diff mbox series

[v2,bpf-next,5/7] bpf: Consider non-owning refs to refcounted nodes RCU protected

Message ID 20230821193311.3290257-6-davemarchevsky@fb.com (mailing list archive)
State Accepted
Commit 0816b8c6bf7fc87cec4273dc199e8f0764b9e7b1
Delegated to: BPF
Headers show
Series BPF Refcount followups 3: bpf_mem_free_rcu refcounted nodes | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2828 this patch: 2828
netdev/cc_maintainers warning 7 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com sdf@google.com song@kernel.org jolsa@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 1526 this patch: 1526
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2856 this patch: 2856
netdev/checkpatch warning WARNING: line length of 108 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix

Commit Message

Dave Marchevsky Aug. 21, 2023, 7:33 p.m. UTC
An earlier patch in the series ensures that the underlying memory of
nodes with bpf_refcount - which can have multiple owners - is not reused
until RCU grace period has elapsed. This prevents
use-after-free with non-owning references that may point to
recently-freed memory. While RCU read lock is held, it's safe to
dereference such a non-owning ref, as by definition RCU GP couldn't have
elapsed and therefore underlying memory couldn't have been reused.

From the perspective of verifier "trustedness" non-owning refs to
refcounted nodes are now trusted only in RCU CS and therefore should no
longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
MEM_RCU in order to reflect this new state.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
---
 include/linux/bpf.h   |  3 ++-
 kernel/bpf/verifier.c | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

Comments

Yonghong Song Aug. 22, 2023, 2:37 a.m. UTC | #1
On 8/21/23 12:33 PM, Dave Marchevsky wrote:
> An earlier patch in the series ensures that the underlying memory of
> nodes with bpf_refcount - which can have multiple owners - is not reused
> until RCU grace period has elapsed. This prevents
> use-after-free with non-owning references that may point to
> recently-freed memory. While RCU read lock is held, it's safe to
> dereference such a non-owning ref, as by definition RCU GP couldn't have
> elapsed and therefore underlying memory couldn't have been reused.
> 
>  From the perspective of verifier "trustedness" non-owning refs to
> refcounted nodes are now trusted only in RCU CS and therefore should no
> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
> MEM_RCU in order to reflect this new state.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> ---
>   include/linux/bpf.h   |  3 ++-
>   kernel/bpf/verifier.c | 13 ++++++++++++-
>   2 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index eced6400f778..12596af59c00 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>   	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
>   
>   	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
> -	 * Currently only valid for linked-list and rbtree nodes.
> +	 * Currently only valid for linked-list and rbtree nodes. If the nodes
> +	 * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>   	 */
>   	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
>   
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 8db0afa5985c..55607ab30522 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -8013,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>   	case PTR_TO_BTF_ID | PTR_TRUSTED:
>   	case PTR_TO_BTF_ID | MEM_RCU:
>   	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
> +	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>   		/* When referenced PTR_TO_BTF_ID is passed to release function,
>   		 * its fixed offset must be 0. In the other cases, fixed offset
>   		 * can be non-zero. This was already checked above. So pass
> @@ -10479,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>   static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>   {
>   	struct bpf_verifier_state *state = env->cur_state;
> +	struct btf_record *rec = reg_btf_record(reg);
>   
>   	if (!state->active_lock.ptr) {
>   		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
> @@ -10491,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>   	}
>   
>   	reg->type |= NON_OWN_REF;
> +	if (rec->refcount_off >= 0)
> +		reg->type |= MEM_RCU;

Should the above MEM_RCU marking be done unless reg access is in
rcu critical section?

I think we still have issues for state resetting
with bpf_spin_unlock() and bpf_rcu_read_unlock(), both of which
will try to convert the reg state to PTR_UNTRUSTED.

Let us say reg state is
   PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU

(1). If hitting bpf_spin_unlock(), since MEM_RCU is in
the reg state, the state should become
   PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU
some additional code might be needed so we wont have
verifier complaints about ref_obj_id == 0.

(2). If hitting bpf_rcu_read_unlock(), the state should become
   PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF
since register access still in bpf_spin_lock() region.

Does this make sense?

> +
>   	return 0;
>   }
>   
> @@ -11328,6 +11333,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>   		struct bpf_func_state *state;
>   		struct bpf_reg_state *reg;
>   
> +		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
> +			verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
> +			return -EACCES;
> +		}
> +
>   		if (rcu_lock) {
>   			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>   			return -EINVAL;
> @@ -16689,7 +16699,8 @@ static int do_check(struct bpf_verifier_env *env)
>   					return -EINVAL;
>   				}
>   
> -				if (env->cur_state->active_rcu_lock) {
> +				if (env->cur_state->active_rcu_lock &&
> +				    !in_rbtree_lock_required_cb(env)) {
>   					verbose(env, "bpf_rcu_read_unlock is missing\n");
>   					return -EINVAL;
>   				}
Yonghong Song Aug. 22, 2023, 3:19 a.m. UTC | #2
On 8/21/23 7:37 PM, Yonghong Song wrote:
> 
> 
> On 8/21/23 12:33 PM, Dave Marchevsky wrote:
>> An earlier patch in the series ensures that the underlying memory of
>> nodes with bpf_refcount - which can have multiple owners - is not reused
>> until RCU grace period has elapsed. This prevents
>> use-after-free with non-owning references that may point to
>> recently-freed memory. While RCU read lock is held, it's safe to
>> dereference such a non-owning ref, as by definition RCU GP couldn't have
>> elapsed and therefore underlying memory couldn't have been reused.
>>
>>  From the perspective of verifier "trustedness" non-owning refs to
>> refcounted nodes are now trusted only in RCU CS and therefore should no
>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
>> MEM_RCU in order to reflect this new state.
>>
>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>> ---
>>   include/linux/bpf.h   |  3 ++-
>>   kernel/bpf/verifier.c | 13 ++++++++++++-
>>   2 files changed, 14 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index eced6400f778..12596af59c00 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>>       MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
>>       /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are 
>> non-owning.
>> -     * Currently only valid for linked-list and rbtree nodes.
>> +     * Currently only valid for linked-list and rbtree nodes. If the 
>> nodes
>> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>>        */
>>       NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 8db0afa5985c..55607ab30522 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -8013,6 +8013,7 @@ int check_func_arg_reg_off(struct 
>> bpf_verifier_env *env,
>>       case PTR_TO_BTF_ID | PTR_TRUSTED:
>>       case PTR_TO_BTF_ID | MEM_RCU:
>>       case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>>           /* When referenced PTR_TO_BTF_ID is passed to release function,
>>            * its fixed offset must be 0. In the other cases, fixed offset
>>            * can be non-zero. This was already checked above. So pass
>> @@ -10479,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct 
>> bpf_verifier_env *env,
>>   static int ref_set_non_owning(struct bpf_verifier_env *env, struct 
>> bpf_reg_state *reg)
>>   {
>>       struct bpf_verifier_state *state = env->cur_state;
>> +    struct btf_record *rec = reg_btf_record(reg);
>>       if (!state->active_lock.ptr) {
>>           verbose(env, "verifier internal error: ref_set_non_owning 
>> w/o active lock\n");
>> @@ -10491,6 +10493,9 @@ static int ref_set_non_owning(struct 
>> bpf_verifier_env *env, struct bpf_reg_state
>>       }
>>       reg->type |= NON_OWN_REF;
>> +    if (rec->refcount_off >= 0)
>> +        reg->type |= MEM_RCU;
> 
> Should the above MEM_RCU marking be done unless reg access is in
> rcu critical section?
> 
> I think we still have issues for state resetting
> with bpf_spin_unlock() and bpf_rcu_read_unlock(), both of which
> will try to convert the reg state to PTR_UNTRUSTED.
> 
> Let us say reg state is
>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU
> 
> (1). If hitting bpf_spin_unlock(), since MEM_RCU is in
> the reg state, the state should become
>    PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU
> some additional code might be needed so we wont have
> verifier complaints about ref_obj_id == 0.
> 
> (2). If hitting bpf_rcu_read_unlock(), the state should become
>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF
> since register access still in bpf_spin_lock() region.
> 
> Does this make sense?

Okay, seems scenario (2) is not possible as bpf_rcu_read_unlock()
is not allowed in bpf spin lock region.

> 
>> +
>>       return 0;
>>   }
>> @@ -11328,6 +11333,11 @@ static int check_kfunc_call(struct 
>> bpf_verifier_env *env, struct bpf_insn *insn,
>>           struct bpf_func_state *state;
>>           struct bpf_reg_state *reg;
>> +        if (in_rbtree_lock_required_cb(env) && (rcu_lock || 
>> rcu_unlock)) {
>> +            verbose(env, "Calling bpf_rcu_read_{lock,unlock} in 
>> unnecessary rbtree callback\n");
>> +            return -EACCES;
>> +        }
>> +
>>           if (rcu_lock) {
>>               verbose(env, "nested rcu read lock (kernel function 
>> %s)\n", func_name);
>>               return -EINVAL;
>> @@ -16689,7 +16699,8 @@ static int do_check(struct bpf_verifier_env *env)
>>                       return -EINVAL;
>>                   }
>> -                if (env->cur_state->active_rcu_lock) {
>> +                if (env->cur_state->active_rcu_lock &&
>> +                    !in_rbtree_lock_required_cb(env)) {
>>                       verbose(env, "bpf_rcu_read_unlock is missing\n");
>>                       return -EINVAL;
>>                   }
>
David Marchevsky Aug. 22, 2023, 5:47 a.m. UTC | #3
On 8/21/23 10:37 PM, Yonghong Song wrote:
> 
> 
> On 8/21/23 12:33 PM, Dave Marchevsky wrote:
>> An earlier patch in the series ensures that the underlying memory of
>> nodes with bpf_refcount - which can have multiple owners - is not reused
>> until RCU grace period has elapsed. This prevents
>> use-after-free with non-owning references that may point to
>> recently-freed memory. While RCU read lock is held, it's safe to
>> dereference such a non-owning ref, as by definition RCU GP couldn't have
>> elapsed and therefore underlying memory couldn't have been reused.
>>
>>  From the perspective of verifier "trustedness" non-owning refs to
>> refcounted nodes are now trusted only in RCU CS and therefore should no
>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
>> MEM_RCU in order to reflect this new state.
>>
>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>> ---
>>   include/linux/bpf.h   |  3 ++-
>>   kernel/bpf/verifier.c | 13 ++++++++++++-
>>   2 files changed, 14 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index eced6400f778..12596af59c00 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>>       MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
>>         /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
>> -     * Currently only valid for linked-list and rbtree nodes.
>> +     * Currently only valid for linked-list and rbtree nodes. If the nodes
>> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>>        */
>>       NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
>>   diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 8db0afa5985c..55607ab30522 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -8013,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>>       case PTR_TO_BTF_ID | PTR_TRUSTED:
>>       case PTR_TO_BTF_ID | MEM_RCU:
>>       case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>>           /* When referenced PTR_TO_BTF_ID is passed to release function,
>>            * its fixed offset must be 0. In the other cases, fixed offset
>>            * can be non-zero. This was already checked above. So pass
>> @@ -10479,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>>   static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>>   {
>>       struct bpf_verifier_state *state = env->cur_state;
>> +    struct btf_record *rec = reg_btf_record(reg);
>>         if (!state->active_lock.ptr) {
>>           verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
>> @@ -10491,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>>       }
>>         reg->type |= NON_OWN_REF;
>> +    if (rec->refcount_off >= 0)
>> +        reg->type |= MEM_RCU;
> 
> Should the above MEM_RCU marking be done unless reg access is in
> rcu critical section?

I think it is fine, since non-owning references currently exist only within
spin_lock CS. Based on Alexei's comments on v1 of this series [0], preemption
disabled + spin_lock CS should imply RCU CS.

  [0]: https://lore.kernel.org/bpf/20230802230715.3ltalexaczbomvbu@MacBook-Pro-8.local/

> 
> I think we still have issues for state resetting
> with bpf_spin_unlock() and bpf_rcu_read_unlock(), both of which
> will try to convert the reg state to PTR_UNTRUSTED.
> 
> Let us say reg state is
>   PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU
> 
> (1). If hitting bpf_spin_unlock(), since MEM_RCU is in
> the reg state, the state should become
>   PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU
> some additional code might be needed so we wont have
> verifier complaints about ref_obj_id == 0.
> 
> (2). If hitting bpf_rcu_read_unlock(), the state should become
>   PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF
> since register access still in bpf_spin_lock() region.

I agree w/ your comment in side reply stating that this
case isn't possible since bpf_rcu_read_{lock,unlock} in spin_lock CS
is currently not allowed.

> 
> Does this make sense?
> 


IIUC the specific reg state flow you're recommending is based on the convos
we've had over the past few weeks re: getting rid of special non-owning ref
lifetime rules, instead using RCU as much as possible. Specifically, this
recommended change would remove non-owning ref clobbering, instead just removing
NON_OWN_REF flag on bpf_spin_unlock so that such nodes can no longer be passed
to collection kfuncs (refcount_acquire, etc).

I agree that in general we'll be able to loosen the lifetime logic for
non-owning refs, and your specific suggestion sounds reasonable. IMO it's
better to ship that as a separate series, though, as this series was meant
to be the minimum changes necessary to re-enable bpf_refcount_acquire, and
it's expanded a bit past that already. Easier to reason about the rest
of this series' changes without having to account for clobbering changes.

>> +
>>       return 0;
>>   }
>>   @@ -11328,6 +11333,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>           struct bpf_func_state *state;
>>           struct bpf_reg_state *reg;
>>   +        if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
>> +            verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
>> +            return -EACCES;
>> +        }
>> +
>>           if (rcu_lock) {
>>               verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>>               return -EINVAL;
>> @@ -16689,7 +16699,8 @@ static int do_check(struct bpf_verifier_env *env)
>>                       return -EINVAL;
>>                   }
>>   -                if (env->cur_state->active_rcu_lock) {
>> +                if (env->cur_state->active_rcu_lock &&
>> +                    !in_rbtree_lock_required_cb(env)) {
>>                       verbose(env, "bpf_rcu_read_unlock is missing\n");
>>                       return -EINVAL;
>>                   }
Yonghong Song Aug. 22, 2023, 4:02 p.m. UTC | #4
On 8/21/23 10:47 PM, David Marchevsky wrote:
> On 8/21/23 10:37 PM, Yonghong Song wrote:
>>
>>
>> On 8/21/23 12:33 PM, Dave Marchevsky wrote:
>>> An earlier patch in the series ensures that the underlying memory of
>>> nodes with bpf_refcount - which can have multiple owners - is not reused
>>> until RCU grace period has elapsed. This prevents
>>> use-after-free with non-owning references that may point to
>>> recently-freed memory. While RCU read lock is held, it's safe to
>>> dereference such a non-owning ref, as by definition RCU GP couldn't have
>>> elapsed and therefore underlying memory couldn't have been reused.
>>>
>>>   From the perspective of verifier "trustedness" non-owning refs to
>>> refcounted nodes are now trusted only in RCU CS and therefore should no
>>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
>>> MEM_RCU in order to reflect this new state.
>>>
>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>> ---
>>>    include/linux/bpf.h   |  3 ++-
>>>    kernel/bpf/verifier.c | 13 ++++++++++++-
>>>    2 files changed, 14 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index eced6400f778..12596af59c00 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>>>        MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
>>>          /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
>>> -     * Currently only valid for linked-list and rbtree nodes.
>>> +     * Currently only valid for linked-list and rbtree nodes. If the nodes
>>> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>>>         */
>>>        NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
>>>    diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index 8db0afa5985c..55607ab30522 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -8013,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>>>        case PTR_TO_BTF_ID | PTR_TRUSTED:
>>>        case PTR_TO_BTF_ID | MEM_RCU:
>>>        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
>>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>>>            /* When referenced PTR_TO_BTF_ID is passed to release function,
>>>             * its fixed offset must be 0. In the other cases, fixed offset
>>>             * can be non-zero. This was already checked above. So pass
>>> @@ -10479,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>>>    static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>>>    {
>>>        struct bpf_verifier_state *state = env->cur_state;
>>> +    struct btf_record *rec = reg_btf_record(reg);
>>>          if (!state->active_lock.ptr) {
>>>            verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
>>> @@ -10491,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>>>        }
>>>          reg->type |= NON_OWN_REF;
>>> +    if (rec->refcount_off >= 0)
>>> +        reg->type |= MEM_RCU;
>>
>> Should the above MEM_RCU marking be done unless reg access is in
>> rcu critical section?
> 
> I think it is fine, since non-owning references currently exist only within
> spin_lock CS. Based on Alexei's comments on v1 of this series [0], preemption
> disabled + spin_lock CS should imply RCU CS.
> 
>    [0]: https://lore.kernel.org/bpf/20230802230715.3ltalexaczbomvbu@MacBook-Pro-8.local/

Okay, I see. Thanks for the pointer.
If MEM_RCU is marked not in explicit rcu cs, it does make sense
to clobber everything when bpf_spin_unlock is hit.
But see below.

> 
>>
>> I think we still have issues for state resetting
>> with bpf_spin_unlock() and bpf_rcu_read_unlock(), both of which
>> will try to convert the reg state to PTR_UNTRUSTED.
>>
>> Let us say reg state is
>>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU
>>
>> (1). If hitting bpf_spin_unlock(), since MEM_RCU is in
>> the reg state, the state should become
>>    PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU
>> some additional code might be needed so we wont have
>> verifier complaints about ref_obj_id == 0.
>>
>> (2). If hitting bpf_rcu_read_unlock(), the state should become
>>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF
>> since register access still in bpf_spin_lock() region.
> 
> I agree w/ your comment in side reply stating that this
> case isn't possible since bpf_rcu_read_{lock,unlock} in spin_lock CS
> is currently not allowed.
> 
>>
>> Does this make sense?
>>
> 
> 
> IIUC the specific reg state flow you're recommending is based on the convos
> we've had over the past few weeks re: getting rid of special non-owning ref
> lifetime rules, instead using RCU as much as possible. Specifically, this
> recommended change would remove non-owning ref clobbering, instead just removing
> NON_OWN_REF flag on bpf_spin_unlock so that such nodes can no longer be passed
> to collection kfuncs (refcount_acquire, etc).
> 
> I agree that in general we'll be able to loosen the lifetime logic for
> non-owning refs, and your specific suggestion sounds reasonable. IMO it's
> better to ship that as a separate series, though, as this series was meant
> to be the minimum changes necessary to re-enable bpf_refcount_acquire, and
> it's expanded a bit past that already. Easier to reason about the rest
> of this series' changes without having to account for clobbering changes.

I removed
   >>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
and
   >>> +    if (rec->refcount_off >= 0)
   >>> +        reg->type |= MEM_RCU;

All 'refcount' selftests seem working fine.
If this is indeed unnecessary for this patch set, maybe we can delay
this RCU marking thing for a future patch set?

> 
>>> +
>>>        return 0;
>>>    }
>>>    @@ -11328,6 +11333,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>>            struct bpf_func_state *state;
>>>            struct bpf_reg_state *reg;
>>>    +        if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
>>> +            verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
>>> +            return -EACCES;
>>> +        }
>>> +
>>>            if (rcu_lock) {
>>>                verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>>>                return -EINVAL;
>>> @@ -16689,7 +16699,8 @@ static int do_check(struct bpf_verifier_env *env)
>>>                        return -EINVAL;
>>>                    }
>>>    -                if (env->cur_state->active_rcu_lock) {
>>> +                if (env->cur_state->active_rcu_lock &&
>>> +                    !in_rbtree_lock_required_cb(env)) {
>>>                        verbose(env, "bpf_rcu_read_unlock is missing\n");
>>>                        return -EINVAL;
>>>                    }
Alexei Starovoitov Aug. 22, 2023, 11:45 p.m. UTC | #5
On Tue, Aug 22, 2023 at 01:47:01AM -0400, David Marchevsky wrote:
> On 8/21/23 10:37 PM, Yonghong Song wrote:
> > 
> > 
> > On 8/21/23 12:33 PM, Dave Marchevsky wrote:
> >> An earlier patch in the series ensures that the underlying memory of
> >> nodes with bpf_refcount - which can have multiple owners - is not reused
> >> until RCU grace period has elapsed. This prevents
> >> use-after-free with non-owning references that may point to
> >> recently-freed memory. While RCU read lock is held, it's safe to
> >> dereference such a non-owning ref, as by definition RCU GP couldn't have
> >> elapsed and therefore underlying memory couldn't have been reused.
> >>
> >>  From the perspective of verifier "trustedness" non-owning refs to
> >> refcounted nodes are now trusted only in RCU CS and therefore should no
> >> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
> >> MEM_RCU in order to reflect this new state.
> >>
> >> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >> ---
> >>   include/linux/bpf.h   |  3 ++-
> >>   kernel/bpf/verifier.c | 13 ++++++++++++-
> >>   2 files changed, 14 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index eced6400f778..12596af59c00 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -653,7 +653,8 @@ enum bpf_type_flag {
> >>       MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
> >>         /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
> >> -     * Currently only valid for linked-list and rbtree nodes.
> >> +     * Currently only valid for linked-list and rbtree nodes. If the nodes
> >> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
> >>        */
> >>       NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
> >>   diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >> index 8db0afa5985c..55607ab30522 100644
> >> --- a/kernel/bpf/verifier.c
> >> +++ b/kernel/bpf/verifier.c
> >> @@ -8013,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
> >>       case PTR_TO_BTF_ID | PTR_TRUSTED:
> >>       case PTR_TO_BTF_ID | MEM_RCU:
> >>       case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
> >> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
> >>           /* When referenced PTR_TO_BTF_ID is passed to release function,
> >>            * its fixed offset must be 0. In the other cases, fixed offset
> >>            * can be non-zero. This was already checked above. So pass
> >> @@ -10479,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
> >>   static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> >>   {
> >>       struct bpf_verifier_state *state = env->cur_state;
> >> +    struct btf_record *rec = reg_btf_record(reg);
> >>         if (!state->active_lock.ptr) {
> >>           verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
> >> @@ -10491,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
> >>       }
> >>         reg->type |= NON_OWN_REF;
> >> +    if (rec->refcount_off >= 0)
> >> +        reg->type |= MEM_RCU;
> > 
> > Should the above MEM_RCU marking be done unless reg access is in
> > rcu critical section?
> 
> I think it is fine, since non-owning references currently exist only within
> spin_lock CS. Based on Alexei's comments on v1 of this series [0], preemption
> disabled + spin_lock CS should imply RCU CS.
> 
>   [0]: https://lore.kernel.org/bpf/20230802230715.3ltalexaczbomvbu@MacBook-Pro-8.local/
> 
> > 
> > I think we still have issues for state resetting
> > with bpf_spin_unlock() and bpf_rcu_read_unlock(), both of which
> > will try to convert the reg state to PTR_UNTRUSTED.
> > 
> > Let us say reg state is
> >   PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU
> > 
> > (1). If hitting bpf_spin_unlock(), since MEM_RCU is in
> > the reg state, the state should become
> >   PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU
> > some additional code might be needed so we wont have
> > verifier complaints about ref_obj_id == 0.
> > 
> > (2). If hitting bpf_rcu_read_unlock(), the state should become
> >   PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF
> > since register access still in bpf_spin_lock() region.
> 
> I agree w/ your comment in side reply stating that this
> case isn't possible since bpf_rcu_read_{lock,unlock} in spin_lock CS
> is currently not allowed.
> 
> > 
> > Does this make sense?
> > 
> 
> 
> IIUC the specific reg state flow you're recommending is based on the convos
> we've had over the past few weeks re: getting rid of special non-owning ref
> lifetime rules, instead using RCU as much as possible. Specifically, this
> recommended change would remove non-owning ref clobbering, instead just removing
> NON_OWN_REF flag on bpf_spin_unlock so that such nodes can no longer be passed
> to collection kfuncs (refcount_acquire, etc).

Overall the patch set makes sense to me, but I want to clarify above.
My understanding that after the patch set applied bpf_spin_unlock()
will invalidate_non_owning_refs(), so what Yonghong is saying in (1)
is not correct.
Instead PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU will become mark_reg_invalid().

Re: (2) even if/when bpf_rcu_read_unlock() will allowed inside spinlocked region
it will convert PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU to
PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | PTR_UNTRUSTED
which is a buggy combination which we would need to address if rcu_unlock is allowed eventually.

Did I get it right?
If so I think the whole set is good to do.
Yonghong Song Aug. 23, 2023, 12:18 a.m. UTC | #6
On 8/22/23 4:45 PM, Alexei Starovoitov wrote:
> On Tue, Aug 22, 2023 at 01:47:01AM -0400, David Marchevsky wrote:
>> On 8/21/23 10:37 PM, Yonghong Song wrote:
>>>
>>>
>>> On 8/21/23 12:33 PM, Dave Marchevsky wrote:
>>>> An earlier patch in the series ensures that the underlying memory of
>>>> nodes with bpf_refcount - which can have multiple owners - is not reused
>>>> until RCU grace period has elapsed. This prevents
>>>> use-after-free with non-owning references that may point to
>>>> recently-freed memory. While RCU read lock is held, it's safe to
>>>> dereference such a non-owning ref, as by definition RCU GP couldn't have
>>>> elapsed and therefore underlying memory couldn't have been reused.
>>>>
>>>>   From the perspective of verifier "trustedness" non-owning refs to
>>>> refcounted nodes are now trusted only in RCU CS and therefore should no
>>>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
>>>> MEM_RCU in order to reflect this new state.
>>>>
>>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>>> ---
>>>>    include/linux/bpf.h   |  3 ++-
>>>>    kernel/bpf/verifier.c | 13 ++++++++++++-
>>>>    2 files changed, 14 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index eced6400f778..12596af59c00 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>>>>        MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
>>>>          /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
>>>> -     * Currently only valid for linked-list and rbtree nodes.
>>>> +     * Currently only valid for linked-list and rbtree nodes. If the nodes
>>>> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>>>>         */
>>>>        NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
>>>>    diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>>> index 8db0afa5985c..55607ab30522 100644
>>>> --- a/kernel/bpf/verifier.c
>>>> +++ b/kernel/bpf/verifier.c
>>>> @@ -8013,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>>>>        case PTR_TO_BTF_ID | PTR_TRUSTED:
>>>>        case PTR_TO_BTF_ID | MEM_RCU:
>>>>        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
>>>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>>>>            /* When referenced PTR_TO_BTF_ID is passed to release function,
>>>>             * its fixed offset must be 0. In the other cases, fixed offset
>>>>             * can be non-zero. This was already checked above. So pass
>>>> @@ -10479,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>>>>    static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>>>>    {
>>>>        struct bpf_verifier_state *state = env->cur_state;
>>>> +    struct btf_record *rec = reg_btf_record(reg);
>>>>          if (!state->active_lock.ptr) {
>>>>            verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
>>>> @@ -10491,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>>>>        }
>>>>          reg->type |= NON_OWN_REF;
>>>> +    if (rec->refcount_off >= 0)
>>>> +        reg->type |= MEM_RCU;
>>>
>>> Should the above MEM_RCU marking be done unless reg access is in
>>> rcu critical section?
>>
>> I think it is fine, since non-owning references currently exist only within
>> spin_lock CS. Based on Alexei's comments on v1 of this series [0], preemption
>> disabled + spin_lock CS should imply RCU CS.
>>
>>    [0]: https://lore.kernel.org/bpf/20230802230715.3ltalexaczbomvbu@MacBook-Pro-8.local/
>>
>>>
>>> I think we still have issues for state resetting
>>> with bpf_spin_unlock() and bpf_rcu_read_unlock(), both of which
>>> will try to convert the reg state to PTR_UNTRUSTED.
>>>
>>> Let us say reg state is
>>>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU
>>>
>>> (1). If hitting bpf_spin_unlock(), since MEM_RCU is in
>>> the reg state, the state should become
>>>    PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU
>>> some additional code might be needed so we wont have
>>> verifier complaints about ref_obj_id == 0.
>>>
>>> (2). If hitting bpf_rcu_read_unlock(), the state should become
>>>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF
>>> since register access still in bpf_spin_lock() region.
>>
>> I agree w/ your comment in side reply stating that this
>> case isn't possible since bpf_rcu_read_{lock,unlock} in spin_lock CS
>> is currently not allowed.
>>
>>>
>>> Does this make sense?
>>>
>>
>>
>> IIUC the specific reg state flow you're recommending is based on the convos
>> we've had over the past few weeks re: getting rid of special non-owning ref
>> lifetime rules, instead using RCU as much as possible. Specifically, this
>> recommended change would remove non-owning ref clobbering, instead just removing
>> NON_OWN_REF flag on bpf_spin_unlock so that such nodes can no longer be passed
>> to collection kfuncs (refcount_acquire, etc).
> 
> Overall the patch set makes sense to me, but I want to clarify above.
> My understanding that after the patch set applied bpf_spin_unlock()
> will invalidate_non_owning_refs(), so what Yonghong is saying in (1)
> is not correct.
> Instead PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU will become mark_reg_invalid().

I said it 'should become ...', but you are right. right now, it will do
mark_reg_invalid(). So it is correct just MAYBE a little conservative.

> 
> Re: (2) even if/when bpf_rcu_read_unlock() will allowed inside spinlocked region
> it will convert PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU to
> PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | PTR_UNTRUSTED
> which is a buggy combination which we would need to address if rcu_unlock is allowed eventually.
> 
> Did I get it right?
> If so I think the whole set is good to do.
>
Alexei Starovoitov Aug. 23, 2023, 12:21 a.m. UTC | #7
On Tue, Aug 22, 2023 at 5:18 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
>
> On 8/22/23 4:45 PM, Alexei Starovoitov wrote:
> > On Tue, Aug 22, 2023 at 01:47:01AM -0400, David Marchevsky wrote:
> >> On 8/21/23 10:37 PM, Yonghong Song wrote:
> >>>
> >>>
> >>> On 8/21/23 12:33 PM, Dave Marchevsky wrote:
> >>>> An earlier patch in the series ensures that the underlying memory of
> >>>> nodes with bpf_refcount - which can have multiple owners - is not reused
> >>>> until RCU grace period has elapsed. This prevents
> >>>> use-after-free with non-owning references that may point to
> >>>> recently-freed memory. While RCU read lock is held, it's safe to
> >>>> dereference such a non-owning ref, as by definition RCU GP couldn't have
> >>>> elapsed and therefore underlying memory couldn't have been reused.
> >>>>
> >>>>   From the perspective of verifier "trustedness" non-owning refs to
> >>>> refcounted nodes are now trusted only in RCU CS and therefore should no
> >>>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
> >>>> MEM_RCU in order to reflect this new state.
> >>>>
> >>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >>>> ---
> >>>>    include/linux/bpf.h   |  3 ++-
> >>>>    kernel/bpf/verifier.c | 13 ++++++++++++-
> >>>>    2 files changed, 14 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >>>> index eced6400f778..12596af59c00 100644
> >>>> --- a/include/linux/bpf.h
> >>>> +++ b/include/linux/bpf.h
> >>>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
> >>>>        MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
> >>>>          /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
> >>>> -     * Currently only valid for linked-list and rbtree nodes.
> >>>> +     * Currently only valid for linked-list and rbtree nodes. If the nodes
> >>>> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
> >>>>         */
> >>>>        NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
> >>>>    diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >>>> index 8db0afa5985c..55607ab30522 100644
> >>>> --- a/kernel/bpf/verifier.c
> >>>> +++ b/kernel/bpf/verifier.c
> >>>> @@ -8013,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
> >>>>        case PTR_TO_BTF_ID | PTR_TRUSTED:
> >>>>        case PTR_TO_BTF_ID | MEM_RCU:
> >>>>        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
> >>>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
> >>>>            /* When referenced PTR_TO_BTF_ID is passed to release function,
> >>>>             * its fixed offset must be 0. In the other cases, fixed offset
> >>>>             * can be non-zero. This was already checked above. So pass
> >>>> @@ -10479,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
> >>>>    static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> >>>>    {
> >>>>        struct bpf_verifier_state *state = env->cur_state;
> >>>> +    struct btf_record *rec = reg_btf_record(reg);
> >>>>          if (!state->active_lock.ptr) {
> >>>>            verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
> >>>> @@ -10491,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
> >>>>        }
> >>>>          reg->type |= NON_OWN_REF;
> >>>> +    if (rec->refcount_off >= 0)
> >>>> +        reg->type |= MEM_RCU;
> >>>
> >>> Should the above MEM_RCU marking be done unless reg access is in
> >>> rcu critical section?
> >>
> >> I think it is fine, since non-owning references currently exist only within
> >> spin_lock CS. Based on Alexei's comments on v1 of this series [0], preemption
> >> disabled + spin_lock CS should imply RCU CS.
> >>
> >>    [0]: https://lore.kernel.org/bpf/20230802230715.3ltalexaczbomvbu@MacBook-Pro-8.local/
> >>
> >>>
> >>> I think we still have issues for state resetting
> >>> with bpf_spin_unlock() and bpf_rcu_read_unlock(), both of which
> >>> will try to convert the reg state to PTR_UNTRUSTED.
> >>>
> >>> Let us say reg state is
> >>>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU
> >>>
> >>> (1). If hitting bpf_spin_unlock(), since MEM_RCU is in
> >>> the reg state, the state should become
> >>>    PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU
> >>> some additional code might be needed so we wont have
> >>> verifier complaints about ref_obj_id == 0.
> >>>
> >>> (2). If hitting bpf_rcu_read_unlock(), the state should become
> >>>    PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF
> >>> since register access still in bpf_spin_lock() region.
> >>
> >> I agree w/ your comment in side reply stating that this
> >> case isn't possible since bpf_rcu_read_{lock,unlock} in spin_lock CS
> >> is currently not allowed.
> >>
> >>>
> >>> Does this make sense?
> >>>
> >>
> >>
> >> IIUC the specific reg state flow you're recommending is based on the convos
> >> we've had over the past few weeks re: getting rid of special non-owning ref
> >> lifetime rules, instead using RCU as much as possible. Specifically, this
> >> recommended change would remove non-owning ref clobbering, instead just removing
> >> NON_OWN_REF flag on bpf_spin_unlock so that such nodes can no longer be passed
> >> to collection kfuncs (refcount_acquire, etc).
> >
> > Overall the patch set makes sense to me, but I want to clarify above.
> > My understanding that after the patch set applied bpf_spin_unlock()
> > will invalidate_non_owning_refs(), so what Yonghong is saying in (1)
> > is not correct.
> > Instead PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU will become mark_reg_invalid().
>
> I said it 'should become ...', but you are right. right now, it will do
> mark_reg_invalid(). So it is correct just MAYBE a little conservative.

Ahh. You mean that it should be fixed to do that. Got it.
non_own_ref after spin_unlock should become a pure mem_rcu pointer.
Need to think it through. Probably correct.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eced6400f778..12596af59c00 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -653,7 +653,8 @@  enum bpf_type_flag {
 	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
 
 	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
-	 * Currently only valid for linked-list and rbtree nodes.
+	 * Currently only valid for linked-list and rbtree nodes. If the nodes
+	 * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
 	 */
 	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8db0afa5985c..55607ab30522 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8013,6 +8013,7 @@  int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_BTF_ID | PTR_TRUSTED:
 	case PTR_TO_BTF_ID | MEM_RCU:
 	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
 		 * its fixed offset must be 0. In the other cases, fixed offset
 		 * can be non-zero. This was already checked above. So pass
@@ -10479,6 +10480,7 @@  static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_verifier_state *state = env->cur_state;
+	struct btf_record *rec = reg_btf_record(reg);
 
 	if (!state->active_lock.ptr) {
 		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
@@ -10491,6 +10493,9 @@  static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
 	}
 
 	reg->type |= NON_OWN_REF;
+	if (rec->refcount_off >= 0)
+		reg->type |= MEM_RCU;
+
 	return 0;
 }
 
@@ -11328,6 +11333,11 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		struct bpf_func_state *state;
 		struct bpf_reg_state *reg;
 
+		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+			verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+			return -EACCES;
+		}
+
 		if (rcu_lock) {
 			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
 			return -EINVAL;
@@ -16689,7 +16699,8 @@  static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				if (env->cur_state->active_rcu_lock) {
+				if (env->cur_state->active_rcu_lock &&
+				    !in_rbtree_lock_required_cb(env)) {
 					verbose(env, "bpf_rcu_read_unlock is missing\n");
 					return -EINVAL;
 				}