Message ID | 20221108074114.264485-1-yhs@fb.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | bpf: Add bpf_rcu_read_lock() support | expand |
On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote: > To simplify the design and support the common practice, no > nested bpf_rcu_read_lock() is allowed. During verification, > each paired bpf_rcu_read_lock()/unlock() has a unique > region id, starting from 1. Each rcu ptr register also > remembers the region id when the ptr reg is initialized. > The following is a simple example to illustrate the > rcu lock regions and usage of rcu ptr's. > > ... <=== rcu lock region 0 > bpf_rcu_read_lock() <=== rcu lock region 1 > rcu_ptr1 = ... <=== rcu_ptr1 with region 1 > ... using rcu_ptr1 ... > bpf_rcu_read_unlock() > ... <=== rcu lock region -1 > bpf_rcu_read_lock() <=== rcu lock region 2 > rcu_ptr2 = ... <=== rcu_ptr2 with region 2 > ... using rcu_ptr2 ... > ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2 > bpf_rcu_read_unlock() > > Outside the rcu lock region, the rcu lock region id is 0 or negative of > previous valid rcu lock region id, so the next valid rcu lock region > id can be easily computed. > > Note that rcu protection is not needed for non-sleepable program. But > it is supported to make cross-sleepable/nonsleepable development easier. > For non-sleepable program, the following insns can be inside the rcu > lock region: > - any non call insns except BPF_ABS/BPF_IND > - non sleepable helpers or kfuncs > Also, bpf_*_storage_get() helper's 5th hidden argument (for memory > allocation flag) should be GFP_ATOMIC. > > If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of > this pointer and the load which gets this pointer needs to be > protected by bpf_rcu_read_lock(). The following shows a couple > of examples: > struct task_struct { > ... > struct task_struct __rcu *real_parent; > struct css_set __rcu *cgroups; > ... > }; > struct css_set { > ... > struct cgroup *dfl_cgrp; > ... > } > ... > task = bpf_get_current_task_btf(); > cgroups = task->cgroups; > dfl_cgroup = cgroups->dfl_cgrp; > ... using dfl_cgroup ... > > The bpf_rcu_read_lock/unlock() should be added like below to > avoid verification failures. > task = bpf_get_current_task_btf(); > bpf_rcu_read_lock(); > cgroups = task->cgroups; > dfl_cgroup = cgroups->dfl_cgrp; > bpf_rcu_read_unlock(); > ... using dfl_cgroup ... > > The following is another example for task->real_parent. > task = bpf_get_current_task_btf(); > bpf_rcu_read_lock(); > real_parent = task->real_parent; > ... bpf_task_storage_get(&map, real_parent, 0, 0); > bpf_rcu_read_unlock(); > > Signed-off-by: Yonghong Song <yhs@fb.com> > --- > include/linux/bpf.h | 1 + > include/linux/bpf_verifier.h | 7 +++ > kernel/bpf/btf.c | 32 ++++++++++++- > kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++----- > 4 files changed, 120 insertions(+), 12 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index b4bbcafd1c9b..98af0c9ec721 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -761,6 +761,7 @@ struct bpf_prog_ops { > struct btf_struct_access_info { > u32 next_btf_id; > enum bpf_type_flag flag; > + bool is_rcu; > }; > > struct bpf_verifier_ops { > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > index 1a32baa78ce2..5d703637bb12 100644 > --- a/include/linux/bpf_verifier.h > +++ b/include/linux/bpf_verifier.h > @@ -179,6 +179,10 @@ struct bpf_reg_state { > */ > s32 subreg_def; > enum bpf_reg_liveness live; > + /* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where > + * the rcu ptr reg is initialized. > + */ > + int active_rcu_lock; > /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ > bool precise; > }; > @@ -324,6 +328,8 @@ struct bpf_verifier_state { > u32 insn_idx; > u32 curframe; > u32 active_spin_lock; > + /* <= 0: not in rcu read lock region; > 0: the rcu lock region id */ > + int active_rcu_lock; > bool speculative; > > /* first and last insn idx of this verifier state */ > @@ -424,6 +430,7 @@ struct bpf_insn_aux_data { > u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ > bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ > bool zext_dst; /* this insn zero extends dst reg */ > + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ > u8 alu_state; /* used in combination with alu_limit */ > > /* below fields are initialized once */ > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > index d2ee1669a2f3..c5a9569f2ae0 100644 > --- a/kernel/bpf/btf.c > +++ b/kernel/bpf/btf.c > @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, > if (btf_type_is_ptr(mtype)) { > const struct btf_type *stype, *t; > enum bpf_type_flag tmp_flag = 0; > + bool is_rcu = false; > u32 id; > > if (msize != size || off != moff) { > @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, > /* check __percpu tag */ > if (strcmp(tag_value, "percpu") == 0) > tmp_flag = MEM_PERCPU; > + /* check __rcu tag */ > + if (strcmp(tag_value, "rcu") == 0) > + is_rcu = true; > } > > stype = btf_type_skip_modifiers(btf, mtype->type, &id); > if (btf_type_is_struct(stype)) { > info->next_btf_id = id; > info->flag = tmp_flag; > + info->is_rcu = is_rcu; > return WALK_PTR; > } > } > @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, > { > enum bpf_prog_type prog_type = resolve_prog_type(env->prog); > bool rel = false, kptr_get = false, trusted_args = false; > - bool sleepable = false; > + bool sleepable = false, rcu_lock = false, rcu_unlock = false; > struct bpf_verifier_log *log = &env->log; > u32 i, nargs, ref_id, ref_obj_id = 0; > bool is_kfunc = btf_is_kernel(btf); > @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, > kptr_get = kfunc_meta->flags & KF_KPTR_GET; > trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; > sleepable = kfunc_meta->flags & KF_SLEEPABLE; > + rcu_lock = kfunc_meta->flags & KF_RCU_LOCK; > + rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK; > + } > + > + /* checking rcu read lock/unlock */ > + if (env->cur_state->active_rcu_lock > 0) { > + if (rcu_lock) { > + bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name); > + return -EINVAL; > + } else if (rcu_unlock) { > + /* change active_rcu_lock to its corresponding negative value to > + * preserve the previous lock region id. > + */ > + env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock; > + } else if (sleepable) { > + bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n", > + func_name); > + return -EINVAL; > + } > + } else if (rcu_lock) { > + /* a new lock region started, increase the region id. */ > + env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1; > + } else if (rcu_unlock) { > + bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name); > + return -EINVAL; > } > Can you provide more context on why having ids is better than simply invalidating the registers when the section ends, and making active_rcu_lock a boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having MEM_RCU and mark it unknown. You won't have to match the id in btf_struct_access as such registers won't ever reach that function (if marked unknown on invalidation, they become scalars). The reg state won't need another active_rcu_lock member either, it is simply part of reg->type. It seems to that simply invalidating registers when rcu_read_unlock is called is both less code to write and simpler to understand. Having ids also makes the pruning algorithm unecessarily conservative. Later in states_equal, the check is: > + if (old->active_rcu_lock != cur->active_rcu_lock) > + return false; which means even though the current state just holding the RCU read lock would be enough to prune search, it would be rejected now due to distinct IDs (e.g. if the current path didn't make exactly the same number of rcu_read_lock calls compared to the old state).
On 11/8/22 9:04 AM, Kumar Kartikeya Dwivedi wrote: > On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote: >> To simplify the design and support the common practice, no >> nested bpf_rcu_read_lock() is allowed. During verification, >> each paired bpf_rcu_read_lock()/unlock() has a unique >> region id, starting from 1. Each rcu ptr register also >> remembers the region id when the ptr reg is initialized. >> The following is a simple example to illustrate the >> rcu lock regions and usage of rcu ptr's. >> >> ... <=== rcu lock region 0 >> bpf_rcu_read_lock() <=== rcu lock region 1 >> rcu_ptr1 = ... <=== rcu_ptr1 with region 1 >> ... using rcu_ptr1 ... >> bpf_rcu_read_unlock() >> ... <=== rcu lock region -1 >> bpf_rcu_read_lock() <=== rcu lock region 2 >> rcu_ptr2 = ... <=== rcu_ptr2 with region 2 >> ... using rcu_ptr2 ... >> ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2 >> bpf_rcu_read_unlock() >> >> Outside the rcu lock region, the rcu lock region id is 0 or negative of >> previous valid rcu lock region id, so the next valid rcu lock region >> id can be easily computed. >> >> Note that rcu protection is not needed for non-sleepable program. But >> it is supported to make cross-sleepable/nonsleepable development easier. >> For non-sleepable program, the following insns can be inside the rcu >> lock region: >> - any non call insns except BPF_ABS/BPF_IND >> - non sleepable helpers or kfuncs >> Also, bpf_*_storage_get() helper's 5th hidden argument (for memory >> allocation flag) should be GFP_ATOMIC. >> >> If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of >> this pointer and the load which gets this pointer needs to be >> protected by bpf_rcu_read_lock(). The following shows a couple >> of examples: >> struct task_struct { >> ... >> struct task_struct __rcu *real_parent; >> struct css_set __rcu *cgroups; >> ... >> }; >> struct css_set { >> ... >> struct cgroup *dfl_cgrp; >> ... >> } >> ... >> task = bpf_get_current_task_btf(); >> cgroups = task->cgroups; >> dfl_cgroup = cgroups->dfl_cgrp; >> ... using dfl_cgroup ... >> >> The bpf_rcu_read_lock/unlock() should be added like below to >> avoid verification failures. >> task = bpf_get_current_task_btf(); >> bpf_rcu_read_lock(); >> cgroups = task->cgroups; >> dfl_cgroup = cgroups->dfl_cgrp; >> bpf_rcu_read_unlock(); >> ... using dfl_cgroup ... >> >> The following is another example for task->real_parent. >> task = bpf_get_current_task_btf(); >> bpf_rcu_read_lock(); >> real_parent = task->real_parent; >> ... bpf_task_storage_get(&map, real_parent, 0, 0); >> bpf_rcu_read_unlock(); >> >> Signed-off-by: Yonghong Song <yhs@fb.com> >> --- >> include/linux/bpf.h | 1 + >> include/linux/bpf_verifier.h | 7 +++ >> kernel/bpf/btf.c | 32 ++++++++++++- >> kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++----- >> 4 files changed, 120 insertions(+), 12 deletions(-) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index b4bbcafd1c9b..98af0c9ec721 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -761,6 +761,7 @@ struct bpf_prog_ops { >> struct btf_struct_access_info { >> u32 next_btf_id; >> enum bpf_type_flag flag; >> + bool is_rcu; >> }; >> >> struct bpf_verifier_ops { >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h >> index 1a32baa78ce2..5d703637bb12 100644 >> --- a/include/linux/bpf_verifier.h >> +++ b/include/linux/bpf_verifier.h >> @@ -179,6 +179,10 @@ struct bpf_reg_state { >> */ >> s32 subreg_def; >> enum bpf_reg_liveness live; >> + /* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where >> + * the rcu ptr reg is initialized. >> + */ >> + int active_rcu_lock; >> /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ >> bool precise; >> }; >> @@ -324,6 +328,8 @@ struct bpf_verifier_state { >> u32 insn_idx; >> u32 curframe; >> u32 active_spin_lock; >> + /* <= 0: not in rcu read lock region; > 0: the rcu lock region id */ >> + int active_rcu_lock; >> bool speculative; >> >> /* first and last insn idx of this verifier state */ >> @@ -424,6 +430,7 @@ struct bpf_insn_aux_data { >> u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ >> bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ >> bool zext_dst; /* this insn zero extends dst reg */ >> + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ >> u8 alu_state; /* used in combination with alu_limit */ >> >> /* below fields are initialized once */ >> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c >> index d2ee1669a2f3..c5a9569f2ae0 100644 >> --- a/kernel/bpf/btf.c >> +++ b/kernel/bpf/btf.c >> @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, >> if (btf_type_is_ptr(mtype)) { >> const struct btf_type *stype, *t; >> enum bpf_type_flag tmp_flag = 0; >> + bool is_rcu = false; >> u32 id; >> >> if (msize != size || off != moff) { >> @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, >> /* check __percpu tag */ >> if (strcmp(tag_value, "percpu") == 0) >> tmp_flag = MEM_PERCPU; >> + /* check __rcu tag */ >> + if (strcmp(tag_value, "rcu") == 0) >> + is_rcu = true; >> } >> >> stype = btf_type_skip_modifiers(btf, mtype->type, &id); >> if (btf_type_is_struct(stype)) { >> info->next_btf_id = id; >> info->flag = tmp_flag; >> + info->is_rcu = is_rcu; >> return WALK_PTR; >> } >> } >> @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, >> { >> enum bpf_prog_type prog_type = resolve_prog_type(env->prog); >> bool rel = false, kptr_get = false, trusted_args = false; >> - bool sleepable = false; >> + bool sleepable = false, rcu_lock = false, rcu_unlock = false; >> struct bpf_verifier_log *log = &env->log; >> u32 i, nargs, ref_id, ref_obj_id = 0; >> bool is_kfunc = btf_is_kernel(btf); >> @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, >> kptr_get = kfunc_meta->flags & KF_KPTR_GET; >> trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; >> sleepable = kfunc_meta->flags & KF_SLEEPABLE; >> + rcu_lock = kfunc_meta->flags & KF_RCU_LOCK; >> + rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK; >> + } >> + >> + /* checking rcu read lock/unlock */ >> + if (env->cur_state->active_rcu_lock > 0) { >> + if (rcu_lock) { >> + bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name); >> + return -EINVAL; >> + } else if (rcu_unlock) { >> + /* change active_rcu_lock to its corresponding negative value to >> + * preserve the previous lock region id. >> + */ >> + env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock; >> + } else if (sleepable) { >> + bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n", >> + func_name); >> + return -EINVAL; >> + } >> + } else if (rcu_lock) { >> + /* a new lock region started, increase the region id. */ >> + env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1; >> + } else if (rcu_unlock) { >> + bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name); >> + return -EINVAL; >> } >> > > Can you provide more context on why having ids is better than simply > invalidating the registers when the section ends, and making active_rcu_lock a > boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having > MEM_RCU and mark it unknown. I think we also need to invalidate rcu-ptr related states as well in spills. I also tried to support cases like: bpf_rcu_read_lock(); rcu_ptr = ... ... rcu_ptr ... bpf_rcu_read_unlock(); ... rcu_ptr ... /* no load, just use the rcu_ptr somehow */ In the above case, outside the rcu read lock region, there is no load with rcu_ptr but it can still be used for other purposes with a property of a pointer. But for a second thought, it should be okay to invalidate rcu_ptr during bpf_rcu_read_unlock() as a scalar. This should satisfy almost all (if not all) cases. > > You won't have to match the id in btf_struct_access as such registers won't ever > reach that function (if marked unknown on invalidation, they become scalars). > The reg state won't need another active_rcu_lock member either, it is simply > part of reg->type. Right, if I don't maintain region id's, no need to have reg->active_rcu_lock and using MEM_RCU should be enough. > > It seems to that simply invalidating registers when rcu_read_unlock is called is > both less code to write and simpler to understand. invalidating rcu_ptr in registers and spills. Let me try to implement this and compare to my current approach. I guess MEM_RCU + invalidation at bpf_rcu_read_unlock() should be simpler as you suggested. > > Having ids also makes the pruning algorithm unecessarily conservative. > Later in states_equal, the check is: > >> + if (old->active_rcu_lock != cur->active_rcu_lock) >> + return false; > > which means even though the current state just holding the RCU read lock would > be enough to prune search, it would be rejected now due to distinct IDs (e.g. if > the current path didn't make exactly the same number of rcu_read_lock calls > compared to the old state). That is true. We should also check old/new verifier state. If both verifier state are not in rcu read lock region. The above check can be ignored. But agree this becomes a little bit more complicated.
On Wed, Nov 09, 2022 at 01:33:04AM IST, Yonghong Song wrote: > > > On 11/8/22 9:04 AM, Kumar Kartikeya Dwivedi wrote: > > On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote: > > > To simplify the design and support the common practice, no > > > nested bpf_rcu_read_lock() is allowed. During verification, > > > each paired bpf_rcu_read_lock()/unlock() has a unique > > > region id, starting from 1. Each rcu ptr register also > > > remembers the region id when the ptr reg is initialized. > > > The following is a simple example to illustrate the > > > rcu lock regions and usage of rcu ptr's. > > > > > > ... <=== rcu lock region 0 > > > bpf_rcu_read_lock() <=== rcu lock region 1 > > > rcu_ptr1 = ... <=== rcu_ptr1 with region 1 > > > ... using rcu_ptr1 ... > > > bpf_rcu_read_unlock() > > > ... <=== rcu lock region -1 > > > bpf_rcu_read_lock() <=== rcu lock region 2 > > > rcu_ptr2 = ... <=== rcu_ptr2 with region 2 > > > ... using rcu_ptr2 ... > > > ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2 > > > bpf_rcu_read_unlock() > > > > > > Outside the rcu lock region, the rcu lock region id is 0 or negative of > > > previous valid rcu lock region id, so the next valid rcu lock region > > > id can be easily computed. > > > > > > Note that rcu protection is not needed for non-sleepable program. But > > > it is supported to make cross-sleepable/nonsleepable development easier. > > > For non-sleepable program, the following insns can be inside the rcu > > > lock region: > > > - any non call insns except BPF_ABS/BPF_IND > > > - non sleepable helpers or kfuncs > > > Also, bpf_*_storage_get() helper's 5th hidden argument (for memory > > > allocation flag) should be GFP_ATOMIC. > > > > > > If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of > > > this pointer and the load which gets this pointer needs to be > > > protected by bpf_rcu_read_lock(). The following shows a couple > > > of examples: > > > struct task_struct { > > > ... > > > struct task_struct __rcu *real_parent; > > > struct css_set __rcu *cgroups; > > > ... > > > }; > > > struct css_set { > > > ... > > > struct cgroup *dfl_cgrp; > > > ... > > > } > > > ... > > > task = bpf_get_current_task_btf(); > > > cgroups = task->cgroups; > > > dfl_cgroup = cgroups->dfl_cgrp; > > > ... using dfl_cgroup ... > > > > > > The bpf_rcu_read_lock/unlock() should be added like below to > > > avoid verification failures. > > > task = bpf_get_current_task_btf(); > > > bpf_rcu_read_lock(); > > > cgroups = task->cgroups; > > > dfl_cgroup = cgroups->dfl_cgrp; > > > bpf_rcu_read_unlock(); > > > ... using dfl_cgroup ... > > > > > > The following is another example for task->real_parent. > > > task = bpf_get_current_task_btf(); > > > bpf_rcu_read_lock(); > > > real_parent = task->real_parent; > > > ... bpf_task_storage_get(&map, real_parent, 0, 0); > > > bpf_rcu_read_unlock(); > > > > > > Signed-off-by: Yonghong Song <yhs@fb.com> > > > --- > > > include/linux/bpf.h | 1 + > > > include/linux/bpf_verifier.h | 7 +++ > > > kernel/bpf/btf.c | 32 ++++++++++++- > > > kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++----- > > > 4 files changed, 120 insertions(+), 12 deletions(-) > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > index b4bbcafd1c9b..98af0c9ec721 100644 > > > --- a/include/linux/bpf.h > > > +++ b/include/linux/bpf.h > > > @@ -761,6 +761,7 @@ struct bpf_prog_ops { > > > struct btf_struct_access_info { > > > u32 next_btf_id; > > > enum bpf_type_flag flag; > > > + bool is_rcu; > > > }; > > > > > > struct bpf_verifier_ops { > > > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > > > index 1a32baa78ce2..5d703637bb12 100644 > > > --- a/include/linux/bpf_verifier.h > > > +++ b/include/linux/bpf_verifier.h > > > @@ -179,6 +179,10 @@ struct bpf_reg_state { > > > */ > > > s32 subreg_def; > > > enum bpf_reg_liveness live; > > > + /* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where > > > + * the rcu ptr reg is initialized. > > > + */ > > > + int active_rcu_lock; > > > /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ > > > bool precise; > > > }; > > > @@ -324,6 +328,8 @@ struct bpf_verifier_state { > > > u32 insn_idx; > > > u32 curframe; > > > u32 active_spin_lock; > > > + /* <= 0: not in rcu read lock region; > 0: the rcu lock region id */ > > > + int active_rcu_lock; > > > bool speculative; > > > > > > /* first and last insn idx of this verifier state */ > > > @@ -424,6 +430,7 @@ struct bpf_insn_aux_data { > > > u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ > > > bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ > > > bool zext_dst; /* this insn zero extends dst reg */ > > > + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ > > > u8 alu_state; /* used in combination with alu_limit */ > > > > > > /* below fields are initialized once */ > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > > > index d2ee1669a2f3..c5a9569f2ae0 100644 > > > --- a/kernel/bpf/btf.c > > > +++ b/kernel/bpf/btf.c > > > @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, > > > if (btf_type_is_ptr(mtype)) { > > > const struct btf_type *stype, *t; > > > enum bpf_type_flag tmp_flag = 0; > > > + bool is_rcu = false; > > > u32 id; > > > > > > if (msize != size || off != moff) { > > > @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, > > > /* check __percpu tag */ > > > if (strcmp(tag_value, "percpu") == 0) > > > tmp_flag = MEM_PERCPU; > > > + /* check __rcu tag */ > > > + if (strcmp(tag_value, "rcu") == 0) > > > + is_rcu = true; > > > } > > > > > > stype = btf_type_skip_modifiers(btf, mtype->type, &id); > > > if (btf_type_is_struct(stype)) { > > > info->next_btf_id = id; > > > info->flag = tmp_flag; > > > + info->is_rcu = is_rcu; > > > return WALK_PTR; > > > } > > > } > > > @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, > > > { > > > enum bpf_prog_type prog_type = resolve_prog_type(env->prog); > > > bool rel = false, kptr_get = false, trusted_args = false; > > > - bool sleepable = false; > > > + bool sleepable = false, rcu_lock = false, rcu_unlock = false; > > > struct bpf_verifier_log *log = &env->log; > > > u32 i, nargs, ref_id, ref_obj_id = 0; > > > bool is_kfunc = btf_is_kernel(btf); > > > @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, > > > kptr_get = kfunc_meta->flags & KF_KPTR_GET; > > > trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; > > > sleepable = kfunc_meta->flags & KF_SLEEPABLE; > > > + rcu_lock = kfunc_meta->flags & KF_RCU_LOCK; > > > + rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK; > > > + } > > > + > > > + /* checking rcu read lock/unlock */ > > > + if (env->cur_state->active_rcu_lock > 0) { > > > + if (rcu_lock) { > > > + bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name); > > > + return -EINVAL; > > > + } else if (rcu_unlock) { > > > + /* change active_rcu_lock to its corresponding negative value to > > > + * preserve the previous lock region id. > > > + */ > > > + env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock; > > > + } else if (sleepable) { > > > + bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n", > > > + func_name); > > > + return -EINVAL; > > > + } > > > + } else if (rcu_lock) { > > > + /* a new lock region started, increase the region id. */ > > > + env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1; > > > + } else if (rcu_unlock) { > > > + bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name); > > > + return -EINVAL; > > > } > > > > > > > Can you provide more context on why having ids is better than simply > > invalidating the registers when the section ends, and making active_rcu_lock a > > boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having > > MEM_RCU and mark it unknown. > > I think we also need to invalidate rcu-ptr related states as well in spills. > > I also tried to support cases like: > bpf_rcu_read_lock(); > rcu_ptr = ... > ... rcu_ptr ... > bpf_rcu_read_unlock(); > ... rcu_ptr ... /* no load, just use the rcu_ptr somehow */ > > In the above case, outside the rcu read lock region, there is no > load with rcu_ptr but it can still be used for other purposes > with a property of a pointer. > > But for a second thought, it should be okay to invalidate > rcu_ptr during bpf_rcu_read_unlock() as a scalar. This should > satisfy almost all (if not all) cases. > > > > > You won't have to match the id in btf_struct_access as such registers won't ever > > reach that function (if marked unknown on invalidation, they become scalars). > > The reg state won't need another active_rcu_lock member either, it is simply > > part of reg->type. > > Right, if I don't maintain region id's, no need to have reg->active_rcu_lock > and using MEM_RCU should be enough. > > > > > It seems to that simply invalidating registers when rcu_read_unlock is called is > > both less code to write and simpler to understand. > > invalidating rcu_ptr in registers and spills. > If you use bpf_for_each_reg_in_vstate, it should cover both.
On 11/8/22 12:19 PM, Kumar Kartikeya Dwivedi wrote: > On Wed, Nov 09, 2022 at 01:33:04AM IST, Yonghong Song wrote: >> >> >> On 11/8/22 9:04 AM, Kumar Kartikeya Dwivedi wrote: >>> On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote: >>>> To simplify the design and support the common practice, no >>>> nested bpf_rcu_read_lock() is allowed. During verification, >>>> each paired bpf_rcu_read_lock()/unlock() has a unique >>>> region id, starting from 1. Each rcu ptr register also >>>> remembers the region id when the ptr reg is initialized. >>>> The following is a simple example to illustrate the >>>> rcu lock regions and usage of rcu ptr's. >>>> >>>> ... <=== rcu lock region 0 >>>> bpf_rcu_read_lock() <=== rcu lock region 1 >>>> rcu_ptr1 = ... <=== rcu_ptr1 with region 1 >>>> ... using rcu_ptr1 ... >>>> bpf_rcu_read_unlock() >>>> ... <=== rcu lock region -1 >>>> bpf_rcu_read_lock() <=== rcu lock region 2 >>>> rcu_ptr2 = ... <=== rcu_ptr2 with region 2 >>>> ... using rcu_ptr2 ... >>>> ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2 >>>> bpf_rcu_read_unlock() >>>> >>>> Outside the rcu lock region, the rcu lock region id is 0 or negative of >>>> previous valid rcu lock region id, so the next valid rcu lock region >>>> id can be easily computed. >>>> >>>> Note that rcu protection is not needed for non-sleepable program. But >>>> it is supported to make cross-sleepable/nonsleepable development easier. >>>> For non-sleepable program, the following insns can be inside the rcu >>>> lock region: >>>> - any non call insns except BPF_ABS/BPF_IND >>>> - non sleepable helpers or kfuncs >>>> Also, bpf_*_storage_get() helper's 5th hidden argument (for memory >>>> allocation flag) should be GFP_ATOMIC. >>>> >>>> If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of >>>> this pointer and the load which gets this pointer needs to be >>>> protected by bpf_rcu_read_lock(). The following shows a couple >>>> of examples: >>>> struct task_struct { >>>> ... >>>> struct task_struct __rcu *real_parent; >>>> struct css_set __rcu *cgroups; >>>> ... >>>> }; >>>> struct css_set { >>>> ... >>>> struct cgroup *dfl_cgrp; >>>> ... >>>> } >>>> ... >>>> task = bpf_get_current_task_btf(); >>>> cgroups = task->cgroups; >>>> dfl_cgroup = cgroups->dfl_cgrp; >>>> ... using dfl_cgroup ... >>>> >>>> The bpf_rcu_read_lock/unlock() should be added like below to >>>> avoid verification failures. >>>> task = bpf_get_current_task_btf(); >>>> bpf_rcu_read_lock(); >>>> cgroups = task->cgroups; >>>> dfl_cgroup = cgroups->dfl_cgrp; >>>> bpf_rcu_read_unlock(); >>>> ... using dfl_cgroup ... >>>> >>>> The following is another example for task->real_parent. >>>> task = bpf_get_current_task_btf(); >>>> bpf_rcu_read_lock(); >>>> real_parent = task->real_parent; >>>> ... bpf_task_storage_get(&map, real_parent, 0, 0); >>>> bpf_rcu_read_unlock(); >>>> >>>> Signed-off-by: Yonghong Song <yhs@fb.com> >>>> --- >>>> include/linux/bpf.h | 1 + >>>> include/linux/bpf_verifier.h | 7 +++ >>>> kernel/bpf/btf.c | 32 ++++++++++++- >>>> kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++----- >>>> 4 files changed, 120 insertions(+), 12 deletions(-) >>>> >>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>>> index b4bbcafd1c9b..98af0c9ec721 100644 >>>> --- a/include/linux/bpf.h >>>> +++ b/include/linux/bpf.h >>>> @@ -761,6 +761,7 @@ struct bpf_prog_ops { >>>> struct btf_struct_access_info { >>>> u32 next_btf_id; >>>> enum bpf_type_flag flag; >>>> + bool is_rcu; >>>> }; >>>> >>>> struct bpf_verifier_ops { >>>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h >>>> index 1a32baa78ce2..5d703637bb12 100644 >>>> --- a/include/linux/bpf_verifier.h >>>> +++ b/include/linux/bpf_verifier.h >>>> @@ -179,6 +179,10 @@ struct bpf_reg_state { >>>> */ >>>> s32 subreg_def; >>>> enum bpf_reg_liveness live; >>>> + /* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where >>>> + * the rcu ptr reg is initialized. >>>> + */ >>>> + int active_rcu_lock; >>>> /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ >>>> bool precise; >>>> }; >>>> @@ -324,6 +328,8 @@ struct bpf_verifier_state { >>>> u32 insn_idx; >>>> u32 curframe; >>>> u32 active_spin_lock; >>>> + /* <= 0: not in rcu read lock region; > 0: the rcu lock region id */ >>>> + int active_rcu_lock; >>>> bool speculative; >>>> >>>> /* first and last insn idx of this verifier state */ >>>> @@ -424,6 +430,7 @@ struct bpf_insn_aux_data { >>>> u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ >>>> bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ >>>> bool zext_dst; /* this insn zero extends dst reg */ >>>> + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ >>>> u8 alu_state; /* used in combination with alu_limit */ >>>> >>>> /* below fields are initialized once */ >>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c >>>> index d2ee1669a2f3..c5a9569f2ae0 100644 >>>> --- a/kernel/bpf/btf.c >>>> +++ b/kernel/bpf/btf.c >>>> @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, >>>> if (btf_type_is_ptr(mtype)) { >>>> const struct btf_type *stype, *t; >>>> enum bpf_type_flag tmp_flag = 0; >>>> + bool is_rcu = false; >>>> u32 id; >>>> >>>> if (msize != size || off != moff) { >>>> @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, >>>> /* check __percpu tag */ >>>> if (strcmp(tag_value, "percpu") == 0) >>>> tmp_flag = MEM_PERCPU; >>>> + /* check __rcu tag */ >>>> + if (strcmp(tag_value, "rcu") == 0) >>>> + is_rcu = true; >>>> } >>>> >>>> stype = btf_type_skip_modifiers(btf, mtype->type, &id); >>>> if (btf_type_is_struct(stype)) { >>>> info->next_btf_id = id; >>>> info->flag = tmp_flag; >>>> + info->is_rcu = is_rcu; >>>> return WALK_PTR; >>>> } >>>> } >>>> @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, >>>> { >>>> enum bpf_prog_type prog_type = resolve_prog_type(env->prog); >>>> bool rel = false, kptr_get = false, trusted_args = false; >>>> - bool sleepable = false; >>>> + bool sleepable = false, rcu_lock = false, rcu_unlock = false; >>>> struct bpf_verifier_log *log = &env->log; >>>> u32 i, nargs, ref_id, ref_obj_id = 0; >>>> bool is_kfunc = btf_is_kernel(btf); >>>> @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, >>>> kptr_get = kfunc_meta->flags & KF_KPTR_GET; >>>> trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; >>>> sleepable = kfunc_meta->flags & KF_SLEEPABLE; >>>> + rcu_lock = kfunc_meta->flags & KF_RCU_LOCK; >>>> + rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK; >>>> + } >>>> + >>>> + /* checking rcu read lock/unlock */ >>>> + if (env->cur_state->active_rcu_lock > 0) { >>>> + if (rcu_lock) { >>>> + bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name); >>>> + return -EINVAL; >>>> + } else if (rcu_unlock) { >>>> + /* change active_rcu_lock to its corresponding negative value to >>>> + * preserve the previous lock region id. >>>> + */ >>>> + env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock; >>>> + } else if (sleepable) { >>>> + bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n", >>>> + func_name); >>>> + return -EINVAL; >>>> + } >>>> + } else if (rcu_lock) { >>>> + /* a new lock region started, increase the region id. */ >>>> + env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1; >>>> + } else if (rcu_unlock) { >>>> + bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name); >>>> + return -EINVAL; >>>> } >>>> >>> >>> Can you provide more context on why having ids is better than simply >>> invalidating the registers when the section ends, and making active_rcu_lock a >>> boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having >>> MEM_RCU and mark it unknown. >> >> I think we also need to invalidate rcu-ptr related states as well in spills. >> >> I also tried to support cases like: >> bpf_rcu_read_lock(); >> rcu_ptr = ... >> ... rcu_ptr ... >> bpf_rcu_read_unlock(); >> ... rcu_ptr ... /* no load, just use the rcu_ptr somehow */ >> >> In the above case, outside the rcu read lock region, there is no >> load with rcu_ptr but it can still be used for other purposes >> with a property of a pointer. >> >> But for a second thought, it should be okay to invalidate >> rcu_ptr during bpf_rcu_read_unlock() as a scalar. This should >> satisfy almost all (if not all) cases. >> >>> >>> You won't have to match the id in btf_struct_access as such registers won't ever >>> reach that function (if marked unknown on invalidation, they become scalars). >>> The reg state won't need another active_rcu_lock member either, it is simply >>> part of reg->type. >> >> Right, if I don't maintain region id's, no need to have reg->active_rcu_lock >> and using MEM_RCU should be enough. >> >>> >>> It seems to that simply invalidating registers when rcu_read_unlock is called is >>> both less code to write and simpler to understand. >> >> invalidating rcu_ptr in registers and spills. >> > > If you use bpf_for_each_reg_in_vstate, it should cover both. Just checked the macro implementation. Yes, it covers both reg and spills. Thanks for mentioning bpf_for_each_reg_in_vstate which I am not aware of.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4bbcafd1c9b..98af0c9ec721 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -761,6 +761,7 @@ struct bpf_prog_ops { struct btf_struct_access_info { u32 next_btf_id; enum bpf_type_flag flag; + bool is_rcu; }; struct bpf_verifier_ops { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1a32baa78ce2..5d703637bb12 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -179,6 +179,10 @@ struct bpf_reg_state { */ s32 subreg_def; enum bpf_reg_liveness live; + /* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where + * the rcu ptr reg is initialized. + */ + int active_rcu_lock; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; @@ -324,6 +328,8 @@ struct bpf_verifier_state { u32 insn_idx; u32 curframe; u32 active_spin_lock; + /* <= 0: not in rcu read lock region; > 0: the rcu lock region id */ + int active_rcu_lock; bool speculative; /* first and last insn idx of this verifier state */ @@ -424,6 +430,7 @@ struct bpf_insn_aux_data { u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d2ee1669a2f3..c5a9569f2ae0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, if (btf_type_is_ptr(mtype)) { const struct btf_type *stype, *t; enum bpf_type_flag tmp_flag = 0; + bool is_rcu = false; u32 id; if (msize != size || off != moff) { @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, /* check __percpu tag */ if (strcmp(tag_value, "percpu") == 0) tmp_flag = MEM_PERCPU; + /* check __rcu tag */ + if (strcmp(tag_value, "rcu") == 0) + is_rcu = true; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { info->next_btf_id = id; info->flag = tmp_flag; + info->is_rcu = is_rcu; return WALK_PTR; } } @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); bool rel = false, kptr_get = false, trusted_args = false; - bool sleepable = false; + bool sleepable = false, rcu_lock = false, rcu_unlock = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, kptr_get = kfunc_meta->flags & KF_KPTR_GET; trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; sleepable = kfunc_meta->flags & KF_SLEEPABLE; + rcu_lock = kfunc_meta->flags & KF_RCU_LOCK; + rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK; + } + + /* checking rcu read lock/unlock */ + if (env->cur_state->active_rcu_lock > 0) { + if (rcu_lock) { + bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name); + return -EINVAL; + } else if (rcu_unlock) { + /* change active_rcu_lock to its corresponding negative value to + * preserve the previous lock region id. + */ + env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock; + } else if (sleepable) { + bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n", + func_name); + return -EINVAL; + } + } else if (rcu_lock) { + /* a new lock region started, increase the region id. */ + env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1; + } else if (rcu_unlock) { + bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name); + return -EINVAL; } /* check that BTF function arguments match actual types that the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4d50f9568245..85151f2a655a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,7 @@ #include <linux/error-injection.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> +#include <linux/trace_events.h> #include <linux/poison.h> #include "disasm.h" @@ -513,6 +514,14 @@ static bool is_callback_calling_function(enum bpf_func_id func_id) func_id == BPF_FUNC_user_ringbuf_drain; } +static bool is_storage_get_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_storage_get || + func_id == BPF_FUNC_inode_storage_get || + func_id == BPF_FUNC_task_storage_get || + func_id == BPF_FUNC_cgrp_storage_get; +} + static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -1203,6 +1212,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; dst_state->active_spin_lock = src->active_spin_lock; + dst_state->active_rcu_lock = src->active_rcu_lock; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; @@ -1687,6 +1697,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->var_off = tnum_unknown; reg->frameno = 0; reg->precise = !env->bpf_capable; + reg->active_rcu_lock = 0; __mark_reg_unbounded(reg); } @@ -1727,7 +1738,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, enum bpf_reg_type reg_type, struct btf *btf, u32 btf_id, - enum bpf_type_flag flag) + enum bpf_type_flag flag, bool set_rcu_lock) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); @@ -1737,6 +1748,9 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; + /* the reg rcu lock region id equals the current rcu lock region id */ + if (set_rcu_lock) + regs[regno].active_rcu_lock = env->cur_state->active_rcu_lock; } #define DEF_NOT_SUBREG (0) @@ -3940,7 +3954,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED, false); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -4681,6 +4695,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + /* If reg valid rcu region id does not equal to the current rcu region id, + * rcu access is not protected properly, either out of a valid rcu region, + * or in a different rcu region. + */ + if (env->prog->aux->sleepable && reg->active_rcu_lock && + reg->active_rcu_lock != env->cur_state->active_rcu_lock) { + verbose(env, + "R%d is ptr_%s access rcu-protected memory with off=%d, not rcu protected\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, off, size, atype, &info); @@ -4697,6 +4723,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; + /* The value is a rcu pointer. For a sleepable program, the load needs to be + * in a rcu lock region, similar to rcu_dereference(). + */ + if (info.is_rcu && env->prog->aux->sleepable && env->cur_state->active_rcu_lock <= 0) { + verbose(env, + "R%d is rcu dereference ptr_%s with off=%d, not in rcu_read_lock region\n", + regno, tname, off); + return -EACCES; + } + /* If this is an untrusted pointer, all pointers formed by walking it * also inherit the untrusted flag. */ @@ -4705,7 +4741,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, info.next_btf_id, - info.flag); + info.flag, info.is_rcu && env->prog->aux->sleepable); return 0; } @@ -4761,7 +4797,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, if (value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, info.next_btf_id, - info.flag); + info.flag, info.is_rcu && env->prog->aux->sleepable); return 0; } @@ -5874,6 +5910,17 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (arg_type & PTR_MAYBE_NULL) type &= ~PTR_MAYBE_NULL; + /* If a rcu pointer is a helper argument, the helper should be protected in + * the same rcu lock region where the rcu pointer reg is initialized. + */ + if (env->prog->aux->sleepable && reg->active_rcu_lock && + reg->active_rcu_lock != env->cur_state->active_rcu_lock) { + verbose(env, + "R%d is arg type %s needs rcu protection\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -7418,6 +7465,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } + if (env->cur_state->active_rcu_lock > 0) { + if (bpf_lsm_sleepable_func_proto(func_id) || + bpf_tracing_sleepable_func_proto(func_id)) { + verbose(env, "sleepable helper %s#%din rcu_read_lock region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (env->prog->aux->sleepable && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -10605,6 +10664,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->prog->aux->sleepable && env->cur_state->active_rcu_lock > 0) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -11869,6 +11933,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_spin_lock != cur->active_spin_lock) return false; + if (old->active_rcu_lock != cur->active_rcu_lock) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -12553,6 +12620,11 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (env->cur_state->active_rcu_lock > 0) { + verbose(env, "bpf_rcu_read_unlock is missing\n"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback @@ -14289,14 +14361,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_task_storage_get || - insn->imm == BPF_FUNC_sk_storage_get || - insn->imm == BPF_FUNC_inode_storage_get || - insn->imm == BPF_FUNC_cgrp_storage_get) { - if (env->prog->aux->sleepable) - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); - else + if (is_storage_get_function(insn->imm)) { + if (!env->prog->aux->sleepable || + env->insn_aux_data[i + delta].storage_get_func_atomic) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); insn_buf[1] = *insn; cnt = 2;
To simplify the design and support the common practice, no nested bpf_rcu_read_lock() is allowed. During verification, each paired bpf_rcu_read_lock()/unlock() has a unique region id, starting from 1. Each rcu ptr register also remembers the region id when the ptr reg is initialized. The following is a simple example to illustrate the rcu lock regions and usage of rcu ptr's. ... <=== rcu lock region 0 bpf_rcu_read_lock() <=== rcu lock region 1 rcu_ptr1 = ... <=== rcu_ptr1 with region 1 ... using rcu_ptr1 ... bpf_rcu_read_unlock() ... <=== rcu lock region -1 bpf_rcu_read_lock() <=== rcu lock region 2 rcu_ptr2 = ... <=== rcu_ptr2 with region 2 ... using rcu_ptr2 ... ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2 bpf_rcu_read_unlock() Outside the rcu lock region, the rcu lock region id is 0 or negative of previous valid rcu lock region id, so the next valid rcu lock region id can be easily computed. Note that rcu protection is not needed for non-sleepable program. But it is supported to make cross-sleepable/nonsleepable development easier. For non-sleepable program, the following insns can be inside the rcu lock region: - any non call insns except BPF_ABS/BPF_IND - non sleepable helpers or kfuncs Also, bpf_*_storage_get() helper's 5th hidden argument (for memory allocation flag) should be GFP_ATOMIC. If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of this pointer and the load which gets this pointer needs to be protected by bpf_rcu_read_lock(). The following shows a couple of examples: struct task_struct { ... struct task_struct __rcu *real_parent; struct css_set __rcu *cgroups; ... }; struct css_set { ... struct cgroup *dfl_cgrp; ... } ... task = bpf_get_current_task_btf(); cgroups = task->cgroups; dfl_cgroup = cgroups->dfl_cgrp; ... using dfl_cgroup ... The bpf_rcu_read_lock/unlock() should be added like below to avoid verification failures. task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); cgroups = task->cgroups; dfl_cgroup = cgroups->dfl_cgrp; bpf_rcu_read_unlock(); ... using dfl_cgroup ... The following is another example for task->real_parent. task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); real_parent = task->real_parent; ... bpf_task_storage_get(&map, real_parent, 0, 0); bpf_rcu_read_unlock(); Signed-off-by: Yonghong Song <yhs@fb.com> --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 7 +++ kernel/bpf/btf.c | 32 ++++++++++++- kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++----- 4 files changed, 120 insertions(+), 12 deletions(-)