Message ID | 37859ca03aaaba23f60288de044a3a10d52a79b4.1657576063.git.delyank@fb.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | BPF |
Headers | show |
Series | Execution context callbacks | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Clearly marked for bpf-next, async |
netdev/apply | fail | Patch does not apply to bpf-next |
bpf/vmtest-bpf-next-PR | fail | merge-conflict |
On Mon, Jul 11, 2022 at 2:48 PM Delyan Kratunov <delyank@fb.com> wrote: > > Similarly to bpf_timer, bpf_delayed_work represents a callback that will > be executed at a later time, in a different execution context. > > Its treatment in maps is practically the same as timers (to a degree > that perhaps calls for refactoring), except releasing the work does not > need to release any resources - we will wait for pending executions in > the program destruction path. > > Signed-off-by: Delyan Kratunov <delyank@fb.com> > --- > include/linux/bpf.h | 9 ++++++++- > include/linux/btf.h | 1 + > include/uapi/linux/bpf.h | 8 ++++++++ > kernel/bpf/btf.c | 21 +++++++++++++++++++++ > kernel/bpf/syscall.c | 24 ++++++++++++++++++++++-- > kernel/bpf/verifier.c | 9 +++++++++ > tools/include/uapi/linux/bpf.h | 8 ++++++++ > 7 files changed, 77 insertions(+), 3 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 0edd7d2c0064..ad9d2cfb0411 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -164,7 +164,8 @@ enum { > BPF_MAP_VALUE_OFF_MAX = 8, > BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + > 1 + /* for bpf_spin_lock */ > - 1, /* for bpf_timer */ > + 1 + /* for bpf_timer */ > + 1, /* for bpf_delayed_work */ > }; > > enum bpf_kptr_type { > @@ -212,6 +213,7 @@ struct bpf_map { > int spin_lock_off; /* >=0 valid offset, <0 error */ > struct bpf_map_value_off *kptr_off_tab; > int timer_off; /* >=0 valid offset, <0 error */ > + int delayed_work_off; /* >=0 valid offset, <0 error */ > u32 id; > int numa_node; > u32 btf_key_type_id; > @@ -256,6 +258,11 @@ static inline bool map_value_has_timer(const struct bpf_map *map) > return map->timer_off >= 0; > } > > +static inline bool map_value_has_delayed_work(const struct bpf_map *map) > +{ > + return map->delayed_work_off >= 0; > +} > + > static inline bool map_value_has_kptrs(const struct bpf_map *map) > { > return !IS_ERR_OR_NULL(map->kptr_off_tab); > diff --git a/include/linux/btf.h b/include/linux/btf.h > index 1bfed7fa0428..2b8f473a6aa0 100644 > --- a/include/linux/btf.h > +++ b/include/linux/btf.h > @@ -132,6 +132,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, > u32 expected_offset, u32 expected_size); > int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); > int btf_find_timer(const struct btf *btf, const struct btf_type *t); > +int btf_find_delayed_work(const struct btf *btf, const struct btf_type *t); > struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, > const struct btf_type *t); > bool btf_type_is_void(const struct btf_type *t); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index e81362891596..d68fc4f472f1 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -6691,6 +6691,14 @@ struct bpf_dynptr { > __u64 :64; > } __attribute__((aligned(8))); > > +struct bpf_delayed_work { > + __u64 :64; > + __u64 :64; > + __u64 :64; > + __u64 :64; > + __u64 :64; > +} __attribute__((aligned(8))); > + > struct bpf_sysctl { > __u32 write; /* Sysctl is being read (= 0) or written (= 1). > * Allows 1,2,4-byte read, but no write. > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > index f08037c31dd7..e4ab52cc25fe 100644 > --- a/kernel/bpf/btf.c > +++ b/kernel/bpf/btf.c > @@ -3196,6 +3196,7 @@ enum btf_field_type { > BTF_FIELD_SPIN_LOCK, > BTF_FIELD_TIMER, > BTF_FIELD_KPTR, > + BTF_FIELD_DELAYED_WORK, > }; > > enum { > @@ -3283,6 +3284,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t > switch (field_type) { > case BTF_FIELD_SPIN_LOCK: > case BTF_FIELD_TIMER: > + case BTF_FIELD_DELAYED_WORK: > ret = btf_find_struct(btf, member_type, off, sz, > idx < info_cnt ? &info[idx] : &tmp); > if (ret < 0) > @@ -3333,6 +3335,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, > switch (field_type) { > case BTF_FIELD_SPIN_LOCK: > case BTF_FIELD_TIMER: > + case BTF_FIELD_DELAYED_WORK: > ret = btf_find_struct(btf, var_type, off, sz, > idx < info_cnt ? &info[idx] : &tmp); > if (ret < 0) > @@ -3375,6 +3378,11 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, > sz = sizeof(struct bpf_timer); > align = __alignof__(struct bpf_timer); > break; > + case BTF_FIELD_DELAYED_WORK: > + name = "bpf_delayed_work"; > + sz = sizeof(struct bpf_delayed_work); > + align = __alignof__(struct bpf_delayed_work); > + break; > case BTF_FIELD_KPTR: > name = NULL; > sz = sizeof(u64); > @@ -3421,6 +3429,19 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t) > return info.off; > } > > +int btf_find_delayed_work(const struct btf *btf, const struct btf_type *t) > +{ > + struct btf_field_info info; > + int ret; > + > + ret = btf_find_field(btf, t, BTF_FIELD_DELAYED_WORK, &info, 1); > + if (ret < 0) > + return ret; > + if (!ret) > + return -ENOENT; > + return info.off; > +} > + > struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, > const struct btf_type *t) > { > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 7d5af5b99f0d..041972305344 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -914,10 +914,11 @@ static int bpf_map_alloc_off_arr(struct bpf_map *map) > bool has_spin_lock = map_value_has_spin_lock(map); > bool has_timer = map_value_has_timer(map); > bool has_kptrs = map_value_has_kptrs(map); > + bool has_delayed_work = map_value_has_delayed_work(map); > struct bpf_map_off_arr *off_arr; > u32 i; > > - if (!has_spin_lock && !has_timer && !has_kptrs) { > + if (!has_spin_lock && !has_timer && !has_kptrs && !has_delayed_work) { > map->off_arr = NULL; > return 0; > } > @@ -953,6 +954,13 @@ static int bpf_map_alloc_off_arr(struct bpf_map *map) > } > off_arr->cnt += tab->nr_off; > } > + if (has_delayed_work) { > + i = off_arr->cnt; > + > + off_arr->field_off[i] = map->delayed_work_off; > + off_arr->field_sz[i] = sizeof(struct bpf_delayed_work); > + off_arr->cnt++; > + } > > if (off_arr->cnt == 1) > return 0; > @@ -1014,6 +1022,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, > return -EOPNOTSUPP; > } > > + map->delayed_work_off = btf_find_delayed_work(btf, value_type); > + if (map_value_has_delayed_work(map)) { > + if (map->map_flags & BPF_F_RDONLY_PROG) > + return -EACCES; > + if (map->map_type != BPF_MAP_TYPE_HASH && > + map->map_type != BPF_MAP_TYPE_LRU_HASH && > + map->map_type != BPF_MAP_TYPE_ARRAY) > + return -EOPNOTSUPP; > + } > + > map->kptr_off_tab = btf_parse_kptrs(btf, value_type); > if (map_value_has_kptrs(map)) { > if (!bpf_capable()) { > @@ -1095,6 +1113,7 @@ static int map_create(union bpf_attr *attr) > > map->spin_lock_off = -EINVAL; > map->timer_off = -EINVAL; > + map->delayed_work_off = -EINVAL; > if (attr->btf_key_type_id || attr->btf_value_type_id || > /* Even the map's value is a kernel's struct, > * the bpf_prog.o must have BTF to begin with > @@ -1863,7 +1882,8 @@ static int map_freeze(const union bpf_attr *attr) > return PTR_ERR(map); > > if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || > - map_value_has_timer(map) || map_value_has_kptrs(map)) { > + map_value_has_timer(map) || map_value_has_kptrs(map) || > + map_value_has_delayed_work(map)) { not introduced by you, but shouldn't this check also check map_value_has_spinlock()? > fdput(f); > return -ENOTSUPP; > } Also check if you need to modify bpf_map_mmap? > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 2859901ffbe3..9fd311b7a1ff 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -3817,6 +3817,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, > return -EACCES; > } > } > + if (map_value_has_delayed_work(map) && src == ACCESS_DIRECT) { > + u32 t = map->delayed_work_off; > + > + if (reg->smin_value + off < t + sizeof(struct bpf_delayed_work) && > + t < reg->umax_value + off + size) { > + verbose(env, "bpf_delayed_work cannot be accessed directly by load/store regno=%d off=%d\n", regno, off); > + return -EACCES; > + } > + } > if (map_value_has_kptrs(map)) { > struct bpf_map_value_off *tab = map->kptr_off_tab; > int i; > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index e81362891596..d68fc4f472f1 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -6691,6 +6691,14 @@ struct bpf_dynptr { > __u64 :64; > } __attribute__((aligned(8))); > > +struct bpf_delayed_work { > + __u64 :64; > + __u64 :64; > + __u64 :64; > + __u64 :64; > + __u64 :64; > +} __attribute__((aligned(8))); > + > struct bpf_sysctl { > __u32 write; /* Sysctl is being read (= 0) or written (= 1). > * Allows 1,2,4-byte read, but no write. > -- > 2.36.1
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0edd7d2c0064..ad9d2cfb0411 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -164,7 +164,8 @@ enum { BPF_MAP_VALUE_OFF_MAX = 8, BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + 1 + /* for bpf_spin_lock */ - 1, /* for bpf_timer */ + 1 + /* for bpf_timer */ + 1, /* for bpf_delayed_work */ }; enum bpf_kptr_type { @@ -212,6 +213,7 @@ struct bpf_map { int spin_lock_off; /* >=0 valid offset, <0 error */ struct bpf_map_value_off *kptr_off_tab; int timer_off; /* >=0 valid offset, <0 error */ + int delayed_work_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; @@ -256,6 +258,11 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } +static inline bool map_value_has_delayed_work(const struct bpf_map *map) +{ + return map->delayed_work_off >= 0; +} + static inline bool map_value_has_kptrs(const struct bpf_map *map) { return !IS_ERR_OR_NULL(map->kptr_off_tab); diff --git a/include/linux/btf.h b/include/linux/btf.h index 1bfed7fa0428..2b8f473a6aa0 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -132,6 +132,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); +int btf_find_delayed_work(const struct btf *btf, const struct btf_type *t); struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e81362891596..d68fc4f472f1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6691,6 +6691,14 @@ struct bpf_dynptr { __u64 :64; } __attribute__((aligned(8))); +struct bpf_delayed_work { + __u64 :64; + __u64 :64; + __u64 :64; + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f08037c31dd7..e4ab52cc25fe 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3196,6 +3196,7 @@ enum btf_field_type { BTF_FIELD_SPIN_LOCK, BTF_FIELD_TIMER, BTF_FIELD_KPTR, + BTF_FIELD_DELAYED_WORK, }; enum { @@ -3283,6 +3284,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t switch (field_type) { case BTF_FIELD_SPIN_LOCK: case BTF_FIELD_TIMER: + case BTF_FIELD_DELAYED_WORK: ret = btf_find_struct(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3333,6 +3335,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, switch (field_type) { case BTF_FIELD_SPIN_LOCK: case BTF_FIELD_TIMER: + case BTF_FIELD_DELAYED_WORK: ret = btf_find_struct(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3375,6 +3378,11 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, sz = sizeof(struct bpf_timer); align = __alignof__(struct bpf_timer); break; + case BTF_FIELD_DELAYED_WORK: + name = "bpf_delayed_work"; + sz = sizeof(struct bpf_delayed_work); + align = __alignof__(struct bpf_delayed_work); + break; case BTF_FIELD_KPTR: name = NULL; sz = sizeof(u64); @@ -3421,6 +3429,19 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t) return info.off; } +int btf_find_delayed_work(const struct btf *btf, const struct btf_type *t) +{ + struct btf_field_info info; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_DELAYED_WORK, &info, 1); + if (ret < 0) + return ret; + if (!ret) + return -ENOENT; + return info.off; +} + struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, const struct btf_type *t) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d5af5b99f0d..041972305344 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -914,10 +914,11 @@ static int bpf_map_alloc_off_arr(struct bpf_map *map) bool has_spin_lock = map_value_has_spin_lock(map); bool has_timer = map_value_has_timer(map); bool has_kptrs = map_value_has_kptrs(map); + bool has_delayed_work = map_value_has_delayed_work(map); struct bpf_map_off_arr *off_arr; u32 i; - if (!has_spin_lock && !has_timer && !has_kptrs) { + if (!has_spin_lock && !has_timer && !has_kptrs && !has_delayed_work) { map->off_arr = NULL; return 0; } @@ -953,6 +954,13 @@ static int bpf_map_alloc_off_arr(struct bpf_map *map) } off_arr->cnt += tab->nr_off; } + if (has_delayed_work) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->delayed_work_off; + off_arr->field_sz[i] = sizeof(struct bpf_delayed_work); + off_arr->cnt++; + } if (off_arr->cnt == 1) return 0; @@ -1014,6 +1022,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EOPNOTSUPP; } + map->delayed_work_off = btf_find_delayed_work(btf, value_type); + if (map_value_has_delayed_work(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) + return -EOPNOTSUPP; + } + map->kptr_off_tab = btf_parse_kptrs(btf, value_type); if (map_value_has_kptrs(map)) { if (!bpf_capable()) { @@ -1095,6 +1113,7 @@ static int map_create(union bpf_attr *attr) map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; + map->delayed_work_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1863,7 +1882,8 @@ static int map_freeze(const union bpf_attr *attr) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map) || map_value_has_kptrs(map)) { + map_value_has_timer(map) || map_value_has_kptrs(map) || + map_value_has_delayed_work(map)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2859901ffbe3..9fd311b7a1ff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3817,6 +3817,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } + if (map_value_has_delayed_work(map) && src == ACCESS_DIRECT) { + u32 t = map->delayed_work_off; + + if (reg->smin_value + off < t + sizeof(struct bpf_delayed_work) && + t < reg->umax_value + off + size) { + verbose(env, "bpf_delayed_work cannot be accessed directly by load/store regno=%d off=%d\n", regno, off); + return -EACCES; + } + } if (map_value_has_kptrs(map)) { struct bpf_map_value_off *tab = map->kptr_off_tab; int i; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e81362891596..d68fc4f472f1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6691,6 +6691,14 @@ struct bpf_dynptr { __u64 :64; } __attribute__((aligned(8))); +struct bpf_delayed_work { + __u64 :64; + __u64 :64; + __u64 :64; + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write.
Similarly to bpf_timer, bpf_delayed_work represents a callback that will be executed at a later time, in a different execution context. Its treatment in maps is practically the same as timers (to a degree that perhaps calls for refactoring), except releasing the work does not need to release any resources - we will wait for pending executions in the program destruction path. Signed-off-by: Delyan Kratunov <delyank@fb.com> --- include/linux/bpf.h | 9 ++++++++- include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/btf.c | 21 +++++++++++++++++++++ kernel/bpf/syscall.c | 24 ++++++++++++++++++++++-- kernel/bpf/verifier.c | 9 +++++++++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 7 files changed, 77 insertions(+), 3 deletions(-)