Message ID | 20230214221718.503964-4-kuifeng@meta.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Transit between BPF TCP congestion controls. | expand |
On 02/14, Kui-Feng Lee wrote: > Registration via bpf_links ensures a uniform behavior, just like other > BPF programs. BPF struct_ops were registered/unregistered when > updating/deleting their values. Only the maps of struct_ops having > the BPF_F_LINK flag are allowed to back a bpf_link. > Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> > --- > include/uapi/linux/bpf.h | 3 ++ > kernel/bpf/bpf_struct_ops.c | 59 +++++++++++++++++++++++++++++++--- > tools/include/uapi/linux/bpf.h | 3 ++ > 3 files changed, 61 insertions(+), 4 deletions(-) > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 1e6cdd0f355d..48d8b3058aa1 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1267,6 +1267,9 @@ enum { > /* Create a map that is suitable to be an inner map with dynamic max > entries */ > BPF_F_INNER_MAP = (1U << 12), > + > +/* Create a map that will be registered/unregesitered by the backed > bpf_link */ > + BPF_F_LINK = (1U << 13), > }; > /* Flags for BPF_PROG_QUERY. */ > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > index 621c8e24481a..d16ca06cf09a 100644 > --- a/kernel/bpf/bpf_struct_ops.c > +++ b/kernel/bpf/bpf_struct_ops.c > @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct > bpf_map *map, void *key, > mutex_lock(&st_map->lock); > - if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { > + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || > refcount_read(&kvalue->refcnt)) { > err = -EBUSY; > goto unlock; > } > @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct > bpf_map *map, void *key, > *(unsigned long *)(udata + moff) = prog->aux->id; > } > + if (st_map->map.map_flags & BPF_F_LINK) { > + /* Let bpf_link handle registration & unregistration. */ > + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); > + goto unlock; > + } > + > refcount_set(&kvalue->refcnt, 1); > bpf_map_inc(map); [..] > @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct > bpf_map *map, void *key, > kfree(tlinks); > mutex_unlock(&st_map->lock); > return err; > + > } > static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) Seems like a left over hunk? > @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct > bpf_map *map, void *key) > BPF_STRUCT_OPS_STATE_TOBEFREE); > switch (prev_state) { > case BPF_STRUCT_OPS_STATE_INUSE: > + if (st_map->map.map_flags & BPF_F_LINK) > + return 0; > st_map->st_ops->unreg(&st_map->kvalue.data); > if (refcount_dec_and_test(&st_map->kvalue.refcnt)) > bpf_map_put(map); > @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct bpf_map > *map) > static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) > { > if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || > - attr->map_flags || !attr->btf_vmlinux_value_type_id) > + (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) > return -EINVAL; > return 0; > } > @@ -638,6 +647,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union > bpf_attr *attr) > set_vm_flush_reset_perms(st_map->image); > bpf_map_init_from_attr(map, attr); [..] > + map->map_flags |= attr->map_flags & BPF_F_LINK; You seem to have the following check above: if (.... (attr->map_flags & ~BPF_F_LINK) ...) return -EINVAL; And here you do: map->map_flags |= attr->map_flags & BPF_F_LINK; So maybe we can simplify to: map->map_flags |= attr->map_flags; ? > + > return map; > } > @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata) > } > } > +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value > *kvalue) > +{ > + struct bpf_struct_ops_map *st_map; > + > + if (refcount_dec_and_test(&kvalue->refcnt)) { > + st_map = container_of(kvalue, struct bpf_struct_ops_map, > + kvalue); > + bpf_map_put(&st_map->map); > + } > +} > + > static void bpf_struct_ops_map_link_release(struct bpf_link *link) > { > + struct bpf_struct_ops_map *st_map; > + > if (link->map) { > - bpf_map_put(link->map); > + st_map = (struct bpf_struct_ops_map *)link->map; > + st_map->st_ops->unreg(&st_map->kvalue.data); > + bpf_struct_ops_kvalue_put(&st_map->kvalue); > link->map = NULL; > } > } > @@ -735,13 +761,15 @@ static const struct bpf_link_ops > bpf_struct_ops_map_lops = { > int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) > { > + struct bpf_struct_ops_map *st_map; > struct bpf_link_primer link_primer; > + struct bpf_struct_ops_value *kvalue; > struct bpf_map *map; > struct bpf_link *link = NULL; > int err; > map = bpf_map_get(attr->link_create.prog_fd); > - if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) > + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & > BPF_F_LINK)) > return -EINVAL; > link = kzalloc(sizeof(*link), GFP_USER); > @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr *attr, > bpfptr_t uattr) > bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, > NULL); > link->map = map; [..] > + if (map->map_flags & BPF_F_LINK) { We seem to bail out above when we don't have BPF_F_LINK flags above? if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK)) return -EINVAL; So why check this 'if (map->map_flags & BPF_F_LINK)' condition here? > + st_map = (struct bpf_struct_ops_map *)map; > + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; > + > + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || > + refcount_read(&kvalue->refcnt) != 0) { > + err = -EINVAL; > + goto err_out; > + } > + > + refcount_set(&kvalue->refcnt, 1); > + > + set_memory_rox((long)st_map->image, 1); > + err = st_map->st_ops->reg(kvalue->data); > + if (err) { > + refcount_set(&kvalue->refcnt, 0); > + > + set_memory_nx((long)st_map->image, 1); > + set_memory_rw((long)st_map->image, 1); > + goto err_out; > + } > + } > + > err = bpf_link_prime(link, &link_primer); > if (err) > goto err_out; > diff --git a/tools/include/uapi/linux/bpf.h > b/tools/include/uapi/linux/bpf.h > index 1e6cdd0f355d..48d8b3058aa1 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -1267,6 +1267,9 @@ enum { > /* Create a map that is suitable to be an inner map with dynamic max > entries */ > BPF_F_INNER_MAP = (1U << 12), > + > +/* Create a map that will be registered/unregesitered by the backed > bpf_link */ > + BPF_F_LINK = (1U << 13), > }; > /* Flags for BPF_PROG_QUERY. */ > -- > 2.30.2
On 2/14/23 18:53, Stanislav Fomichev wrote: > On 02/14, Kui-Feng Lee wrote: >> Registration via bpf_links ensures a uniform behavior, just like other >> BPF programs. BPF struct_ops were registered/unregistered when >> updating/deleting their values. Only the maps of struct_ops having >> the BPF_F_LINK flag are allowed to back a bpf_link. > >> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> >> --- >> include/uapi/linux/bpf.h | 3 ++ >> kernel/bpf/bpf_struct_ops.c | 59 +++++++++++++++++++++++++++++++--- >> tools/include/uapi/linux/bpf.h | 3 ++ >> 3 files changed, 61 insertions(+), 4 deletions(-) > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 1e6cdd0f355d..48d8b3058aa1 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -1267,6 +1267,9 @@ enum { > >> /* Create a map that is suitable to be an inner map with dynamic >> max entries */ >> BPF_F_INNER_MAP = (1U << 12), >> + >> +/* Create a map that will be registered/unregesitered by the backed >> bpf_link */ >> + BPF_F_LINK = (1U << 13), >> }; > >> /* Flags for BPF_PROG_QUERY. */ >> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c >> index 621c8e24481a..d16ca06cf09a 100644 >> --- a/kernel/bpf/bpf_struct_ops.c >> +++ b/kernel/bpf/bpf_struct_ops.c >> @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, > >> mutex_lock(&st_map->lock); > >> - if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { >> + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || >> refcount_read(&kvalue->refcnt)) { >> err = -EBUSY; >> goto unlock; >> } >> @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> *(unsigned long *)(udata + moff) = prog->aux->id; >> } > >> + if (st_map->map.map_flags & BPF_F_LINK) { >> + /* Let bpf_link handle registration & unregistration. */ >> + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); >> + goto unlock; >> + } >> + >> refcount_set(&kvalue->refcnt, 1); >> bpf_map_inc(map); > > > [..] > >> @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> kfree(tlinks); >> mutex_unlock(&st_map->lock); >> return err; >> + >> } > >> static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void >> *key) > > Seems like a left over hunk? You are right. I will remove it. > >> @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct >> bpf_map *map, void *key) >> BPF_STRUCT_OPS_STATE_TOBEFREE); >> switch (prev_state) { >> case BPF_STRUCT_OPS_STATE_INUSE: >> + if (st_map->map.map_flags & BPF_F_LINK) >> + return 0; >> st_map->st_ops->unreg(&st_map->kvalue.data); >> if (refcount_dec_and_test(&st_map->kvalue.refcnt)) >> bpf_map_put(map); >> @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct >> bpf_map *map) >> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) >> { >> if (attr->key_size != sizeof(unsigned int) || attr->max_entries >> != 1 || >> - attr->map_flags || !attr->btf_vmlinux_value_type_id) >> + (attr->map_flags & ~BPF_F_LINK) || >> !attr->btf_vmlinux_value_type_id) >> return -EINVAL; >> return 0; >> } >> @@ -638,6 +647,8 @@ static struct bpf_map >> *bpf_struct_ops_map_alloc(union bpf_attr *attr) >> set_vm_flush_reset_perms(st_map->image); >> bpf_map_init_from_attr(map, attr); > > > [..] > >> + map->map_flags |= attr->map_flags & BPF_F_LINK; > > You seem to have the following check above: > > if (.... (attr->map_flags & ~BPF_F_LINK) ...) return -EINVAL; > > And here you do: > > map->map_flags |= attr->map_flags & BPF_F_LINK; > > So maybe we can simplify to: > map->map_flags |= attr->map_flags; > > ? Great catch! > >> + >> return map; >> } > >> @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata) >> } >> } > >> +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value >> *kvalue) >> +{ >> + struct bpf_struct_ops_map *st_map; >> + >> + if (refcount_dec_and_test(&kvalue->refcnt)) { >> + st_map = container_of(kvalue, struct bpf_struct_ops_map, >> + kvalue); >> + bpf_map_put(&st_map->map); >> + } >> +} >> + >> static void bpf_struct_ops_map_link_release(struct bpf_link *link) >> { >> + struct bpf_struct_ops_map *st_map; >> + >> if (link->map) { >> - bpf_map_put(link->map); >> + st_map = (struct bpf_struct_ops_map *)link->map; >> + st_map->st_ops->unreg(&st_map->kvalue.data); >> + bpf_struct_ops_kvalue_put(&st_map->kvalue); >> link->map = NULL; >> } >> } >> @@ -735,13 +761,15 @@ static const struct bpf_link_ops >> bpf_struct_ops_map_lops = { > >> int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) >> { >> + struct bpf_struct_ops_map *st_map; >> struct bpf_link_primer link_primer; >> + struct bpf_struct_ops_value *kvalue; >> struct bpf_map *map; >> struct bpf_link *link = NULL; >> int err; > >> map = bpf_map_get(attr->link_create.prog_fd); >> - if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) >> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags >> & BPF_F_LINK)) >> return -EINVAL; > >> link = kzalloc(sizeof(*link), GFP_USER); >> @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr >> *attr, bpfptr_t uattr) >> bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, >> &bpf_struct_ops_map_lops, NULL); >> link->map = map; > > > [..] > >> + if (map->map_flags & BPF_F_LINK) { > > We seem to bail out above when we don't have BPF_F_LINK flags above? > > if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & > BPF_F_LINK)) > return -EINVAL; > > So why check this 'if (map->map_flags & BPF_F_LINK)' condition here? You are right! This check is not necessary anymore. > > >> + st_map = (struct bpf_struct_ops_map *)map; >> + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; >> + >> + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || >> + refcount_read(&kvalue->refcnt) != 0) { >> + err = -EINVAL; >> + goto err_out; >> + } >> + >> + refcount_set(&kvalue->refcnt, 1); >> + >> + set_memory_rox((long)st_map->image, 1); >> + err = st_map->st_ops->reg(kvalue->data); >> + if (err) { >> + refcount_set(&kvalue->refcnt, 0); >> + >> + set_memory_nx((long)st_map->image, 1); >> + set_memory_rw((long)st_map->image, 1); >> + goto err_out; >> + } >> + } >> + >> err = bpf_link_prime(link, &link_primer); >> if (err) >> goto err_out; >> diff --git a/tools/include/uapi/linux/bpf.h >> b/tools/include/uapi/linux/bpf.h >> index 1e6cdd0f355d..48d8b3058aa1 100644 >> --- a/tools/include/uapi/linux/bpf.h >> +++ b/tools/include/uapi/linux/bpf.h >> @@ -1267,6 +1267,9 @@ enum { > >> /* Create a map that is suitable to be an inner map with dynamic >> max entries */ >> BPF_F_INNER_MAP = (1U << 12), >> + >> +/* Create a map that will be registered/unregesitered by the backed >> bpf_link */ >> + BPF_F_LINK = (1U << 13), >> }; > >> /* Flags for BPF_PROG_QUERY. */ >> -- >> 2.30.2 >
On 2/14/23 2:17 PM, Kui-Feng Lee wrote: > Registration via bpf_links ensures a uniform behavior, just like other > BPF programs. BPF struct_ops were registered/unregistered when > updating/deleting their values. Only the maps of struct_ops having > the BPF_F_LINK flag are allowed to back a bpf_link. > > Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> > --- > include/uapi/linux/bpf.h | 3 ++ > kernel/bpf/bpf_struct_ops.c | 59 +++++++++++++++++++++++++++++++--- > tools/include/uapi/linux/bpf.h | 3 ++ > 3 files changed, 61 insertions(+), 4 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 1e6cdd0f355d..48d8b3058aa1 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1267,6 +1267,9 @@ enum { > > /* Create a map that is suitable to be an inner map with dynamic max entries */ > BPF_F_INNER_MAP = (1U << 12), > + > +/* Create a map that will be registered/unregesitered by the backed bpf_link */ > + BPF_F_LINK = (1U << 13), > }; > > /* Flags for BPF_PROG_QUERY. */ > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > index 621c8e24481a..d16ca06cf09a 100644 > --- a/kernel/bpf/bpf_struct_ops.c > +++ b/kernel/bpf/bpf_struct_ops.c > @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > > mutex_lock(&st_map->lock); > > - if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { > + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || refcount_read(&kvalue->refcnt)) { Why it needs a new refcount_read(&kvalue->refcnt) check? > err = -EBUSY; > goto unlock; > } > @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > *(unsigned long *)(udata + moff) = prog->aux->id; > } > > + if (st_map->map.map_flags & BPF_F_LINK) { > + /* Let bpf_link handle registration & unregistration. */ > + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); INUSE is for registered struct_ops. It needs a new UNREG state to mean initialized but not registered. The kvalue->state is not in uapi but the user space can still introspect it (thanks to BTF), so having a correct semantic state is useful. Try 'bpftool struct_ops dump ...': "bpf_struct_ops_tcp_congestion_ops": { "refcnt": { "refs": { "counter": 1 } }, "state": "BPF_STRUCT_OPS_STATE_INUSE", > + goto unlock; > + } > + > refcount_set(&kvalue->refcnt, 1); > bpf_map_inc(map); > > @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > kfree(tlinks); > mutex_unlock(&st_map->lock); > return err; > + Unnecessary new line. > } > > static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) > @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) > BPF_STRUCT_OPS_STATE_TOBEFREE); > switch (prev_state) { > case BPF_STRUCT_OPS_STATE_INUSE: > + if (st_map->map.map_flags & BPF_F_LINK) > + return 0; This should be a -ENOTSUPP. > st_map->st_ops->unreg(&st_map->kvalue.data); > if (refcount_dec_and_test(&st_map->kvalue.refcnt)) > bpf_map_put(map); > @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) > static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) > { > if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || > - attr->map_flags || !attr->btf_vmlinux_value_type_id) > + (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) > return -EINVAL; > return 0; > } > @@ -638,6 +647,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) > set_vm_flush_reset_perms(st_map->image); > bpf_map_init_from_attr(map, attr); > > + map->map_flags |= attr->map_flags & BPF_F_LINK; This should have already been done in bpf_map_init_from_attr(). > + > return map; > } > > @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata) > } > } > > +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value *kvalue) > +{ > + struct bpf_struct_ops_map *st_map; > + > + if (refcount_dec_and_test(&kvalue->refcnt)) { > + st_map = container_of(kvalue, struct bpf_struct_ops_map, > + kvalue); > + bpf_map_put(&st_map->map); > + } > +} > + > static void bpf_struct_ops_map_link_release(struct bpf_link *link) > { > + struct bpf_struct_ops_map *st_map; > + > if (link->map) { > - bpf_map_put(link->map); > + st_map = (struct bpf_struct_ops_map *)link->map; > + st_map->st_ops->unreg(&st_map->kvalue.data); > + bpf_struct_ops_kvalue_put(&st_map->kvalue); > link->map = NULL; Does it need a lock or something to protect the link_release? or I am missing something and lock is not needed? The kvalue->value state should become UNREG. After UNREG, can the struct_ops map be used in creating a new link again? > } > } > @@ -735,13 +761,15 @@ static const struct bpf_link_ops bpf_struct_ops_map_lops = { > > int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) > { > + struct bpf_struct_ops_map *st_map; > struct bpf_link_primer link_primer; > + struct bpf_struct_ops_value *kvalue; > struct bpf_map *map; > struct bpf_link *link = NULL; > int err; > > map = bpf_map_get(attr->link_create.prog_fd); > - if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) > + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK)) > return -EINVAL; > > link = kzalloc(sizeof(*link), GFP_USER); > @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) > bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); > link->map = map; > > + if (map->map_flags & BPF_F_LINK) { > + st_map = (struct bpf_struct_ops_map *)map; > + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; > + > + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || > + refcount_read(&kvalue->refcnt) != 0) { The refcount_read(&kvalue->refcnt) is to ensure it is not registered? It seems the UNREG state is useful here. > + err = -EINVAL; > + goto err_out; > + } > + > + refcount_set(&kvalue->refcnt, 1); If a struct_ops map is used to create multiple links in parallel, is it safe? > + > + set_memory_rox((long)st_map->image, 1); > + err = st_map->st_ops->reg(kvalue->data); After successful reg, the state can be changed from UNREG to INUSE. > + if (err) { > + refcount_set(&kvalue->refcnt, 0); > + > + set_memory_nx((long)st_map->image, 1); > + set_memory_rw((long)st_map->image, 1); > + goto err_out; > + } > + } This patch should be combined with patch 1. Otherwise, patch 1 is quite hard to understand without link_create_struct_ops_map() doing the actual "attach". > + > err = bpf_link_prime(link, &link_primer); > if (err) > goto err_out; > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index 1e6cdd0f355d..48d8b3058aa1 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -1267,6 +1267,9 @@ enum { > > /* Create a map that is suitable to be an inner map with dynamic max entries */ > BPF_F_INNER_MAP = (1U << 12), > + > +/* Create a map that will be registered/unregesitered by the backed bpf_link */ > + BPF_F_LINK = (1U << 13), > }; > > /* Flags for BPF_PROG_QUERY. */
On 2/15/23 16:37, Martin KaFai Lau wrote: > On 2/14/23 2:17 PM, Kui-Feng Lee wrote: >> Registration via bpf_links ensures a uniform behavior, just like other >> BPF programs. BPF struct_ops were registered/unregistered when >> updating/deleting their values. Only the maps of struct_ops having >> the BPF_F_LINK flag are allowed to back a bpf_link. >> >> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> >> --- >> include/uapi/linux/bpf.h | 3 ++ >> kernel/bpf/bpf_struct_ops.c | 59 +++++++++++++++++++++++++++++++--- >> tools/include/uapi/linux/bpf.h | 3 ++ >> 3 files changed, 61 insertions(+), 4 deletions(-) >> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 1e6cdd0f355d..48d8b3058aa1 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -1267,6 +1267,9 @@ enum { >> /* Create a map that is suitable to be an inner map with dynamic max >> entries */ >> BPF_F_INNER_MAP = (1U << 12), >> + >> +/* Create a map that will be registered/unregesitered by the backed >> bpf_link */ >> + BPF_F_LINK = (1U << 13), >> }; >> /* Flags for BPF_PROG_QUERY. */ >> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c >> index 621c8e24481a..d16ca06cf09a 100644 >> --- a/kernel/bpf/bpf_struct_ops.c >> +++ b/kernel/bpf/bpf_struct_ops.c >> @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> mutex_lock(&st_map->lock); >> - if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { >> + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || >> refcount_read(&kvalue->refcnt)) { > > Why it needs a new refcount_read(&kvalue->refcnt) check? It prohibits updating the value once it is registered. This refcnt is set to 1 when register it. But, yes, it is confusing since we never reset it back to *_INIT. The purpose of this refcount_read() will be clear once add *_UNREG, and reset it back to *_INIT properly. > >> err = -EBUSY; >> goto unlock; >> } >> @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> *(unsigned long *)(udata + moff) = prog->aux->id; >> } >> + if (st_map->map.map_flags & BPF_F_LINK) { >> + /* Let bpf_link handle registration & unregistration. */ >> + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); > > INUSE is for registered struct_ops. It needs a new UNREG state to mean > initialized but not registered. The kvalue->state is not in uapi but the > user space can still introspect it (thanks to BTF), so having a correct > semantic state is useful. Try 'bpftool struct_ops dump ...': > > "bpf_struct_ops_tcp_congestion_ops": { > "refcnt": { > "refs": { > "counter": 1 > } > }, > "state": "BPF_STRUCT_OPS_STATE_INUSE", Ok! That make sense. > >> + goto unlock; >> + } >> + >> refcount_set(&kvalue->refcnt, 1); >> bpf_map_inc(map); >> @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> kfree(tlinks); >> mutex_unlock(&st_map->lock); >> return err; >> + > > Unnecessary new line. > >> } >> static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void >> *key) >> @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct >> bpf_map *map, void *key) >> BPF_STRUCT_OPS_STATE_TOBEFREE); >> switch (prev_state) { >> case BPF_STRUCT_OPS_STATE_INUSE: >> + if (st_map->map.map_flags & BPF_F_LINK) >> + return 0; > > This should be a -ENOTSUPP. Sure! > >> st_map->st_ops->unreg(&st_map->kvalue.data); >> if (refcount_dec_and_test(&st_map->kvalue.refcnt)) >> bpf_map_put(map); >> @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct bpf_map >> *map) >> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) >> { >> if (attr->key_size != sizeof(unsigned int) || attr->max_entries >> != 1 || >> - attr->map_flags || !attr->btf_vmlinux_value_type_id) >> + (attr->map_flags & ~BPF_F_LINK) || >> !attr->btf_vmlinux_value_type_id) >> return -EINVAL; >> return 0; >> } >> @@ -638,6 +647,8 @@ static struct bpf_map >> *bpf_struct_ops_map_alloc(union bpf_attr *attr) >> set_vm_flush_reset_perms(st_map->image); >> bpf_map_init_from_attr(map, attr); >> + map->map_flags |= attr->map_flags & BPF_F_LINK; > > This should have already been done in bpf_map_init_from_attr(). bpf_map_init_from_attr() will filter out all flags except BPF_F_RDONLY & BPF_F_WRONLY. But, I can move it to bpf_map_init_from_attr() by not filtering out it. > >> + >> return map; >> } >> @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata) >> } >> } >> +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value >> *kvalue) >> +{ >> + struct bpf_struct_ops_map *st_map; >> + >> + if (refcount_dec_and_test(&kvalue->refcnt)) { >> + st_map = container_of(kvalue, struct bpf_struct_ops_map, >> + kvalue); >> + bpf_map_put(&st_map->map); >> + } >> +} >> + >> static void bpf_struct_ops_map_link_release(struct bpf_link *link) >> { >> + struct bpf_struct_ops_map *st_map; >> + >> if (link->map) { >> - bpf_map_put(link->map); >> + st_map = (struct bpf_struct_ops_map *)link->map; >> + st_map->st_ops->unreg(&st_map->kvalue.data); >> + bpf_struct_ops_kvalue_put(&st_map->kvalue); >> link->map = NULL; > > Does it need a lock or something to protect the link_release? or I am > missing something and lock is not needed? This function will be called by bpf_link_free() following the pointer in bpf_link_ops. And bpf_link_free() is called by bpf_link_put(). The refcnt of bpf_link is maintained by bpf_link_put(), and the function here indirectly only if the refcnt reachs 0. If I don't miss anything, it should be safe to release a link without a lock. > > The kvalue->value state should become UNREG. > > After UNREG, can the struct_ops map be used in creating a new link again? > It should be. >> } >> } >> @@ -735,13 +761,15 @@ static const struct bpf_link_ops >> bpf_struct_ops_map_lops = { >> int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) >> { >> + struct bpf_struct_ops_map *st_map; >> struct bpf_link_primer link_primer; >> + struct bpf_struct_ops_value *kvalue; >> struct bpf_map *map; >> struct bpf_link *link = NULL; >> int err; >> map = bpf_map_get(attr->link_create.prog_fd); >> - if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) >> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags >> & BPF_F_LINK)) >> return -EINVAL; >> link = kzalloc(sizeof(*link), GFP_USER); >> @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr >> *attr, bpfptr_t uattr) >> bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, >> &bpf_struct_ops_map_lops, NULL); >> link->map = map; >> + if (map->map_flags & BPF_F_LINK) { >> + st_map = (struct bpf_struct_ops_map *)map; >> + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; >> + >> + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || >> + refcount_read(&kvalue->refcnt) != 0) { > > The refcount_read(&kvalue->refcnt) is to ensure it is not registered? > It seems the UNREG state is useful here. Yes! > >> + err = -EINVAL; >> + goto err_out; >> + } >> + >> + refcount_set(&kvalue->refcnt, 1); > > If a struct_ops map is used to create multiple links in parallel, is it > safe? > >> + >> + set_memory_rox((long)st_map->image, 1); >> + err = st_map->st_ops->reg(kvalue->data); > > After successful reg, the state can be changed from UNREG to INUSE. > >> + if (err) { >> + refcount_set(&kvalue->refcnt, 0); >> + >> + set_memory_nx((long)st_map->image, 1); >> + set_memory_rw((long)st_map->image, 1); >> + goto err_out; >> + } >> + } > > This patch should be combined with patch 1. Otherwise, patch 1 is quite > hard to understand without link_create_struct_ops_map() doing the actual > "attach". Ok! > >> + >> err = bpf_link_prime(link, &link_primer); >> if (err) >> goto err_out; >> diff --git a/tools/include/uapi/linux/bpf.h >> b/tools/include/uapi/linux/bpf.h >> index 1e6cdd0f355d..48d8b3058aa1 100644 >> --- a/tools/include/uapi/linux/bpf.h >> +++ b/tools/include/uapi/linux/bpf.h >> @@ -1267,6 +1267,9 @@ enum { >> /* Create a map that is suitable to be an inner map with dynamic max >> entries */ >> BPF_F_INNER_MAP = (1U << 12), >> + >> +/* Create a map that will be registered/unregesitered by the backed >> bpf_link */ >> + BPF_F_LINK = (1U << 13), >> }; >> /* Flags for BPF_PROG_QUERY. */ >
On 2/16/23 8:42 AM, Kui-Feng Lee wrote: >>> @@ -638,6 +647,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union >>> bpf_attr *attr) >>> set_vm_flush_reset_perms(st_map->image); >>> bpf_map_init_from_attr(map, attr); >>> + map->map_flags |= attr->map_flags & BPF_F_LINK; >> >> This should have already been done in bpf_map_init_from_attr(). > > bpf_map_init_from_attr() will filter out all flags except BPF_F_RDONLY & > BPF_F_WRONLY. should be the opposite: static u32 bpf_map_flags_retain_permanent(u32 flags) { /* Some map creation flags are not tied to the map object but * rather to the map fd instead, so they have no meaning upon * map object inspection since multiple file descriptors with * different (access) properties can exist here. Thus, given * this has zero meaning for the map itself, lets clear these * from here. */ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); }
On 2/16/23 14:38, Martin KaFai Lau wrote: > On 2/16/23 8:42 AM, Kui-Feng Lee wrote: >>>> @@ -638,6 +647,8 @@ static struct bpf_map >>>> *bpf_struct_ops_map_alloc(union bpf_attr *attr) >>>> set_vm_flush_reset_perms(st_map->image); >>>> bpf_map_init_from_attr(map, attr); >>>> + map->map_flags |= attr->map_flags & BPF_F_LINK; >>> >>> This should have already been done in bpf_map_init_from_attr(). >> >> bpf_map_init_from_attr() will filter out all flags except BPF_F_RDONLY >> & BPF_F_WRONLY. > > should be the opposite: > > static u32 bpf_map_flags_retain_permanent(u32 flags) > { > /* Some map creation flags are not tied to the map object but > * rather to the map fd instead, so they have no meaning upon > * map object inspection since multiple file descriptors with > * different (access) properties can exist here. Thus, given > * this has zero meaning for the map itself, lets clear these > * from here. > */ > return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); > } Got it! Thank you!
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e6cdd0f355d..48d8b3058aa1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1267,6 +1267,9 @@ enum { /* Create a map that is suitable to be an inner map with dynamic max entries */ BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), }; /* Flags for BPF_PROG_QUERY. */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 621c8e24481a..d16ca06cf09a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, mutex_lock(&st_map->lock); - if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || refcount_read(&kvalue->refcnt)) { err = -EBUSY; goto unlock; } @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *(unsigned long *)(udata + moff) = prog->aux->id; } + if (st_map->map.map_flags & BPF_F_LINK) { + /* Let bpf_link handle registration & unregistration. */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); + goto unlock; + } + refcount_set(&kvalue->refcnt, 1); bpf_map_inc(map); @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, kfree(tlinks); mutex_unlock(&st_map->lock); return err; + } static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) BPF_STRUCT_OPS_STATE_TOBEFREE); switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: + if (st_map->map.map_flags & BPF_F_LINK) + return 0; st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) { if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || - attr->map_flags || !attr->btf_vmlinux_value_type_id) + (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) return -EINVAL; return 0; } @@ -638,6 +647,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) set_vm_flush_reset_perms(st_map->image); bpf_map_init_from_attr(map, attr); + map->map_flags |= attr->map_flags & BPF_F_LINK; + return map; } @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata) } } +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value *kvalue) +{ + struct bpf_struct_ops_map *st_map; + + if (refcount_dec_and_test(&kvalue->refcnt)) { + st_map = container_of(kvalue, struct bpf_struct_ops_map, + kvalue); + bpf_map_put(&st_map->map); + } +} + static void bpf_struct_ops_map_link_release(struct bpf_link *link) { + struct bpf_struct_ops_map *st_map; + if (link->map) { - bpf_map_put(link->map); + st_map = (struct bpf_struct_ops_map *)link->map; + st_map->st_ops->unreg(&st_map->kvalue.data); + bpf_struct_ops_kvalue_put(&st_map->kvalue); link->map = NULL; } } @@ -735,13 +761,15 @@ static const struct bpf_link_ops bpf_struct_ops_map_lops = { int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) { + struct bpf_struct_ops_map *st_map; struct bpf_link_primer link_primer; + struct bpf_struct_ops_value *kvalue; struct bpf_map *map; struct bpf_link *link = NULL; int err; map = bpf_map_get(attr->link_create.prog_fd); - if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK)) return -EINVAL; link = kzalloc(sizeof(*link), GFP_USER); @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); link->map = map; + if (map->map_flags & BPF_F_LINK) { + st_map = (struct bpf_struct_ops_map *)map; + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; + + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || + refcount_read(&kvalue->refcnt) != 0) { + err = -EINVAL; + goto err_out; + } + + refcount_set(&kvalue->refcnt, 1); + + set_memory_rox((long)st_map->image, 1); + err = st_map->st_ops->reg(kvalue->data); + if (err) { + refcount_set(&kvalue->refcnt, 0); + + set_memory_nx((long)st_map->image, 1); + set_memory_rw((long)st_map->image, 1); + goto err_out; + } + } + err = bpf_link_prime(link, &link_primer); if (err) goto err_out; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1e6cdd0f355d..48d8b3058aa1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1267,6 +1267,9 @@ enum { /* Create a map that is suitable to be an inner map with dynamic max entries */ BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), }; /* Flags for BPF_PROG_QUERY. */
Registration via bpf_links ensures a uniform behavior, just like other BPF programs. BPF struct_ops were registered/unregistered when updating/deleting their values. Only the maps of struct_ops having the BPF_F_LINK flag are allowed to back a bpf_link. Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> --- include/uapi/linux/bpf.h | 3 ++ kernel/bpf/bpf_struct_ops.c | 59 +++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 3 ++ 3 files changed, 61 insertions(+), 4 deletions(-)