Message ID | 20230214221718.503964-6-kuifeng@meta.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Transit between BPF TCP congestion controls. | expand |
On 2/14/23 2:17 PM, Kui-Feng Lee wrote: > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > index d16ca06cf09a..d329621fc721 100644 > --- a/kernel/bpf/bpf_struct_ops.c > +++ b/kernel/bpf/bpf_struct_ops.c > @@ -752,11 +752,66 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, > return 0; > } > > +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map) > +{ > + struct bpf_struct_ops_value *kvalue; > + struct bpf_struct_ops_map *st_map, *old_st_map; > + struct bpf_map *old_map; > + int err; > + > + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(new_map->map_flags & BPF_F_LINK)) > + return -EINVAL; > + > + old_map = link->map; > + > + /* It does nothing if the new map is the same as the old one. > + * A struct_ops that backs a bpf_link can not be updated or > + * its kvalue would be updated and causes inconsistencies. > + */ > + if (old_map == new_map) > + return 0; > + > + /* The new and old struct_ops must be the same type. */ > + st_map = (struct bpf_struct_ops_map *)new_map; > + old_st_map = (struct bpf_struct_ops_map *)old_map; > + if (st_map->st_ops != old_st_map->st_ops) > + return -EINVAL; > + > + /* Assure the struct_ops is updated (has value) and not > + * backing any other link. > + */ > + kvalue = &st_map->kvalue; > + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || > + refcount_read(&kvalue->refcnt) != 0) > + return -EINVAL; > + > + bpf_map_inc(new_map); > + refcount_set(&kvalue->refcnt, 1); > + > + set_memory_rox((long)st_map->image, 1); > + err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data); > + if (err) { > + refcount_set(&kvalue->refcnt, 0); > + > + set_memory_nx((long)st_map->image, 1); > + set_memory_rw((long)st_map->image, 1); > + bpf_map_put(new_map); > + return err; > + } > + > + link->map = new_map; Similar here, does this link_update operation needs a lock? > + > + bpf_struct_ops_kvalue_put(&old_st_map->kvalue); > + > + return 0; > +} > + > static const struct bpf_link_ops bpf_struct_ops_map_lops = { > .release = bpf_struct_ops_map_link_release, > .dealloc = bpf_struct_ops_map_link_dealloc, > .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, > .fill_link_info = bpf_struct_ops_map_link_fill_link_info, > + .update_struct_ops = bpf_struct_ops_map_link_update, This seems a little non-intuitive to add a struct_ops specific thing to the generic bpf_link_ops. May be avoid adding ".update_struct_ops" and directly call the bpf_struct_ops_map_link_update() from link_update()? > }; > > int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 54e172d8f5d1..1341634863b5 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -4650,6 +4650,32 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) > return ret; > } > > +#define BPF_LINK_UPDATE_STRUCT_OPS_LAST_FIELD link_update_struct_ops.new_map_fd Why it is needed? Does it hit error without it? > + > +static int link_update_struct_ops(struct bpf_link *link, union bpf_attr *attr) > +{ > + struct bpf_map *new_map; > + int ret = 0; > + > + new_map = bpf_map_get(attr->link_update.new_map_fd); > + if (IS_ERR(new_map)) > + return -EINVAL; > + > + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { > + ret = -EINVAL; > + goto out_put_map; > + } How about BPF_F_REPLACE? > + > + if (link->ops->update_struct_ops) > + ret = link->ops->update_struct_ops(link, new_map); > + else > + ret = -EINVAL; > + > +out_put_map: > + bpf_map_put(new_map); > + return ret; > +} > + > #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd > > static int link_update(union bpf_attr *attr) > @@ -4670,6 +4696,11 @@ static int link_update(union bpf_attr *attr) > if (IS_ERR(link)) > return PTR_ERR(link); > > + if (link->type == BPF_LINK_TYPE_STRUCT_OPS) { > + ret = link_update_struct_ops(link, attr); > + goto out_put_link; > + } > + > new_prog = bpf_prog_get(attr->link_update.new_prog_fd); > if (IS_ERR(new_prog)) { > ret = PTR_ERR(new_prog); > diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c > index 66ce5fadfe42..558b01d5250f 100644 > --- a/net/ipv4/bpf_tcp_ca.c > +++ b/net/ipv4/bpf_tcp_ca.c > @@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, > if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name, > sizeof(tcp_ca->name)) <= 0) > return -EINVAL; > - if (tcp_ca_find(utcp_ca->name)) > - return -EEXIST; This change is not obvious. Please put some comment in the commit message about this change.
On 2/15/23 17:02, Martin KaFai Lau wrote: > On 2/14/23 2:17 PM, Kui-Feng Lee wrote: >> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c >> index d16ca06cf09a..d329621fc721 100644 >> --- a/kernel/bpf/bpf_struct_ops.c >> +++ b/kernel/bpf/bpf_struct_ops.c >> @@ -752,11 +752,66 @@ static int >> bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, >> return 0; >> } >> +static int bpf_struct_ops_map_link_update(struct bpf_link *link, >> struct bpf_map *new_map) >> +{ >> + struct bpf_struct_ops_value *kvalue; >> + struct bpf_struct_ops_map *st_map, *old_st_map; >> + struct bpf_map *old_map; >> + int err; >> + >> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS || >> !(new_map->map_flags & BPF_F_LINK)) >> + return -EINVAL; >> + >> + old_map = link->map; >> + >> + /* It does nothing if the new map is the same as the old one. >> + * A struct_ops that backs a bpf_link can not be updated or >> + * its kvalue would be updated and causes inconsistencies. >> + */ >> + if (old_map == new_map) >> + return 0; >> + >> + /* The new and old struct_ops must be the same type. */ >> + st_map = (struct bpf_struct_ops_map *)new_map; >> + old_st_map = (struct bpf_struct_ops_map *)old_map; >> + if (st_map->st_ops != old_st_map->st_ops) >> + return -EINVAL; >> + >> + /* Assure the struct_ops is updated (has value) and not >> + * backing any other link. >> + */ >> + kvalue = &st_map->kvalue; >> + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || >> + refcount_read(&kvalue->refcnt) != 0) >> + return -EINVAL; >> + >> + bpf_map_inc(new_map); >> + refcount_set(&kvalue->refcnt, 1); >> + >> + set_memory_rox((long)st_map->image, 1); >> + err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data); >> + if (err) { >> + refcount_set(&kvalue->refcnt, 0); >> + >> + set_memory_nx((long)st_map->image, 1); >> + set_memory_rw((long)st_map->image, 1); >> + bpf_map_put(new_map); >> + return err; >> + } >> + >> + link->map = new_map; > > Similar here, does this link_update operation needs a lock? The update function of tcp_ca checks if the name is unique with the protection of a lock. bpf_struct_ops_map_update_elem() also check and update state of the kvalue to prevent changing kvalue. Only one of thread will success to register or update at any moment. > >> + >> + bpf_struct_ops_kvalue_put(&old_st_map->kvalue); >> + >> + return 0; >> +} >> + >> static const struct bpf_link_ops bpf_struct_ops_map_lops = { >> .release = bpf_struct_ops_map_link_release, >> .dealloc = bpf_struct_ops_map_link_dealloc, >> .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, >> .fill_link_info = bpf_struct_ops_map_link_fill_link_info, >> + .update_struct_ops = bpf_struct_ops_map_link_update, > > This seems a little non-intuitive to add a struct_ops specific thing to > the generic bpf_link_ops. May be avoid adding ".update_struct_ops" and > directly call the bpf_struct_ops_map_link_update() from link_update()? It has `.update_prog` for BPF programs so `.update_struct_ops` or `.update_map` is not that weird for me. It would be better to have a `.update_link` to receive either a bpf_prog or bpf_map, and remove `.update_prog`. > > >> }; >> int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index 54e172d8f5d1..1341634863b5 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -4650,6 +4650,32 @@ static int link_create(union bpf_attr *attr, >> bpfptr_t uattr) >> return ret; >> } >> +#define BPF_LINK_UPDATE_STRUCT_OPS_LAST_FIELD >> link_update_struct_ops.new_map_fd > > Why it is needed? Does it hit error without it? It can be removed now. > >> + >> +static int link_update_struct_ops(struct bpf_link *link, union >> bpf_attr *attr) >> +{ >> + struct bpf_map *new_map; >> + int ret = 0; >> + >> + new_map = bpf_map_get(attr->link_update.new_map_fd); >> + if (IS_ERR(new_map)) >> + return -EINVAL; >> + >> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { >> + ret = -EINVAL; >> + goto out_put_map; >> + } > > How about BPF_F_REPLACE? Do you mean the new_map should be labeled with BPF_F_REPLACE to replace the old one? > >> + >> + if (link->ops->update_struct_ops) >> + ret = link->ops->update_struct_ops(link, new_map); > + else >> + ret = -EINVAL; >> + >> +out_put_map: >> + bpf_map_put(new_map); >> + return ret; >> +} >> + >> #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd >> static int link_update(union bpf_attr *attr) >> @@ -4670,6 +4696,11 @@ static int link_update(union bpf_attr *attr) >> if (IS_ERR(link)) >> return PTR_ERR(link); >> + if (link->type == BPF_LINK_TYPE_STRUCT_OPS) { >> + ret = link_update_struct_ops(link, attr); >> + goto out_put_link; >> + } >> + >> new_prog = bpf_prog_get(attr->link_update.new_prog_fd); >> if (IS_ERR(new_prog)) { >> ret = PTR_ERR(new_prog); >> diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c >> index 66ce5fadfe42..558b01d5250f 100644 >> --- a/net/ipv4/bpf_tcp_ca.c >> +++ b/net/ipv4/bpf_tcp_ca.c >> @@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct >> btf_type *t, >> if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name, >> sizeof(tcp_ca->name)) <= 0) >> return -EINVAL; >> - if (tcp_ca_find(utcp_ca->name)) >> - return -EEXIST; > > This change is not obvious. Please put some comment in the commit > message about this change. > sure!
On 2/16/23 11:17 AM, Kui-Feng Lee wrote: >>> +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct >>> bpf_map *new_map) >>> +{ >>> + struct bpf_struct_ops_value *kvalue; >>> + struct bpf_struct_ops_map *st_map, *old_st_map; >>> + struct bpf_map *old_map; >>> + int err; >>> + >>> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(new_map->map_flags >>> & BPF_F_LINK)) >>> + return -EINVAL; >>> + >>> + old_map = link->map; >>> + >>> + /* It does nothing if the new map is the same as the old one. >>> + * A struct_ops that backs a bpf_link can not be updated or >>> + * its kvalue would be updated and causes inconsistencies. >>> + */ >>> + if (old_map == new_map) >>> + return 0; >>> + >>> + /* The new and old struct_ops must be the same type. */ >>> + st_map = (struct bpf_struct_ops_map *)new_map; >>> + old_st_map = (struct bpf_struct_ops_map *)old_map; >>> + if (st_map->st_ops != old_st_map->st_ops) >>> + return -EINVAL; >>> + >>> + /* Assure the struct_ops is updated (has value) and not >>> + * backing any other link. >>> + */ >>> + kvalue = &st_map->kvalue; >>> + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || >>> + refcount_read(&kvalue->refcnt) != 0) >>> + return -EINVAL; >>> + >>> + bpf_map_inc(new_map); >>> + refcount_set(&kvalue->refcnt, 1); >>> + >>> + set_memory_rox((long)st_map->image, 1); >>> + err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data); >>> + if (err) { >>> + refcount_set(&kvalue->refcnt, 0); >>> + >>> + set_memory_nx((long)st_map->image, 1); >>> + set_memory_rw((long)st_map->image, 1); >>> + bpf_map_put(new_map); >>> + return err; >>> + } >>> + >>> + link->map = new_map; >> >> Similar here, does this link_update operation needs a lock? > > The update function of tcp_ca checks if the name is unique with the protection > of a lock. bpf_struct_ops_map_update_elem() also check and update state of the > kvalue to prevent changing kvalue. Only one of thread will success to register > or update at any moment. hmm... meaning the lock inside the "->update()" function? There are many variables outside of update() that this function (bpf_struct_ops_map_link_update) is setting and testing without a lock. eg. the succeeded thread will set refcnt to 1 while the failed thread will set it back to 0... >>> + >>> +static int link_update_struct_ops(struct bpf_link *link, union bpf_attr *attr) >>> +{ >>> + struct bpf_map *new_map; >>> + int ret = 0; >>> + >>> + new_map = bpf_map_get(attr->link_update.new_map_fd); >>> + if (IS_ERR(new_map)) >>> + return -EINVAL; >>> + >>> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { >>> + ret = -EINVAL; >>> + goto out_put_map; >>> + } >> >> How about BPF_F_REPLACE? > > Do you mean the new_map should be labeled with BPF_F_REPLACE to replace the old > one? was asking if BPF_F_REPLACE is supported.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5fe39f56a760..03a15dc26d7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1408,6 +1408,7 @@ struct bpf_link_ops { void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); + int (*update_struct_ops)(struct bpf_link *link, struct bpf_map *new_map); }; struct bpf_tramp_link { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 48d8b3058aa1..7c009ac859c8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1552,8 +1552,12 @@ union bpf_attr { struct { /* struct used by BPF_LINK_UPDATE command */ __u32 link_fd; /* link fd */ - /* new program fd to update link with */ - __u32 new_prog_fd; + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; __u32 flags; /* extra flags */ /* expected link's program fd; is specified only if * BPF_F_REPLACE flag is set in flags */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d16ca06cf09a..d329621fc721 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -752,11 +752,66 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, return 0; } +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map, *old_st_map; + struct bpf_map *old_map; + int err; + + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(new_map->map_flags & BPF_F_LINK)) + return -EINVAL; + + old_map = link->map; + + /* It does nothing if the new map is the same as the old one. + * A struct_ops that backs a bpf_link can not be updated or + * its kvalue would be updated and causes inconsistencies. + */ + if (old_map == new_map) + return 0; + + /* The new and old struct_ops must be the same type. */ + st_map = (struct bpf_struct_ops_map *)new_map; + old_st_map = (struct bpf_struct_ops_map *)old_map; + if (st_map->st_ops != old_st_map->st_ops) + return -EINVAL; + + /* Assure the struct_ops is updated (has value) and not + * backing any other link. + */ + kvalue = &st_map->kvalue; + if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE || + refcount_read(&kvalue->refcnt) != 0) + return -EINVAL; + + bpf_map_inc(new_map); + refcount_set(&kvalue->refcnt, 1); + + set_memory_rox((long)st_map->image, 1); + err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data); + if (err) { + refcount_set(&kvalue->refcnt, 0); + + set_memory_nx((long)st_map->image, 1); + set_memory_rw((long)st_map->image, 1); + bpf_map_put(new_map); + return err; + } + + link->map = new_map; + + bpf_struct_ops_kvalue_put(&old_st_map->kvalue); + + return 0; +} + static const struct bpf_link_ops bpf_struct_ops_map_lops = { .release = bpf_struct_ops_map_link_release, .dealloc = bpf_struct_ops_map_link_dealloc, .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, .fill_link_info = bpf_struct_ops_map_link_fill_link_info, + .update_struct_ops = bpf_struct_ops_map_link_update, }; int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 54e172d8f5d1..1341634863b5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4650,6 +4650,32 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) return ret; } +#define BPF_LINK_UPDATE_STRUCT_OPS_LAST_FIELD link_update_struct_ops.new_map_fd + +static int link_update_struct_ops(struct bpf_link *link, union bpf_attr *attr) +{ + struct bpf_map *new_map; + int ret = 0; + + new_map = bpf_map_get(attr->link_update.new_map_fd); + if (IS_ERR(new_map)) + return -EINVAL; + + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto out_put_map; + } + + if (link->ops->update_struct_ops) + ret = link->ops->update_struct_ops(link, new_map); + else + ret = -EINVAL; + +out_put_map: + bpf_map_put(new_map); + return ret; +} + #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd static int link_update(union bpf_attr *attr) @@ -4670,6 +4696,11 @@ static int link_update(union bpf_attr *attr) if (IS_ERR(link)) return PTR_ERR(link); + if (link->type == BPF_LINK_TYPE_STRUCT_OPS) { + ret = link_update_struct_ops(link, attr); + goto out_put_link; + } + new_prog = bpf_prog_get(attr->link_update.new_prog_fd); if (IS_ERR(new_prog)) { ret = PTR_ERR(new_prog); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 66ce5fadfe42..558b01d5250f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)) <= 0) return -EINVAL; - if (tcp_ca_find(utcp_ca->name)) - return -EEXIST; return 1; }
By improving the BPF_LINK_UPDATE command of bpf(), it should allow you to conveniently switch between different struct_ops on a single bpf_link. This would enable smoother transitions from one struct_ops to another. The struct_ops maps passing along with BPF_LINK_UPDATE should have the BPF_F_LINK flag. Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 ++++-- kernel/bpf/bpf_struct_ops.c | 55 +++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 31 +++++++++++++++++++++ net/ipv4/bpf_tcp_ca.c | 2 -- 5 files changed, 93 insertions(+), 4 deletions(-)