Message ID | 20230214221718.503964-2-kuifeng@meta.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Transit between BPF TCP congestion controls. | expand |
Hi Kui-Feng, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on bpf-next/master] url: https://github.com/intel-lab-lkp/linux/commits/Kui-Feng-Lee/bpf-Create-links-for-BPF-struct_ops-maps/20230215-061816 base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master patch link: https://lore.kernel.org/r/20230214221718.503964-2-kuifeng%40meta.com patch subject: [PATCH bpf-next 1/7] bpf: Create links for BPF struct_ops maps. config: powerpc-allyesconfig (https://download.01.org/0day-ci/archive/20230215/202302150809.TbWg3iN6-lkp@intel.com/config) compiler: powerpc-linux-gcc (GCC) 12.1.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/c186fe88190559d4872279724d598b8d45ba3092 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Kui-Feng-Lee/bpf-Create-links-for-BPF-struct_ops-maps/20230215-061816 git checkout c186fe88190559d4872279724d598b8d45ba3092 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc olddefconfig COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash kernel/bpf/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Link: https://lore.kernel.org/oe-kbuild-all/202302150809.TbWg3iN6-lkp@intel.com/ All warnings (new ones prefixed by >>): >> kernel/bpf/bpf_struct_ops.c:736:5: warning: no previous prototype for 'link_create_struct_ops_map' [-Wmissing-prototypes] 736 | int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ vim +/link_create_struct_ops_map +736 kernel/bpf/bpf_struct_ops.c 735 > 736 int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) 737 { 738 struct bpf_link_primer link_primer; 739 struct bpf_map *map; 740 struct bpf_link *link = NULL; 741 int err; 742 743 map = bpf_map_get(attr->link_create.prog_fd); 744 if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) 745 return -EINVAL; 746 747 link = kzalloc(sizeof(*link), GFP_USER); 748 if (!link) { 749 err = -ENOMEM; 750 goto err_out; 751 } 752 bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); 753 link->map = map; 754 755 err = bpf_link_prime(link, &link_primer); 756 if (err) 757 goto err_out; 758 759 return bpf_link_settle(&link_primer); 760 761 err_out: 762 bpf_map_put(map); 763 kfree(link); 764 return err; 765 } 766
On 02/14, Kui-Feng Lee wrote: > BPF struct_ops maps are employed directly to register TCP Congestion > Control algorithms. Unlike other BPF programs that terminate when > their links gone, the struct_ops program reduces its refcount solely > upon death of its FD. The link of a BPF struct_ops map provides a > uniform experience akin to other types of BPF programs. A TCP > Congestion Control algorithm will be unregistered upon destroying the > FD in the following patches. > Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> > --- > include/linux/bpf.h | 6 +++- > include/uapi/linux/bpf.h | 4 +++ > kernel/bpf/bpf_struct_ops.c | 66 ++++++++++++++++++++++++++++++++++ > kernel/bpf/syscall.c | 29 ++++++++++----- > tools/include/uapi/linux/bpf.h | 4 +++ > tools/lib/bpf/bpf.c | 2 ++ > tools/lib/bpf/libbpf.c | 1 + > 7 files changed, 102 insertions(+), 10 deletions(-) > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 8b5d0b4c4ada..13683584b071 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1391,7 +1391,11 @@ struct bpf_link { > u32 id; > enum bpf_link_type type; > const struct bpf_link_ops *ops; > - struct bpf_prog *prog; [..] > + union { > + struct bpf_prog *prog; > + /* Backed by a struct_ops (type == BPF_LINK_UPDATE_STRUCT_OPS) */ > + struct bpf_map *map; > + }; Any reason you're not using the approach that has been used for other links where we create a wrapping structure + container_of? struct bpt_struct_ops_link { struct bpf_link link; struct bpf_map *map; }; > struct work_struct work; > }; > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 17afd2b35ee5..1e6cdd0f355d 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1033,6 +1033,7 @@ enum bpf_attach_type { > BPF_PERF_EVENT, > BPF_TRACE_KPROBE_MULTI, > BPF_LSM_CGROUP, > + BPF_STRUCT_OPS_MAP, > __MAX_BPF_ATTACH_TYPE > }; > @@ -6354,6 +6355,9 @@ struct bpf_link_info { > struct { > __u32 ifindex; > } xdp; > + struct { > + __u32 map_id; > + } struct_ops_map; > }; > } __attribute__((aligned(8))); > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > index ece9870cab68..621c8e24481a 100644 > --- a/kernel/bpf/bpf_struct_ops.c > +++ b/kernel/bpf/bpf_struct_ops.c > @@ -698,3 +698,69 @@ void bpf_struct_ops_put(const void *kdata) > call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); > } > } > + > +static void bpf_struct_ops_map_link_release(struct bpf_link *link) > +{ > + if (link->map) { > + bpf_map_put(link->map); > + link->map = NULL; > + } > +} > + > +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) > +{ > + kfree(link); > +} > + > +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link > *link, > + struct seq_file *seq) > +{ > + seq_printf(seq, "map_id:\t%d\n", > + link->map->id); > +} > + > +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link > *link, > + struct bpf_link_info *info) > +{ > + info->struct_ops_map.map_id = link->map->id; > + return 0; > +} > + > +static const struct bpf_link_ops bpf_struct_ops_map_lops = { > + .release = bpf_struct_ops_map_link_release, > + .dealloc = bpf_struct_ops_map_link_dealloc, > + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, > + .fill_link_info = bpf_struct_ops_map_link_fill_link_info, > +}; > + > +int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) > +{ [..] > + struct bpf_link_primer link_primer; > + struct bpf_map *map; > + struct bpf_link *link = NULL; Are we still trying to keep reverse christmas trees? > + int err; > + > + map = bpf_map_get(attr->link_create.prog_fd); bpf_map_get can fail here? > + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) > + return -EINVAL; > + > + link = kzalloc(sizeof(*link), GFP_USER); > + if (!link) { > + err = -ENOMEM; > + goto err_out; > + } > + bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, > NULL); > + link->map = map; > + > + err = bpf_link_prime(link, &link_primer); > + if (err) > + goto err_out; > + > + return bpf_link_settle(&link_primer); > + > +err_out: > + bpf_map_put(map); > + kfree(link); > + return err; > +} > + > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index cda8d00f3762..54e172d8f5d1 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2738,7 +2738,9 @@ static void bpf_link_free(struct bpf_link *link) > if (link->prog) { > /* detach BPF program, clean up used resources */ > link->ops->release(link); > - bpf_prog_put(link->prog); > + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) > + bpf_prog_put(link->prog); > + /* The struct_ops links clean up map by them-selves. */ Why not more generic: if (link->prog) bpf_prog_put(link->prog); ? > } > /* free bpf_link and its containing memory */ > link->ops->dealloc(link); > @@ -2794,16 +2796,19 @@ static void bpf_link_show_fdinfo(struct seq_file > *m, struct file *filp) > const struct bpf_prog *prog = link->prog; > char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; > - bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); > seq_printf(m, > "link_type:\t%s\n" > - "link_id:\t%u\n" > - "prog_tag:\t%s\n" > - "prog_id:\t%u\n", > + "link_id:\t%u\n", > bpf_link_type_strs[link->type], > - link->id, > - prog_tag, > - prog->aux->id); > + link->id); > + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) { > + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); > + seq_printf(m, > + "prog_tag:\t%s\n" > + "prog_id:\t%u\n", > + prog_tag, > + prog->aux->id); > + } > if (link->ops->show_fdinfo) > link->ops->show_fdinfo(link, m); > } > @@ -4278,7 +4283,8 @@ static int bpf_link_get_info_by_fd(struct file > *file, > info.type = link->type; > info.id = link->id; > - info.prog_id = link->prog->aux->id; > + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) > + info.prog_id = link->prog->aux->id; Here as well: should we have "link->type != BPF_LINK_TYPE_STRUCT_OPS" vs "link->prog != NULL" ? > if (link->ops->fill_link_info) { > err = link->ops->fill_link_info(link, &info); > @@ -4531,6 +4537,8 @@ static int bpf_map_do_batch(const union bpf_attr > *attr, > return err; > } > +extern int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t > uattr); > + > #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies > static int link_create(union bpf_attr *attr, bpfptr_t uattr) > { > @@ -4541,6 +4549,9 @@ static int link_create(union bpf_attr *attr, > bpfptr_t uattr) > if (CHECK_ATTR(BPF_LINK_CREATE)) > return -EINVAL; > + if (attr->link_create.attach_type == BPF_STRUCT_OPS_MAP) > + return link_create_struct_ops_map(attr, uattr); > + > prog = bpf_prog_get(attr->link_create.prog_fd); > if (IS_ERR(prog)) > return PTR_ERR(prog); > diff --git a/tools/include/uapi/linux/bpf.h > b/tools/include/uapi/linux/bpf.h > index 17afd2b35ee5..1e6cdd0f355d 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -1033,6 +1033,7 @@ enum bpf_attach_type { > BPF_PERF_EVENT, > BPF_TRACE_KPROBE_MULTI, > BPF_LSM_CGROUP, > + BPF_STRUCT_OPS_MAP, > __MAX_BPF_ATTACH_TYPE > }; > @@ -6354,6 +6355,9 @@ struct bpf_link_info { > struct { > __u32 ifindex; > } xdp; > + struct { > + __u32 map_id; > + } struct_ops_map; > }; > } __attribute__((aligned(8))); > diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c > index 9aff98f42a3d..e44d49f17c86 100644 > --- a/tools/lib/bpf/bpf.c > +++ b/tools/lib/bpf/bpf.c > @@ -731,6 +731,8 @@ int bpf_link_create(int prog_fd, int target_fd, > if (!OPTS_ZEROED(opts, tracing)) > return libbpf_err(-EINVAL); > break; > + case BPF_STRUCT_OPS_MAP: > + break; > default: > if (!OPTS_ZEROED(opts, flags)) > return libbpf_err(-EINVAL); > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c > index 35a698eb825d..75ed95b7e455 100644 > --- a/tools/lib/bpf/libbpf.c > +++ b/tools/lib/bpf/libbpf.c > @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { > [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", > [BPF_PERF_EVENT] = "perf_event", > [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", > + [BPF_STRUCT_OPS_MAP] = "struct_ops_map", > }; > static const char * const link_type_name[] = { > -- > 2.30.2
Thank you for the feedback. On 2/14/23 18:39, Stanislav Fomichev wrote: > On 02/14, Kui-Feng Lee wrote: >> BPF struct_ops maps are employed directly to register TCP Congestion >> Control algorithms. Unlike other BPF programs that terminate when >> their links gone, the struct_ops program reduces its refcount solely >> upon death of its FD. The link of a BPF struct_ops map provides a >> uniform experience akin to other types of BPF programs. A TCP >> Congestion Control algorithm will be unregistered upon destroying the >> FD in the following patches. > >> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> >> --- >> include/linux/bpf.h | 6 +++- >> include/uapi/linux/bpf.h | 4 +++ >> kernel/bpf/bpf_struct_ops.c | 66 ++++++++++++++++++++++++++++++++++ >> kernel/bpf/syscall.c | 29 ++++++++++----- >> tools/include/uapi/linux/bpf.h | 4 +++ >> tools/lib/bpf/bpf.c | 2 ++ >> tools/lib/bpf/libbpf.c | 1 + >> 7 files changed, 102 insertions(+), 10 deletions(-) > >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index 8b5d0b4c4ada..13683584b071 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -1391,7 +1391,11 @@ struct bpf_link { >> u32 id; >> enum bpf_link_type type; >> const struct bpf_link_ops *ops; >> - struct bpf_prog *prog; > > [..] > >> + union { >> + struct bpf_prog *prog; >> + /* Backed by a struct_ops (type == >> BPF_LINK_UPDATE_STRUCT_OPS) */ >> + struct bpf_map *map; >> + }; > > Any reason you're not using the approach that has been used for other > links where we create a wrapping structure + container_of? > > struct bpt_struct_ops_link { > struct bpf_link link; > struct bpf_map *map; > }; > `map` and `prog` are meant to be used separately, while `union` is designed for this purpose. The `container_of` approach also works. While both `container_of` and `union` are often used, is there any factor that makes the former a better choice than the latter? >> struct work_struct work; >> }; > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 17afd2b35ee5..1e6cdd0f355d 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -1033,6 +1033,7 @@ enum bpf_attach_type { >> BPF_PERF_EVENT, >> BPF_TRACE_KPROBE_MULTI, >> BPF_LSM_CGROUP, >> + BPF_STRUCT_OPS_MAP, >> __MAX_BPF_ATTACH_TYPE >> }; > >> @@ -6354,6 +6355,9 @@ struct bpf_link_info { >> struct { >> __u32 ifindex; >> } xdp; >> + struct { >> + __u32 map_id; >> + } struct_ops_map; >> }; >> } __attribute__((aligned(8))); > >> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c >> index ece9870cab68..621c8e24481a 100644 >> --- a/kernel/bpf/bpf_struct_ops.c >> +++ b/kernel/bpf/bpf_struct_ops.c >> @@ -698,3 +698,69 @@ void bpf_struct_ops_put(const void *kdata) >> call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); >> } >> } >> + >> +static void bpf_struct_ops_map_link_release(struct bpf_link *link) >> +{ >> + if (link->map) { >> + bpf_map_put(link->map); >> + link->map = NULL; >> + } >> +} >> + >> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) >> +{ >> + kfree(link); >> +} >> + >> +static void bpf_struct_ops_map_link_show_fdinfo(const struct >> bpf_link *link, >> + struct seq_file *seq) >> +{ >> + seq_printf(seq, "map_id:\t%d\n", >> + link->map->id); >> +} >> + >> +static int bpf_struct_ops_map_link_fill_link_info(const struct >> bpf_link *link, >> + struct bpf_link_info *info) >> +{ >> + info->struct_ops_map.map_id = link->map->id; >> + return 0; >> +} >> + >> +static const struct bpf_link_ops bpf_struct_ops_map_lops = { >> + .release = bpf_struct_ops_map_link_release, >> + .dealloc = bpf_struct_ops_map_link_dealloc, >> + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, >> + .fill_link_info = bpf_struct_ops_map_link_fill_link_info, >> +}; >> + >> +int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) >> +{ > > [..] > >> + struct bpf_link_primer link_primer; >> + struct bpf_map *map; >> + struct bpf_link *link = NULL; > > Are we still trying to keep reverse christmas trees? Yes, I will reorder them. > >> + int err; >> + >> + map = bpf_map_get(attr->link_create.prog_fd); > > bpf_map_get can fail here? We have already verified the `attach_type` of the link before calling this function, so an error should not occur. If it does happen, however, something truly unusual must be happening. To ensure maximum protection and avoid this issue in the future, I will add a check here as well. > >> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) >> + return -EINVAL; >> + >> + link = kzalloc(sizeof(*link), GFP_USER); >> + if (!link) { >> + err = -ENOMEM; >> + goto err_out; >> + } >> + bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, >> &bpf_struct_ops_map_lops, NULL); >> + link->map = map; >> + >> + err = bpf_link_prime(link, &link_primer); >> + if (err) >> + goto err_out; >> + >> + return bpf_link_settle(&link_primer); >> + >> +err_out: >> + bpf_map_put(map); >> + kfree(link); >> + return err; >> +} >> + >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index cda8d00f3762..54e172d8f5d1 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -2738,7 +2738,9 @@ static void bpf_link_free(struct bpf_link *link) >> if (link->prog) { >> /* detach BPF program, clean up used resources */ >> link->ops->release(link); >> - bpf_prog_put(link->prog); >> + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) >> + bpf_prog_put(link->prog); >> + /* The struct_ops links clean up map by them-selves. */ > > Why not more generic: > > if (link->prog) > bpf_prog_put(link->prog); > > ? The `prog` and `map` functions are now occupying the same space. I'm afraid this check won't work. > > >> } >> /* free bpf_link and its containing memory */ >> link->ops->dealloc(link); >> @@ -2794,16 +2796,19 @@ static void bpf_link_show_fdinfo(struct >> seq_file *m, struct file *filp) >> const struct bpf_prog *prog = link->prog; >> char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; > >> - bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); >> seq_printf(m, >> "link_type:\t%s\n" >> - "link_id:\t%u\n" >> - "prog_tag:\t%s\n" >> - "prog_id:\t%u\n", >> + "link_id:\t%u\n", >> bpf_link_type_strs[link->type], >> - link->id, >> - prog_tag, >> - prog->aux->id); >> + link->id); >> + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) { >> + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); >> + seq_printf(m, >> + "prog_tag:\t%s\n" >> + "prog_id:\t%u\n", >> + prog_tag, >> + prog->aux->id); >> + } >> if (link->ops->show_fdinfo) >> link->ops->show_fdinfo(link, m); >> } >> @@ -4278,7 +4283,8 @@ static int bpf_link_get_info_by_fd(struct file >> *file, > >> info.type = link->type; >> info.id = link->id; >> - info.prog_id = link->prog->aux->id; >> + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) >> + info.prog_id = link->prog->aux->id; > > Here as well: should we have "link->type != BPF_LINK_TYPE_STRUCT_OPS" vs > "link->prog != NULL" ? Same as above. `map` and `prog` share the same memory space. > > >> if (link->ops->fill_link_info) { >> err = link->ops->fill_link_info(link, &info); >> @@ -4531,6 +4537,8 @@ static int bpf_map_do_batch(const union >> bpf_attr *attr, >> return err; >> } > >> +extern int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t >> uattr); >> + >> #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies >> static int link_create(union bpf_attr *attr, bpfptr_t uattr) >> { >> @@ -4541,6 +4549,9 @@ static int link_create(union bpf_attr *attr, >> bpfptr_t uattr) >> if (CHECK_ATTR(BPF_LINK_CREATE)) >> return -EINVAL; > >> + if (attr->link_create.attach_type == BPF_STRUCT_OPS_MAP) >> + return link_create_struct_ops_map(attr, uattr); >> + >> prog = bpf_prog_get(attr->link_create.prog_fd); >> if (IS_ERR(prog)) >> return PTR_ERR(prog); >> diff --git a/tools/include/uapi/linux/bpf.h >> b/tools/include/uapi/linux/bpf.h >> index 17afd2b35ee5..1e6cdd0f355d 100644 >> --- a/tools/include/uapi/linux/bpf.h >> +++ b/tools/include/uapi/linux/bpf.h >> @@ -1033,6 +1033,7 @@ enum bpf_attach_type { >> BPF_PERF_EVENT, >> BPF_TRACE_KPROBE_MULTI, >> BPF_LSM_CGROUP, >> + BPF_STRUCT_OPS_MAP, >> __MAX_BPF_ATTACH_TYPE >> }; > >> @@ -6354,6 +6355,9 @@ struct bpf_link_info { >> struct { >> __u32 ifindex; >> } xdp; >> + struct { >> + __u32 map_id; >> + } struct_ops_map; >> }; >> } __attribute__((aligned(8))); > >> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c >> index 9aff98f42a3d..e44d49f17c86 100644 >> --- a/tools/lib/bpf/bpf.c >> +++ b/tools/lib/bpf/bpf.c >> @@ -731,6 +731,8 @@ int bpf_link_create(int prog_fd, int target_fd, >> if (!OPTS_ZEROED(opts, tracing)) >> return libbpf_err(-EINVAL); >> break; >> + case BPF_STRUCT_OPS_MAP: >> + break; >> default: >> if (!OPTS_ZEROED(opts, flags)) >> return libbpf_err(-EINVAL); >> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c >> index 35a698eb825d..75ed95b7e455 100644 >> --- a/tools/lib/bpf/libbpf.c >> +++ b/tools/lib/bpf/libbpf.c >> @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { >> [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = >> "sk_reuseport_select_or_migrate", >> [BPF_PERF_EVENT] = "perf_event", >> [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", >> + [BPF_STRUCT_OPS_MAP] = "struct_ops_map", >> }; > >> static const char * const link_type_name[] = { >> -- >> 2.30.2 >
On 02/15, Kui-Feng Lee wrote: > Thank you for the feedback. > On 2/14/23 18:39, Stanislav Fomichev wrote: > > On 02/14, Kui-Feng Lee wrote: > > > BPF struct_ops maps are employed directly to register TCP Congestion > > > Control algorithms. Unlike other BPF programs that terminate when > > > their links gone, the struct_ops program reduces its refcount solely > > > upon death of its FD. The link of a BPF struct_ops map provides a > > > uniform experience akin to other types of BPF programs.� A TCP > > > Congestion Control algorithm will be unregistered upon destroying the > > > FD in the following patches. > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> > > > --- > > > � include/linux/bpf.h����������� |� 6 +++- > > > � include/uapi/linux/bpf.h������ |� 4 +++ > > > � kernel/bpf/bpf_struct_ops.c��� | 66 > ++++++++++++++++++++++++++++++++++ > > > � kernel/bpf/syscall.c���������� | 29 ++++++++++----- > > > � tools/include/uapi/linux/bpf.h |� 4 +++ > > > � tools/lib/bpf/bpf.c����������� |� 2 ++ > > > � tools/lib/bpf/libbpf.c�������� |� 1 + > > > � 7 files changed, 102 insertions(+), 10 deletions(-) > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > index 8b5d0b4c4ada..13683584b071 100644 > > > --- a/include/linux/bpf.h > > > +++ b/include/linux/bpf.h > > > @@ -1391,7 +1391,11 @@ struct bpf_link { > > > ����� u32 id; > > > ����� enum bpf_link_type type; > > > ����� const struct bpf_link_ops *ops; > > > -��� struct bpf_prog *prog; > > > > [..] > > > > > +��� union { > > > +������� struct bpf_prog *prog; > > > +������� /* Backed by a struct_ops (type == > > > BPF_LINK_UPDATE_STRUCT_OPS) */ > > > +������� struct bpf_map *map; > > > +��� }; > > > > Any reason you're not using the approach that has been used for other > > links where we create a wrapping structure + container_of? > > > > struct bpt_struct_ops_link { > > ����struct bpf_link link; > > ����struct bpf_map *map; > > }; > > > `map` and `prog` are meant to be used separately, while `union` is > designed > for this purpose. > The `container_of` approach also works. While both `container_of` and > `union` are often used, is there any factor that makes the former a better > choice than the latter? I guess I'm missing something here. Because you seem to add that container_of approach later on with 'fake' links. Maybe you can clarify on the patch where I made that comment? > > > ����� struct work_struct work; > > > � }; > > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > > index 17afd2b35ee5..1e6cdd0f355d 100644 > > > --- a/include/uapi/linux/bpf.h > > > +++ b/include/uapi/linux/bpf.h > > > @@ -1033,6 +1033,7 @@ enum bpf_attach_type { > > > ����� BPF_PERF_EVENT, > > > ����� BPF_TRACE_KPROBE_MULTI, > > > ����� BPF_LSM_CGROUP, > > > +��� BPF_STRUCT_OPS_MAP, > > > ����� __MAX_BPF_ATTACH_TYPE > > > � }; > > > > > @@ -6354,6 +6355,9 @@ struct bpf_link_info { > > > ��������� struct { > > > ������������� __u32 ifindex; > > > ��������� } xdp; > > > +������� struct { > > > +����������� __u32 map_id; > > > +������� } struct_ops_map; > > > ����� }; > > > � } __attribute__((aligned(8))); > > > > > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > > > index ece9870cab68..621c8e24481a 100644 > > > --- a/kernel/bpf/bpf_struct_ops.c > > > +++ b/kernel/bpf/bpf_struct_ops.c > > > @@ -698,3 +698,69 @@ void bpf_struct_ops_put(const void *kdata) > > > ��������� call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); > > > ����� } > > > � } > > > + > > > +static void bpf_struct_ops_map_link_release(struct bpf_link *link) > > > +{ > > > +��� if (link->map) { > > > +������� bpf_map_put(link->map); > > > +������� link->map = NULL; > > > +��� } > > > +} > > > + > > > +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) > > > +{ > > > +��� kfree(link); > > > +} > > > + > > > +static void bpf_struct_ops_map_link_show_fdinfo(const struct > > > bpf_link *link, > > > +����������������������� struct seq_file *seq) > > > +{ > > > +��� seq_printf(seq, "map_id:\t%d\n", > > > +��������� link->map->id); > > > +} > > > + > > > +static int bpf_struct_ops_map_link_fill_link_info(const struct > > > bpf_link *link, > > > +�������������������������� struct bpf_link_info *info) > > > +{ > > > +��� info->struct_ops_map.map_id = link->map->id; > > > +��� return 0; > > > +} > > > + > > > +static const struct bpf_link_ops bpf_struct_ops_map_lops = { > > > +��� .release = bpf_struct_ops_map_link_release, > > > +��� .dealloc = bpf_struct_ops_map_link_dealloc, > > > +��� .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, > > > +��� .fill_link_info = bpf_struct_ops_map_link_fill_link_info, > > > +}; > > > + > > > +int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) > > > +{ > > > > [..] > > > > > +��� struct bpf_link_primer link_primer; > > > +��� struct bpf_map *map; > > > +��� struct bpf_link *link = NULL; > > > > Are we still trying to keep reverse christmas trees? > Yes, I will reorder them. > > > > > +��� int err; > > > + > > > +��� map = bpf_map_get(attr->link_create.prog_fd); > > > > bpf_map_get can fail here? > We have already verified the `attach_type` of the link before calling this > function, so an error should not occur. If it does happen, however, > something truly unusual must be happening. To ensure maximum protection > and > avoid this issue in the future, I will add a check here as well. If we've already checked, it's fine not to check here. I haven't looked at the real path, thanks for clarifying. > > > > > +��� if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) > > > +������� return -EINVAL; > > > + > > > +��� link = kzalloc(sizeof(*link), GFP_USER); > > > +��� if (!link) { > > > +������� err = -ENOMEM; > > > +������� goto err_out; > > > +��� } > > > +��� bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, > > > &bpf_struct_ops_map_lops, NULL); > > > +��� link->map = map; > > > + > > > +��� err = bpf_link_prime(link, &link_primer); > > > +��� if (err) > > > +������� goto err_out; > > > + > > > +��� return bpf_link_settle(&link_primer); > > > + > > > +err_out: > > > +��� bpf_map_put(map); > > > +��� kfree(link); > > > +��� return err; > > > +} > > > + > > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > > index cda8d00f3762..54e172d8f5d1 100644 > > > --- a/kernel/bpf/syscall.c > > > +++ b/kernel/bpf/syscall.c > > > @@ -2738,7 +2738,9 @@ static void bpf_link_free(struct bpf_link *link) > > > ����� if (link->prog) { > > > ��������� /* detach BPF program, clean up used resources */ > > > ��������� link->ops->release(link); > > > -������� bpf_prog_put(link->prog); > > > +������� if (link->type != BPF_LINK_TYPE_STRUCT_OPS) > > > +����������� bpf_prog_put(link->prog); > > > +������� /* The struct_ops links clean up map by them-selves. */ > > > > Why not more generic: > > > > if (link->prog) > > ����bpf_prog_put(link->prog); > > > > ? > The `prog` and `map` functions are now occupying the same space. I'm > afraid > this check won't work. Hmm, good point. In this case: why not have separate prog/map pointers instead of a union? Are we 100% sure struct_ops is unique enough and there won't ever be another map-based links? > > > > > > > ����� } > > > ����� /* free bpf_link and its containing memory */ > > > ����� link->ops->dealloc(link); > > > @@ -2794,16 +2796,19 @@ static void bpf_link_show_fdinfo(struct > > > seq_file *m, struct file *filp) > > > ����� const struct bpf_prog *prog = link->prog; > > > ����� char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; > > > > > -��� bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); > > > ����� seq_printf(m, > > > ������������ "link_type:\t%s\n" > > > -���������� "link_id:\t%u\n" > > > -���������� "prog_tag:\t%s\n" > > > -���������� "prog_id:\t%u\n", > > > +���������� "link_id:\t%u\n", > > > ������������ bpf_link_type_strs[link->type], > > > -���������� link->id, > > > -���������� prog_tag, > > > -���������� prog->aux->id); > > > +���������� link->id); > > > +��� if (link->type != BPF_LINK_TYPE_STRUCT_OPS) { > > > +������� bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); > > > +������� seq_printf(m, > > > +�������������� "prog_tag:\t%s\n" > > > +�������������� "prog_id:\t%u\n", > > > +�������������� prog_tag, > > > +�������������� prog->aux->id); > > > +��� } > > > ����� if (link->ops->show_fdinfo) > > > ��������� link->ops->show_fdinfo(link, m); > > > � } > > > @@ -4278,7 +4283,8 @@ static int bpf_link_get_info_by_fd(struct file > > > *file, > > > > > ����� info.type = link->type; > > > ����� info.id = link->id; > > > -��� info.prog_id = link->prog->aux->id; > > > +��� if (link->type != BPF_LINK_TYPE_STRUCT_OPS) > > > +������� info.prog_id = link->prog->aux->id; > > > > Here as well: should we have "link->type != BPF_LINK_TYPE_STRUCT_OPS" vs > > "link->prog != NULL" ? > Same as above.� `map` and `prog` share the same memory space. > > > > > > > ����� if (link->ops->fill_link_info) { > > > ��������� err = link->ops->fill_link_info(link, &info); > > > @@ -4531,6 +4537,8 @@ static int bpf_map_do_batch(const union > > > bpf_attr *attr, > > > ����� return err; > > > � } > > > > > +extern int link_create_struct_ops_map(union bpf_attr *attr, > > > bpfptr_t uattr); > > > + > > > � #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies > > > � static int link_create(union bpf_attr *attr, bpfptr_t uattr) > > > � { > > > @@ -4541,6 +4549,9 @@ static int link_create(union bpf_attr *attr, > > > bpfptr_t uattr) > > > ����� if (CHECK_ATTR(BPF_LINK_CREATE)) > > > ��������� return -EINVAL; > > > > > +��� if (attr->link_create.attach_type == BPF_STRUCT_OPS_MAP) > > > +������� return link_create_struct_ops_map(attr, uattr); > > > + > > > ����� prog = bpf_prog_get(attr->link_create.prog_fd); > > > ����� if (IS_ERR(prog)) > > > ��������� return PTR_ERR(prog); > > > diff --git a/tools/include/uapi/linux/bpf.h > > > b/tools/include/uapi/linux/bpf.h > > > index 17afd2b35ee5..1e6cdd0f355d 100644 > > > --- a/tools/include/uapi/linux/bpf.h > > > +++ b/tools/include/uapi/linux/bpf.h > > > @@ -1033,6 +1033,7 @@ enum bpf_attach_type { > > > ����� BPF_PERF_EVENT, > > > ����� BPF_TRACE_KPROBE_MULTI, > > > ����� BPF_LSM_CGROUP, > > > +��� BPF_STRUCT_OPS_MAP, > > > ����� __MAX_BPF_ATTACH_TYPE > > > � }; > > > > > @@ -6354,6 +6355,9 @@ struct bpf_link_info { > > > ��������� struct { > > > ������������� __u32 ifindex; > > > ��������� } xdp; > > > +������� struct { > > > +����������� __u32 map_id; > > > +������� } struct_ops_map; > > > ����� }; > > > � } __attribute__((aligned(8))); > > > > > diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c > > > index 9aff98f42a3d..e44d49f17c86 100644 > > > --- a/tools/lib/bpf/bpf.c > > > +++ b/tools/lib/bpf/bpf.c > > > @@ -731,6 +731,8 @@ int bpf_link_create(int prog_fd, int target_fd, > > > ��������� if (!OPTS_ZEROED(opts, tracing)) > > > ������������� return libbpf_err(-EINVAL); > > > ��������� break; > > > +��� case BPF_STRUCT_OPS_MAP: > > > +������� break; > > > ����� default: > > > ��������� if (!OPTS_ZEROED(opts, flags)) > > > ������������� return libbpf_err(-EINVAL); > > > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c > > > index 35a698eb825d..75ed95b7e455 100644 > > > --- a/tools/lib/bpf/libbpf.c > > > +++ b/tools/lib/bpf/libbpf.c > > > @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { > > > ����� [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE]��� = > > > "sk_reuseport_select_or_migrate", > > > ����� [BPF_PERF_EVENT]������� = "perf_event", > > > ����� [BPF_TRACE_KPROBE_MULTI]��� = "trace_kprobe_multi", > > > +��� [BPF_STRUCT_OPS_MAP]������� = "struct_ops_map", > > > � }; > > > > > � static const char * const link_type_name[] = { > > > -- > > > 2.30.2 > >
On 2/15/23 10:44, Stanislav Fomichev wrote: > On 02/15, Kui-Feng Lee wrote: >> Thank you for the feedback. > > >> On 2/14/23 18:39, Stanislav Fomichev wrote: >> > On 02/14, Kui-Feng Lee wrote: [..] >> > >> > > +��� if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) >> > > +������� return -EINVAL; >> > > + >> > > +��� link = kzalloc(sizeof(*link), GFP_USER); >> > > +��� if (!link) { >> > > +������� err = -ENOMEM; >> > > +������� goto err_out; >> > > +��� } >> > > +��� bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, >> > > &bpf_struct_ops_map_lops, NULL); >> > > +��� link->map = map; >> > > + >> > > +��� err = bpf_link_prime(link, &link_primer); >> > > +��� if (err) >> > > +������� goto err_out; >> > > + >> > > +��� return bpf_link_settle(&link_primer); >> > > + >> > > +err_out: >> > > +��� bpf_map_put(map); >> > > +��� kfree(link); >> > > +��� return err; >> > > +} >> > > + >> > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> > > index cda8d00f3762..54e172d8f5d1 100644 >> > > --- a/kernel/bpf/syscall.c >> > > +++ b/kernel/bpf/syscall.c >> > > @@ -2738,7 +2738,9 @@ static void bpf_link_free(struct bpf_link >> *link) >> > > ����� if (link->prog) { >> > > ��������� /* detach BPF program, clean up used resources */ >> > > ��������� link->ops->release(link); >> > > -������� bpf_prog_put(link->prog); >> > > +������� if (link->type != BPF_LINK_TYPE_STRUCT_OPS) >> > > +����������� bpf_prog_put(link->prog); >> > > +������� /* The struct_ops links clean up map by them-selves. */ >> > >> > Why not more generic: >> > >> > if (link->prog) >> > ����bpf_prog_put(link->prog); >> > >> > ? >> The `prog` and `map` functions are now occupying the same space. I'm >> afraid >> this check won't work. > > Hmm, good point. In this case: why not have separate prog/map pointers > instead of a union? Are we 100% sure struct_ops is unique enough > and there won't ever be another map-based links? > Good question! bpf_link is used to attach a single BPF program with a hook now. This patch takes things one step further, allowing the attachment of struct_ops. We can think of it as attaching a set of BPF programs to a pre-existing set of hooks. I would say that bpf_link now represents the attachments of a set of BPF programs to a pre-defined set of hooks. The size of a set is one for the case of attaching single BPF program. Is creating a new map-based link indicative of introducing an entirely distinct type of map that reflects a set of BPF programs? If so, why doesn't struct_ops work? If it happens, we may need to create a function to validate the attach_type associated with any given map type.
On 2/15/23 10:04 AM, Kui-Feng Lee wrote: >>> + int err; >>> + >>> + map = bpf_map_get(attr->link_create.prog_fd); >> >> bpf_map_get can fail here? > > > We have already verified the `attach_type` of the link before calling this > function, so an error should not occur. If it does happen, however, something > truly unusual must be happening. To ensure maximum protection and avoid this > issue in the future, I will add a check here as well. bpf_map_get() could fail. A valid attach_type does not mean prog_fd (actually map_fd here) is also valid.
On 2/15/23 12:30, Martin KaFai Lau wrote: > On 2/15/23 10:04 AM, Kui-Feng Lee wrote: >>>> + int err; >>>> + >>>> + map = bpf_map_get(attr->link_create.prog_fd); >>> >>> bpf_map_get can fail here? >> >> >> We have already verified the `attach_type` of the link before calling >> this function, so an error should not occur. If it does happen, >> however, something truly unusual must be happening. To ensure maximum >> protection and avoid this issue in the future, I will add a check >> here as well. > > bpf_map_get() could fail. A valid attach_type does not mean prog_fd > (actually map_fd here) is also valid. You are right! I must mess up with the update one. I will add a check.
On 2/15/23 12:24 PM, Kui-Feng Lee wrote: > > On 2/15/23 10:44, Stanislav Fomichev wrote: >> On 02/15, Kui-Feng Lee wrote: >>> Thank you for the feedback. >> >> >>> On 2/14/23 18:39, Stanislav Fomichev wrote: >>> > On 02/14, Kui-Feng Lee wrote: > [..] >>> > >>> > > +��� if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) >>> > > +������� return -EINVAL; >>> > > + >>> > > +��� link = kzalloc(sizeof(*link), GFP_USER); >>> > > +��� if (!link) { >>> > > +������� err = -ENOMEM; >>> > > +������� goto err_out; >>> > > +��� } >>> > > +��� bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, >>> > > &bpf_struct_ops_map_lops, NULL); >>> > > +��� link->map = map; >>> > > + >>> > > +��� err = bpf_link_prime(link, &link_primer); >>> > > +��� if (err) >>> > > +������� goto err_out; >>> > > + >>> > > +��� return bpf_link_settle(&link_primer); >>> > > + >>> > > +err_out: >>> > > +��� bpf_map_put(map); >>> > > +��� kfree(link); >>> > > +��� return err; >>> > > +} >>> > > + >>> > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >>> > > index cda8d00f3762..54e172d8f5d1 100644 >>> > > --- a/kernel/bpf/syscall.c >>> > > +++ b/kernel/bpf/syscall.c >>> > > @@ -2738,7 +2738,9 @@ static void bpf_link_free(struct bpf_link *link) >>> > > ����� if (link->prog) { >>> > > ��������� /* detach BPF program, clean up used resources */ >>> > > ��������� link->ops->release(link); >>> > > -������� bpf_prog_put(link->prog); >>> > > +������� if (link->type != BPF_LINK_TYPE_STRUCT_OPS) >>> > > +����������� bpf_prog_put(link->prog); >>> > > +������� /* The struct_ops links clean up map by them-selves. */ >>> > >>> > Why not more generic: >>> > >>> > if (link->prog) >>> > ����bpf_prog_put(link->prog); >>> > >>> > ? >>> The `prog` and `map` functions are now occupying the same space. I'm afraid >>> this check won't work. >> >> Hmm, good point. In this case: why not have separate prog/map pointers >> instead of a union? Are we 100% sure struct_ops is unique enough >> and there won't ever be another map-based links? >> > Good question! > > bpf_link is used to attach a single BPF program with a hook now. This patch > takes things one step further, allowing the attachment of struct_ops. We can > think of it as attaching a set of BPF programs to a pre-existing set of hooks. I > would say that bpf_link now represents the attachments of a set of BPF programs > to a pre-defined set of hooks. The size of a set is one for the case of > attaching single BPF program. > > Is creating a new map-based link indicative of introducing an entirely distinct > type of map that reflects a set of BPF programs? If so, why doesn't struct_ops > work? If it happens, we may need to create a function to validate the > attach_type associated with any given map type. > I also prefer separating 'prog' and 'map' in the link. It may be only struct_ops link that has no prog now but the future link type may not have prog also, so testing link->prog is less tie-up to a specific link type. Once separated, it makes sense to push the link's specific field to a new struct, so the following (from Stan) makes more sense: struct bpt_struct_ops_link { struct bpf_link link; struct bpf_map *map; }; In bpf_link_free(), there is no need to do an extra check for 'link->type != BPF_LINK_TYPE_STRUCT_OPS'. bpf_struct_ops_map_link_release() can be done together in bpf_struct_ops_map_link_dealloc().
On 2/14/23 2:17 PM, Kui-Feng Lee wrote: > BPF struct_ops maps are employed directly to register TCP Congestion > Control algorithms. Unlike other BPF programs that terminate when > their links gone, the struct_ops program reduces its refcount solely > upon death of its FD. I think the refcount comment probably not needed for this patch. > The link of a BPF struct_ops map provides a > uniform experience akin to other types of BPF programs. A TCP > Congestion Control algorithm will be unregistered upon destroying the > FD in the following patches. > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 17afd2b35ee5..1e6cdd0f355d 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h The existing BPF_LINK_TYPE_STRUCT_OPS enum is reused. Please explain why it can be reused in the commit message and also add comments around the existing "bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS...)" in bpf_struct_ops_map_update_elem(). > @@ -1033,6 +1033,7 @@ enum bpf_attach_type { > BPF_PERF_EVENT, > BPF_TRACE_KPROBE_MULTI, > BPF_LSM_CGROUP, > + BPF_STRUCT_OPS_MAP, nit. Only BPF_STRUCT_OPS. No need for "_MAP". > __MAX_BPF_ATTACH_TYPE > }; > > @@ -6354,6 +6355,9 @@ struct bpf_link_info { > struct { > __u32 ifindex; > } xdp; > + struct { > + __u32 map_id; > + } struct_ops_map; nit. Same here, skip the "_map"; This looks good instead of union. In case the user space tool directly uses the prog_id without checking the type. > }; > } __attribute__((aligned(8))); > > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > index ece9870cab68..621c8e24481a 100644 > --- a/kernel/bpf/bpf_struct_ops.c > +++ b/kernel/bpf/bpf_struct_ops.c > @@ -698,3 +698,69 @@ void bpf_struct_ops_put(const void *kdata) > call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); > } > } > + > +static void bpf_struct_ops_map_link_release(struct bpf_link *link) > +{ > + if (link->map) { > + bpf_map_put(link->map); > + link->map = NULL; > + } > +} > + > +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) > +{ > + kfree(link); > +} > + > +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, > + struct seq_file *seq) > +{ > + seq_printf(seq, "map_id:\t%d\n", > + link->map->id); > +} > + > +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, > + struct bpf_link_info *info) > +{ > + info->struct_ops_map.map_id = link->map->id; > + return 0; > +} > + > +static const struct bpf_link_ops bpf_struct_ops_map_lops = { > + .release = bpf_struct_ops_map_link_release, > + .dealloc = bpf_struct_ops_map_link_dealloc, > + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, > + .fill_link_info = bpf_struct_ops_map_link_fill_link_info, > +}; Can .detach be supported also? > + > +int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) Does it need uattr? nit. Rename to bpf_struct_ops_link_attach(), like how other link type's "attach" functions are named. or may be even just bpf_struct_ops_attach(). > +{ > + struct bpf_link_primer link_primer; > + struct bpf_map *map; > + struct bpf_link *link = NULL; > + int err; > + > + map = bpf_map_get(attr->link_create.prog_fd); This one looks weird. passing prog_fd to bpf_map_get(). I think in this case it makes sense to union map_fd with prog_fd in attr->link_create ? > + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) map is leaked. > + return -EINVAL; > + > + link = kzalloc(sizeof(*link), GFP_USER); > + if (!link) { > + err = -ENOMEM; > + goto err_out; > + } > + bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); > + link->map = map; > + > + err = bpf_link_prime(link, &link_primer); > + if (err) > + goto err_out; > + > + return bpf_link_settle(&link_primer); > + > +err_out: > + bpf_map_put(map); > + kfree(link); > + return err; > +} > + [ ... ] > +extern int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr); Move it to bpf.h. > + > #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies > static int link_create(union bpf_attr *attr, bpfptr_t uattr) > { > @@ -4541,6 +4549,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) > if (CHECK_ATTR(BPF_LINK_CREATE)) > return -EINVAL; > > + if (attr->link_create.attach_type == BPF_STRUCT_OPS_MAP) > + return link_create_struct_ops_map(attr, uattr); > + > prog = bpf_prog_get(attr->link_create.prog_fd); > if (IS_ERR(prog)) > return PTR_ERR(prog); > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index 17afd2b35ee5..1e6cdd0f355d 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -1033,6 +1033,7 @@ enum bpf_attach_type { > BPF_PERF_EVENT, > BPF_TRACE_KPROBE_MULTI, > BPF_LSM_CGROUP, > + BPF_STRUCT_OPS_MAP, > __MAX_BPF_ATTACH_TYPE > }; > > @@ -6354,6 +6355,9 @@ struct bpf_link_info { > struct { > __u32 ifindex; > } xdp; > + struct { > + __u32 map_id; > + } struct_ops_map; > }; > } __attribute__((aligned(8))); > > diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c > index 9aff98f42a3d..e44d49f17c86 100644 > --- a/tools/lib/bpf/bpf.c > +++ b/tools/lib/bpf/bpf.c Not necessary in this set and could be a follow up. Have you thought about how to generate a skel including the struct_ops link? > @@ -731,6 +731,8 @@ int bpf_link_create(int prog_fd, int target_fd, > if (!OPTS_ZEROED(opts, tracing)) > return libbpf_err(-EINVAL); > break; > + case BPF_STRUCT_OPS_MAP: > + break; > default: > if (!OPTS_ZEROED(opts, flags)) > return libbpf_err(-EINVAL); > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c > index 35a698eb825d..75ed95b7e455 100644 > --- a/tools/lib/bpf/libbpf.c > +++ b/tools/lib/bpf/libbpf.c > @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { > [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", > [BPF_PERF_EVENT] = "perf_event", > [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", > + [BPF_STRUCT_OPS_MAP] = "struct_ops_map", > }; > > static const char * const link_type_name[] = {
On 2/15/23 14:58, Martin KaFai Lau wrote: > On 2/14/23 2:17 PM, Kui-Feng Lee wrote: >> BPF struct_ops maps are employed directly to register TCP Congestion >> Control algorithms. Unlike other BPF programs that terminate when >> their links gone, the struct_ops program reduces its refcount solely >> upon death of its FD. > > I think the refcount comment probably not needed for this patch. Got it! > >> The link of a BPF struct_ops map provides a >> uniform experience akin to other types of BPF programs. A TCP >> Congestion Control algorithm will be unregistered upon destroying the >> FD in the following patches. > > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 17afd2b35ee5..1e6cdd0f355d 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h > > The existing BPF_LINK_TYPE_STRUCT_OPS enum is reused. Please explain why > it can be reused in the commit message and also add comments around the > existing "bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS...)" in > bpf_struct_ops_map_update_elem(). Sure! > >> @@ -1033,6 +1033,7 @@ enum bpf_attach_type { >> BPF_PERF_EVENT, >> BPF_TRACE_KPROBE_MULTI, >> BPF_LSM_CGROUP, >> + BPF_STRUCT_OPS_MAP, > > nit. Only BPF_STRUCT_OPS. No need for "_MAP". > >> __MAX_BPF_ATTACH_TYPE >> }; >> @@ -6354,6 +6355,9 @@ struct bpf_link_info { >> struct { >> __u32 ifindex; >> } xdp; >> + struct { >> + __u32 map_id; >> + } struct_ops_map; > > nit. Same here, skip the "_map"; Got! > > This looks good instead of union. In case the user space tool directly > uses the prog_id without checking the type. > >> }; >> } __attribute__((aligned(8))); >> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c >> index ece9870cab68..621c8e24481a 100644 >> --- a/kernel/bpf/bpf_struct_ops.c >> +++ b/kernel/bpf/bpf_struct_ops.c >> @@ -698,3 +698,69 @@ void bpf_struct_ops_put(const void *kdata) >> call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); >> } >> } >> + >> +static void bpf_struct_ops_map_link_release(struct bpf_link *link) >> +{ >> + if (link->map) { >> + bpf_map_put(link->map); >> + link->map = NULL; >> + } >> +} >> + >> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) >> +{ >> + kfree(link); >> +} >> + >> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link >> *link, >> + struct seq_file *seq) >> +{ >> + seq_printf(seq, "map_id:\t%d\n", >> + link->map->id); >> +} >> + >> +static int bpf_struct_ops_map_link_fill_link_info(const struct >> bpf_link *link, >> + struct bpf_link_info *info) >> +{ >> + info->struct_ops_map.map_id = link->map->id; >> + return 0; >> +} >> + >> +static const struct bpf_link_ops bpf_struct_ops_map_lops = { >> + .release = bpf_struct_ops_map_link_release, >> + .dealloc = bpf_struct_ops_map_link_dealloc, >> + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, >> + .fill_link_info = bpf_struct_ops_map_link_fill_link_info, >> +}; > > Can .detach be supported also? Sure! > >> + >> +int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) > > Does it need uattr? > > nit. Rename to bpf_struct_ops_link_attach(), like how other link type's > "attach" functions are named. or may be even just bpf_struct_ops_attach(). Got it! > >> +{ >> + struct bpf_link_primer link_primer; >> + struct bpf_map *map; >> + struct bpf_link *link = NULL; >> + int err; >> + >> + map = bpf_map_get(attr->link_create.prog_fd); > > This one looks weird. passing prog_fd to bpf_map_get(). I think in this > case it makes sense to union map_fd with prog_fd in attr->link_create ? > >> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) > > map is leaked. Yes, I will fix it. > >> + return -EINVAL; >> + >> + link = kzalloc(sizeof(*link), GFP_USER); >> + if (!link) { >> + err = -ENOMEM; >> + goto err_out; >> + } >> + bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, >> &bpf_struct_ops_map_lops, NULL); >> + link->map = map; >> + >> + err = bpf_link_prime(link, &link_primer); >> + if (err) >> + goto err_out; >> + >> + return bpf_link_settle(&link_primer); >> + >> +err_out: >> + bpf_map_put(map); >> + kfree(link); >> + return err; >> +} >> + > > [ ... ] > >> +extern int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t >> uattr); > > Move it to bpf.h. > >> + >> #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies >> static int link_create(union bpf_attr *attr, bpfptr_t uattr) >> { >> @@ -4541,6 +4549,9 @@ static int link_create(union bpf_attr *attr, >> bpfptr_t uattr) >> if (CHECK_ATTR(BPF_LINK_CREATE)) >> return -EINVAL; >> + if (attr->link_create.attach_type == BPF_STRUCT_OPS_MAP) >> + return link_create_struct_ops_map(attr, uattr); >> + >> prog = bpf_prog_get(attr->link_create.prog_fd); >> if (IS_ERR(prog)) >> return PTR_ERR(prog); >> diff --git a/tools/include/uapi/linux/bpf.h >> b/tools/include/uapi/linux/bpf.h >> index 17afd2b35ee5..1e6cdd0f355d 100644 >> --- a/tools/include/uapi/linux/bpf.h >> +++ b/tools/include/uapi/linux/bpf.h >> @@ -1033,6 +1033,7 @@ enum bpf_attach_type { >> BPF_PERF_EVENT, >> BPF_TRACE_KPROBE_MULTI, >> BPF_LSM_CGROUP, >> + BPF_STRUCT_OPS_MAP, >> __MAX_BPF_ATTACH_TYPE >> }; >> @@ -6354,6 +6355,9 @@ struct bpf_link_info { >> struct { >> __u32 ifindex; >> } xdp; >> + struct { >> + __u32 map_id; >> + } struct_ops_map; >> }; >> } __attribute__((aligned(8))); >> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c >> index 9aff98f42a3d..e44d49f17c86 100644 >> --- a/tools/lib/bpf/bpf.c >> +++ b/tools/lib/bpf/bpf.c > > Not necessary in this set and could be a follow up. Have you thought > about how to generate a skel including the struct_ops link? The user must now call bpf_map__set_map_flags() between XXXX__open() and XXX__load(). To simplify the process, skel should invoke bpf_map__set_map_flags() in the function of XXX__open_and _load(). Therefore, a method to indicate which struct_ops need a link is necessary. For instance, SEC(".struct_ops") struct tcp_congestion_ops xxx_map = { ... .flags = BPF_F_LINK, ... }; We probably can do it without any change to the code generator. > >> @@ -731,6 +731,8 @@ int bpf_link_create(int prog_fd, int target_fd, >> if (!OPTS_ZEROED(opts, tracing)) >> return libbpf_err(-EINVAL); >> break; >> + case BPF_STRUCT_OPS_MAP: >> + break; >> default: >> if (!OPTS_ZEROED(opts, flags)) >> return libbpf_err(-EINVAL); >> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c >> index 35a698eb825d..75ed95b7e455 100644 >> --- a/tools/lib/bpf/libbpf.c >> +++ b/tools/lib/bpf/libbpf.c >> @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { >> [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = >> "sk_reuseport_select_or_migrate", >> [BPF_PERF_EVENT] = "perf_event", >> [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", >> + [BPF_STRUCT_OPS_MAP] = "struct_ops_map", >> }; >> static const char * const link_type_name[] = { >
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b5d0b4c4ada..13683584b071 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1391,7 +1391,11 @@ struct bpf_link { u32 id; enum bpf_link_type type; const struct bpf_link_ops *ops; - struct bpf_prog *prog; + union { + struct bpf_prog *prog; + /* Backed by a struct_ops (type == BPF_LINK_UPDATE_STRUCT_OPS) */ + struct bpf_map *map; + }; struct work_struct work; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 17afd2b35ee5..1e6cdd0f355d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1033,6 +1033,7 @@ enum bpf_attach_type { BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, BPF_LSM_CGROUP, + BPF_STRUCT_OPS_MAP, __MAX_BPF_ATTACH_TYPE }; @@ -6354,6 +6355,9 @@ struct bpf_link_info { struct { __u32 ifindex; } xdp; + struct { + __u32 map_id; + } struct_ops_map; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ece9870cab68..621c8e24481a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -698,3 +698,69 @@ void bpf_struct_ops_put(const void *kdata) call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); } } + +static void bpf_struct_ops_map_link_release(struct bpf_link *link) +{ + if (link->map) { + bpf_map_put(link->map); + link->map = NULL; + } +} + +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + seq_printf(seq, "map_id:\t%d\n", + link->map->id); +} + +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + info->struct_ops_map.map_id = link->map->id; + return 0; +} + +static const struct bpf_link_ops bpf_struct_ops_map_lops = { + .release = bpf_struct_ops_map_link_release, + .dealloc = bpf_struct_ops_map_link_dealloc, + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, + .fill_link_info = bpf_struct_ops_map_link_fill_link_info, +}; + +int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr) +{ + struct bpf_link_primer link_primer; + struct bpf_map *map; + struct bpf_link *link = NULL; + int err; + + map = bpf_map_get(attr->link_create.prog_fd); + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto err_out; + } + bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + link->map = map; + + err = bpf_link_prime(link, &link_primer); + if (err) + goto err_out; + + return bpf_link_settle(&link_primer); + +err_out: + bpf_map_put(map); + kfree(link); + return err; +} + diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cda8d00f3762..54e172d8f5d1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2738,7 +2738,9 @@ static void bpf_link_free(struct bpf_link *link) if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); - bpf_prog_put(link->prog); + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) + bpf_prog_put(link->prog); + /* The struct_ops links clean up map by them-selves. */ } /* free bpf_link and its containing memory */ link->ops->dealloc(link); @@ -2794,16 +2796,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" - "link_id:\t%u\n" - "prog_tag:\t%s\n" - "prog_id:\t%u\n", + "link_id:\t%u\n", bpf_link_type_strs[link->type], - link->id, - prog_tag, - prog->aux->id); + link->id); + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) { + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + prog_tag, + prog->aux->id); + } if (link->ops->show_fdinfo) link->ops->show_fdinfo(link, m); } @@ -4278,7 +4283,8 @@ static int bpf_link_get_info_by_fd(struct file *file, info.type = link->type; info.id = link->id; - info.prog_id = link->prog->aux->id; + if (link->type != BPF_LINK_TYPE_STRUCT_OPS) + info.prog_id = link->prog->aux->id; if (link->ops->fill_link_info) { err = link->ops->fill_link_info(link, &info); @@ -4531,6 +4537,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, return err; } +extern int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr); + #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { @@ -4541,6 +4549,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; + if (attr->link_create.attach_type == BPF_STRUCT_OPS_MAP) + return link_create_struct_ops_map(attr, uattr); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 17afd2b35ee5..1e6cdd0f355d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1033,6 +1033,7 @@ enum bpf_attach_type { BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, BPF_LSM_CGROUP, + BPF_STRUCT_OPS_MAP, __MAX_BPF_ATTACH_TYPE }; @@ -6354,6 +6355,9 @@ struct bpf_link_info { struct { __u32 ifindex; } xdp; + struct { + __u32 map_id; + } struct_ops_map; }; } __attribute__((aligned(8))); diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 9aff98f42a3d..e44d49f17c86 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -731,6 +731,8 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, tracing)) return libbpf_err(-EINVAL); break; + case BPF_STRUCT_OPS_MAP: + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 35a698eb825d..75ed95b7e455 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", [BPF_PERF_EVENT] = "perf_event", [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", + [BPF_STRUCT_OPS_MAP] = "struct_ops_map", }; static const char * const link_type_name[] = {
BPF struct_ops maps are employed directly to register TCP Congestion Control algorithms. Unlike other BPF programs that terminate when their links gone, the struct_ops program reduces its refcount solely upon death of its FD. The link of a BPF struct_ops map provides a uniform experience akin to other types of BPF programs. A TCP Congestion Control algorithm will be unregistered upon destroying the FD in the following patches. Signed-off-by: Kui-Feng Lee <kuifeng@meta.com> --- include/linux/bpf.h | 6 +++- include/uapi/linux/bpf.h | 4 +++ kernel/bpf/bpf_struct_ops.c | 66 ++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 29 ++++++++++----- tools/include/uapi/linux/bpf.h | 4 +++ tools/lib/bpf/bpf.c | 2 ++ tools/lib/bpf/libbpf.c | 1 + 7 files changed, 102 insertions(+), 10 deletions(-)