diff mbox series

[bpf-next,v3,2/8] bpf: Create links for BPF struct_ops maps.

Message ID 20230303012122.852654-3-kuifeng@meta.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Transit between BPF TCP congestion controls. | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail merge-conflict
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-17

Commit Message

Kui-Feng Lee March 3, 2023, 1:21 a.m. UTC
BPF struct_ops maps are employed directly to register TCP Congestion
Control algorithms. Unlike other BPF programs that terminate when
their links gone. The link of a BPF struct_ops map provides a uniform
experience akin to other types of BPF programs.

bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.

The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self.  Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
 include/linux/bpf.h            |  11 +++
 include/uapi/linux/bpf.h       |  12 +++-
 kernel/bpf/bpf_struct_ops.c    | 118 +++++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c           |  26 +++++---
 tools/include/uapi/linux/bpf.h |  12 +++-
 5 files changed, 164 insertions(+), 15 deletions(-)

Comments

Stanislav Fomichev March 6, 2023, 8:23 p.m. UTC | #1
On 03/02, Kui-Feng Lee wrote:
> BPF struct_ops maps are employed directly to register TCP Congestion
> Control algorithms. Unlike other BPF programs that terminate when
> their links gone. The link of a BPF struct_ops map provides a uniform
> experience akin to other types of BPF programs.

> bpf_links are responsible for registering their associated
> struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
> set to create a bpf_link, while a structs without this flag behaves in
> the same manner as before and is registered upon updating its value.

> The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
> used to craft the links for BPF struct_ops programs, but also to
> create links for BPF struct_ops them-self.  Since the links of BPF
> struct_ops programs are only used to create trampolines internally,
> they are never seen in other contexts. Thus, they can be reused for
> struct_ops themself.

> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
>   include/linux/bpf.h            |  11 +++
>   include/uapi/linux/bpf.h       |  12 +++-
>   kernel/bpf/bpf_struct_ops.c    | 118 +++++++++++++++++++++++++++++++--
>   kernel/bpf/syscall.c           |  26 +++++---
>   tools/include/uapi/linux/bpf.h |  12 +++-
>   5 files changed, 164 insertions(+), 15 deletions(-)

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cb837f42b99d..b845be719422 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1396,6 +1396,11 @@ struct bpf_link {
>   	struct work_struct work;
>   };

> +struct bpf_struct_ops_link {
> +	struct bpf_link link;
> +	struct bpf_map __rcu *map;
> +};
> +
>   struct bpf_link_ops {
>   	void (*release)(struct bpf_link *link);
>   	void (*dealloc)(struct bpf_link *link);
> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
>   struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
>   struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>   struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
> +int bpf_struct_ops_link_create(union bpf_attr *attr);

>   int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>   int bpf_obj_get_user(const char __user *pathname, int flags);
> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link  
> *link)
>   {
>   }

> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>   static inline int bpf_obj_get_user(const char __user *pathname, int  
> flags)
>   {
>   	return -EOPNOTSUPP;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 17afd2b35ee5..cd0ff39981e8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>   	BPF_PERF_EVENT,
>   	BPF_TRACE_KPROBE_MULTI,
>   	BPF_LSM_CGROUP,
> +	BPF_STRUCT_OPS,
>   	__MAX_BPF_ATTACH_TYPE
>   };

> @@ -1266,6 +1267,9 @@ enum {

>   /* Create a map that is suitable to be an inner map with dynamic max  
> entries */
>   	BPF_F_INNER_MAP		= (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed  
> bpf_link */
> +	BPF_F_LINK		= (1U << 13),
>   };

>   /* Flags for BPF_PROG_QUERY. */
> @@ -1507,7 +1511,10 @@ union bpf_attr {
>   	} task_fd_query;

>   	struct { /* struct used by BPF_LINK_CREATE command */
> -		__u32		prog_fd;	/* eBPF program to attach */
> +		union {
> +			__u32		prog_fd;	/* eBPF program to attach */
> +			__u32		map_fd;		/* eBPF struct_ops to attach */
> +		};
>   		union {
>   			__u32		target_fd;	/* object to attach to */
>   			__u32		target_ifindex; /* target ifindex */
> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>   		struct {
>   			__u32 ifindex;
>   		} xdp;
> +		struct {
> +			__u32 map_id;
> +		} struct_ops;
>   	};
>   } __attribute__((aligned(8)));

> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index bba03b6b010b..9ec675576d97 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -14,6 +14,7 @@

>   enum bpf_struct_ops_state {
>   	BPF_STRUCT_OPS_STATE_INIT,
> +	BPF_STRUCT_OPS_STATE_READY,
>   	BPF_STRUCT_OPS_STATE_INUSE,
>   	BPF_STRUCT_OPS_STATE_TOBEFREE,
>   };
> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct  
> bpf_map *map, void *key,
>   		*(unsigned long *)(udata + moff) = prog->aux->id;
>   	}

> -	bpf_map_inc(map);
> -
>   	set_memory_rox((long)st_map->image, 1);
> +	if (st_map->map.map_flags & BPF_F_LINK) {
> +		/* Let bpf_link handle registration & unregistration.
> +		 *
> +		 * Pair with smp_load_acquire() during lookup_elem().
> +		 */
> +		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);

Any reason not to create a link here and return to the user? (without
extra link_create API)

> +		goto unlock;
> +	}
> +
>   	err = st_ops->reg(kdata);
>   	if (likely(!err)) {
> +		bpf_map_inc(map);
>   		/* Pair with smp_load_acquire() during lookup_elem().
>   		 * It ensures the above udata updates (e.g. prog->aux->id)
>   		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct  
> bpf_map *map, void *key,
>   	 */
>   	set_memory_nx((long)st_map->image, 1);
>   	set_memory_rw((long)st_map->image, 1);
> -	bpf_map_put(map);

>   reset_unlock:
>   	bpf_struct_ops_map_put_progs(st_map);
> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct  
> bpf_map *map, void *key)
>   	struct bpf_struct_ops_map *st_map;

>   	st_map = (struct bpf_struct_ops_map *)map;
> +	if (st_map->map.map_flags & BPF_F_LINK)
> +		return -EOPNOTSUPP;
> +
>   	prev_state = cmpxchg(&st_map->kvalue.state,
>   			     BPF_STRUCT_OPS_STATE_INUSE,
>   			     BPF_STRUCT_OPS_STATE_TOBEFREE);
>   	switch (prev_state) {
> +	case BPF_STRUCT_OPS_STATE_READY:
> +		return -EOPNOTSUPP;
>   	case BPF_STRUCT_OPS_STATE_INUSE:
>   		st_map->st_ops->unreg(&st_map->kvalue.data);
>   		bpf_map_put(map);
> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct  
> bpf_map *map)
>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>   {
>   	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> -	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
> +	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
>   		return -EINVAL;
>   	return 0;
>   }
> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)

>   	bpf_map_put(&st_map->map);
>   }
> +
> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
> +{
> +	struct bpf_struct_ops_link *st_link;
> +	struct bpf_struct_ops_map *st_map;
> +
> +	st_link = container_of(link, struct bpf_struct_ops_link, link);
> +	if (st_link->map) {
> +		st_map = (struct bpf_struct_ops_map *)st_link->map;
> +		st_map->st_ops->unreg(&st_map->kvalue.data);
> +		bpf_map_put(st_link->map);
> +	}
> +	kfree(st_link);
> +}
> +
> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link  
> *link,
> +					    struct seq_file *seq)
> +{
> +	struct bpf_struct_ops_link *st_link;
> +	struct bpf_map *map;
> +
> +	st_link = container_of(link, struct bpf_struct_ops_link, link);
> +	rcu_read_lock_trace();
> +	map = rcu_dereference(st_link->map);
> +	if (map)
> +		seq_printf(seq, "map_id:\t%d\n", map->id);
> +	rcu_read_unlock_trace();
> +}
> +
> +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link  
> *link,
> +					       struct bpf_link_info *info)
> +{
> +	struct bpf_struct_ops_link *st_link;
> +	struct bpf_map *map;
> +
> +	st_link = container_of(link, struct bpf_struct_ops_link, link);
> +	rcu_read_lock_trace();
> +	map = rcu_dereference(st_link->map);
> +	if (map)
> +		info->struct_ops.map_id = map->id;
> +	rcu_read_unlock_trace();
> +	return 0;
> +}
> +
> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
> +	.dealloc = bpf_struct_ops_map_link_dealloc,
> +	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
> +	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
> +};
> +
> +int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> +	struct bpf_struct_ops_link *link = NULL;
> +	struct bpf_link_primer link_primer;
> +	struct bpf_struct_ops_map *st_map;
> +	struct bpf_map *map;
> +	int err;
> +
> +	map = bpf_map_get(attr->link_create.map_fd);
> +	if (!map)
> +		return -EINVAL;
> +
> +	st_map = (struct bpf_struct_ops_map *)map;
> +
> +	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags &  
> BPF_F_LINK) ||
> +	    /* Pair with smp_store_release() during map_update */
> +	    smp_load_acquire(&st_map->kvalue.state) !=  
> BPF_STRUCT_OPS_STATE_READY) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	link = kzalloc(sizeof(*link), GFP_USER);
> +	if (!link) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,  
> &bpf_struct_ops_map_lops, NULL);
> +	link->map = map;
> +
> +	err = bpf_link_prime(&link->link, &link_primer);
> +	if (err)
> +		goto err_out;
> +
> +	err = st_map->st_ops->reg(st_map->kvalue.data);
> +	if (err) {
> +		bpf_link_cleanup(&link_primer);
> +		goto err_out;
> +	}
> +
> +	return bpf_link_settle(&link_primer);
> +
> +err_out:
> +	bpf_map_put(map);
> +	kfree(link);
> +	return err;
> +}
> +
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 358a0e40555e..3db4938212d6 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
>   static void bpf_link_free(struct bpf_link *link)
>   {
>   	bpf_link_free_id(link->id);
> +	/* detach BPF program, clean up used resources */
>   	if (link->prog) {
> -		/* detach BPF program, clean up used resources */
>   		link->ops->release(link);
>   		bpf_prog_put(link->prog);
> +		/* The struct_ops links clean up map by them-selves. */
>   	}
>   	/* free bpf_link and its containing memory */
>   	link->ops->dealloc(link);
> @@ -2802,16 +2803,19 @@ static void bpf_link_show_fdinfo(struct seq_file  
> *m, struct file *filp)
>   	const struct bpf_prog *prog = link->prog;
>   	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };

> -	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
>   	seq_printf(m,
>   		   "link_type:\t%s\n"
> -		   "link_id:\t%u\n"
> -		   "prog_tag:\t%s\n"
> -		   "prog_id:\t%u\n",
> +		   "link_id:\t%u\n",
>   		   bpf_link_type_strs[link->type],
> -		   link->id,
> -		   prog_tag,
> -		   prog->aux->id);
> +		   link->id);
> +	if (prog) {
> +		bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
> +		seq_printf(m,
> +			   "prog_tag:\t%s\n"
> +			   "prog_id:\t%u\n",
> +			   prog_tag,
> +			   prog->aux->id);
> +	}
>   	if (link->ops->show_fdinfo)
>   		link->ops->show_fdinfo(link, m);
>   }
> @@ -4286,7 +4290,8 @@ static int bpf_link_get_info_by_fd(struct file  
> *file,

>   	info.type = link->type;
>   	info.id = link->id;
> -	info.prog_id = link->prog->aux->id;
> +	if (link->prog)
> +		info.prog_id = link->prog->aux->id;

>   	if (link->ops->fill_link_info) {
>   		err = link->ops->fill_link_info(link, &info);
> @@ -4549,6 +4554,9 @@ static int link_create(union bpf_attr *attr,  
> bpfptr_t uattr)
>   	if (CHECK_ATTR(BPF_LINK_CREATE))
>   		return -EINVAL;

> +	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
> +		return bpf_struct_ops_link_create(attr);
> +
>   	prog = bpf_prog_get(attr->link_create.prog_fd);
>   	if (IS_ERR(prog))
>   		return PTR_ERR(prog);
> diff --git a/tools/include/uapi/linux/bpf.h  
> b/tools/include/uapi/linux/bpf.h
> index 17afd2b35ee5..cd0ff39981e8 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>   	BPF_PERF_EVENT,
>   	BPF_TRACE_KPROBE_MULTI,
>   	BPF_LSM_CGROUP,
> +	BPF_STRUCT_OPS,
>   	__MAX_BPF_ATTACH_TYPE
>   };

> @@ -1266,6 +1267,9 @@ enum {

>   /* Create a map that is suitable to be an inner map with dynamic max  
> entries */
>   	BPF_F_INNER_MAP		= (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed  
> bpf_link */
> +	BPF_F_LINK		= (1U << 13),
>   };

>   /* Flags for BPF_PROG_QUERY. */
> @@ -1507,7 +1511,10 @@ union bpf_attr {
>   	} task_fd_query;

>   	struct { /* struct used by BPF_LINK_CREATE command */
> -		__u32		prog_fd;	/* eBPF program to attach */
> +		union {
> +			__u32		prog_fd;	/* eBPF program to attach */
> +			__u32		map_fd;		/* eBPF struct_ops to attach */
> +		};
>   		union {
>   			__u32		target_fd;	/* object to attach to */
>   			__u32		target_ifindex; /* target ifindex */
> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>   		struct {
>   			__u32 ifindex;
>   		} xdp;
> +		struct {
> +			__u32 map_id;
> +		} struct_ops;
>   	};
>   } __attribute__((aligned(8)));

> --
> 2.30.2
Kuifeng Lee March 6, 2023, 10:02 p.m. UTC | #2
On 3/6/23 12:23, Stanislav Fomichev wrote:
> On 03/02, Kui-Feng Lee wrote:
>> BPF struct_ops maps are employed directly to register TCP Congestion
>> Control algorithms. Unlike other BPF programs that terminate when
>> their links gone. The link of a BPF struct_ops map provides a uniform
>> experience akin to other types of BPF programs.
> 
>> bpf_links are responsible for registering their associated
>> struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
>> set to create a bpf_link, while a structs without this flag behaves in
>> the same manner as before and is registered upon updating its value.
> 
>> The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
>> used to craft the links for BPF struct_ops programs, but also to
>> create links for BPF struct_ops them-self.  Since the links of BPF
>> struct_ops programs are only used to create trampolines internally,
>> they are never seen in other contexts. Thus, they can be reused for
>> struct_ops themself.
> 
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>>   include/linux/bpf.h            |  11 +++
>>   include/uapi/linux/bpf.h       |  12 +++-
>>   kernel/bpf/bpf_struct_ops.c    | 118 +++++++++++++++++++++++++++++++--
>>   kernel/bpf/syscall.c           |  26 +++++---
>>   tools/include/uapi/linux/bpf.h |  12 +++-
>>   5 files changed, 164 insertions(+), 15 deletions(-)
> 
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index cb837f42b99d..b845be719422 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1396,6 +1396,11 @@ struct bpf_link {
>>       struct work_struct work;
>>   };
> 
>> +struct bpf_struct_ops_link {
>> +    struct bpf_link link;
>> +    struct bpf_map __rcu *map;
>> +};
>> +
>>   struct bpf_link_ops {
>>       void (*release)(struct bpf_link *link);
>>       void (*dealloc)(struct bpf_link *link);
>> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
>>   struct file *bpf_link_new_file(struct bpf_link *link, int 
>> *reserved_fd);
>>   struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>>   struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
>> +int bpf_struct_ops_link_create(union bpf_attr *attr);
> 
>>   int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>>   int bpf_obj_get_user(const char __user *pathname, int flags);
>> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link 
>> *link)
>>   {
>>   }
> 
>> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> +    return -EOPNOTSUPP;
>> +}
>> +
>>   static inline int bpf_obj_get_user(const char __user *pathname, int 
>> flags)
>>   {
>>       return -EOPNOTSUPP;
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 17afd2b35ee5..cd0ff39981e8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>>       BPF_PERF_EVENT,
>>       BPF_TRACE_KPROBE_MULTI,
>>       BPF_LSM_CGROUP,
>> +    BPF_STRUCT_OPS,
>>       __MAX_BPF_ATTACH_TYPE
>>   };
> 
>> @@ -1266,6 +1267,9 @@ enum {
> 
>>   /* Create a map that is suitable to be an inner map with dynamic max 
>> entries */
>>       BPF_F_INNER_MAP        = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed 
>> bpf_link */
>> +    BPF_F_LINK        = (1U << 13),
>>   };
> 
>>   /* Flags for BPF_PROG_QUERY. */
>> @@ -1507,7 +1511,10 @@ union bpf_attr {
>>       } task_fd_query;
> 
>>       struct { /* struct used by BPF_LINK_CREATE command */
>> -        __u32        prog_fd;    /* eBPF program to attach */
>> +        union {
>> +            __u32        prog_fd;    /* eBPF program to attach */
>> +            __u32        map_fd;        /* eBPF struct_ops to attach */
>> +        };
>>           union {
>>               __u32        target_fd;    /* object to attach to */
>>               __u32        target_ifindex; /* target ifindex */
>> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>>           struct {
>>               __u32 ifindex;
>>           } xdp;
>> +        struct {
>> +            __u32 map_id;
>> +        } struct_ops;
>>       };
>>   } __attribute__((aligned(8)));
> 
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index bba03b6b010b..9ec675576d97 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -14,6 +14,7 @@
> 
>>   enum bpf_struct_ops_state {
>>       BPF_STRUCT_OPS_STATE_INIT,
>> +    BPF_STRUCT_OPS_STATE_READY,
>>       BPF_STRUCT_OPS_STATE_INUSE,
>>       BPF_STRUCT_OPS_STATE_TOBEFREE,
>>   };
>> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>           *(unsigned long *)(udata + moff) = prog->aux->id;
>>       }
> 
>> -    bpf_map_inc(map);
>> -
>>       set_memory_rox((long)st_map->image, 1);
>> +    if (st_map->map.map_flags & BPF_F_LINK) {
>> +        /* Let bpf_link handle registration & unregistration.
>> +         *
>> +         * Pair with smp_load_acquire() during lookup_elem().
>> +         */
>> +        smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
> 
> Any reason not to create a link here and return to the user? (without
> extra link_create API)
> 

We want the ability to substitute an existing struct_ops with a 
different one. If we establish a link here, that implies we can't load 
multiple struct_ops and switch between them since the struct_ops will be 
registered instantly. Likewise, bpf_map_update_elem() is not supposed to 
generate a new FD.

>> +        goto unlock;
>> +    }
>> +
>>       err = st_ops->reg(kdata);
>>       if (likely(!err)) {
>> +        bpf_map_inc(map);
>>           /* Pair with smp_load_acquire() during lookup_elem().
>>            * It ensures the above udata updates (e.g. prog->aux->id)
>>            * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
>> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>        */
>>       set_memory_nx((long)st_map->image, 1);
>>       set_memory_rw((long)st_map->image, 1);
>> -    bpf_map_put(map);
> 
>>   reset_unlock:
>>       bpf_struct_ops_map_put_progs(st_map);
>> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct 
>> bpf_map *map, void *key)
>>       struct bpf_struct_ops_map *st_map;
> 
>>       st_map = (struct bpf_struct_ops_map *)map;
>> +    if (st_map->map.map_flags & BPF_F_LINK)
>> +        return -EOPNOTSUPP;
>> +
>>       prev_state = cmpxchg(&st_map->kvalue.state,
>>                    BPF_STRUCT_OPS_STATE_INUSE,
>>                    BPF_STRUCT_OPS_STATE_TOBEFREE);
>>       switch (prev_state) {
>> +    case BPF_STRUCT_OPS_STATE_READY:
>> +        return -EOPNOTSUPP;
>>       case BPF_STRUCT_OPS_STATE_INUSE:
>>           st_map->st_ops->unreg(&st_map->kvalue.data);
>>           bpf_map_put(map);
>> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct 
>> bpf_map *map)
>>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>>   {
>>       if (attr->key_size != sizeof(unsigned int) || attr->max_entries 
>> != 1 ||
>> -        attr->map_flags || !attr->btf_vmlinux_value_type_id)
>> +        (attr->map_flags & ~BPF_F_LINK) || 
>> !attr->btf_vmlinux_value_type_id)
>>           return -EINVAL;
>>       return 0;
>>   }
>> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
> 
>>       bpf_map_put(&st_map->map);
>>   }
>> +
>> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
>> +{
>> +    struct bpf_struct_ops_link *st_link;
>> +    struct bpf_struct_ops_map *st_map;
>> +
>> +    st_link = container_of(link, struct bpf_struct_ops_link, link);
>> +    if (st_link->map) {
>> +        st_map = (struct bpf_struct_ops_map *)st_link->map;
>> +        st_map->st_ops->unreg(&st_map->kvalue.data);
>> +        bpf_map_put(st_link->map);
>> +    }
>> +    kfree(st_link);
>> +}
>> +
>> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link 
>> *link,
>> +                        struct seq_file *seq)
>> +{
>> +    struct bpf_struct_ops_link *st_link;
>> +    struct bpf_map *map;
>> +
>> +    st_link = container_of(link, struct bpf_struct_ops_link, link);
>> +    rcu_read_lock_trace();
>> +    map = rcu_dereference(st_link->map);
>> +    if (map)
>> +        seq_printf(seq, "map_id:\t%d\n", map->id);
>> +    rcu_read_unlock_trace();
>> +}
>> +
>> +static int bpf_struct_ops_map_link_fill_link_info(const struct 
>> bpf_link *link,
>> +                           struct bpf_link_info *info)
>> +{
>> +    struct bpf_struct_ops_link *st_link;
>> +    struct bpf_map *map;
>> +
>> +    st_link = container_of(link, struct bpf_struct_ops_link, link);
>> +    rcu_read_lock_trace();
>> +    map = rcu_dereference(st_link->map);
>> +    if (map)
>> +        info->struct_ops.map_id = map->id;
>> +    rcu_read_unlock_trace();
>> +    return 0;
>> +}
>> +
>> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
>> +    .dealloc = bpf_struct_ops_map_link_dealloc,
>> +    .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
>> +    .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
>> +};
>> +
>> +int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> +    struct bpf_struct_ops_link *link = NULL;
>> +    struct bpf_link_primer link_primer;
>> +    struct bpf_struct_ops_map *st_map;
>> +    struct bpf_map *map;
>> +    int err;
>> +
>> +    map = bpf_map_get(attr->link_create.map_fd);
>> +    if (!map)
>> +        return -EINVAL;
>> +
>> +    st_map = (struct bpf_struct_ops_map *)map;
>> +
>> +    if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags 
>> & BPF_F_LINK) ||
>> +        /* Pair with smp_store_release() during map_update */
>> +        smp_load_acquire(&st_map->kvalue.state) != 
>> BPF_STRUCT_OPS_STATE_READY) {
>> +        err = -EINVAL;
>> +        goto err_out;
>> +    }
>> +
>> +    link = kzalloc(sizeof(*link), GFP_USER);
>> +    if (!link) {
>> +        err = -ENOMEM;
>> +        goto err_out;
>> +    }
>> +    bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 
>> &bpf_struct_ops_map_lops, NULL);
>> +    link->map = map;
>> +
>> +    err = bpf_link_prime(&link->link, &link_primer);
>> +    if (err)
>> +        goto err_out;
>> +
>> +    err = st_map->st_ops->reg(st_map->kvalue.data);
>> +    if (err) {
>> +        bpf_link_cleanup(&link_primer);
>> +        goto err_out;
>> +    }
>> +
>> +    return bpf_link_settle(&link_primer);
>> +
>> +err_out:
>> +    bpf_map_put(map);
>> +    kfree(link);
>> +    return err;
>> +}
>> +
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 358a0e40555e..3db4938212d6 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
>>   static void bpf_link_free(struct bpf_link *link)
>>   {
>>       bpf_link_free_id(link->id);
>> +    /* detach BPF program, clean up used resources */
>>       if (link->prog) {
>> -        /* detach BPF program, clean up used resources */
>>           link->ops->release(link);
>>           bpf_prog_put(link->prog);
>> +        /* The struct_ops links clean up map by them-selves. */
>>       }
>>       /* free bpf_link and its containing memory */
>>       link->ops->dealloc(link);
>> @@ -2802,16 +2803,19 @@ static void bpf_link_show_fdinfo(struct 
>> seq_file *m, struct file *filp)
>>       const struct bpf_prog *prog = link->prog;
>>       char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
> 
>> -    bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
>>       seq_printf(m,
>>              "link_type:\t%s\n"
>> -           "link_id:\t%u\n"
>> -           "prog_tag:\t%s\n"
>> -           "prog_id:\t%u\n",
>> +           "link_id:\t%u\n",
>>              bpf_link_type_strs[link->type],
>> -           link->id,
>> -           prog_tag,
>> -           prog->aux->id);
>> +           link->id);
>> +    if (prog) {
>> +        bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
>> +        seq_printf(m,
>> +               "prog_tag:\t%s\n"
>> +               "prog_id:\t%u\n",
>> +               prog_tag,
>> +               prog->aux->id);
>> +    }
>>       if (link->ops->show_fdinfo)
>>           link->ops->show_fdinfo(link, m);
>>   }
>> @@ -4286,7 +4290,8 @@ static int bpf_link_get_info_by_fd(struct file 
>> *file,
> 
>>       info.type = link->type;
>>       info.id = link->id;
>> -    info.prog_id = link->prog->aux->id;
>> +    if (link->prog)
>> +        info.prog_id = link->prog->aux->id;
> 
>>       if (link->ops->fill_link_info) {
>>           err = link->ops->fill_link_info(link, &info);
>> @@ -4549,6 +4554,9 @@ static int link_create(union bpf_attr *attr, 
>> bpfptr_t uattr)
>>       if (CHECK_ATTR(BPF_LINK_CREATE))
>>           return -EINVAL;
> 
>> +    if (attr->link_create.attach_type == BPF_STRUCT_OPS)
>> +        return bpf_struct_ops_link_create(attr);
>> +
>>       prog = bpf_prog_get(attr->link_create.prog_fd);
>>       if (IS_ERR(prog))
>>           return PTR_ERR(prog);
>> diff --git a/tools/include/uapi/linux/bpf.h 
>> b/tools/include/uapi/linux/bpf.h
>> index 17afd2b35ee5..cd0ff39981e8 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>>       BPF_PERF_EVENT,
>>       BPF_TRACE_KPROBE_MULTI,
>>       BPF_LSM_CGROUP,
>> +    BPF_STRUCT_OPS,
>>       __MAX_BPF_ATTACH_TYPE
>>   };
> 
>> @@ -1266,6 +1267,9 @@ enum {
> 
>>   /* Create a map that is suitable to be an inner map with dynamic max 
>> entries */
>>       BPF_F_INNER_MAP        = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed 
>> bpf_link */
>> +    BPF_F_LINK        = (1U << 13),
>>   };
> 
>>   /* Flags for BPF_PROG_QUERY. */
>> @@ -1507,7 +1511,10 @@ union bpf_attr {
>>       } task_fd_query;
> 
>>       struct { /* struct used by BPF_LINK_CREATE command */
>> -        __u32        prog_fd;    /* eBPF program to attach */
>> +        union {
>> +            __u32        prog_fd;    /* eBPF program to attach */
>> +            __u32        map_fd;        /* eBPF struct_ops to attach */
>> +        };
>>           union {
>>               __u32        target_fd;    /* object to attach to */
>>               __u32        target_ifindex; /* target ifindex */
>> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>>           struct {
>>               __u32 ifindex;
>>           } xdp;
>> +        struct {
>> +            __u32 map_id;
>> +        } struct_ops;
>>       };
>>   } __attribute__((aligned(8)));
> 
>> -- 
>> 2.30.2
>
Martin KaFai Lau March 7, 2023, 2:11 a.m. UTC | #3
On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cb837f42b99d..b845be719422 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1396,6 +1396,11 @@ struct bpf_link {
>   	struct work_struct work;
>   };
>   
> +struct bpf_struct_ops_link {
> +	struct bpf_link link;
> +	struct bpf_map __rcu *map;

__rcu is only needed after the link_update change in patch 5?
It is fine to keep it in this patch but please leave a comment in the commit 
message.

Does 'struct bpf_struct_ops_link' have to be in bpf.h?

> +};
> +
>   struct bpf_link_ops {
>   	void (*release)(struct bpf_link *link);
>   	void (*dealloc)(struct bpf_link *link);
> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
>   struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
>   struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>   struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
> +int bpf_struct_ops_link_create(union bpf_attr *attr);
>   
>   int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>   int bpf_obj_get_user(const char __user *pathname, int flags);
> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link *link)
>   {
>   }
>   
> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> +	return -EOPNOTSUPP;
> +}

Is this currently under '#ifdef CONFIG_BPF_SYSCALL' alone?

Not sure if it is correct. Please double check.

ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif

obj-$(CONFIG_BPF_SYSCALL) += syscall.o ...

> +
>   static inline int bpf_obj_get_user(const char __user *pathname, int flags)
>   {
>   	return -EOPNOTSUPP;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 17afd2b35ee5..cd0ff39981e8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>   	BPF_PERF_EVENT,
>   	BPF_TRACE_KPROBE_MULTI,
>   	BPF_LSM_CGROUP,
> +	BPF_STRUCT_OPS,
>   	__MAX_BPF_ATTACH_TYPE
>   };
>   
> @@ -1266,6 +1267,9 @@ enum {
>   
>   /* Create a map that is suitable to be an inner map with dynamic max entries */
>   	BPF_F_INNER_MAP		= (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed bpf_link */
> +	BPF_F_LINK		= (1U << 13),
>   };
>   
>   /* Flags for BPF_PROG_QUERY. */
> @@ -1507,7 +1511,10 @@ union bpf_attr {
>   	} task_fd_query;
>   
>   	struct { /* struct used by BPF_LINK_CREATE command */
> -		__u32		prog_fd;	/* eBPF program to attach */
> +		union {
> +			__u32		prog_fd;	/* eBPF program to attach */
> +			__u32		map_fd;		/* eBPF struct_ops to attach */

nit. Remove eBPF. "struct_ops to attach"

> +		};
>   		union {
>   			__u32		target_fd;	/* object to attach to */
>   			__u32		target_ifindex; /* target ifindex */
> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>   		struct {
>   			__u32 ifindex;
>   		} xdp;
> +		struct {
> +			__u32 map_id;
> +		} struct_ops;
>   	};
>   } __attribute__((aligned(8)));
>   
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index bba03b6b010b..9ec675576d97 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -14,6 +14,7 @@
>   
>   enum bpf_struct_ops_state {
>   	BPF_STRUCT_OPS_STATE_INIT,
> +	BPF_STRUCT_OPS_STATE_READY,

Please add it to the end. Although it is not in uapi, try not to disrupt the 
userspace introspection tool if it does not have to.

>   	BPF_STRUCT_OPS_STATE_INUSE,
>   	BPF_STRUCT_OPS_STATE_TOBEFREE,
>   };
> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   		*(unsigned long *)(udata + moff) = prog->aux->id;
>   	}
>   
> -	bpf_map_inc(map);
> -
>   	set_memory_rox((long)st_map->image, 1);
> +	if (st_map->map.map_flags & BPF_F_LINK) {
> +		/* Let bpf_link handle registration & unregistration.
> +		 *
> +		 * Pair with smp_load_acquire() during lookup_elem().
> +		 */
> +		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
> +		goto unlock;
> +	}
> +
>   	err = st_ops->reg(kdata);
>   	if (likely(!err)) {
> +		bpf_map_inc(map);
>   		/* Pair with smp_load_acquire() during lookup_elem().
>   		 * It ensures the above udata updates (e.g. prog->aux->id)
>   		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	 */
>   	set_memory_nx((long)st_map->image, 1);
>   	set_memory_rw((long)st_map->image, 1);
> -	bpf_map_put(map);
>   
>   reset_unlock:
>   	bpf_struct_ops_map_put_progs(st_map);
> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
>   	struct bpf_struct_ops_map *st_map;
>   
>   	st_map = (struct bpf_struct_ops_map *)map;
> +	if (st_map->map.map_flags & BPF_F_LINK)
> +		return -EOPNOTSUPP;
> +
>   	prev_state = cmpxchg(&st_map->kvalue.state,
>   			     BPF_STRUCT_OPS_STATE_INUSE,
>   			     BPF_STRUCT_OPS_STATE_TOBEFREE);
>   	switch (prev_state) {
> +	case BPF_STRUCT_OPS_STATE_READY:
> +		return -EOPNOTSUPP;

If this case never happens, this case should be removed. The WARN in the default 
case at the end is a better handling.

>   	case BPF_STRUCT_OPS_STATE_INUSE:
>   		st_map->st_ops->unreg(&st_map->kvalue.data);
>   		bpf_map_put(map);
> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>   {
>   	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> -	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
> +	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
>   		return -EINVAL;
>   	return 0;
>   }
> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
>   
>   	bpf_map_put(&st_map->map);
>   }
> +
> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
> +{
> +	struct bpf_struct_ops_link *st_link;
> +	struct bpf_struct_ops_map *st_map;
> +
> +	st_link = container_of(link, struct bpf_struct_ops_link, link);
> +	if (st_link->map) {

Will map ever be NULL

> +		st_map = (struct bpf_struct_ops_map *)st_link->map;
> +		st_map->st_ops->unreg(&st_map->kvalue.data);
> +		bpf_map_put(st_link->map);
> +	}
> +	kfree(st_link);
> +}
> +
> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
> +					    struct seq_file *seq)
> +{
> +	struct bpf_struct_ops_link *st_link;
> +	struct bpf_map *map;
> +
> +	st_link = container_of(link, struct bpf_struct_ops_link, link);
> +	rcu_read_lock_trace();

Should it be rcu_read_lock()?

> +	map = rcu_dereference(st_link->map);
> +	if (map)
> +		seq_printf(seq, "map_id:\t%d\n", map->id);
> +	rcu_read_unlock_trace();
> +}
> +
> +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
> +					       struct bpf_link_info *info)
> +{
> +	struct bpf_struct_ops_link *st_link;
> +	struct bpf_map *map;
> +
> +	st_link = container_of(link, struct bpf_struct_ops_link, link);
> +	rcu_read_lock_trace();
> +	map = rcu_dereference(st_link->map);
> +	if (map)
> +		info->struct_ops.map_id = map->id;
> +	rcu_read_unlock_trace();
> +	return 0;
> +}
> +
> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
> +	.dealloc = bpf_struct_ops_map_link_dealloc,
> +	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
> +	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
> +};
> +
> +int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> +	struct bpf_struct_ops_link *link = NULL;
> +	struct bpf_link_primer link_primer;
> +	struct bpf_struct_ops_map *st_map;
> +	struct bpf_map *map;
> +	int err;
> +
> +	map = bpf_map_get(attr->link_create.map_fd);
> +	if (!map)
> +		return -EINVAL;
> +
> +	st_map = (struct bpf_struct_ops_map *)map;
> +
> +	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK) ||
> +	    /* Pair with smp_store_release() during map_update */
> +	    smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	link = kzalloc(sizeof(*link), GFP_USER);
> +	if (!link) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
> +	link->map = map;

RCU_INIT_POINTER().

> +
> +	err = bpf_link_prime(&link->link, &link_primer);
> +	if (err)
> +		goto err_out;
> +
> +	err = st_map->st_ops->reg(st_map->kvalue.data);
> +	if (err) {
> +		bpf_link_cleanup(&link_primer);
> +		goto err_out;
> +	}
> +
> +	return bpf_link_settle(&link_primer);
> +
> +err_out:
> +	bpf_map_put(map);
> +	kfree(link);
> +	return err;
> +}
> +
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 358a0e40555e..3db4938212d6 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
>   static void bpf_link_free(struct bpf_link *link)
>   {
>   	bpf_link_free_id(link->id);
> +	/* detach BPF program, clean up used resources */
>   	if (link->prog) {
> -		/* detach BPF program, clean up used resources */

This comment move seems unnecessary.

>   		link->ops->release(link);
>   		bpf_prog_put(link->prog);
> +		/* The struct_ops links clean up map by them-selves. */

This also seems unnecessary to only spell out for struct_ops link. Each specific 
link type does its cleanup in ->dealloc.
Kuifeng Lee March 7, 2023, 6:04 p.m. UTC | #4
On 3/6/23 18:11, Martin KaFai Lau wrote:
> On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index cb837f42b99d..b845be719422 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1396,6 +1396,11 @@ struct bpf_link {
>>       struct work_struct work;
>>   };
>> +struct bpf_struct_ops_link {
>> +    struct bpf_link link;
>> +    struct bpf_map __rcu *map;
> 
> __rcu is only needed after the link_update change in patch 5?
> It is fine to keep it in this patch but please leave a comment in the 
> commit message.
> 
> Does 'struct bpf_struct_ops_link' have to be in bpf.h?
> 
>> +};
>> +
>>   struct bpf_link_ops {
>>       void (*release)(struct bpf_link *link);
>>       void (*dealloc)(struct bpf_link *link);
>> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
>>   struct file *bpf_link_new_file(struct bpf_link *link, int 
>> *reserved_fd);
>>   struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>>   struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
>> +int bpf_struct_ops_link_create(union bpf_attr *attr);
>>   int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>>   int bpf_obj_get_user(const char __user *pathname, int flags);
>> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link 
>> *link)
>>   {
>>   }
>> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> +    return -EOPNOTSUPP;
>> +}
> 
> Is this currently under '#ifdef CONFIG_BPF_SYSCALL' alone?
> 
> Not sure if it is correct. Please double check.
> 
> ifeq ($(CONFIG_BPF_JIT),y)
> obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
> obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
> obj-${CONFIG_BPF_LSM} += bpf_lsm.o
> endif
> 
> obj-$(CONFIG_BPF_SYSCALL) += syscall.o ...

You are right! Should be under CONFIG_BPF_JIT.

> 
>> +
>>   static inline int bpf_obj_get_user(const char __user *pathname, int 
>> flags)
>>   {
>>       return -EOPNOTSUPP;
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 17afd2b35ee5..cd0ff39981e8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>>       BPF_PERF_EVENT,
>>       BPF_TRACE_KPROBE_MULTI,
>>       BPF_LSM_CGROUP,
>> +    BPF_STRUCT_OPS,
>>       __MAX_BPF_ATTACH_TYPE
>>   };
>> @@ -1266,6 +1267,9 @@ enum {
>>   /* Create a map that is suitable to be an inner map with dynamic max 
>> entries */
>>       BPF_F_INNER_MAP        = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed 
>> bpf_link */
>> +    BPF_F_LINK        = (1U << 13),
>>   };
>>   /* Flags for BPF_PROG_QUERY. */
>> @@ -1507,7 +1511,10 @@ union bpf_attr {
>>       } task_fd_query;
>>       struct { /* struct used by BPF_LINK_CREATE command */
>> -        __u32        prog_fd;    /* eBPF program to attach */
>> +        union {
>> +            __u32        prog_fd;    /* eBPF program to attach */
>> +            __u32        map_fd;        /* eBPF struct_ops to attach */
> 
> nit. Remove eBPF. "struct_ops to attach"
> 
>> +        };
>>           union {
>>               __u32        target_fd;    /* object to attach to */
>>               __u32        target_ifindex; /* target ifindex */
>> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>>           struct {
>>               __u32 ifindex;
>>           } xdp;
>> +        struct {
>> +            __u32 map_id;
>> +        } struct_ops;
>>       };
>>   } __attribute__((aligned(8)));
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index bba03b6b010b..9ec675576d97 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -14,6 +14,7 @@
>>   enum bpf_struct_ops_state {
>>       BPF_STRUCT_OPS_STATE_INIT,
>> +    BPF_STRUCT_OPS_STATE_READY,
> 
> Please add it to the end. Although it is not in uapi, try not to disrupt 
> the userspace introspection tool if it does not have to.
> 

Got it!

>>       BPF_STRUCT_OPS_STATE_INUSE,
>>       BPF_STRUCT_OPS_STATE_TOBEFREE,
>>   };
>> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>           *(unsigned long *)(udata + moff) = prog->aux->id;
>>       }
>> -    bpf_map_inc(map);
>> -
>>       set_memory_rox((long)st_map->image, 1);
>> +    if (st_map->map.map_flags & BPF_F_LINK) {
>> +        /* Let bpf_link handle registration & unregistration.
>> +         *
>> +         * Pair with smp_load_acquire() during lookup_elem().
>> +         */
>> +        smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
>> +        goto unlock;
>> +    }
>> +
>>       err = st_ops->reg(kdata);
>>       if (likely(!err)) {
>> +        bpf_map_inc(map);
>>           /* Pair with smp_load_acquire() during lookup_elem().
>>            * It ensures the above udata updates (e.g. prog->aux->id)
>>            * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
>> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>        */
>>       set_memory_nx((long)st_map->image, 1);
>>       set_memory_rw((long)st_map->image, 1);
>> -    bpf_map_put(map);
>>   reset_unlock:
>>       bpf_struct_ops_map_put_progs(st_map);
>> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct 
>> bpf_map *map, void *key)
>>       struct bpf_struct_ops_map *st_map;
>>       st_map = (struct bpf_struct_ops_map *)map;
>> +    if (st_map->map.map_flags & BPF_F_LINK)
>> +        return -EOPNOTSUPP;
>> +
>>       prev_state = cmpxchg(&st_map->kvalue.state,
>>                    BPF_STRUCT_OPS_STATE_INUSE,
>>                    BPF_STRUCT_OPS_STATE_TOBEFREE);
>>       switch (prev_state) {
>> +    case BPF_STRUCT_OPS_STATE_READY:
>> +        return -EOPNOTSUPP;
> 
> If this case never happens, this case should be removed. The WARN in the 
> default case at the end is a better handling

No, it never happens.  I will remove it.

> 
>>       case BPF_STRUCT_OPS_STATE_INUSE:
>>           st_map->st_ops->unreg(&st_map->kvalue.data);
>>           bpf_map_put(map);
>> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct 
>> bpf_map *map)
>>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>>   {
>>       if (attr->key_size != sizeof(unsigned int) || attr->max_entries 
>> != 1 ||
>> -        attr->map_flags || !attr->btf_vmlinux_value_type_id)
>> +        (attr->map_flags & ~BPF_F_LINK) || 
>> !attr->btf_vmlinux_value_type_id)
>>           return -EINVAL;
>>       return 0;
>>   }
>> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
>>       bpf_map_put(&st_map->map);
>>   }
>> +
>> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
>> +{
>> +    struct bpf_struct_ops_link *st_link;
>> +    struct bpf_struct_ops_map *st_map;
>> +
>> +    st_link = container_of(link, struct bpf_struct_ops_link, link);
>> +    if (st_link->map) {
> 
> Will map ever be NULL

No, it is never NULL after removing the detach feature.

> 
>> +        st_map = (struct bpf_struct_ops_map *)st_link->map;
>> +        st_map->st_ops->unreg(&st_map->kvalue.data);
>> +        bpf_map_put(st_link->map);
>> +    }
>> +    kfree(st_link);
>> +}
>> +
>> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link 
>> *link,
>> +                        struct seq_file *seq)
>> +{
>> +    struct bpf_struct_ops_link *st_link;
>> +    struct bpf_map *map;
>> +
>> +    st_link = container_of(link, struct bpf_struct_ops_link, link);
>> +    rcu_read_lock_trace();
> 
> Should it be rcu_read_lock()?

Got it!

> 
>> +    map = rcu_dereference(st_link->map);
>> +    if (map)
>> +        seq_printf(seq, "map_id:\t%d\n", map->id);
>> +    rcu_read_unlock_trace();
>> +}
>> +
>> +static int bpf_struct_ops_map_link_fill_link_info(const struct 
>> bpf_link *link,
>> +                           struct bpf_link_info *info)
>> +{
>> +    struct bpf_struct_ops_link *st_link;
>> +    struct bpf_map *map;
>> +
>> +    st_link = container_of(link, struct bpf_struct_ops_link, link);
>> +    rcu_read_lock_trace();
>> +    map = rcu_dereference(st_link->map);
>> +    if (map)
>> +        info->struct_ops.map_id = map->id;
>> +    rcu_read_unlock_trace();
>> +    return 0;
>> +}
>> +
>> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
>> +    .dealloc = bpf_struct_ops_map_link_dealloc,
>> +    .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
>> +    .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
>> +};
>> +
>> +int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> +    struct bpf_struct_ops_link *link = NULL;
>> +    struct bpf_link_primer link_primer;
>> +    struct bpf_struct_ops_map *st_map;
>> +    struct bpf_map *map;
>> +    int err;
>> +
>> +    map = bpf_map_get(attr->link_create.map_fd);
>> +    if (!map)
>> +        return -EINVAL;
>> +
>> +    st_map = (struct bpf_struct_ops_map *)map;
>> +
>> +    if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags 
>> & BPF_F_LINK) ||
>> +        /* Pair with smp_store_release() during map_update */
>> +        smp_load_acquire(&st_map->kvalue.state) != 
>> BPF_STRUCT_OPS_STATE_READY) {
>> +        err = -EINVAL;
>> +        goto err_out;
>> +    }
>> +
>> +    link = kzalloc(sizeof(*link), GFP_USER);
>> +    if (!link) {
>> +        err = -ENOMEM;
>> +        goto err_out;
>> +    }
>> +    bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 
>> &bpf_struct_ops_map_lops, NULL);
>> +    link->map = map;
> 
> RCU_INIT_POINTER().

Got it!

> 
>> +
>> +    err = bpf_link_prime(&link->link, &link_primer);
>> +    if (err)
>> +        goto err_out;
>> +
>> +    err = st_map->st_ops->reg(st_map->kvalue.data);
>> +    if (err) {
>> +        bpf_link_cleanup(&link_primer);
>> +        goto err_out;
>> +    }
>> +
>> +    return bpf_link_settle(&link_primer);
>> +
>> +err_out:
>> +    bpf_map_put(map);
>> +    kfree(link);
>> +    return err;
>> +}
>> +
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 358a0e40555e..3db4938212d6 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
>>   static void bpf_link_free(struct bpf_link *link)
>>   {
>>       bpf_link_free_id(link->id);
>> +    /* detach BPF program, clean up used resources */
>>       if (link->prog) {
>> -        /* detach BPF program, clean up used resources */
> 
> This comment move seems unnecessary.
> 
>>           link->ops->release(link);
>>           bpf_prog_put(link->prog);
>> +        /* The struct_ops links clean up map by them-selves. */
> 
> This also seems unnecessary to only spell out for struct_ops link. Each 
> specific link type does its cleanup in ->dealloc.
> 

Got it!

>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cb837f42b99d..b845be719422 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1396,6 +1396,11 @@  struct bpf_link {
 	struct work_struct work;
 };
 
+struct bpf_struct_ops_link {
+	struct bpf_link link;
+	struct bpf_map __rcu *map;
+};
+
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
 	void (*dealloc)(struct bpf_link *link);
@@ -1964,6 +1969,7 @@  int bpf_link_new_fd(struct bpf_link *link);
 struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
+int bpf_struct_ops_link_create(union bpf_attr *attr);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
@@ -2308,6 +2314,11 @@  static inline void bpf_link_put(struct bpf_link *link)
 {
 }
 
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 17afd2b35ee5..cd0ff39981e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@  enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@  enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@  union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* eBPF struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6354,6 +6361,9 @@  struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index bba03b6b010b..9ec675576d97 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -14,6 +14,7 @@ 
 
 enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
+	BPF_STRUCT_OPS_STATE_READY,
 	BPF_STRUCT_OPS_STATE_INUSE,
 	BPF_STRUCT_OPS_STATE_TOBEFREE,
 };
@@ -494,11 +495,19 @@  static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
-	bpf_map_inc(map);
-
 	set_memory_rox((long)st_map->image, 1);
+	if (st_map->map.map_flags & BPF_F_LINK) {
+		/* Let bpf_link handle registration & unregistration.
+		 *
+		 * Pair with smp_load_acquire() during lookup_elem().
+		 */
+		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+		goto unlock;
+	}
+
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
+		bpf_map_inc(map);
 		/* Pair with smp_load_acquire() during lookup_elem().
 		 * It ensures the above udata updates (e.g. prog->aux->id)
 		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -514,7 +523,6 @@  static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 */
 	set_memory_nx((long)st_map->image, 1);
 	set_memory_rw((long)st_map->image, 1);
-	bpf_map_put(map);
 
 reset_unlock:
 	bpf_struct_ops_map_put_progs(st_map);
@@ -532,10 +540,15 @@  static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	struct bpf_struct_ops_map *st_map;
 
 	st_map = (struct bpf_struct_ops_map *)map;
+	if (st_map->map.map_flags & BPF_F_LINK)
+		return -EOPNOTSUPP;
+
 	prev_state = cmpxchg(&st_map->kvalue.state,
 			     BPF_STRUCT_OPS_STATE_INUSE,
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
 	switch (prev_state) {
+	case BPF_STRUCT_OPS_STATE_READY:
+		return -EOPNOTSUPP;
 	case BPF_STRUCT_OPS_STATE_INUSE:
 		st_map->st_ops->unreg(&st_map->kvalue.data);
 		bpf_map_put(map);
@@ -618,7 +631,7 @@  static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
-	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
+	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
 		return -EINVAL;
 	return 0;
 }
@@ -714,3 +727,100 @@  void bpf_struct_ops_put(const void *kdata)
 
 	bpf_map_put(&st_map->map);
 }
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_struct_ops_map *st_map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	if (st_link->map) {
+		st_map = (struct bpf_struct_ops_map *)st_link->map;
+		st_map->st_ops->unreg(&st_map->kvalue.data);
+		bpf_map_put(st_link->map);
+	}
+	kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+					    struct seq_file *seq)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock_trace();
+	map = rcu_dereference(st_link->map);
+	if (map)
+		seq_printf(seq, "map_id:\t%d\n", map->id);
+	rcu_read_unlock_trace();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+					       struct bpf_link_info *info)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock_trace();
+	map = rcu_dereference(st_link->map);
+	if (map)
+		info->struct_ops.map_id = map->id;
+	rcu_read_unlock_trace();
+	return 0;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+	.dealloc = bpf_struct_ops_map_link_dealloc,
+	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	struct bpf_struct_ops_link *link = NULL;
+	struct bpf_link_primer link_primer;
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *map;
+	int err;
+
+	map = bpf_map_get(attr->link_create.map_fd);
+	if (!map)
+		return -EINVAL;
+
+	st_map = (struct bpf_struct_ops_map *)map;
+
+	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK) ||
+	    /* Pair with smp_store_release() during map_update */
+	    smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+	link->map = map;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto err_out;
+
+	err = st_map->st_ops->reg(st_map->kvalue.data);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		goto err_out;
+	}
+
+	return bpf_link_settle(&link_primer);
+
+err_out:
+	bpf_map_put(map);
+	kfree(link);
+	return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 358a0e40555e..3db4938212d6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2743,10 +2743,11 @@  void bpf_link_inc(struct bpf_link *link)
 static void bpf_link_free(struct bpf_link *link)
 {
 	bpf_link_free_id(link->id);
+	/* detach BPF program, clean up used resources */
 	if (link->prog) {
-		/* detach BPF program, clean up used resources */
 		link->ops->release(link);
 		bpf_prog_put(link->prog);
+		/* The struct_ops links clean up map by them-selves. */
 	}
 	/* free bpf_link and its containing memory */
 	link->ops->dealloc(link);
@@ -2802,16 +2803,19 @@  static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 	const struct bpf_prog *prog = link->prog;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
 
-	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "link_type:\t%s\n"
-		   "link_id:\t%u\n"
-		   "prog_tag:\t%s\n"
-		   "prog_id:\t%u\n",
+		   "link_id:\t%u\n",
 		   bpf_link_type_strs[link->type],
-		   link->id,
-		   prog_tag,
-		   prog->aux->id);
+		   link->id);
+	if (prog) {
+		bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+		seq_printf(m,
+			   "prog_tag:\t%s\n"
+			   "prog_id:\t%u\n",
+			   prog_tag,
+			   prog->aux->id);
+	}
 	if (link->ops->show_fdinfo)
 		link->ops->show_fdinfo(link, m);
 }
@@ -4286,7 +4290,8 @@  static int bpf_link_get_info_by_fd(struct file *file,
 
 	info.type = link->type;
 	info.id = link->id;
-	info.prog_id = link->prog->aux->id;
+	if (link->prog)
+		info.prog_id = link->prog->aux->id;
 
 	if (link->ops->fill_link_info) {
 		err = link->ops->fill_link_info(link, &info);
@@ -4549,6 +4554,9 @@  static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	if (CHECK_ATTR(BPF_LINK_CREATE))
 		return -EINVAL;
 
+	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+		return bpf_struct_ops_link_create(attr);
+
 	prog = bpf_prog_get(attr->link_create.prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 17afd2b35ee5..cd0ff39981e8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@  enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@  enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@  union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* eBPF struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6354,6 +6361,9 @@  struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));