diff mbox series

[bpf-next,3/7] bpf: Register and unregister a struct_ops by their bpf_links.

Message ID 20230214221718.503964-4-kuifeng@meta.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Transit between BPF TCP congestion controls. | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1749 this patch: 1749
netdev/cc_maintainers warning 8 maintainers not CCed: john.fastabend@gmail.com daniel@iogearbox.net sdf@google.com jolsa@kernel.org netdev@vger.kernel.org haoluo@google.com yhs@fb.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 164 this patch: 164
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1746 this patch: 1746
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc

Commit Message

Kui-Feng Lee Feb. 14, 2023, 10:17 p.m. UTC
Registration via bpf_links ensures a uniform behavior, just like other
BPF programs.  BPF struct_ops were registered/unregistered when
updating/deleting their values.  Only the maps of struct_ops having
the BPF_F_LINK flag are allowed to back a bpf_link.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
 include/uapi/linux/bpf.h       |  3 ++
 kernel/bpf/bpf_struct_ops.c    | 59 +++++++++++++++++++++++++++++++---
 tools/include/uapi/linux/bpf.h |  3 ++
 3 files changed, 61 insertions(+), 4 deletions(-)

Comments

Stanislav Fomichev Feb. 15, 2023, 2:53 a.m. UTC | #1
On 02/14, Kui-Feng Lee wrote:
> Registration via bpf_links ensures a uniform behavior, just like other
> BPF programs.  BPF struct_ops were registered/unregistered when
> updating/deleting their values.  Only the maps of struct_ops having
> the BPF_F_LINK flag are allowed to back a bpf_link.

> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
>   include/uapi/linux/bpf.h       |  3 ++
>   kernel/bpf/bpf_struct_ops.c    | 59 +++++++++++++++++++++++++++++++---
>   tools/include/uapi/linux/bpf.h |  3 ++
>   3 files changed, 61 insertions(+), 4 deletions(-)

> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 1e6cdd0f355d..48d8b3058aa1 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1267,6 +1267,9 @@ enum {

>   /* Create a map that is suitable to be an inner map with dynamic max  
> entries */
>   	BPF_F_INNER_MAP		= (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed  
> bpf_link */
> +	BPF_F_LINK		= (1U << 13),
>   };

>   /* Flags for BPF_PROG_QUERY. */
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 621c8e24481a..d16ca06cf09a 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct  
> bpf_map *map, void *key,

>   	mutex_lock(&st_map->lock);

> -	if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
> +	if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT ||  
> refcount_read(&kvalue->refcnt)) {
>   		err = -EBUSY;
>   		goto unlock;
>   	}
> @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct  
> bpf_map *map, void *key,
>   		*(unsigned long *)(udata + moff) = prog->aux->id;
>   	}

> +	if (st_map->map.map_flags & BPF_F_LINK) {
> +		/* Let bpf_link handle registration & unregistration. */
> +		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
> +		goto unlock;
> +	}
> +
>   	refcount_set(&kvalue->refcnt, 1);
>   	bpf_map_inc(map);


[..]

> @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct  
> bpf_map *map, void *key,
>   	kfree(tlinks);
>   	mutex_unlock(&st_map->lock);
>   	return err;
> +
>   }

>   static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)

Seems like a left over hunk?

> @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct  
> bpf_map *map, void *key)
>   			     BPF_STRUCT_OPS_STATE_TOBEFREE);
>   	switch (prev_state) {
>   	case BPF_STRUCT_OPS_STATE_INUSE:
> +		if (st_map->map.map_flags & BPF_F_LINK)
> +			return 0;
>   		st_map->st_ops->unreg(&st_map->kvalue.data);
>   		if (refcount_dec_and_test(&st_map->kvalue.refcnt))
>   			bpf_map_put(map);
> @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct bpf_map  
> *map)
>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>   {
>   	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> -	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
> +	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
>   		return -EINVAL;
>   	return 0;
>   }
> @@ -638,6 +647,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union  
> bpf_attr *attr)
>   	set_vm_flush_reset_perms(st_map->image);
>   	bpf_map_init_from_attr(map, attr);


[..]

> +	map->map_flags |= attr->map_flags & BPF_F_LINK;

You seem to have the following check above:

if (.... (attr->map_flags & ~BPF_F_LINK) ...) return -EINVAL;

And here you do:

map->map_flags |= attr->map_flags & BPF_F_LINK;

So maybe we can simplify to:
map->map_flags |= attr->map_flags;

?

> +
>   	return map;
>   }

> @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata)
>   	}
>   }

> +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value  
> *kvalue)
> +{
> +	struct bpf_struct_ops_map *st_map;
> +
> +	if (refcount_dec_and_test(&kvalue->refcnt)) {
> +		st_map = container_of(kvalue, struct bpf_struct_ops_map,
> +				      kvalue);
> +		bpf_map_put(&st_map->map);
> +	}
> +}
> +
>   static void bpf_struct_ops_map_link_release(struct bpf_link *link)
>   {
> +	struct bpf_struct_ops_map *st_map;
> +
>   	if (link->map) {
> -		bpf_map_put(link->map);
> +		st_map = (struct bpf_struct_ops_map *)link->map;
> +		st_map->st_ops->unreg(&st_map->kvalue.data);
> +		bpf_struct_ops_kvalue_put(&st_map->kvalue);
>   		link->map = NULL;
>   	}
>   }
> @@ -735,13 +761,15 @@ static const struct bpf_link_ops  
> bpf_struct_ops_map_lops = {

>   int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr)
>   {
> +	struct bpf_struct_ops_map *st_map;
>   	struct bpf_link_primer link_primer;
> +	struct bpf_struct_ops_value *kvalue;
>   	struct bpf_map *map;
>   	struct bpf_link *link = NULL;
>   	int err;

>   	map = bpf_map_get(attr->link_create.prog_fd);
> -	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS)
> +	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags &  
> BPF_F_LINK))
>   		return -EINVAL;

>   	link = kzalloc(sizeof(*link), GFP_USER);
> @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr *attr,  
> bpfptr_t uattr)
>   	bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops,  
> NULL);
>   	link->map = map;


[..]

> +	if (map->map_flags & BPF_F_LINK) {

We seem to bail out above when we don't have BPF_F_LINK flags above?

if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags &  
BPF_F_LINK))
	return -EINVAL;

So why check this 'if (map->map_flags & BPF_F_LINK)' condition here?


> +		st_map = (struct bpf_struct_ops_map *)map;
> +		kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
> +
> +		if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE ||
> +		    refcount_read(&kvalue->refcnt) != 0) {
> +			err = -EINVAL;
> +			goto err_out;
> +		}
> +
> +		refcount_set(&kvalue->refcnt, 1);
> +
> +		set_memory_rox((long)st_map->image, 1);
> +		err = st_map->st_ops->reg(kvalue->data);
> +		if (err) {
> +			refcount_set(&kvalue->refcnt, 0);
> +
> +			set_memory_nx((long)st_map->image, 1);
> +			set_memory_rw((long)st_map->image, 1);
> +			goto err_out;
> +		}
> +	}
> +
>   	err = bpf_link_prime(link, &link_primer);
>   	if (err)
>   		goto err_out;
> diff --git a/tools/include/uapi/linux/bpf.h  
> b/tools/include/uapi/linux/bpf.h
> index 1e6cdd0f355d..48d8b3058aa1 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1267,6 +1267,9 @@ enum {

>   /* Create a map that is suitable to be an inner map with dynamic max  
> entries */
>   	BPF_F_INNER_MAP		= (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed  
> bpf_link */
> +	BPF_F_LINK		= (1U << 13),
>   };

>   /* Flags for BPF_PROG_QUERY. */
> --
> 2.30.2
Kui-Feng Lee Feb. 15, 2023, 6:29 p.m. UTC | #2
On 2/14/23 18:53, Stanislav Fomichev wrote:
> On 02/14, Kui-Feng Lee wrote:
>> Registration via bpf_links ensures a uniform behavior, just like other
>> BPF programs.  BPF struct_ops were registered/unregistered when
>> updating/deleting their values.  Only the maps of struct_ops having
>> the BPF_F_LINK flag are allowed to back a bpf_link.
>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>>   include/uapi/linux/bpf.h       |  3 ++
>>   kernel/bpf/bpf_struct_ops.c    | 59 +++++++++++++++++++++++++++++++---
>>   tools/include/uapi/linux/bpf.h |  3 ++
>>   3 files changed, 61 insertions(+), 4 deletions(-)
>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 1e6cdd0f355d..48d8b3058aa1 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1267,6 +1267,9 @@ enum {
>
>>   /* Create a map that is suitable to be an inner map with dynamic 
>> max entries */
>>       BPF_F_INNER_MAP        = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed 
>> bpf_link */
>> +    BPF_F_LINK        = (1U << 13),
>>   };
>
>>   /* Flags for BPF_PROG_QUERY. */
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index 621c8e24481a..d16ca06cf09a 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>
>>       mutex_lock(&st_map->lock);
>
>> -    if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
>> +    if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || 
>> refcount_read(&kvalue->refcnt)) {
>>           err = -EBUSY;
>>           goto unlock;
>>       }
>> @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>           *(unsigned long *)(udata + moff) = prog->aux->id;
>>       }
>
>> +    if (st_map->map.map_flags & BPF_F_LINK) {
>> +        /* Let bpf_link handle registration & unregistration. */
>> +        smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
>> +        goto unlock;
>> +    }
>> +
>>       refcount_set(&kvalue->refcnt, 1);
>>       bpf_map_inc(map);
>
>
> [..]
>
>> @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>       kfree(tlinks);
>>       mutex_unlock(&st_map->lock);
>>       return err;
>> +
>>   }
>
>>   static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void 
>> *key)
>
> Seems like a left over hunk?


You are right.  I will remove it.


>
>> @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct 
>> bpf_map *map, void *key)
>>                    BPF_STRUCT_OPS_STATE_TOBEFREE);
>>       switch (prev_state) {
>>       case BPF_STRUCT_OPS_STATE_INUSE:
>> +        if (st_map->map.map_flags & BPF_F_LINK)
>> +            return 0;
>> st_map->st_ops->unreg(&st_map->kvalue.data);
>>           if (refcount_dec_and_test(&st_map->kvalue.refcnt))
>>               bpf_map_put(map);
>> @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct 
>> bpf_map *map)
>>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>>   {
>>       if (attr->key_size != sizeof(unsigned int) || attr->max_entries 
>> != 1 ||
>> -        attr->map_flags || !attr->btf_vmlinux_value_type_id)
>> +        (attr->map_flags & ~BPF_F_LINK) || 
>> !attr->btf_vmlinux_value_type_id)
>>           return -EINVAL;
>>       return 0;
>>   }
>> @@ -638,6 +647,8 @@ static struct bpf_map 
>> *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>       set_vm_flush_reset_perms(st_map->image);
>>       bpf_map_init_from_attr(map, attr);
>
>
> [..]
>
>> +    map->map_flags |= attr->map_flags & BPF_F_LINK;
>
> You seem to have the following check above:
>
> if (.... (attr->map_flags & ~BPF_F_LINK) ...) return -EINVAL;
>
> And here you do:
>
> map->map_flags |= attr->map_flags & BPF_F_LINK;
>
> So maybe we can simplify to:
> map->map_flags |= attr->map_flags;
>
> ?

Great catch!


>
>> +
>>       return map;
>>   }
>
>> @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata)
>>       }
>>   }
>
>> +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value 
>> *kvalue)
>> +{
>> +    struct bpf_struct_ops_map *st_map;
>> +
>> +    if (refcount_dec_and_test(&kvalue->refcnt)) {
>> +        st_map = container_of(kvalue, struct bpf_struct_ops_map,
>> +                      kvalue);
>> +        bpf_map_put(&st_map->map);
>> +    }
>> +}
>> +
>>   static void bpf_struct_ops_map_link_release(struct bpf_link *link)
>>   {
>> +    struct bpf_struct_ops_map *st_map;
>> +
>>       if (link->map) {
>> -        bpf_map_put(link->map);
>> +        st_map = (struct bpf_struct_ops_map *)link->map;
>> + st_map->st_ops->unreg(&st_map->kvalue.data);
>> +        bpf_struct_ops_kvalue_put(&st_map->kvalue);
>>           link->map = NULL;
>>       }
>>   }
>> @@ -735,13 +761,15 @@ static const struct bpf_link_ops 
>> bpf_struct_ops_map_lops = {
>
>>   int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr)
>>   {
>> +    struct bpf_struct_ops_map *st_map;
>>       struct bpf_link_primer link_primer;
>> +    struct bpf_struct_ops_value *kvalue;
>>       struct bpf_map *map;
>>       struct bpf_link *link = NULL;
>>       int err;
>
>>       map = bpf_map_get(attr->link_create.prog_fd);
>> -    if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS)
>> +    if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags 
>> & BPF_F_LINK))
>>           return -EINVAL;
>
>>       link = kzalloc(sizeof(*link), GFP_USER);
>> @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr 
>> *attr, bpfptr_t uattr)
>>       bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, 
>> &bpf_struct_ops_map_lops, NULL);
>>       link->map = map;
>
>
> [..]
>
>> +    if (map->map_flags & BPF_F_LINK) {
>
> We seem to bail out above when we don't have BPF_F_LINK flags above?
>
> if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & 
> BPF_F_LINK))
>     return -EINVAL;
>
> So why check this 'if (map->map_flags & BPF_F_LINK)' condition here?


You are right! This check is not necessary anymore.


>
>
>> +        st_map = (struct bpf_struct_ops_map *)map;
>> +        kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
>> +
>> +        if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE ||
>> +            refcount_read(&kvalue->refcnt) != 0) {
>> +            err = -EINVAL;
>> +            goto err_out;
>> +        }
>> +
>> +        refcount_set(&kvalue->refcnt, 1);
>> +
>> +        set_memory_rox((long)st_map->image, 1);
>> +        err = st_map->st_ops->reg(kvalue->data);
>> +        if (err) {
>> +            refcount_set(&kvalue->refcnt, 0);
>> +
>> +            set_memory_nx((long)st_map->image, 1);
>> +            set_memory_rw((long)st_map->image, 1);
>> +            goto err_out;
>> +        }
>> +    }
>> +
>>       err = bpf_link_prime(link, &link_primer);
>>       if (err)
>>           goto err_out;
>> diff --git a/tools/include/uapi/linux/bpf.h 
>> b/tools/include/uapi/linux/bpf.h
>> index 1e6cdd0f355d..48d8b3058aa1 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -1267,6 +1267,9 @@ enum {
>
>>   /* Create a map that is suitable to be an inner map with dynamic 
>> max entries */
>>       BPF_F_INNER_MAP        = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed 
>> bpf_link */
>> +    BPF_F_LINK        = (1U << 13),
>>   };
>
>>   /* Flags for BPF_PROG_QUERY. */
>> -- 
>> 2.30.2
>
Martin KaFai Lau Feb. 16, 2023, 12:37 a.m. UTC | #3
On 2/14/23 2:17 PM, Kui-Feng Lee wrote:
> Registration via bpf_links ensures a uniform behavior, just like other
> BPF programs.  BPF struct_ops were registered/unregistered when
> updating/deleting their values.  Only the maps of struct_ops having
> the BPF_F_LINK flag are allowed to back a bpf_link.
> 
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
>   include/uapi/linux/bpf.h       |  3 ++
>   kernel/bpf/bpf_struct_ops.c    | 59 +++++++++++++++++++++++++++++++---
>   tools/include/uapi/linux/bpf.h |  3 ++
>   3 files changed, 61 insertions(+), 4 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 1e6cdd0f355d..48d8b3058aa1 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1267,6 +1267,9 @@ enum {
>   
>   /* Create a map that is suitable to be an inner map with dynamic max entries */
>   	BPF_F_INNER_MAP		= (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed bpf_link */
> +	BPF_F_LINK		= (1U << 13),
>   };
>   
>   /* Flags for BPF_PROG_QUERY. */
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 621c8e24481a..d16ca06cf09a 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   
>   	mutex_lock(&st_map->lock);
>   
> -	if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
> +	if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || refcount_read(&kvalue->refcnt)) {

Why it needs a new refcount_read(&kvalue->refcnt) check?

>   		err = -EBUSY;
>   		goto unlock;
>   	}
> @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   		*(unsigned long *)(udata + moff) = prog->aux->id;
>   	}
>   
> +	if (st_map->map.map_flags & BPF_F_LINK) {
> +		/* Let bpf_link handle registration & unregistration. */
> +		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);

INUSE is for registered struct_ops. It needs a new UNREG state to mean 
initialized but not registered. The kvalue->state is not in uapi but the user 
space can still introspect it (thanks to BTF), so having a correct semantic 
state is useful. Try 'bpftool struct_ops dump ...':

     "bpf_struct_ops_tcp_congestion_ops": {
         "refcnt": {
             "refs": {
                 "counter": 1
             }
         },
         "state": "BPF_STRUCT_OPS_STATE_INUSE",

> +		goto unlock;
> +	}
> +
>   	refcount_set(&kvalue->refcnt, 1);
>   	bpf_map_inc(map);
>   
> @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	kfree(tlinks);
>   	mutex_unlock(&st_map->lock);
>   	return err;
> +

Unnecessary new line.

>   }
>   
>   static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
> @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
>   			     BPF_STRUCT_OPS_STATE_TOBEFREE);
>   	switch (prev_state) {
>   	case BPF_STRUCT_OPS_STATE_INUSE:
> +		if (st_map->map.map_flags & BPF_F_LINK)
> +			return 0;

This should be a -ENOTSUPP.

>   		st_map->st_ops->unreg(&st_map->kvalue.data);
>   		if (refcount_dec_and_test(&st_map->kvalue.refcnt))
>   			bpf_map_put(map);
> @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>   {
>   	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> -	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
> +	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
>   		return -EINVAL;
>   	return 0;
>   }
> @@ -638,6 +647,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>   	set_vm_flush_reset_perms(st_map->image);
>   	bpf_map_init_from_attr(map, attr);
>   
> +	map->map_flags |= attr->map_flags & BPF_F_LINK;

This should have already been done in bpf_map_init_from_attr().

> +
>   	return map;
>   }
>   
> @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata)
>   	}
>   }
>   
> +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value *kvalue)
> +{
> +	struct bpf_struct_ops_map *st_map;
> +
> +	if (refcount_dec_and_test(&kvalue->refcnt)) {
> +		st_map = container_of(kvalue, struct bpf_struct_ops_map,
> +				      kvalue);
> +		bpf_map_put(&st_map->map);
> +	}
> +}
> +
>   static void bpf_struct_ops_map_link_release(struct bpf_link *link)
>   {
> +	struct bpf_struct_ops_map *st_map;
> +
>   	if (link->map) {
> -		bpf_map_put(link->map);
> +		st_map = (struct bpf_struct_ops_map *)link->map;
> +		st_map->st_ops->unreg(&st_map->kvalue.data);
> +		bpf_struct_ops_kvalue_put(&st_map->kvalue);
>   		link->map = NULL;

Does it need a lock or something to protect the link_release? or I am missing 
something and lock is not needed?

The kvalue->value state should become UNREG.

After UNREG, can the struct_ops map be used in creating a new link again?

>   	}
>   }
> @@ -735,13 +761,15 @@ static const struct bpf_link_ops bpf_struct_ops_map_lops = {
>   
>   int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr)
>   {
> +	struct bpf_struct_ops_map *st_map;
>   	struct bpf_link_primer link_primer;
> +	struct bpf_struct_ops_value *kvalue;
>   	struct bpf_map *map;
>   	struct bpf_link *link = NULL;
>   	int err;
>   
>   	map = bpf_map_get(attr->link_create.prog_fd);
> -	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS)
> +	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK))
>   		return -EINVAL;
>   
>   	link = kzalloc(sizeof(*link), GFP_USER);
> @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr)
>   	bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
>   	link->map = map;
>   
> +	if (map->map_flags & BPF_F_LINK) {
> +		st_map = (struct bpf_struct_ops_map *)map;
> +		kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
> +
> +		if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE ||
> +		    refcount_read(&kvalue->refcnt) != 0) {

The refcount_read(&kvalue->refcnt) is to ensure it is not registered?
It seems the UNREG state is useful here.

> +			err = -EINVAL;
> +			goto err_out;
> +		}
> +
> +		refcount_set(&kvalue->refcnt, 1);

If a struct_ops map is used to create multiple links in parallel, is it safe?

> +
> +		set_memory_rox((long)st_map->image, 1);
> +		err = st_map->st_ops->reg(kvalue->data);

After successful reg, the state can be changed from UNREG to INUSE.

> +		if (err) {
> +			refcount_set(&kvalue->refcnt, 0);
> +
> +			set_memory_nx((long)st_map->image, 1);
> +			set_memory_rw((long)st_map->image, 1);
> +			goto err_out;
> +		}
> +	}

This patch should be combined with patch 1. Otherwise, patch 1 is quite hard to 
understand without link_create_struct_ops_map() doing the actual "attach".

> +
>   	err = bpf_link_prime(link, &link_primer);
>   	if (err)
>   		goto err_out;
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 1e6cdd0f355d..48d8b3058aa1 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1267,6 +1267,9 @@ enum {
>   
>   /* Create a map that is suitable to be an inner map with dynamic max entries */
>   	BPF_F_INNER_MAP		= (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed bpf_link */
> +	BPF_F_LINK		= (1U << 13),
>   };
>   
>   /* Flags for BPF_PROG_QUERY. */
Kui-Feng Lee Feb. 16, 2023, 4:42 p.m. UTC | #4
On 2/15/23 16:37, Martin KaFai Lau wrote:
> On 2/14/23 2:17 PM, Kui-Feng Lee wrote:
>> Registration via bpf_links ensures a uniform behavior, just like other
>> BPF programs.  BPF struct_ops were registered/unregistered when
>> updating/deleting their values.  Only the maps of struct_ops having
>> the BPF_F_LINK flag are allowed to back a bpf_link.
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>>   include/uapi/linux/bpf.h       |  3 ++
>>   kernel/bpf/bpf_struct_ops.c    | 59 +++++++++++++++++++++++++++++++---
>>   tools/include/uapi/linux/bpf.h |  3 ++
>>   3 files changed, 61 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 1e6cdd0f355d..48d8b3058aa1 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1267,6 +1267,9 @@ enum {
>>   /* Create a map that is suitable to be an inner map with dynamic max 
>> entries */
>>       BPF_F_INNER_MAP        = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed 
>> bpf_link */
>> +    BPF_F_LINK        = (1U << 13),
>>   };
>>   /* Flags for BPF_PROG_QUERY. */
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index 621c8e24481a..d16ca06cf09a 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -390,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>       mutex_lock(&st_map->lock);
>> -    if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
>> +    if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || 
>> refcount_read(&kvalue->refcnt)) {
> 
> Why it needs a new refcount_read(&kvalue->refcnt) check?

It prohibits updating the value once it is registered.
This refcnt is set to 1 when register it.

But, yes, it is confusing since we never reset it back to *_INIT.
The purpose of this refcount_read() will be clear once add *_UNREG, and 
reset it back to *_INIT properly.


> 
>>           err = -EBUSY;
>>           goto unlock;
>>       }
>> @@ -491,6 +491,12 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>           *(unsigned long *)(udata + moff) = prog->aux->id;
>>       }
>> +    if (st_map->map.map_flags & BPF_F_LINK) {
>> +        /* Let bpf_link handle registration & unregistration. */
>> +        smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
> 
> INUSE is for registered struct_ops. It needs a new UNREG state to mean 
> initialized but not registered. The kvalue->state is not in uapi but the 
> user space can still introspect it (thanks to BTF), so having a correct 
> semantic state is useful. Try 'bpftool struct_ops dump ...':
> 
>      "bpf_struct_ops_tcp_congestion_ops": {
>          "refcnt": {
>              "refs": {
>                  "counter": 1
>              }
>          },
>          "state": "BPF_STRUCT_OPS_STATE_INUSE",

Ok! That make sense.

> 
>> +        goto unlock;
>> +    }
>> +
>>       refcount_set(&kvalue->refcnt, 1);
>>       bpf_map_inc(map);
>> @@ -522,6 +528,7 @@ static int bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>       kfree(tlinks);
>>       mutex_unlock(&st_map->lock);
>>       return err;
>> +
> 
> Unnecessary new line.
> 
>>   }
>>   static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void 
>> *key)
>> @@ -535,6 +542,8 @@ static int bpf_struct_ops_map_delete_elem(struct 
>> bpf_map *map, void *key)
>>                    BPF_STRUCT_OPS_STATE_TOBEFREE);
>>       switch (prev_state) {
>>       case BPF_STRUCT_OPS_STATE_INUSE:
>> +        if (st_map->map.map_flags & BPF_F_LINK)
>> +            return 0;
> 
> This should be a -ENOTSUPP.
Sure!

> 
>>           st_map->st_ops->unreg(&st_map->kvalue.data);
>>           if (refcount_dec_and_test(&st_map->kvalue.refcnt))
>>               bpf_map_put(map);
>> @@ -585,7 +594,7 @@ static void bpf_struct_ops_map_free(struct bpf_map 
>> *map)
>>   static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>>   {
>>       if (attr->key_size != sizeof(unsigned int) || attr->max_entries 
>> != 1 ||
>> -        attr->map_flags || !attr->btf_vmlinux_value_type_id)
>> +        (attr->map_flags & ~BPF_F_LINK) || 
>> !attr->btf_vmlinux_value_type_id)
>>           return -EINVAL;
>>       return 0;
>>   }
>> @@ -638,6 +647,8 @@ static struct bpf_map 
>> *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>       set_vm_flush_reset_perms(st_map->image);
>>       bpf_map_init_from_attr(map, attr);
>> +    map->map_flags |= attr->map_flags & BPF_F_LINK;
> 
> This should have already been done in bpf_map_init_from_attr().

bpf_map_init_from_attr() will filter out all flags except BPF_F_RDONLY & 
BPF_F_WRONLY.  But, I can move it to bpf_map_init_from_attr() by not 
filtering out it.

> 
>> +
>>       return map;
>>   }
>> @@ -699,10 +710,25 @@ void bpf_struct_ops_put(const void *kdata)
>>       }
>>   }
>> +static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value 
>> *kvalue)
>> +{
>> +    struct bpf_struct_ops_map *st_map;
>> +
>> +    if (refcount_dec_and_test(&kvalue->refcnt)) {
>> +        st_map = container_of(kvalue, struct bpf_struct_ops_map,
>> +                      kvalue);
>> +        bpf_map_put(&st_map->map);
>> +    }
>> +}
>> +
>>   static void bpf_struct_ops_map_link_release(struct bpf_link *link)
>>   {
>> +    struct bpf_struct_ops_map *st_map;
>> +
>>       if (link->map) {
>> -        bpf_map_put(link->map);
>> +        st_map = (struct bpf_struct_ops_map *)link->map;
>> +        st_map->st_ops->unreg(&st_map->kvalue.data);
>> +        bpf_struct_ops_kvalue_put(&st_map->kvalue);
>>           link->map = NULL;
> 
> Does it need a lock or something to protect the link_release? or I am 
> missing something and lock is not needed?

This function will be called by bpf_link_free() following the pointer in 
bpf_link_ops.  And bpf_link_free() is called by bpf_link_put(). The 
refcnt of bpf_link is maintained by bpf_link_put(), and the function 
here indirectly only if the refcnt reachs 0.  If I don't miss anything, 
it should be safe to release a link without a lock.

> 
> The kvalue->value state should become UNREG.
> 
> After UNREG, can the struct_ops map be used in creating a new link again?
> 

It should be.

>>       }
>>   }
>> @@ -735,13 +761,15 @@ static const struct bpf_link_ops 
>> bpf_struct_ops_map_lops = {
>>   int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr)
>>   {
>> +    struct bpf_struct_ops_map *st_map;
>>       struct bpf_link_primer link_primer;
>> +    struct bpf_struct_ops_value *kvalue;
>>       struct bpf_map *map;
>>       struct bpf_link *link = NULL;
>>       int err;
>>       map = bpf_map_get(attr->link_create.prog_fd);
>> -    if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS)
>> +    if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags 
>> & BPF_F_LINK))
>>           return -EINVAL;
>>       link = kzalloc(sizeof(*link), GFP_USER);
>> @@ -752,6 +780,29 @@ int link_create_struct_ops_map(union bpf_attr 
>> *attr, bpfptr_t uattr)
>>       bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, 
>> &bpf_struct_ops_map_lops, NULL);
>>       link->map = map;
>> +    if (map->map_flags & BPF_F_LINK) {
>> +        st_map = (struct bpf_struct_ops_map *)map;
>> +        kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
>> +
>> +        if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE ||
>> +            refcount_read(&kvalue->refcnt) != 0) {
> 
> The refcount_read(&kvalue->refcnt) is to ensure it is not registered?
> It seems the UNREG state is useful here.

Yes!

> 
>> +            err = -EINVAL;
>> +            goto err_out;
>> +        }
>> +
>> +        refcount_set(&kvalue->refcnt, 1);
> 
> If a struct_ops map is used to create multiple links in parallel, is it 
> safe?
> 
>> +
>> +        set_memory_rox((long)st_map->image, 1);
>> +        err = st_map->st_ops->reg(kvalue->data);
> 
> After successful reg, the state can be changed from UNREG to INUSE.
> 
>> +        if (err) {
>> +            refcount_set(&kvalue->refcnt, 0);
>> +
>> +            set_memory_nx((long)st_map->image, 1);
>> +            set_memory_rw((long)st_map->image, 1);
>> +            goto err_out;
>> +        }
>> +    }
> 
> This patch should be combined with patch 1. Otherwise, patch 1 is quite 
> hard to understand without link_create_struct_ops_map() doing the actual 
> "attach".

Ok!

> 
>> +
>>       err = bpf_link_prime(link, &link_primer);
>>       if (err)
>>           goto err_out;
>> diff --git a/tools/include/uapi/linux/bpf.h 
>> b/tools/include/uapi/linux/bpf.h
>> index 1e6cdd0f355d..48d8b3058aa1 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -1267,6 +1267,9 @@ enum {
>>   /* Create a map that is suitable to be an inner map with dynamic max 
>> entries */
>>       BPF_F_INNER_MAP        = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed 
>> bpf_link */
>> +    BPF_F_LINK        = (1U << 13),
>>   };
>>   /* Flags for BPF_PROG_QUERY. */
>
Martin KaFai Lau Feb. 16, 2023, 10:38 p.m. UTC | #5
On 2/16/23 8:42 AM, Kui-Feng Lee wrote:
>>> @@ -638,6 +647,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union 
>>> bpf_attr *attr)
>>>       set_vm_flush_reset_perms(st_map->image);
>>>       bpf_map_init_from_attr(map, attr);
>>> +    map->map_flags |= attr->map_flags & BPF_F_LINK;
>>
>> This should have already been done in bpf_map_init_from_attr().
> 
> bpf_map_init_from_attr() will filter out all flags except BPF_F_RDONLY & 
> BPF_F_WRONLY.

should be the opposite:

static u32 bpf_map_flags_retain_permanent(u32 flags)
{
	/* Some map creation flags are not tied to the map object but
          * rather to the map fd instead, so they have no meaning upon
          * map object inspection since multiple file descriptors with
          * different (access) properties can exist here. Thus, given
          * this has zero meaning for the map itself, lets clear these
          * from here.
          */
	return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
}
Kui-Feng Lee Feb. 17, 2023, 10:17 p.m. UTC | #6
On 2/16/23 14:38, Martin KaFai Lau wrote:
> On 2/16/23 8:42 AM, Kui-Feng Lee wrote:
>>>> @@ -638,6 +647,8 @@ static struct bpf_map 
>>>> *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>>>       set_vm_flush_reset_perms(st_map->image);
>>>>       bpf_map_init_from_attr(map, attr);
>>>> +    map->map_flags |= attr->map_flags & BPF_F_LINK;
>>>
>>> This should have already been done in bpf_map_init_from_attr().
>>
>> bpf_map_init_from_attr() will filter out all flags except BPF_F_RDONLY 
>> & BPF_F_WRONLY.
> 
> should be the opposite:
> 
> static u32 bpf_map_flags_retain_permanent(u32 flags)
> {
>      /* Some map creation flags are not tied to the map object but
>           * rather to the map fd instead, so they have no meaning upon
>           * map object inspection since multiple file descriptors with
>           * different (access) properties can exist here. Thus, given
>           * this has zero meaning for the map itself, lets clear these
>           * from here.
>           */
>      return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
> }

Got it! Thank you!
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1e6cdd0f355d..48d8b3058aa1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1267,6 +1267,9 @@  enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 621c8e24481a..d16ca06cf09a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -390,7 +390,7 @@  static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 	mutex_lock(&st_map->lock);
 
-	if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+	if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT || refcount_read(&kvalue->refcnt)) {
 		err = -EBUSY;
 		goto unlock;
 	}
@@ -491,6 +491,12 @@  static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
+	if (st_map->map.map_flags & BPF_F_LINK) {
+		/* Let bpf_link handle registration & unregistration. */
+		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+		goto unlock;
+	}
+
 	refcount_set(&kvalue->refcnt, 1);
 	bpf_map_inc(map);
 
@@ -522,6 +528,7 @@  static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	kfree(tlinks);
 	mutex_unlock(&st_map->lock);
 	return err;
+
 }
 
 static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
@@ -535,6 +542,8 @@  static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
 	switch (prev_state) {
 	case BPF_STRUCT_OPS_STATE_INUSE:
+		if (st_map->map.map_flags & BPF_F_LINK)
+			return 0;
 		st_map->st_ops->unreg(&st_map->kvalue.data);
 		if (refcount_dec_and_test(&st_map->kvalue.refcnt))
 			bpf_map_put(map);
@@ -585,7 +594,7 @@  static void bpf_struct_ops_map_free(struct bpf_map *map)
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
-	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
+	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
 		return -EINVAL;
 	return 0;
 }
@@ -638,6 +647,8 @@  static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	set_vm_flush_reset_perms(st_map->image);
 	bpf_map_init_from_attr(map, attr);
 
+	map->map_flags |= attr->map_flags & BPF_F_LINK;
+
 	return map;
 }
 
@@ -699,10 +710,25 @@  void bpf_struct_ops_put(const void *kdata)
 	}
 }
 
+static void bpf_struct_ops_kvalue_put(struct bpf_struct_ops_value *kvalue)
+{
+	struct bpf_struct_ops_map *st_map;
+
+	if (refcount_dec_and_test(&kvalue->refcnt)) {
+		st_map = container_of(kvalue, struct bpf_struct_ops_map,
+				      kvalue);
+		bpf_map_put(&st_map->map);
+	}
+}
+
 static void bpf_struct_ops_map_link_release(struct bpf_link *link)
 {
+	struct bpf_struct_ops_map *st_map;
+
 	if (link->map) {
-		bpf_map_put(link->map);
+		st_map = (struct bpf_struct_ops_map *)link->map;
+		st_map->st_ops->unreg(&st_map->kvalue.data);
+		bpf_struct_ops_kvalue_put(&st_map->kvalue);
 		link->map = NULL;
 	}
 }
@@ -735,13 +761,15 @@  static const struct bpf_link_ops bpf_struct_ops_map_lops = {
 
 int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr)
 {
+	struct bpf_struct_ops_map *st_map;
 	struct bpf_link_primer link_primer;
+	struct bpf_struct_ops_value *kvalue;
 	struct bpf_map *map;
 	struct bpf_link *link = NULL;
 	int err;
 
 	map = bpf_map_get(attr->link_create.prog_fd);
-	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS)
+	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK))
 		return -EINVAL;
 
 	link = kzalloc(sizeof(*link), GFP_USER);
@@ -752,6 +780,29 @@  int link_create_struct_ops_map(union bpf_attr *attr, bpfptr_t uattr)
 	bpf_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
 	link->map = map;
 
+	if (map->map_flags & BPF_F_LINK) {
+		st_map = (struct bpf_struct_ops_map *)map;
+		kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
+
+		if (kvalue->state != BPF_STRUCT_OPS_STATE_INUSE ||
+		    refcount_read(&kvalue->refcnt) != 0) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		refcount_set(&kvalue->refcnt, 1);
+
+		set_memory_rox((long)st_map->image, 1);
+		err = st_map->st_ops->reg(kvalue->data);
+		if (err) {
+			refcount_set(&kvalue->refcnt, 0);
+
+			set_memory_nx((long)st_map->image, 1);
+			set_memory_rw((long)st_map->image, 1);
+			goto err_out;
+		}
+	}
+
 	err = bpf_link_prime(link, &link_primer);
 	if (err)
 		goto err_out;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1e6cdd0f355d..48d8b3058aa1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1267,6 +1267,9 @@  enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */