diff mbox series

[bpf-next,1/2] bpf: struct_ops supports more than one page for trampolines.

Message ID 20240216182828.201727-2-thinker.li@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Allow struct_ops maps with a large number of programs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1051 this patch: 1051
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 9 maintainers not CCed: jolsa@kernel.org daniel@iogearbox.net john.fastabend@gmail.com yonghong.song@linux.dev netdev@vger.kernel.org sdf@google.com eddyz87@gmail.com kpsingh@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 1066 this patch: 1066
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1068 this patch: 1068
netdev/checkpatch warning WARNING: line length of 101 exceeds 80 columns WARNING: line length of 102 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Kui-Feng Lee Feb. 16, 2024, 6:28 p.m. UTC
From: Kui-Feng Lee <thinker.li@gmail.com>

The BPF struct_ops previously only allowed for one page to be used for the
trampolines of all links in a map. However, we have recently run out of
space due to the large number of BPF program links. By allocating
additional pages when we exhaust an existing page, we can accommodate more
links in a single map.

The variable st_map->image has been changed to st_map->image_pages, and its
type has been changed to an array of pointers to buffers of PAGE_SIZE. The
array is dynamically resized and additional pages are allocated when all
existing pages are exhausted.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
 kernel/bpf/bpf_struct_ops.c | 99 ++++++++++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 19 deletions(-)

Comments

Martin KaFai Lau Feb. 20, 2024, 9:47 p.m. UTC | #1
On 2/16/24 10:28 AM, thinker.li@gmail.com wrote:
> From: Kui-Feng Lee <thinker.li@gmail.com>
> 
> The BPF struct_ops previously only allowed for one page to be used for the
> trampolines of all links in a map. However, we have recently run out of
> space due to the large number of BPF program links. By allocating
> additional pages when we exhaust an existing page, we can accommodate more
> links in a single map.
> 
> The variable st_map->image has been changed to st_map->image_pages, and its
> type has been changed to an array of pointers to buffers of PAGE_SIZE. The
> array is dynamically resized and additional pages are allocated when all
> existing pages are exhausted.
> 
> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
> ---
>   kernel/bpf/bpf_struct_ops.c | 99 ++++++++++++++++++++++++++++++-------
>   1 file changed, 80 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 0d7be97a2411..bb7ae665006a 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -30,12 +30,11 @@ struct bpf_struct_ops_map {
>   	 */
>   	struct bpf_link **links;
>   	u32 links_cnt;
> -	/* image is a page that has all the trampolines
> +	u32 image_pages_cnt;
> +	/* image_pages is an array of pages that has all the trampolines
>   	 * that stores the func args before calling the bpf_prog.
> -	 * A PAGE_SIZE "image" is enough to store all trampoline for
> -	 * "links[]".
>   	 */
> -	void *image;
> +	void **image_pages;
>   	/* The owner moduler's btf. */
>   	struct btf *btf;
>   	/* uvalue->data stores the kernel struct
> @@ -535,7 +534,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	void *udata, *kdata;
>   	int prog_fd, err;
>   	void *image, *image_end;
> -	u32 i;
> +	void **image_pages;
> +	u32 i, next_page = 0;
>   
>   	if (flags)
>   		return -EINVAL;
> @@ -573,8 +573,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   
>   	udata = &uvalue->data;
>   	kdata = &kvalue->data;
> -	image = st_map->image;
> -	image_end = st_map->image + PAGE_SIZE;
> +	image = st_map->image_pages[next_page++];
> +	image_end = image + PAGE_SIZE;
>   
>   	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
>   	for_each_member(i, t, member) {
> @@ -657,6 +657,43 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   							&st_ops->func_models[i],
>   							*(void **)(st_ops->cfi_stubs + moff),
>   							image, image_end);
> +		if (err == -E2BIG) {
> +			/* Use an additional page to try again.
> +			 *
> +			 * It may reuse pages allocated for the previous
> +			 * failed calls.
> +			 */
> +			if (next_page >= st_map->image_pages_cnt) {

This check (more on this later) ...

> +				/* Allocate an additional page */
> +				image_pages = krealloc(st_map->image_pages,
> +						       (st_map->image_pages_cnt + 1) * sizeof(void *),
> +						       GFP_KERNEL);

 From the patch 2 test, one page is enough for at least 20 ops. How about keep 
it simple and allow a max 8 pages which should be much more than enough for sane 
usage. (i.e. add "void *images[MAX_STRUCT_OPS_PAGES];" to "struct 
bpf_struct_ops_map").

> +				if (!image_pages) {
> +					err = -ENOMEM;
> +					goto reset_unlock;
> +				}
> +				st_map->image_pages = image_pages;
> +
> +				err = bpf_jit_charge_modmem(PAGE_SIZE);
> +				if (err)
> +					goto reset_unlock;
> +
> +				image = arch_alloc_bpf_trampoline(PAGE_SIZE);
> +				if (!image) {
> +					bpf_jit_uncharge_modmem(PAGE_SIZE);
> +					err = -ENOMEM;
> +					goto reset_unlock;
> +				}
> +				st_map->image_pages[st_map->image_pages_cnt++] = image;
> +			}
> +			image = st_map->image_pages[next_page++];
> +			image_end = image + PAGE_SIZE;
> +
> +			err = bpf_struct_ops_prepare_trampoline(tlinks, link,
> +								&st_ops->func_models[i],
> +								*(void **)(st_ops->cfi_stubs + moff),
> +								image, image_end);
> +		}
>   		if (err < 0)
>   			goto reset_unlock;
>   
> @@ -667,6 +704,18 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   		*(unsigned long *)(udata + moff) = prog->aux->id;
>   	}
>   
> +	while (next_page < st_map->image_pages_cnt) {

This check and the above "if (next_page >= st_map->image_pages_cnt)" should not 
happen for the common case?

Together with the new comment after the above "if (err == -E2BIG)", is it trying 
to optimize to reuse the pages allocated in the previous error-ed out 
map_update_elem() call?

How about keep it simple for the common case and always free all pages when 
map_update_elem() failed?

Also, after this patch, the same calls are used in different places.

arch_alloc_bpf_trampoline() is done in two different places, one in map_alloc() 
and one in map_update_elem(). How about do all the page alloc in map_update_elem()?

bpf_struct_ops_prepare_trampoline() is also called in two different places 
within the same map_update_elem(). When looking inside the 
bpf_struct_ops_prepare_trampoline(), it does call arch_bpf_trampoline_size() to 
learn the required size first. bpf_struct_ops_prepare_trampoline() should be a 
better place to call arch_alloc_bpf_trampoline() when needed. Then there is no 
need to retry bpf_struct_ops_prepare_trampoline() in map_update_elem()?


> +		/* Free unused pages
> +		 *
> +		 * The value can not be updated anymore if the value is not
> +		 * rejected by st_ops->validate() or st_ops->reg().  So,
> +		 * there is no reason to keep the unused pages.
> +		 */
> +		bpf_jit_uncharge_modmem(PAGE_SIZE);
> +		image = st_map->image_pages[--st_map->image_pages_cnt];
> +		arch_free_bpf_trampoline(image, PAGE_SIZE);
> +	}
> +
>   	if (st_map->map.map_flags & BPF_F_LINK) {
>   		err = 0;
>   		if (st_ops->validate) {
> @@ -674,7 +723,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   			if (err)
>   				goto reset_unlock;
>   		}
> -		arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
> +		for (i = 0; i < next_page; i++)
> +			arch_protect_bpf_trampoline(st_map->image_pages[i],
> +						    PAGE_SIZE);

arch_protect_bpf_trampoline() is called here for BPF_F_LINK.

>   		/* Let bpf_link handle registration & unregistration.
>   		 *
>   		 * Pair with smp_load_acquire() during lookup_elem().
> @@ -683,7 +734,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   		goto unlock;
>   	}
>   
> -	arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
> +	for (i = 0; i < next_page; i++)
> +		arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);

arch_protect_bpf_trampoline() is called here also in the same function for non 
BPF_F_LINK.

Can this be cleaned up a bit? For example, "st_ops->validate(kdata);" should not 
be specific to BPF_F_LINK which had been brought up earlier when making the 
"->validate" optional. It is a good time to clean this up also.

----
pw-bot: cr

>   	err = st_ops->reg(kdata);
>   	if (likely(!err)) {
>   		/* This refcnt increment on the map here after
> @@ -706,7 +758,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	 * there was a race in registering the struct_ops (under the same name) to
>   	 * a sub-system through different struct_ops's maps.
>   	 */
> -	arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
> +	for (i = 0; i < next_page; i++)
> +		arch_unprotect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
>   
>   reset_unlock:
>   	bpf_struct_ops_map_put_progs(st_map);
> @@ -771,14 +824,15 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
>   static void __bpf_struct_ops_map_free(struct bpf_map *map)
>   {
>   	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
> +	int i;
>   
>   	if (st_map->links)
>   		bpf_struct_ops_map_put_progs(st_map);
>   	bpf_map_area_free(st_map->links);
> -	if (st_map->image) {
> -		arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
> -		bpf_jit_uncharge_modmem(PAGE_SIZE);
> -	}
> +	for (i = 0; i < st_map->image_pages_cnt; i++)
> +		arch_free_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
> +	bpf_jit_uncharge_modmem(PAGE_SIZE * st_map->image_pages_cnt);
> +	kfree(st_map->image_pages);
>   	bpf_map_area_free(st_map->uvalue);
>   	bpf_map_area_free(st_map);
>   }
> @@ -888,20 +942,27 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>   	st_map->st_ops_desc = st_ops_desc;
>   	map = &st_map->map;
>   
> +	st_map->image_pages = kcalloc(1, sizeof(void *), GFP_KERNEL);
> +	if (!st_map->image_pages) {
> +		ret = -ENOMEM;
> +		goto errout_free;
> +	}
> +
>   	ret = bpf_jit_charge_modmem(PAGE_SIZE);
>   	if (ret)
>   		goto errout_free;
>   
> -	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
> -	if (!st_map->image) {
> -		/* __bpf_struct_ops_map_free() uses st_map->image as flag
> -		 * for "charged or not". In this case, we need to unchange
> -		 * here.
> +	st_map->image_pages[0] = arch_alloc_bpf_trampoline(PAGE_SIZE);
> +	if (!st_map->image_pages[0]) {
> +		/* __bpf_struct_ops_map_free() uses st_map->image_pages_cnt
> +		 * to for uncharging a number of pages.  In this case, we
> +		 * need to uncharge here.
>   		 */
>   		bpf_jit_uncharge_modmem(PAGE_SIZE);
>   		ret = -ENOMEM;
>   		goto errout_free;
>   	}
> +	st_map->image_pages_cnt = 1;
>   	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
>   	st_map->links_cnt = btf_type_vlen(t);
>   	st_map->links =
Kui-Feng Lee Feb. 20, 2024, 11:23 p.m. UTC | #2
On 2/20/24 13:47, Martin KaFai Lau wrote:
> On 2/16/24 10:28 AM, thinker.li@gmail.com wrote:
>> From: Kui-Feng Lee <thinker.li@gmail.com>
>>
>> The BPF struct_ops previously only allowed for one page to be used for 
>> the
>> trampolines of all links in a map. However, we have recently run out of
>> space due to the large number of BPF program links. By allocating
>> additional pages when we exhaust an existing page, we can accommodate 
>> more
>> links in a single map.
>>
>> The variable st_map->image has been changed to st_map->image_pages, 
>> and its
>> type has been changed to an array of pointers to buffers of PAGE_SIZE. 
>> The
>> array is dynamically resized and additional pages are allocated when all
>> existing pages are exhausted.
>>
>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
>> ---
>>   kernel/bpf/bpf_struct_ops.c | 99 ++++++++++++++++++++++++++++++-------
>>   1 file changed, 80 insertions(+), 19 deletions(-)
>>
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index 0d7be97a2411..bb7ae665006a 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -30,12 +30,11 @@ struct bpf_struct_ops_map {
>>        */
>>       struct bpf_link **links;
>>       u32 links_cnt;
>> -    /* image is a page that has all the trampolines
>> +    u32 image_pages_cnt;
>> +    /* image_pages is an array of pages that has all the trampolines
>>        * that stores the func args before calling the bpf_prog.
>> -     * A PAGE_SIZE "image" is enough to store all trampoline for
>> -     * "links[]".
>>        */
>> -    void *image;
>> +    void **image_pages;
>>       /* The owner moduler's btf. */
>>       struct btf *btf;
>>       /* uvalue->data stores the kernel struct
>> @@ -535,7 +534,8 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>       void *udata, *kdata;
>>       int prog_fd, err;
>>       void *image, *image_end;
>> -    u32 i;
>> +    void **image_pages;
>> +    u32 i, next_page = 0;
>>       if (flags)
>>           return -EINVAL;
>> @@ -573,8 +573,8 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>       udata = &uvalue->data;
>>       kdata = &kvalue->data;
>> -    image = st_map->image;
>> -    image_end = st_map->image + PAGE_SIZE;
>> +    image = st_map->image_pages[next_page++];
>> +    image_end = image + PAGE_SIZE;
>>       module_type = btf_type_by_id(btf_vmlinux, 
>> st_ops_ids[IDX_MODULE_ID]);
>>       for_each_member(i, t, member) {
>> @@ -657,6 +657,43 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>                               &st_ops->func_models[i],
>>                               *(void **)(st_ops->cfi_stubs + moff),
>>                               image, image_end);
>> +        if (err == -E2BIG) {
>> +            /* Use an additional page to try again.
>> +             *
>> +             * It may reuse pages allocated for the previous
>> +             * failed calls.
>> +             */
>> +            if (next_page >= st_map->image_pages_cnt) {
> 
> This check (more on this later) ...
> 
>> +                /* Allocate an additional page */
>> +                image_pages = krealloc(st_map->image_pages,
>> +                               (st_map->image_pages_cnt + 1) * 
>> sizeof(void *),
>> +                               GFP_KERNEL);
> 
>  From the patch 2 test, one page is enough for at least 20 ops. How 
> about keep it simple and allow a max 8 pages which should be much more 
> than enough for sane usage. (i.e. add "void 
> *images[MAX_STRUCT_OPS_PAGES];" to "struct bpf_struct_ops_map").

Ok!

> 
>> +                if (!image_pages) {
>> +                    err = -ENOMEM;
>> +                    goto reset_unlock;
>> +                }
>> +                st_map->image_pages = image_pages;
>> +
>> +                err = bpf_jit_charge_modmem(PAGE_SIZE);
>> +                if (err)
>> +                    goto reset_unlock;
>> +
>> +                image = arch_alloc_bpf_trampoline(PAGE_SIZE);
>> +                if (!image) {
>> +                    bpf_jit_uncharge_modmem(PAGE_SIZE);
>> +                    err = -ENOMEM;
>> +                    goto reset_unlock;
>> +                }
>> +                st_map->image_pages[st_map->image_pages_cnt++] = image;
>> +            }
>> +            image = st_map->image_pages[next_page++];
>> +            image_end = image + PAGE_SIZE;
>> +
>> +            err = bpf_struct_ops_prepare_trampoline(tlinks, link,
>> +                                &st_ops->func_models[i],
>> +                                *(void **)(st_ops->cfi_stubs + moff),
>> +                                image, image_end);
>> +        }
>>           if (err < 0)
>>               goto reset_unlock;
>> @@ -667,6 +704,18 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>           *(unsigned long *)(udata + moff) = prog->aux->id;
>>       }
>> +    while (next_page < st_map->image_pages_cnt) {
> 
> This check and the above "if (next_page >= st_map->image_pages_cnt)" 
> should not happen for the common case?
> 
> Together with the new comment after the above "if (err == -E2BIG)", is 
> it trying to optimize to reuse the pages allocated in the previous 
> error-ed out map_update_elem() call?
> 
> How about keep it simple for the common case and always free all pages 
> when map_update_elem() failed?

Yes, it is an optimization. So, together with max 8 pages and free all
pages when fails, we can remove all these code.

> 
> Also, after this patch, the same calls are used in different places.
> 
> arch_alloc_bpf_trampoline() is done in two different places, one in 
> map_alloc() and one in map_update_elem(). How about do all the page 
> alloc in map_update_elem()?
> 
> bpf_struct_ops_prepare_trampoline() is also called in two different 
> places within the same map_update_elem(). When looking inside the 
> bpf_struct_ops_prepare_trampoline(), it does call 
> arch_bpf_trampoline_size() to learn the required size first. 
> bpf_struct_ops_prepare_trampoline() should be a better place to call 
> arch_alloc_bpf_trampoline() when needed. Then there is no need to retry 
> bpf_struct_ops_prepare_trampoline() in map_update_elem()?

Agree!
> 
> 
>> +        /* Free unused pages
>> +         *
>> +         * The value can not be updated anymore if the value is not
>> +         * rejected by st_ops->validate() or st_ops->reg().  So,
>> +         * there is no reason to keep the unused pages.
>> +         */
>> +        bpf_jit_uncharge_modmem(PAGE_SIZE);
>> +        image = st_map->image_pages[--st_map->image_pages_cnt];
>> +        arch_free_bpf_trampoline(image, PAGE_SIZE);
>> +    }
>> +
>>       if (st_map->map.map_flags & BPF_F_LINK) {
>>           err = 0;
>>           if (st_ops->validate) {
>> @@ -674,7 +723,9 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>               if (err)
>>                   goto reset_unlock;
>>           }
>> -        arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>> +        for (i = 0; i < next_page; i++)
>> +            arch_protect_bpf_trampoline(st_map->image_pages[i],
>> +                            PAGE_SIZE);
> 
> arch_protect_bpf_trampoline() is called here for BPF_F_LINK.
> 
>>           /* Let bpf_link handle registration & unregistration.
>>            *
>>            * Pair with smp_load_acquire() during lookup_elem().
>> @@ -683,7 +734,8 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>           goto unlock;
>>       }
>> -    arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>> +    for (i = 0; i < next_page; i++)
>> +        arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
> 
> arch_protect_bpf_trampoline() is called here also in the same function 
> for non BPF_F_LINK.
> 
> Can this be cleaned up a bit? For example, "st_ops->validate(kdata);" 
> should not be specific to BPF_F_LINK which had been brought up earlier 
> when making the "->validate" optional. It is a good time to clean this 
> up also.

Ok

> 
> ----
> pw-bot: cr
> 
>>       err = st_ops->reg(kdata);
>>       if (likely(!err)) {
>>           /* This refcnt increment on the map here after
>> @@ -706,7 +758,8 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>        * there was a race in registering the struct_ops (under the 
>> same name) to
>>        * a sub-system through different struct_ops's maps.
>>        */
>> -    arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
>> +    for (i = 0; i < next_page; i++)
>> +        arch_unprotect_bpf_trampoline(st_map->image_pages[i], 
>> PAGE_SIZE);
>>   reset_unlock:
>>       bpf_struct_ops_map_put_progs(st_map);
>> @@ -771,14 +824,15 @@ static void 
>> bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
>>   static void __bpf_struct_ops_map_free(struct bpf_map *map)
>>   {
>>       struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map 
>> *)map;
>> +    int i;
>>       if (st_map->links)
>>           bpf_struct_ops_map_put_progs(st_map);
>>       bpf_map_area_free(st_map->links);
>> -    if (st_map->image) {
>> -        arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
>> -        bpf_jit_uncharge_modmem(PAGE_SIZE);
>> -    }
>> +    for (i = 0; i < st_map->image_pages_cnt; i++)
>> +        arch_free_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
>> +    bpf_jit_uncharge_modmem(PAGE_SIZE * st_map->image_pages_cnt);
>> +    kfree(st_map->image_pages);
>>       bpf_map_area_free(st_map->uvalue);
>>       bpf_map_area_free(st_map);
>>   }
>> @@ -888,20 +942,27 @@ static struct bpf_map 
>> *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>       st_map->st_ops_desc = st_ops_desc;
>>       map = &st_map->map;
>> +    st_map->image_pages = kcalloc(1, sizeof(void *), GFP_KERNEL);
>> +    if (!st_map->image_pages) {
>> +        ret = -ENOMEM;
>> +        goto errout_free;
>> +    }
>> +
>>       ret = bpf_jit_charge_modmem(PAGE_SIZE);
>>       if (ret)
>>           goto errout_free;
>> -    st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
>> -    if (!st_map->image) {
>> -        /* __bpf_struct_ops_map_free() uses st_map->image as flag
>> -         * for "charged or not". In this case, we need to unchange
>> -         * here.
>> +    st_map->image_pages[0] = arch_alloc_bpf_trampoline(PAGE_SIZE);
>> +    if (!st_map->image_pages[0]) {
>> +        /* __bpf_struct_ops_map_free() uses st_map->image_pages_cnt
>> +         * to for uncharging a number of pages.  In this case, we
>> +         * need to uncharge here.
>>            */
>>           bpf_jit_uncharge_modmem(PAGE_SIZE);
>>           ret = -ENOMEM;
>>           goto errout_free;
>>       }
>> +    st_map->image_pages_cnt = 1;
>>       st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
>>       st_map->links_cnt = btf_type_vlen(t);
>>       st_map->links =
>
diff mbox series

Patch

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 0d7be97a2411..bb7ae665006a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -30,12 +30,11 @@  struct bpf_struct_ops_map {
 	 */
 	struct bpf_link **links;
 	u32 links_cnt;
-	/* image is a page that has all the trampolines
+	u32 image_pages_cnt;
+	/* image_pages is an array of pages that has all the trampolines
 	 * that stores the func args before calling the bpf_prog.
-	 * A PAGE_SIZE "image" is enough to store all trampoline for
-	 * "links[]".
 	 */
-	void *image;
+	void **image_pages;
 	/* The owner moduler's btf. */
 	struct btf *btf;
 	/* uvalue->data stores the kernel struct
@@ -535,7 +534,8 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	void *udata, *kdata;
 	int prog_fd, err;
 	void *image, *image_end;
-	u32 i;
+	void **image_pages;
+	u32 i, next_page = 0;
 
 	if (flags)
 		return -EINVAL;
@@ -573,8 +573,8 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 	udata = &uvalue->data;
 	kdata = &kvalue->data;
-	image = st_map->image;
-	image_end = st_map->image + PAGE_SIZE;
+	image = st_map->image_pages[next_page++];
+	image_end = image + PAGE_SIZE;
 
 	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
 	for_each_member(i, t, member) {
@@ -657,6 +657,43 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 							&st_ops->func_models[i],
 							*(void **)(st_ops->cfi_stubs + moff),
 							image, image_end);
+		if (err == -E2BIG) {
+			/* Use an additional page to try again.
+			 *
+			 * It may reuse pages allocated for the previous
+			 * failed calls.
+			 */
+			if (next_page >= st_map->image_pages_cnt) {
+				/* Allocate an additional page */
+				image_pages = krealloc(st_map->image_pages,
+						       (st_map->image_pages_cnt + 1) * sizeof(void *),
+						       GFP_KERNEL);
+				if (!image_pages) {
+					err = -ENOMEM;
+					goto reset_unlock;
+				}
+				st_map->image_pages = image_pages;
+
+				err = bpf_jit_charge_modmem(PAGE_SIZE);
+				if (err)
+					goto reset_unlock;
+
+				image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+				if (!image) {
+					bpf_jit_uncharge_modmem(PAGE_SIZE);
+					err = -ENOMEM;
+					goto reset_unlock;
+				}
+				st_map->image_pages[st_map->image_pages_cnt++] = image;
+			}
+			image = st_map->image_pages[next_page++];
+			image_end = image + PAGE_SIZE;
+
+			err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+								&st_ops->func_models[i],
+								*(void **)(st_ops->cfi_stubs + moff),
+								image, image_end);
+		}
 		if (err < 0)
 			goto reset_unlock;
 
@@ -667,6 +704,18 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
+	while (next_page < st_map->image_pages_cnt) {
+		/* Free unused pages
+		 *
+		 * The value can not be updated anymore if the value is not
+		 * rejected by st_ops->validate() or st_ops->reg().  So,
+		 * there is no reason to keep the unused pages.
+		 */
+		bpf_jit_uncharge_modmem(PAGE_SIZE);
+		image = st_map->image_pages[--st_map->image_pages_cnt];
+		arch_free_bpf_trampoline(image, PAGE_SIZE);
+	}
+
 	if (st_map->map.map_flags & BPF_F_LINK) {
 		err = 0;
 		if (st_ops->validate) {
@@ -674,7 +723,9 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			if (err)
 				goto reset_unlock;
 		}
-		arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
+		for (i = 0; i < next_page; i++)
+			arch_protect_bpf_trampoline(st_map->image_pages[i],
+						    PAGE_SIZE);
 		/* Let bpf_link handle registration & unregistration.
 		 *
 		 * Pair with smp_load_acquire() during lookup_elem().
@@ -683,7 +734,8 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		goto unlock;
 	}
 
-	arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
+	for (i = 0; i < next_page; i++)
+		arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
 		/* This refcnt increment on the map here after
@@ -706,7 +758,8 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 * there was a race in registering the struct_ops (under the same name) to
 	 * a sub-system through different struct_ops's maps.
 	 */
-	arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
+	for (i = 0; i < next_page; i++)
+		arch_unprotect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
 
 reset_unlock:
 	bpf_struct_ops_map_put_progs(st_map);
@@ -771,14 +824,15 @@  static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
 static void __bpf_struct_ops_map_free(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+	int i;
 
 	if (st_map->links)
 		bpf_struct_ops_map_put_progs(st_map);
 	bpf_map_area_free(st_map->links);
-	if (st_map->image) {
-		arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
-		bpf_jit_uncharge_modmem(PAGE_SIZE);
-	}
+	for (i = 0; i < st_map->image_pages_cnt; i++)
+		arch_free_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
+	bpf_jit_uncharge_modmem(PAGE_SIZE * st_map->image_pages_cnt);
+	kfree(st_map->image_pages);
 	bpf_map_area_free(st_map->uvalue);
 	bpf_map_area_free(st_map);
 }
@@ -888,20 +942,27 @@  static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	st_map->st_ops_desc = st_ops_desc;
 	map = &st_map->map;
 
+	st_map->image_pages = kcalloc(1, sizeof(void *), GFP_KERNEL);
+	if (!st_map->image_pages) {
+		ret = -ENOMEM;
+		goto errout_free;
+	}
+
 	ret = bpf_jit_charge_modmem(PAGE_SIZE);
 	if (ret)
 		goto errout_free;
 
-	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
-	if (!st_map->image) {
-		/* __bpf_struct_ops_map_free() uses st_map->image as flag
-		 * for "charged or not". In this case, we need to unchange
-		 * here.
+	st_map->image_pages[0] = arch_alloc_bpf_trampoline(PAGE_SIZE);
+	if (!st_map->image_pages[0]) {
+		/* __bpf_struct_ops_map_free() uses st_map->image_pages_cnt
+		 * to for uncharging a number of pages.  In this case, we
+		 * need to uncharge here.
 		 */
 		bpf_jit_uncharge_modmem(PAGE_SIZE);
 		ret = -ENOMEM;
 		goto errout_free;
 	}
+	st_map->image_pages_cnt = 1;
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
 	st_map->links_cnt = btf_type_vlen(t);
 	st_map->links =