diff mbox series

[bpf-next,v2,2/3] bpf: struct_ops supports more than one page for trampolines.

Message ID 20240221225911.757861-3-thinker.li@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Allow struct_ops maps with a large number of programs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1051 this patch: 1051
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 9 maintainers not CCed: jolsa@kernel.org daniel@iogearbox.net john.fastabend@gmail.com yonghong.song@linux.dev netdev@vger.kernel.org sdf@google.com eddyz87@gmail.com kpsingh@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 1066 this patch: 1066
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1068 this patch: 1068
netdev/checkpatch warning WARNING: line length of 88 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization

Commit Message

Kui-Feng Lee Feb. 21, 2024, 10:59 p.m. UTC
From: Kui-Feng Lee <thinker.li@gmail.com>

The BPF struct_ops previously only allowed for one page to be used for the
trampolines of all links in a map. However, we have recently run out of
space due to the large number of BPF program links. By allocating
additional pages when we exhaust an existing page, we can accommodate more
links in a single map.

The variable st_map->image has been changed to st_map->image_pages, and its
type has been changed to an array of pointers to buffers of
PAGE_SIZE. Every struct_ops map can have MAX_IMAGE_PAGES (8) pages for
trampolines at most.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
 kernel/bpf/bpf_struct_ops.c | 112 +++++++++++++++++++++++++-----------
 1 file changed, 79 insertions(+), 33 deletions(-)

Comments

Martin KaFai Lau Feb. 23, 2024, 12:33 a.m. UTC | #1
On 2/21/24 2:59 PM, thinker.li@gmail.com wrote:
> @@ -531,10 +567,10 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	const struct btf_type *module_type;
>   	const struct btf_member *member;
>   	const struct btf_type *t = st_ops_desc->type;
> +	void *image = NULL, *image_end = NULL;
>   	struct bpf_tramp_links *tlinks;
>   	void *udata, *kdata;
>   	int prog_fd, err;
> -	void *image, *image_end;
>   	u32 i;
>   
>   	if (flags)
> @@ -573,15 +609,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   
>   	udata = &uvalue->data;
>   	kdata = &kvalue->data;
> -	image = st_map->image;
> -	image_end = st_map->image + PAGE_SIZE;
>   
>   	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
>   	for_each_member(i, t, member) {
>   		const struct btf_type *mtype, *ptype;
>   		struct bpf_prog *prog;
>   		struct bpf_tramp_link *link;
> -		u32 moff;
> +		u32 moff, tflags;
> +		int tsize;
>   
>   		moff = __btf_member_bit_offset(t, member) / 8;
>   		ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
> @@ -653,10 +688,38 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   			      &bpf_struct_ops_link_lops, prog);
>   		st_map->links[i] = &link->link;
>   
> -		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
> -							&st_ops->func_models[i],
> -							*(void **)(st_ops->cfi_stubs + moff),
> -							image, image_end);
> +		tflags = BPF_TRAMP_F_INDIRECT;
> +		if (st_ops->func_models[i].ret_size > 0)
> +			tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
> +
> +		/* Compute the size of the trampoline */
> +		tlinks[BPF_TRAMP_FENTRY].links[0] = link;
> +		tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
> +		tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
> +						 tflags, tlinks, NULL);
> +		if (tsize < 0) {
> +			err = tsize;
> +			goto reset_unlock;
> +		}
> +
> +		/* Allocate pages */
> +		if (tsize > (unsigned long)image_end - (unsigned long)image) {
> +			if (tsize > PAGE_SIZE) {
> +				err = -E2BIG;
> +				goto reset_unlock;
> +			}
> +			image = bpf_struct_ops_map_inc_image(st_map);
> +			if (IS_ERR(image)) {
> +				err = PTR_ERR(image);
> +				goto reset_unlock;
> +			}
> +			image_end = image + PAGE_SIZE;
> +		}
> +
> +		err = arch_prepare_bpf_trampoline(NULL, image, image_end,
> +						  &st_ops->func_models[i],
> +						  tflags, tlinks,
> +						  *(void **)(st_ops->cfi_stubs + moff));

I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and the 
arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() which is 
used by the bpf_dummy_ops for testing also. Considering struct_ops supports 
kernel module now, in the future, it is better to move bpf_dummy_ops out to the 
bpf_testmod somehow and avoid its bpf_struct_ops_prepare_trampoline() usage. For 
now, it is still better to keep bpf_struct_ops_prepare_trampoline() to be 
reusable by both.

Have you thought about the earlier suggestion in v1 to do 
arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() instead of 
copying codes from bpf_struct_ops_prepare_trampoline() to 
bpf_struct_ops_map_update_elem()?

Something like this (untested code):

void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
					struct bpf_tramp_link *link,
					const struct btf_func_model *model,
					void *stub_func, void *image,
					u32 *image_off,
					bool allow_alloc)
{
	u32 flags = BPF_TRAMP_F_INDIRECT;
	void *new_image = NULL;
	int size;

	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;

	if (model->ret_size > 0)
		flags |= BPF_TRAMP_F_RET_FENTRY_RET;

	size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
	if (size < 0)
		return ERR_PTR(size);
	if (!image || size > PAGE_SIZE - *image_off) {
		int err;

		if (!allow_alloc)
			return ERR_PTR(-E2BIG);

		err = bpf_jit_charge_modmem(PAGE_SIZE);
		if (err)
			return ERR_PTR(err);

		new_image = image = arch_alloc_bpf_trampoline(PAGE_SIZE);
		if (!new_image) {
			bpf_jit_uncharge_modmem(PAGE_SIZE);
			return ERR_PTR(-ENOMEM);
		}
		*image_off = 0;
	}


	size = arch_prepare_bpf_trampoline(NULL, image + *image_off,
					   image + PAGE_SIZE,
					   model, flags, tlinks, stub_func);
	if (size >= 0) {
		*image_off += size;
		return image;
	}

	if (new_image) {
		bpf_jit_uncharge_modmem(PAGE_SIZE);
		arch_free_bpf_trampoline(new_image, PAGE_SIZE);
	}

	return ERR_PTR(size);
}

----

pw-bot: cr

>   		if (err < 0)
>   			goto reset_unlock;
>   
> @@ -672,10 +735,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   		if (err)
>   			goto reset_unlock;
>   	}
> +	for (i = 0; i < st_map->image_pages_cnt; i++)
> +		arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
>   
>   	if (st_map->map.map_flags & BPF_F_LINK) {
>   		err = 0;
> -		arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>   		/* Let bpf_link handle registration & unregistration.
>   		 *
>   		 * Pair with smp_load_acquire() during lookup_elem().
> @@ -684,7 +748,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   		goto unlock;
>   	}
>   
> -	arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>   	err = st_ops->reg(kdata);
>   	if (likely(!err)) {
>   		/* This refcnt increment on the map here after
> @@ -707,9 +770,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	 * there was a race in registering the struct_ops (under the same name) to
>   	 * a sub-system through different struct_ops's maps.
>   	 */
> -	arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
>   
>   reset_unlock:
> +	bpf_struct_ops_map_free_image(st_map);
>   	bpf_struct_ops_map_put_progs(st_map);
>   	memset(uvalue, 0, map->value_size);
>   	memset(kvalue, 0, map->value_size);
> @@ -776,10 +839,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
>   	if (st_map->links)
>   		bpf_struct_ops_map_put_progs(st_map);
>   	bpf_map_area_free(st_map->links);
> -	if (st_map->image) {
> -		arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
> -		bpf_jit_uncharge_modmem(PAGE_SIZE);
> -	}
> +	bpf_struct_ops_map_free_image(st_map);
>   	bpf_map_area_free(st_map->uvalue);
>   	bpf_map_area_free(st_map);
>   }
> @@ -889,20 +949,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>   	st_map->st_ops_desc = st_ops_desc;
>   	map = &st_map->map;
>   
> -	ret = bpf_jit_charge_modmem(PAGE_SIZE);
> -	if (ret)
> -		goto errout_free;
> -
> -	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
> -	if (!st_map->image) {
> -		/* __bpf_struct_ops_map_free() uses st_map->image as flag
> -		 * for "charged or not". In this case, we need to unchange
> -		 * here.
> -		 */
> -		bpf_jit_uncharge_modmem(PAGE_SIZE);
> -		ret = -ENOMEM;
> -		goto errout_free;
> -	}
>   	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
>   	st_map->links_cnt = btf_type_vlen(t);
>   	st_map->links =
Kui-Feng Lee Feb. 23, 2024, 1:35 a.m. UTC | #2
On 2/22/24 16:33, Martin KaFai Lau wrote:
> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote:
>> @@ -531,10 +567,10 @@ static long 
>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>       const struct btf_type *module_type;
>>       const struct btf_member *member;
>>       const struct btf_type *t = st_ops_desc->type;
>> +    void *image = NULL, *image_end = NULL;
>>       struct bpf_tramp_links *tlinks;
>>       void *udata, *kdata;
>>       int prog_fd, err;
>> -    void *image, *image_end;
>>       u32 i;
>>       if (flags)
>> @@ -573,15 +609,14 @@ static long 
>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>       udata = &uvalue->data;
>>       kdata = &kvalue->data;
>> -    image = st_map->image;
>> -    image_end = st_map->image + PAGE_SIZE;
>>       module_type = btf_type_by_id(btf_vmlinux, 
>> st_ops_ids[IDX_MODULE_ID]);
>>       for_each_member(i, t, member) {
>>           const struct btf_type *mtype, *ptype;
>>           struct bpf_prog *prog;
>>           struct bpf_tramp_link *link;
>> -        u32 moff;
>> +        u32 moff, tflags;
>> +        int tsize;
>>           moff = __btf_member_bit_offset(t, member) / 8;
>>           ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
>> @@ -653,10 +688,38 @@ static long 
>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>                     &bpf_struct_ops_link_lops, prog);
>>           st_map->links[i] = &link->link;
>> -        err = bpf_struct_ops_prepare_trampoline(tlinks, link,
>> -                            &st_ops->func_models[i],
>> -                            *(void **)(st_ops->cfi_stubs + moff),
>> -                            image, image_end);
>> +        tflags = BPF_TRAMP_F_INDIRECT;
>> +        if (st_ops->func_models[i].ret_size > 0)
>> +            tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
>> +
>> +        /* Compute the size of the trampoline */
>> +        tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>> +        tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
>> +        tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
>> +                         tflags, tlinks, NULL);
>> +        if (tsize < 0) {
>> +            err = tsize;
>> +            goto reset_unlock;
>> +        }
>> +
>> +        /* Allocate pages */
>> +        if (tsize > (unsigned long)image_end - (unsigned long)image) {
>> +            if (tsize > PAGE_SIZE) {
>> +                err = -E2BIG;
>> +                goto reset_unlock;
>> +            }
>> +            image = bpf_struct_ops_map_inc_image(st_map);
>> +            if (IS_ERR(image)) {
>> +                err = PTR_ERR(image);
>> +                goto reset_unlock;
>> +            }
>> +            image_end = image + PAGE_SIZE;
>> +        }
>> +
>> +        err = arch_prepare_bpf_trampoline(NULL, image, image_end,
>> +                          &st_ops->func_models[i],
>> +                          tflags, tlinks,
>> +                          *(void **)(st_ops->cfi_stubs + moff));
> 
> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and 
> the arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() 
> which is used by the bpf_dummy_ops for testing also. Considering 
> struct_ops supports kernel module now, in the future, it is better to 
> move bpf_dummy_ops out to the bpf_testmod somehow and avoid its 
> bpf_struct_ops_prepare_trampoline() usage. For now, it is still better 
> to keep bpf_struct_ops_prepare_trampoline() to be reusable by both.
> 
> Have you thought about the earlier suggestion in v1 to do 
> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() 
> instead of copying codes from bpf_struct_ops_prepare_trampoline() to 
> bpf_struct_ops_map_update_elem()?
> 
> Something like this (untested code):
> 
> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>                      struct bpf_tramp_link *link,
>                      const struct btf_func_model *model,
>                      void *stub_func, void *image,
>                      u32 *image_off,
>                      bool allow_alloc)


How about pass a struct bpf_struct_ops_map to
bpf_struct_ops_prepare_trampoline(). If the pointer of struct
bpf_struct_ops_map is not NULL, try to allocate new pages for the map?

For example,

static int
_bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 
 

                                    struct bpf_tramp_link *link, 
 

                                    const struct btf_func_model *model, 
 

                                    void *stub_func, void *image,
                                    void *image_end,
                                    struct bpf_struct_ops_map *st_map) 

{ 
 

...

     if (!image || size > PAGE_SIZE - *image_off) {
         if (!st_map)
             return -E2BIG;
         image = bpf_struct_ops_map_inc_image(st_map);
         if (IS_ERR(image))
             return PTR_ERR(image);
         image_end = image + PAGE_SIZE;
     }
...
}

int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 
 

                                       struct bpf_tramp_link *link, 
 

                                       const struct 
btf_func_model*model, 

                                       void *stub_func, void *image,
                                       void *image_end)
{
     return _bpf_struct_ops_prepare_trampoline(tlinks, link, model,
                                               stub_func, image,
                                               image_end, NULL);
}

> {
>      u32 flags = BPF_TRAMP_F_INDIRECT;
>      void *new_image = NULL;
>      int size;
> 
>      tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>      tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
> 
>      if (model->ret_size > 0)
>          flags |= BPF_TRAMP_F_RET_FENTRY_RET;
> 
>      size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
>      if (size < 0)
>          return ERR_PTR(size);
>      if (!image || size > PAGE_SIZE - *image_off) {
>          int err;
> 
>          if (!allow_alloc)
>              return ERR_PTR(-E2BIG);
> 
>          err = bpf_jit_charge_modmem(PAGE_SIZE);
>          if (err)
>              return ERR_PTR(err);
> 
>          new_image = image = arch_alloc_bpf_trampoline(PAGE_SIZE);
>          if (!new_image) {
>              bpf_jit_uncharge_modmem(PAGE_SIZE);
>              return ERR_PTR(-ENOMEM);
>          }
>          *image_off = 0;
>      }
> 
> 
>      size = arch_prepare_bpf_trampoline(NULL, image + *image_off,
>                         image + PAGE_SIZE,
>                         model, flags, tlinks, stub_func);
>      if (size >= 0) {
>          *image_off += size;
>          return image;
>      }
> 
>      if (new_image) {
>          bpf_jit_uncharge_modmem(PAGE_SIZE);
>          arch_free_bpf_trampoline(new_image, PAGE_SIZE);
>      }
> 
>      return ERR_PTR(size);
> }
> 
> ----
> 
> pw-bot: cr
> 
>>           if (err < 0)
>>               goto reset_unlock;
>> @@ -672,10 +735,11 @@ static long 
>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>           if (err)
>>               goto reset_unlock;
>>       }
>> +    for (i = 0; i < st_map->image_pages_cnt; i++)
>> +        arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
>>       if (st_map->map.map_flags & BPF_F_LINK) {
>>           err = 0;
>> -        arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>>           /* Let bpf_link handle registration & unregistration.
>>            *
>>            * Pair with smp_load_acquire() during lookup_elem().
>> @@ -684,7 +748,6 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>           goto unlock;
>>       }
>> -    arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>>       err = st_ops->reg(kdata);
>>       if (likely(!err)) {
>>           /* This refcnt increment on the map here after
>> @@ -707,9 +770,9 @@ static long bpf_struct_ops_map_update_elem(struct 
>> bpf_map *map, void *key,
>>        * there was a race in registering the struct_ops (under the 
>> same name) to
>>        * a sub-system through different struct_ops's maps.
>>        */
>> -    arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
>>   reset_unlock:
>> +    bpf_struct_ops_map_free_image(st_map);
>>       bpf_struct_ops_map_put_progs(st_map);
>>       memset(uvalue, 0, map->value_size);
>>       memset(kvalue, 0, map->value_size);
>> @@ -776,10 +839,7 @@ static void __bpf_struct_ops_map_free(struct 
>> bpf_map *map)
>>       if (st_map->links)
>>           bpf_struct_ops_map_put_progs(st_map);
>>       bpf_map_area_free(st_map->links);
>> -    if (st_map->image) {
>> -        arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
>> -        bpf_jit_uncharge_modmem(PAGE_SIZE);
>> -    }
>> +    bpf_struct_ops_map_free_image(st_map);
>>       bpf_map_area_free(st_map->uvalue);
>>       bpf_map_area_free(st_map);
>>   }
>> @@ -889,20 +949,6 @@ static struct bpf_map 
>> *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>       st_map->st_ops_desc = st_ops_desc;
>>       map = &st_map->map;
>> -    ret = bpf_jit_charge_modmem(PAGE_SIZE);
>> -    if (ret)
>> -        goto errout_free;
>> -
>> -    st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
>> -    if (!st_map->image) {
>> -        /* __bpf_struct_ops_map_free() uses st_map->image as flag
>> -         * for "charged or not". In this case, we need to unchange
>> -         * here.
>> -         */
>> -        bpf_jit_uncharge_modmem(PAGE_SIZE);
>> -        ret = -ENOMEM;
>> -        goto errout_free;
>> -    }
>>       st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
>>       st_map->links_cnt = btf_type_vlen(t);
>>       st_map->links =
>
Martin KaFai Lau Feb. 23, 2024, 2:16 a.m. UTC | #3
On 2/22/24 5:35 PM, Kui-Feng Lee wrote:
> 
> 
> On 2/22/24 16:33, Martin KaFai Lau wrote:
>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote:
>>> @@ -531,10 +567,10 @@ static long bpf_struct_ops_map_update_elem(struct 
>>> bpf_map *map, void *key,
>>>       const struct btf_type *module_type;
>>>       const struct btf_member *member;
>>>       const struct btf_type *t = st_ops_desc->type;
>>> +    void *image = NULL, *image_end = NULL;
>>>       struct bpf_tramp_links *tlinks;
>>>       void *udata, *kdata;
>>>       int prog_fd, err;
>>> -    void *image, *image_end;
>>>       u32 i;
>>>       if (flags)
>>> @@ -573,15 +609,14 @@ static long bpf_struct_ops_map_update_elem(struct 
>>> bpf_map *map, void *key,
>>>       udata = &uvalue->data;
>>>       kdata = &kvalue->data;
>>> -    image = st_map->image;
>>> -    image_end = st_map->image + PAGE_SIZE;
>>>       module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
>>>       for_each_member(i, t, member) {
>>>           const struct btf_type *mtype, *ptype;
>>>           struct bpf_prog *prog;
>>>           struct bpf_tramp_link *link;
>>> -        u32 moff;
>>> +        u32 moff, tflags;
>>> +        int tsize;
>>>           moff = __btf_member_bit_offset(t, member) / 8;
>>>           ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
>>> @@ -653,10 +688,38 @@ static long bpf_struct_ops_map_update_elem(struct 
>>> bpf_map *map, void *key,
>>>                     &bpf_struct_ops_link_lops, prog);
>>>           st_map->links[i] = &link->link;
>>> -        err = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>> -                            &st_ops->func_models[i],
>>> -                            *(void **)(st_ops->cfi_stubs + moff),
>>> -                            image, image_end);
>>> +        tflags = BPF_TRAMP_F_INDIRECT;
>>> +        if (st_ops->func_models[i].ret_size > 0)
>>> +            tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
>>> +
>>> +        /* Compute the size of the trampoline */
>>> +        tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>>> +        tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
>>> +        tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
>>> +                         tflags, tlinks, NULL);
>>> +        if (tsize < 0) {
>>> +            err = tsize;
>>> +            goto reset_unlock;
>>> +        }
>>> +
>>> +        /* Allocate pages */
>>> +        if (tsize > (unsigned long)image_end - (unsigned long)image) {
>>> +            if (tsize > PAGE_SIZE) {
>>> +                err = -E2BIG;
>>> +                goto reset_unlock;
>>> +            }
>>> +            image = bpf_struct_ops_map_inc_image(st_map);
>>> +            if (IS_ERR(image)) {
>>> +                err = PTR_ERR(image);
>>> +                goto reset_unlock;
>>> +            }
>>> +            image_end = image + PAGE_SIZE;
>>> +        }
>>> +
>>> +        err = arch_prepare_bpf_trampoline(NULL, image, image_end,
>>> +                          &st_ops->func_models[i],
>>> +                          tflags, tlinks,
>>> +                          *(void **)(st_ops->cfi_stubs + moff));
>>
>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and the 
>> arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() which is 
>> used by the bpf_dummy_ops for testing also. Considering struct_ops supports 
>> kernel module now, in the future, it is better to move bpf_dummy_ops out to 
>> the bpf_testmod somehow and avoid its bpf_struct_ops_prepare_trampoline() 
>> usage. For now, it is still better to keep bpf_struct_ops_prepare_trampoline() 
>> to be reusable by both.
>>
>> Have you thought about the earlier suggestion in v1 to do 
>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() instead of 
>> copying codes from bpf_struct_ops_prepare_trampoline() to 
>> bpf_struct_ops_map_update_elem()?
>>
>> Something like this (untested code):
>>
>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>                      struct bpf_tramp_link *link,
>>                      const struct btf_func_model *model,
>>                      void *stub_func, void *image,
>>                      u32 *image_off,
>>                      bool allow_alloc)

To be a little more specific, the changes in bpf_struct_ops_map_update_elem()
could be mostly like this (untested):

		ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link,
							      &st_ops->func_models[i],
							      *(void **)(st_ops->cfi_stubs + moff),
							      image, &image_off,
							      st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
		if (IS_ERR(ret_image))
			goto reset_unlock;

		if (image != ret_image) {
			image = ret_image;
			st_map->image_pages[st_map->image_pages_cnt++] = image;
		}

> 
> 
> How about pass a struct bpf_struct_ops_map to
> bpf_struct_ops_prepare_trampoline(). If the pointer of struct
> bpf_struct_ops_map is not NULL, try to allocate new pages for the map?
> 
> For example,
> 
> static int
> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
> 
>                                     struct bpf_tramp_link *link,
> 
>                                     const struct btf_func_model *model,
> 
>                                     void *stub_func, void *image,
>                                     void *image_end,
>                                     struct bpf_struct_ops_map *st_map)
> {
> 
> ...
> 
>      if (!image || size > PAGE_SIZE - *image_off) {
>          if (!st_map)

Why only limit to st_map != NULL?

arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops.
If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as well simplify
bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc.

>              return -E2BIG;
>          image = bpf_struct_ops_map_inc_image(st_map);
>          if (IS_ERR(image))
>              return PTR_ERR(image);
>          image_end = image + PAGE_SIZE;
>      }
> ...
> }
> 
> int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
> 
>                                        struct bpf_tramp_link *link,
> 
>                                        const struct btf_func_model*model,
>                                        void *stub_func, void *image,
>                                        void *image_end)
> {
>      return _bpf_struct_ops_prepare_trampoline(tlinks, link, model,
>                                                stub_func, image,
>                                                image_end, NULL);
> }
> 
>> {
>>      u32 flags = BPF_TRAMP_F_INDIRECT;
>>      void *new_image = NULL;
>>      int size;
>>
>>      tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>>      tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
>>
>>      if (model->ret_size > 0)
>>          flags |= BPF_TRAMP_F_RET_FENTRY_RET;
>>
>>      size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
>>      if (size < 0)
>>          return ERR_PTR(size);
>>      if (!image || size > PAGE_SIZE - *image_off) {
>>          int err;
>>
>>          if (!allow_alloc)
>>              return ERR_PTR(-E2BIG);
>>
>>          err = bpf_jit_charge_modmem(PAGE_SIZE);
>>          if (err)
>>              return ERR_PTR(err);
>>
>>          new_image = image = arch_alloc_bpf_trampoline(PAGE_SIZE);
>>          if (!new_image) {
>>              bpf_jit_uncharge_modmem(PAGE_SIZE);
>>              return ERR_PTR(-ENOMEM);
>>          }
>>          *image_off = 0;
>>      }
>>
>>
>>      size = arch_prepare_bpf_trampoline(NULL, image + *image_off,
>>                         image + PAGE_SIZE,
>>                         model, flags, tlinks, stub_func);
>>      if (size >= 0) {
>>          *image_off += size;
>>          return image;
>>      }
>>
>>      if (new_image) {
>>          bpf_jit_uncharge_modmem(PAGE_SIZE);
>>          arch_free_bpf_trampoline(new_image, PAGE_SIZE);
>>      }
>>
>>      return ERR_PTR(size);
>> }
>>
>> ----
>>
>> pw-bot: cr
>>
>>>           if (err < 0)
>>>               goto reset_unlock;
>>> @@ -672,10 +735,11 @@ static long bpf_struct_ops_map_update_elem(struct 
>>> bpf_map *map, void *key,
>>>           if (err)
>>>               goto reset_unlock;
>>>       }
>>> +    for (i = 0; i < st_map->image_pages_cnt; i++)
>>> +        arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
>>>       if (st_map->map.map_flags & BPF_F_LINK) {
>>>           err = 0;
>>> -        arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>>>           /* Let bpf_link handle registration & unregistration.
>>>            *
>>>            * Pair with smp_load_acquire() during lookup_elem().
>>> @@ -684,7 +748,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map 
>>> *map, void *key,
>>>           goto unlock;
>>>       }
>>> -    arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
>>>       err = st_ops->reg(kdata);
>>>       if (likely(!err)) {
>>>           /* This refcnt increment on the map here after
>>> @@ -707,9 +770,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map 
>>> *map, void *key,
>>>        * there was a race in registering the struct_ops (under the same name) to
>>>        * a sub-system through different struct_ops's maps.
>>>        */
>>> -    arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
>>>   reset_unlock:
>>> +    bpf_struct_ops_map_free_image(st_map);
>>>       bpf_struct_ops_map_put_progs(st_map);
>>>       memset(uvalue, 0, map->value_size);
>>>       memset(kvalue, 0, map->value_size);
>>> @@ -776,10 +839,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
>>>       if (st_map->links)
>>>           bpf_struct_ops_map_put_progs(st_map);
>>>       bpf_map_area_free(st_map->links);
>>> -    if (st_map->image) {
>>> -        arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
>>> -        bpf_jit_uncharge_modmem(PAGE_SIZE);
>>> -    }
>>> +    bpf_struct_ops_map_free_image(st_map);
>>>       bpf_map_area_free(st_map->uvalue);
>>>       bpf_map_area_free(st_map);
>>>   }
>>> @@ -889,20 +949,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union 
>>> bpf_attr *attr)
>>>       st_map->st_ops_desc = st_ops_desc;
>>>       map = &st_map->map;
>>> -    ret = bpf_jit_charge_modmem(PAGE_SIZE);
>>> -    if (ret)
>>> -        goto errout_free;
>>> -
>>> -    st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
>>> -    if (!st_map->image) {
>>> -        /* __bpf_struct_ops_map_free() uses st_map->image as flag
>>> -         * for "charged or not". In this case, we need to unchange
>>> -         * here.
>>> -         */
>>> -        bpf_jit_uncharge_modmem(PAGE_SIZE);
>>> -        ret = -ENOMEM;
>>> -        goto errout_free;
>>> -    }
>>>       st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
>>>       st_map->links_cnt = btf_type_vlen(t);
>>>       st_map->links =
>>
Kui-Feng Lee Feb. 23, 2024, 3:01 a.m. UTC | #4
On 2/22/24 18:16, Martin KaFai Lau wrote:
> On 2/22/24 5:35 PM, Kui-Feng Lee wrote:
>>
>>
>> On 2/22/24 16:33, Martin KaFai Lau wrote:
>>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote:
>>>> @@ -531,10 +567,10 @@ static long 
>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>       const struct btf_type *module_type;
>>>>       const struct btf_member *member;
>>>>       const struct btf_type *t = st_ops_desc->type;
>>>> +    void *image = NULL, *image_end = NULL;
>>>>       struct bpf_tramp_links *tlinks;
>>>>       void *udata, *kdata;
>>>>       int prog_fd, err;
>>>> -    void *image, *image_end;
>>>>       u32 i;
>>>>       if (flags)
>>>> @@ -573,15 +609,14 @@ static long 
>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>       udata = &uvalue->data;
>>>>       kdata = &kvalue->data;
>>>> -    image = st_map->image;
>>>> -    image_end = st_map->image + PAGE_SIZE;
>>>>       module_type = btf_type_by_id(btf_vmlinux, 
>>>> st_ops_ids[IDX_MODULE_ID]);
>>>>       for_each_member(i, t, member) {
>>>>           const struct btf_type *mtype, *ptype;
>>>>           struct bpf_prog *prog;
>>>>           struct bpf_tramp_link *link;
>>>> -        u32 moff;
>>>> +        u32 moff, tflags;
>>>> +        int tsize;
>>>>           moff = __btf_member_bit_offset(t, member) / 8;
>>>>           ptype = btf_type_resolve_ptr(st_map->btf, member->type, 
>>>> NULL);
>>>> @@ -653,10 +688,38 @@ static long 
>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>                     &bpf_struct_ops_link_lops, prog);
>>>>           st_map->links[i] = &link->link;
>>>> -        err = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>>> -                            &st_ops->func_models[i],
>>>> -                            *(void **)(st_ops->cfi_stubs + moff),
>>>> -                            image, image_end);
>>>> +        tflags = BPF_TRAMP_F_INDIRECT;
>>>> +        if (st_ops->func_models[i].ret_size > 0)
>>>> +            tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
>>>> +
>>>> +        /* Compute the size of the trampoline */
>>>> +        tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>>>> +        tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
>>>> +        tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
>>>> +                         tflags, tlinks, NULL);
>>>> +        if (tsize < 0) {
>>>> +            err = tsize;
>>>> +            goto reset_unlock;
>>>> +        }
>>>> +
>>>> +        /* Allocate pages */
>>>> +        if (tsize > (unsigned long)image_end - (unsigned long)image) {
>>>> +            if (tsize > PAGE_SIZE) {
>>>> +                err = -E2BIG;
>>>> +                goto reset_unlock;
>>>> +            }
>>>> +            image = bpf_struct_ops_map_inc_image(st_map);
>>>> +            if (IS_ERR(image)) {
>>>> +                err = PTR_ERR(image);
>>>> +                goto reset_unlock;
>>>> +            }
>>>> +            image_end = image + PAGE_SIZE;
>>>> +        }
>>>> +
>>>> +        err = arch_prepare_bpf_trampoline(NULL, image, image_end,
>>>> +                          &st_ops->func_models[i],
>>>> +                          tflags, tlinks,
>>>> +                          *(void **)(st_ops->cfi_stubs + moff));
>>>
>>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, 
>>> and the arch_*_trampoline_*() logic from 
>>> bpf_struct_ops_prepare_trampoline() which is used by the 
>>> bpf_dummy_ops for testing also. Considering struct_ops supports 
>>> kernel module now, in the future, it is better to move bpf_dummy_ops 
>>> out to the bpf_testmod somehow and avoid its 
>>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still 
>>> better to keep bpf_struct_ops_prepare_trampoline() to be reusable by 
>>> both.
>>>
>>> Have you thought about the earlier suggestion in v1 to do 
>>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() 
>>> instead of copying codes from bpf_struct_ops_prepare_trampoline() to 
>>> bpf_struct_ops_map_update_elem()?
>>>
>>> Something like this (untested code):
>>>
>>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>>                      struct bpf_tramp_link *link,
>>>                      const struct btf_func_model *model,
>>>                      void *stub_func, void *image,
>>>                      u32 *image_off,
>>>                      bool allow_alloc)
> 
> To be a little more specific, the changes in 
> bpf_struct_ops_map_update_elem()
> could be mostly like this (untested):
> 
>          ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link,
>                                    &st_ops->func_models[i],
>                                    *(void **)(st_ops->cfi_stubs + moff),
>                                    image, &image_off,
>                                    st_map->image_pages_cnt < 
> MAX_TRAMP_IMAGE_PAGES);
>          if (IS_ERR(ret_image))
>              goto reset_unlock;
> 
>          if (image != ret_image) {
>              image = ret_image;
>              st_map->image_pages[st_map->image_pages_cnt++] = image;
>          }
> 

What I don't like is the memory management code was in two named
functions, bpf_struct_ops_map_free_image() and
bpf_struct_ops_map_inc_image().
Now, it falls apart.  Allocate in one place, keep accounting in another
place, and free yet at the 3rd place.

>>
>>
>> How about pass a struct bpf_struct_ops_map to
>> bpf_struct_ops_prepare_trampoline(). If the pointer of struct
>> bpf_struct_ops_map is not NULL, try to allocate new pages for the map?
>>
>> For example,
>>
>> static int
>> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>
>>                                     struct bpf_tramp_link *link,
>>
>>                                     const struct btf_func_model *model,
>>
>>                                     void *stub_func, void *image,
>>                                     void *image_end,
>>                                     struct bpf_struct_ops_map *st_map)
>> {
>>
>> ...
>>
>>      if (!image || size > PAGE_SIZE - *image_off) {
>>          if (!st_map)
> 
> Why only limit to st_map != NULL?
> 
> arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops.
> If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as well 
> simplify
> bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc.


Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops
still need to free the memory. And, it doesn't pair alloc and free in
the same function. Usually, paring alloc and free in the same function,
nearby, or the same module is easier to understand.


[ skip ]
Martin KaFai Lau Feb. 23, 2024, 5:25 a.m. UTC | #5
On 2/22/24 7:01 PM, Kui-Feng Lee wrote:
> 
> 
> 
> On 2/22/24 18:16, Martin KaFai Lau wrote:
>> On 2/22/24 5:35 PM, Kui-Feng Lee wrote:
>>>
>>>
>>> On 2/22/24 16:33, Martin KaFai Lau wrote:
>>>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote:
>>>>> @@ -531,10 +567,10 @@ static long bpf_struct_ops_map_update_elem(struct 
>>>>> bpf_map *map, void *key,
>>>>>       const struct btf_type *module_type;
>>>>>       const struct btf_member *member;
>>>>>       const struct btf_type *t = st_ops_desc->type;
>>>>> +    void *image = NULL, *image_end = NULL;
>>>>>       struct bpf_tramp_links *tlinks;
>>>>>       void *udata, *kdata;
>>>>>       int prog_fd, err;
>>>>> -    void *image, *image_end;
>>>>>       u32 i;
>>>>>       if (flags)
>>>>> @@ -573,15 +609,14 @@ static long bpf_struct_ops_map_update_elem(struct 
>>>>> bpf_map *map, void *key,
>>>>>       udata = &uvalue->data;
>>>>>       kdata = &kvalue->data;
>>>>> -    image = st_map->image;
>>>>> -    image_end = st_map->image + PAGE_SIZE;
>>>>>       module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
>>>>>       for_each_member(i, t, member) {
>>>>>           const struct btf_type *mtype, *ptype;
>>>>>           struct bpf_prog *prog;
>>>>>           struct bpf_tramp_link *link;
>>>>> -        u32 moff;
>>>>> +        u32 moff, tflags;
>>>>> +        int tsize;
>>>>>           moff = __btf_member_bit_offset(t, member) / 8;
>>>>>           ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
>>>>> @@ -653,10 +688,38 @@ static long bpf_struct_ops_map_update_elem(struct 
>>>>> bpf_map *map, void *key,
>>>>>                     &bpf_struct_ops_link_lops, prog);
>>>>>           st_map->links[i] = &link->link;
>>>>> -        err = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>>>> -                            &st_ops->func_models[i],
>>>>> -                            *(void **)(st_ops->cfi_stubs + moff),
>>>>> -                            image, image_end);
>>>>> +        tflags = BPF_TRAMP_F_INDIRECT;
>>>>> +        if (st_ops->func_models[i].ret_size > 0)
>>>>> +            tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
>>>>> +
>>>>> +        /* Compute the size of the trampoline */
>>>>> +        tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>>>>> +        tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
>>>>> +        tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
>>>>> +                         tflags, tlinks, NULL);
>>>>> +        if (tsize < 0) {
>>>>> +            err = tsize;
>>>>> +            goto reset_unlock;
>>>>> +        }
>>>>> +
>>>>> +        /* Allocate pages */
>>>>> +        if (tsize > (unsigned long)image_end - (unsigned long)image) {
>>>>> +            if (tsize > PAGE_SIZE) {
>>>>> +                err = -E2BIG;
>>>>> +                goto reset_unlock;
>>>>> +            }
>>>>> +            image = bpf_struct_ops_map_inc_image(st_map);
>>>>> +            if (IS_ERR(image)) {
>>>>> +                err = PTR_ERR(image);
>>>>> +                goto reset_unlock;
>>>>> +            }
>>>>> +            image_end = image + PAGE_SIZE;
>>>>> +        }
>>>>> +
>>>>> +        err = arch_prepare_bpf_trampoline(NULL, image, image_end,
>>>>> +                          &st_ops->func_models[i],
>>>>> +                          tflags, tlinks,
>>>>> +                          *(void **)(st_ops->cfi_stubs + moff));
>>>>
>>>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and the 
>>>> arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() which 
>>>> is used by the bpf_dummy_ops for testing also. Considering struct_ops 
>>>> supports kernel module now, in the future, it is better to move 
>>>> bpf_dummy_ops out to the bpf_testmod somehow and avoid its 
>>>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still better to 
>>>> keep bpf_struct_ops_prepare_trampoline() to be reusable by both.
>>>>
>>>> Have you thought about the earlier suggestion in v1 to do 
>>>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() instead 
>>>> of copying codes from bpf_struct_ops_prepare_trampoline() to 
>>>> bpf_struct_ops_map_update_elem()?
>>>>
>>>> Something like this (untested code):
>>>>
>>>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>>>                      struct bpf_tramp_link *link,
>>>>                      const struct btf_func_model *model,
>>>>                      void *stub_func, void *image,
>>>>                      u32 *image_off,
>>>>                      bool allow_alloc)
>>
>> To be a little more specific, the changes in bpf_struct_ops_map_update_elem()
>> could be mostly like this (untested):
>>
>>          ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>                                    &st_ops->func_models[i],
>>                                    *(void **)(st_ops->cfi_stubs + moff),
>>                                    image, &image_off,
>>                                    st_map->image_pages_cnt < 
>> MAX_TRAMP_IMAGE_PAGES);
>>          if (IS_ERR(ret_image))
>>              goto reset_unlock;
>>
>>          if (image != ret_image) {
>>              image = ret_image;
>>              st_map->image_pages[st_map->image_pages_cnt++] = image;
>>          }
>>
> 
> What I don't like is the memory management code was in two named
> functions, bpf_struct_ops_map_free_image() and
> bpf_struct_ops_map_inc_image().

bpf_struct_ops_map_inc_image() is not needed.

> Now, it falls apart.  Allocate in one place, keep accounting in another
> place, and free yet at the 3rd place.
> 
>>>
>>>
>>> How about pass a struct bpf_struct_ops_map to
>>> bpf_struct_ops_prepare_trampoline(). If the pointer of struct
>>> bpf_struct_ops_map is not NULL, try to allocate new pages for the map?
>>>
>>> For example,
>>>
>>> static int
>>> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>>
>>>                                     struct bpf_tramp_link *link,
>>>
>>>                                     const struct btf_func_model *model,
>>>
>>>                                     void *stub_func, void *image,
>>>                                     void *image_end,
>>>                                     struct bpf_struct_ops_map *st_map)
>>> {
>>>
>>> ...
>>>
>>>      if (!image || size > PAGE_SIZE - *image_off) {
>>>          if (!st_map)
>>
>> Why only limit to st_map != NULL?
>>
>> arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops.
>> If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as well simplify
>> bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc.
> 
> 
> Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops
> still need to free the memory. And, it doesn't pair alloc and free in
> the same function. Usually, paring alloc and free in the same function,
> nearby, or the same module is easier to understand.

It is not only about saving a few lines. It just does not make sense to
add all this complexity and another "_"bpf_struct_ops_prepare_trampoline()
variant to make it conform to the alloc/free pair idea. To be clear, I don't
see alloc/free pair is a must have in all cases. There are many situations that
non-alloc named function calls multiple kmalloc() in different places and
one xyz_free() releases everything. Even alloc/free is really preferred,
there has to be a better way or else need to make a trade off.

I suggested the high level ideal on making
bpf_struct_ops_prepare_trampoline() to allocate page. It can sure add a
bpf_struct_ops_free_trampoline() if you see fit to make it match with
bpf_struct_ops_prepare_trampoline() as alloc/free pair, for example,

void bpf_struct_ops_free_trampoline(void *image)
{
	bpf_jit_uncharge_modmem(PAGE_SIZE);
	arch_free_bpf_trampoline(image, PAGE_SIZE);
}

and make bpf_struct_ops_map_free_image() to use
bpf_struct_ops_free_trampoline()

static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
{
	int i;

	for (i = 0; i < st_map->image_pages_cnt; i++) {
		bpf_struct_ops_free_trampoline(st_map->image_pages[i]);
		st_map->image_pages[i] = NULL;
	}
	st_map->image_pages_cnt = 0;
}

Then it should work like alloc/free pair.
Kui-Feng Lee Feb. 23, 2024, 5:36 p.m. UTC | #6
On 2/22/24 21:25, Martin KaFai Lau wrote:
> On 2/22/24 7:01 PM, Kui-Feng Lee wrote:
>>
>>
>>
>> On 2/22/24 18:16, Martin KaFai Lau wrote:
>>> On 2/22/24 5:35 PM, Kui-Feng Lee wrote:
>>>>
>>>>
>>>> On 2/22/24 16:33, Martin KaFai Lau wrote:
>>>>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote:
>>>>>> @@ -531,10 +567,10 @@ static long 
>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>>>       const struct btf_type *module_type;
>>>>>>       const struct btf_member *member;
>>>>>>       const struct btf_type *t = st_ops_desc->type;
>>>>>> +    void *image = NULL, *image_end = NULL;
>>>>>>       struct bpf_tramp_links *tlinks;
>>>>>>       void *udata, *kdata;
>>>>>>       int prog_fd, err;
>>>>>> -    void *image, *image_end;
>>>>>>       u32 i;
>>>>>>       if (flags)
>>>>>> @@ -573,15 +609,14 @@ static long 
>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>>>       udata = &uvalue->data;
>>>>>>       kdata = &kvalue->data;
>>>>>> -    image = st_map->image;
>>>>>> -    image_end = st_map->image + PAGE_SIZE;
>>>>>>       module_type = btf_type_by_id(btf_vmlinux, 
>>>>>> st_ops_ids[IDX_MODULE_ID]);
>>>>>>       for_each_member(i, t, member) {
>>>>>>           const struct btf_type *mtype, *ptype;
>>>>>>           struct bpf_prog *prog;
>>>>>>           struct bpf_tramp_link *link;
>>>>>> -        u32 moff;
>>>>>> +        u32 moff, tflags;
>>>>>> +        int tsize;
>>>>>>           moff = __btf_member_bit_offset(t, member) / 8;
>>>>>>           ptype = btf_type_resolve_ptr(st_map->btf, member->type, 
>>>>>> NULL);
>>>>>> @@ -653,10 +688,38 @@ static long 
>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>>>                     &bpf_struct_ops_link_lops, prog);
>>>>>>           st_map->links[i] = &link->link;
>>>>>> -        err = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>>>>> -                            &st_ops->func_models[i],
>>>>>> -                            *(void **)(st_ops->cfi_stubs + moff),
>>>>>> -                            image, image_end);
>>>>>> +        tflags = BPF_TRAMP_F_INDIRECT;
>>>>>> +        if (st_ops->func_models[i].ret_size > 0)
>>>>>> +            tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
>>>>>> +
>>>>>> +        /* Compute the size of the trampoline */
>>>>>> +        tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>>>>>> +        tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
>>>>>> +        tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
>>>>>> +                         tflags, tlinks, NULL);
>>>>>> +        if (tsize < 0) {
>>>>>> +            err = tsize;
>>>>>> +            goto reset_unlock;
>>>>>> +        }
>>>>>> +
>>>>>> +        /* Allocate pages */
>>>>>> +        if (tsize > (unsigned long)image_end - (unsigned 
>>>>>> long)image) {
>>>>>> +            if (tsize > PAGE_SIZE) {
>>>>>> +                err = -E2BIG;
>>>>>> +                goto reset_unlock;
>>>>>> +            }
>>>>>> +            image = bpf_struct_ops_map_inc_image(st_map);
>>>>>> +            if (IS_ERR(image)) {
>>>>>> +                err = PTR_ERR(image);
>>>>>> +                goto reset_unlock;
>>>>>> +            }
>>>>>> +            image_end = image + PAGE_SIZE;
>>>>>> +        }
>>>>>> +
>>>>>> +        err = arch_prepare_bpf_trampoline(NULL, image, image_end,
>>>>>> +                          &st_ops->func_models[i],
>>>>>> +                          tflags, tlinks,
>>>>>> +                          *(void **)(st_ops->cfi_stubs + moff));
>>>>>
>>>>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, 
>>>>> and the arch_*_trampoline_*() logic from 
>>>>> bpf_struct_ops_prepare_trampoline() which is used by the 
>>>>> bpf_dummy_ops for testing also. Considering struct_ops supports 
>>>>> kernel module now, in the future, it is better to move 
>>>>> bpf_dummy_ops out to the bpf_testmod somehow and avoid its 
>>>>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still 
>>>>> better to keep bpf_struct_ops_prepare_trampoline() to be reusable 
>>>>> by both.
>>>>>
>>>>> Have you thought about the earlier suggestion in v1 to do 
>>>>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() 
>>>>> instead of copying codes from bpf_struct_ops_prepare_trampoline() 
>>>>> to bpf_struct_ops_map_update_elem()?
>>>>>
>>>>> Something like this (untested code):
>>>>>
>>>>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links 
>>>>> *tlinks,
>>>>>                      struct bpf_tramp_link *link,
>>>>>                      const struct btf_func_model *model,
>>>>>                      void *stub_func, void *image,
>>>>>                      u32 *image_off,
>>>>>                      bool allow_alloc)
>>>
>>> To be a little more specific, the changes in 
>>> bpf_struct_ops_map_update_elem()
>>> could be mostly like this (untested):
>>>
>>>          ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>>                                    &st_ops->func_models[i],
>>>                                    *(void **)(st_ops->cfi_stubs + moff),
>>>                                    image, &image_off,
>>>                                    st_map->image_pages_cnt < 
>>> MAX_TRAMP_IMAGE_PAGES);
>>>          if (IS_ERR(ret_image))
>>>              goto reset_unlock;
>>>
>>>          if (image != ret_image) {
>>>              image = ret_image;
>>>              st_map->image_pages[st_map->image_pages_cnt++] = image;
>>>          }
>>>
>>
>> What I don't like is the memory management code was in two named
>> functions, bpf_struct_ops_map_free_image() and
>> bpf_struct_ops_map_inc_image().
> 
> bpf_struct_ops_map_inc_image() is not needed.
> 
>> Now, it falls apart.  Allocate in one place, keep accounting in another
>> place, and free yet at the 3rd place.
>>
>>>>
>>>>
>>>> How about pass a struct bpf_struct_ops_map to
>>>> bpf_struct_ops_prepare_trampoline(). If the pointer of struct
>>>> bpf_struct_ops_map is not NULL, try to allocate new pages for the map?
>>>>
>>>> For example,
>>>>
>>>> static int
>>>> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>>>
>>>>                                     struct bpf_tramp_link *link,
>>>>
>>>>                                     const struct btf_func_model *model,
>>>>
>>>>                                     void *stub_func, void *image,
>>>>                                     void *image_end,
>>>>                                     struct bpf_struct_ops_map *st_map)
>>>> {
>>>>
>>>> ...
>>>>
>>>>      if (!image || size > PAGE_SIZE - *image_off) {
>>>>          if (!st_map)
>>>
>>> Why only limit to st_map != NULL?
>>>
>>> arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops.
>>> If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as 
>>> well simplify
>>> bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc.
>>
>>
>> Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops
>> still need to free the memory. And, it doesn't pair alloc and free in
>> the same function. Usually, paring alloc and free in the same function,
>> nearby, or the same module is easier to understand.
> 
> It is not only about saving a few lines. It just does not make sense to
> add all this complexity and another "_"bpf_struct_ops_prepare_trampoline()
> variant to make it conform to the alloc/free pair idea. To be clear, I 
> don't

To be clear, we are not talking computation or memory complexity here.
I consider the complexity in another way. When I look at the code of
bpf_dummy_ops, and see it free the memory at the very end of a function.
I have to guess who allocate the memory by looking around without a
clear sign or hint if we move the allocation to
bpf_struct_ops_prepare_trampoline(). It is a source of complexity.
Very often, a duplication is much more simple and easy to understand.
Especially, when the duplication is in a very well know/recognized
pattern. Here will create a unusual way to replace a well recognized one
to simplify the code.

My reason of duplicating the code from
bpf_struct_ops_prepare_trampoline() was we don't need
bpf_struct_ops_prepare_trampoline() in future if we were going to move
bpf_dummy_ops out. But, just like you said, we still have bpf_dummy_ops
now, so it is a good trade of to move memory allocation into
bpf_struct_ops_prepare_trampoline() to avoid the duplication the code
about flags and tlinks. But, the trade off we are talking here goes to
an opposite way.

By the way, I am not insisting on these tiny details. I am just trying
to explain what I don't like here.

> see alloc/free pair is a must have in all cases. There are many 
> situations that
> non-alloc named function calls multiple kmalloc() in different places and
> one xyz_free() releases everything. Even alloc/free is really preferred,
> there has to be a better way or else need to make a trade off.
> 
> I suggested the high level ideal on making
> bpf_struct_ops_prepare_trampoline() to allocate page. It can sure add a
> bpf_struct_ops_free_trampoline() if you see fit to make it match with
> bpf_struct_ops_prepare_trampoline() as alloc/free pair, for example,
> 
> void bpf_struct_ops_free_trampoline(void *image)
> {
>      bpf_jit_uncharge_modmem(PAGE_SIZE);
>      arch_free_bpf_trampoline(image, PAGE_SIZE);
> }
> 
> and make bpf_struct_ops_map_free_image() to use
> bpf_struct_ops_free_trampoline()
> 
> static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map 
> *st_map)
> {
>      int i;
> 
>      for (i = 0; i < st_map->image_pages_cnt; i++) {
>          bpf_struct_ops_free_trampoline(st_map->image_pages[i]);
>          st_map->image_pages[i] = NULL;
>      }
>      st_map->image_pages_cnt = 0;
> }
> 
> Then it should work like alloc/free pair.
Kui-Feng Lee Feb. 23, 2024, 6:29 p.m. UTC | #7
On 2/23/24 09:36, Kui-Feng Lee wrote:
> 
> 
> On 2/22/24 21:25, Martin KaFai Lau wrote:
>> On 2/22/24 7:01 PM, Kui-Feng Lee wrote:
>>>
>>>
>>>
>>> On 2/22/24 18:16, Martin KaFai Lau wrote:
>>>> On 2/22/24 5:35 PM, Kui-Feng Lee wrote:
>>>>>
>>>>>
>>>>> On 2/22/24 16:33, Martin KaFai Lau wrote:
>>>>>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote:
>>>>>>> @@ -531,10 +567,10 @@ static long 
>>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>>>>       const struct btf_type *module_type;
>>>>>>>       const struct btf_member *member;
>>>>>>>       const struct btf_type *t = st_ops_desc->type;
>>>>>>> +    void *image = NULL, *image_end = NULL;
>>>>>>>       struct bpf_tramp_links *tlinks;
>>>>>>>       void *udata, *kdata;
>>>>>>>       int prog_fd, err;
>>>>>>> -    void *image, *image_end;
>>>>>>>       u32 i;
>>>>>>>       if (flags)
>>>>>>> @@ -573,15 +609,14 @@ static long 
>>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>>>>       udata = &uvalue->data;
>>>>>>>       kdata = &kvalue->data;
>>>>>>> -    image = st_map->image;
>>>>>>> -    image_end = st_map->image + PAGE_SIZE;
>>>>>>>       module_type = btf_type_by_id(btf_vmlinux, 
>>>>>>> st_ops_ids[IDX_MODULE_ID]);
>>>>>>>       for_each_member(i, t, member) {
>>>>>>>           const struct btf_type *mtype, *ptype;
>>>>>>>           struct bpf_prog *prog;
>>>>>>>           struct bpf_tramp_link *link;
>>>>>>> -        u32 moff;
>>>>>>> +        u32 moff, tflags;
>>>>>>> +        int tsize;
>>>>>>>           moff = __btf_member_bit_offset(t, member) / 8;
>>>>>>>           ptype = btf_type_resolve_ptr(st_map->btf, member->type, 
>>>>>>> NULL);
>>>>>>> @@ -653,10 +688,38 @@ static long 
>>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>>>>                     &bpf_struct_ops_link_lops, prog);
>>>>>>>           st_map->links[i] = &link->link;
>>>>>>> -        err = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>>>>>> -                            &st_ops->func_models[i],
>>>>>>> -                            *(void **)(st_ops->cfi_stubs + moff),
>>>>>>> -                            image, image_end);
>>>>>>> +        tflags = BPF_TRAMP_F_INDIRECT;
>>>>>>> +        if (st_ops->func_models[i].ret_size > 0)
>>>>>>> +            tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
>>>>>>> +
>>>>>>> +        /* Compute the size of the trampoline */
>>>>>>> +        tlinks[BPF_TRAMP_FENTRY].links[0] = link;
>>>>>>> +        tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
>>>>>>> +        tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
>>>>>>> +                         tflags, tlinks, NULL);
>>>>>>> +        if (tsize < 0) {
>>>>>>> +            err = tsize;
>>>>>>> +            goto reset_unlock;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        /* Allocate pages */
>>>>>>> +        if (tsize > (unsigned long)image_end - (unsigned 
>>>>>>> long)image) {
>>>>>>> +            if (tsize > PAGE_SIZE) {
>>>>>>> +                err = -E2BIG;
>>>>>>> +                goto reset_unlock;
>>>>>>> +            }
>>>>>>> +            image = bpf_struct_ops_map_inc_image(st_map);
>>>>>>> +            if (IS_ERR(image)) {
>>>>>>> +                err = PTR_ERR(image);
>>>>>>> +                goto reset_unlock;
>>>>>>> +            }
>>>>>>> +            image_end = image + PAGE_SIZE;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        err = arch_prepare_bpf_trampoline(NULL, image, image_end,
>>>>>>> +                          &st_ops->func_models[i],
>>>>>>> +                          tflags, tlinks,
>>>>>>> +                          *(void **)(st_ops->cfi_stubs + moff));
>>>>>>
>>>>>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, 
>>>>>> tlinks, and the arch_*_trampoline_*() logic from 
>>>>>> bpf_struct_ops_prepare_trampoline() which is used by the 
>>>>>> bpf_dummy_ops for testing also. Considering struct_ops supports 
>>>>>> kernel module now, in the future, it is better to move 
>>>>>> bpf_dummy_ops out to the bpf_testmod somehow and avoid its 
>>>>>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still 
>>>>>> better to keep bpf_struct_ops_prepare_trampoline() to be reusable 
>>>>>> by both.
>>>>>>
>>>>>> Have you thought about the earlier suggestion in v1 to do 
>>>>>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() 
>>>>>> instead of copying codes from bpf_struct_ops_prepare_trampoline() 
>>>>>> to bpf_struct_ops_map_update_elem()?
>>>>>>
>>>>>> Something like this (untested code):
>>>>>>
>>>>>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links 
>>>>>> *tlinks,
>>>>>>                      struct bpf_tramp_link *link,
>>>>>>                      const struct btf_func_model *model,
>>>>>>                      void *stub_func, void *image,
>>>>>>                      u32 *image_off,
>>>>>>                      bool allow_alloc)
>>>>
>>>> To be a little more specific, the changes in 
>>>> bpf_struct_ops_map_update_elem()
>>>> could be mostly like this (untested):
>>>>
>>>>          ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link,
>>>>                                    &st_ops->func_models[i],
>>>>                                    *(void **)(st_ops->cfi_stubs + 
>>>> moff),
>>>>                                    image, &image_off,
>>>>                                    st_map->image_pages_cnt < 
>>>> MAX_TRAMP_IMAGE_PAGES);
>>>>          if (IS_ERR(ret_image))
>>>>              goto reset_unlock;
>>>>
>>>>          if (image != ret_image) {
>>>>              image = ret_image;
>>>>              st_map->image_pages[st_map->image_pages_cnt++] = image;
>>>>          }
>>>>
>>>
>>> What I don't like is the memory management code was in two named
>>> functions, bpf_struct_ops_map_free_image() and
>>> bpf_struct_ops_map_inc_image().
>>
>> bpf_struct_ops_map_inc_image() is not needed.
>>
>>> Now, it falls apart.  Allocate in one place, keep accounting in another
>>> place, and free yet at the 3rd place.
>>>
>>>>>
>>>>>
>>>>> How about pass a struct bpf_struct_ops_map to
>>>>> bpf_struct_ops_prepare_trampoline(). If the pointer of struct
>>>>> bpf_struct_ops_map is not NULL, try to allocate new pages for the map?
>>>>>
>>>>> For example,
>>>>>
>>>>> static int
>>>>> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>>>>
>>>>>                                     struct bpf_tramp_link *link,
>>>>>
>>>>>                                     const struct btf_func_model 
>>>>> *model,
>>>>>
>>>>>                                     void *stub_func, void *image,
>>>>>                                     void *image_end,
>>>>>                                     struct bpf_struct_ops_map *st_map)
>>>>> {
>>>>>
>>>>> ...
>>>>>
>>>>>      if (!image || size > PAGE_SIZE - *image_off) {
>>>>>          if (!st_map)
>>>>
>>>> Why only limit to st_map != NULL?
>>>>
>>>> arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops.
>>>> If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as 
>>>> well simplify
>>>> bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to 
>>>> alloc.
>>>
>>>
>>> Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops
>>> still need to free the memory. And, it doesn't pair alloc and free in
>>> the same function. Usually, paring alloc and free in the same function,
>>> nearby, or the same module is easier to understand.
>>
>> It is not only about saving a few lines. It just does not make sense to
>> add all this complexity and another 
>> "_"bpf_struct_ops_prepare_trampoline()
>> variant to make it conform to the alloc/free pair idea. To be clear, I 
>> don't
> 
> To be clear, we are not talking computation or memory complexity here.
> I consider the complexity in another way. When I look at the code of
> bpf_dummy_ops, and see it free the memory at the very end of a function.
> I have to guess who allocate the memory by looking around without a
> clear sign or hint if we move the allocation to
> bpf_struct_ops_prepare_trampoline(). It is a source of complexity.
> Very often, a duplication is much more simple and easy to understand.
> Especially, when the duplication is in a very well know/recognized
> pattern. Here will create a unusual way to replace a well recognized one
> to simplify the code.
> 
> My reason of duplicating the code from
> bpf_struct_ops_prepare_trampoline() was we don't need
> bpf_struct_ops_prepare_trampoline() in future if we were going to move
> bpf_dummy_ops out. But, just like you said, we still have bpf_dummy_ops
> now, so it is a good trade of to move memory allocation into
> bpf_struct_ops_prepare_trampoline() to avoid the duplication the code
> about flags and tlinks. But, the trade off we are talking here goes to
> an opposite way.
> 
> By the way, I am not insisting on these tiny details. I am just trying
> to explain what I don't like here.

One thing I forgot to mention is that bpf_dummy_ops has to call
bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move
bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(),
meaning bpf_struct_ops_map_update_elem() should handle the case that the
allocation in bpf_struct_ops_prepare_trampoline() successes, but
bpf_jit_charge_modmem() fails.


> 
>> see alloc/free pair is a must have in all cases. There are many 
>> situations that
>> non-alloc named function calls multiple kmalloc() in different places and
>> one xyz_free() releases everything. Even alloc/free is really preferred,
>> there has to be a better way or else need to make a trade off.
>>
>> I suggested the high level ideal on making
>> bpf_struct_ops_prepare_trampoline() to allocate page. It can sure add a
>> bpf_struct_ops_free_trampoline() if you see fit to make it match with
>> bpf_struct_ops_prepare_trampoline() as alloc/free pair, for example,
>>
>> void bpf_struct_ops_free_trampoline(void *image)
>> {
>>      bpf_jit_uncharge_modmem(PAGE_SIZE);
>>      arch_free_bpf_trampoline(image, PAGE_SIZE);
>> }
>>
>> and make bpf_struct_ops_map_free_image() to use
>> bpf_struct_ops_free_trampoline()
>>
>> static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map 
>> *st_map)
>> {
>>      int i;
>>
>>      for (i = 0; i < st_map->image_pages_cnt; i++) {
>>          bpf_struct_ops_free_trampoline(st_map->image_pages[i]);
>>          st_map->image_pages[i] = NULL;
>>      }
>>      st_map->image_pages_cnt = 0;
>> }
>>
>> Then it should work like alloc/free pair.
Martin KaFai Lau Feb. 23, 2024, 6:32 p.m. UTC | #8
On 2/23/24 9:36 AM, Kui-Feng Lee wrote:
> 
> To be clear, we are not talking computation or memory complexity here.
> I consider the complexity in another way. When I look at the code of
> bpf_dummy_ops, and see it free the memory at the very end of a function.
> I have to guess who allocate the memory by looking around without a
> clear sign or hint if we move the allocation to
> bpf_struct_ops_prepare_trampoline(). It is a source of complexity.

It still sounds like a naming perception issue more than a practical code-wise 
complexity/readability. Rename it to 
bpf_struct_ops_"s/alloc/prepare/"_trampoline() if it can make it more obvious 
that it is an alloc function. imo, that function returning a page is a clear 
sign that it can alloc but I don't mind renaming it if it can help to make it 
sounds more like alloc and free pair.

> Very often, a duplication is much more simple and easy to understand.
> Especially, when the duplication is in a very well know/recognized
> pattern. Here will create a unusual way to replace a well recognized one
> to simplify the code.

Sorry, I don't agree on this where this patch is duplicating lines of code which 
is not obvious like setting BPF_TRAMP_F_*. At least I often have to go back to 
arch_prepare_bpf_trampoline() to understand how it works.

Not copy-and-pasting this piece of codes everywhere is more important than 
making bpf_dummy_ops looks better.

> 
> My reason of duplicating the code from
> bpf_struct_ops_prepare_trampoline() was we don't need
> bpf_struct_ops_prepare_trampoline() in future if we were going to move
> bpf_dummy_ops out. But, just like you said, we still have bpf_dummy_ops

Yep, it will be great to move bpf_dummy_ops out but how it can be done and 
whether it can remove its bpf_struct_ops_prepare_trampoline() usage is still 
TBD. I think it should be possible. Even it is moved out in the future, 
bpf_struct_ops_(prepare|alloc)_trampoline() can be keep as is.

> now, so it is a good trade of to move memory allocation into
> bpf_struct_ops_prepare_trampoline() to avoid the duplication the code
> about flags and tlinks. But, the trade off we are talking here goes to
> an opposite way.
Martin KaFai Lau Feb. 23, 2024, 6:42 p.m. UTC | #9
On 2/23/24 10:29 AM, Kui-Feng Lee wrote:
> One thing I forgot to mention is that bpf_dummy_ops has to call
> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move
> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(),
> meaning bpf_struct_ops_map_update_elem() should handle the case that the
> allocation in bpf_struct_ops_prepare_trampoline() successes, but
> bpf_jit_charge_modmem() fails.

Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline().

It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. There is 
no need to optimize for bpf_dummy_ops. Use bpf_struct_ops_free_trampoline() in 
bpf_dummy_ops to uncharge and free.


>>> void bpf_struct_ops_free_trampoline(void *image)
>>> {
>>>      bpf_jit_uncharge_modmem(PAGE_SIZE);
>>>      arch_free_bpf_trampoline(image, PAGE_SIZE);
>>> }
>>>
Kui-Feng Lee Feb. 23, 2024, 7:05 p.m. UTC | #10
On 2/23/24 10:42, Martin KaFai Lau wrote:
> On 2/23/24 10:29 AM, Kui-Feng Lee wrote:
>> One thing I forgot to mention is that bpf_dummy_ops has to call
>> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move
>> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(),
>> meaning bpf_struct_ops_map_update_elem() should handle the case that the
>> allocation in bpf_struct_ops_prepare_trampoline() successes, but
>> bpf_jit_charge_modmem() fails.
> 
> Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline().
> 
> It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. 
> There is no need to optimize for bpf_dummy_ops. Use 
> bpf_struct_ops_free_trampoline() in bpf_dummy_ops to uncharge and free.


Then, I don't get the point here.
I agree with moving the allocation into
bpf_struct_ops_prepare_trampoline() to avoid duplication of the code
about flags and tlinks. It really simplifies the code with the fact
that bpf_dummy_ops is still there. So, I tried to pass a st_map to
bpf_struct_ops_prepare_trampoline() to keep page managements code
together. But, you said to simplify the code of bpf_dummy_ops by
allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping
in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to
allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to
bpf_dummy_ops. For me, this trade-off that include removing an
allocation and adding a bpf_jit_uncharge_modmem() make no sense.

> 
> 
>>>> void bpf_struct_ops_free_trampoline(void *image)
>>>> {
>>>>      bpf_jit_uncharge_modmem(PAGE_SIZE);
>>>>      arch_free_bpf_trampoline(image, PAGE_SIZE);
>>>> }
>>>>
>
Martin KaFai Lau Feb. 23, 2024, 7:15 p.m. UTC | #11
On 2/23/24 11:05 AM, Kui-Feng Lee wrote:
> 
> 
> 
> On 2/23/24 10:42, Martin KaFai Lau wrote:
>> On 2/23/24 10:29 AM, Kui-Feng Lee wrote:
>>> One thing I forgot to mention is that bpf_dummy_ops has to call
>>> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move
>>> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(),
>>> meaning bpf_struct_ops_map_update_elem() should handle the case that the
>>> allocation in bpf_struct_ops_prepare_trampoline() successes, but
>>> bpf_jit_charge_modmem() fails.
>>
>> Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline().
>>
>> It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. There 
>> is no need to optimize for bpf_dummy_ops. Use bpf_struct_ops_free_trampoline() 
>> in bpf_dummy_ops to uncharge and free.
> 
> 
> Then, I don't get the point here.
> I agree with moving the allocation into
> bpf_struct_ops_prepare_trampoline() to avoid duplication of the code
> about flags and tlinks. It really simplifies the code with the fact
> that bpf_dummy_ops is still there. So, I tried to pass a st_map to
> bpf_struct_ops_prepare_trampoline() to keep page managements code
> together. But, you said to simplify the code of bpf_dummy_ops by
> allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping
> in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to

I don't think I ever mentioned to do book keeping in 
bpf_struct_ops_map_update_elem(). Have you looked at my earlier code in 
bpf_struct_ops_prepare_trampoline() which also does the memory charging also?

> allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to
> bpf_dummy_ops. For me, this trade-off that include removing an
> allocation and adding a bpf_jit_uncharge_modmem() make no sense.

Which part make no sense? Having bpf_dummy_ops charge/uncharge memory also?

The bpf_dummy_ops() uses the below bpf_struct_ops_free_trampoline() which does 
uncharge and free. bpf_struct_ops_prepare_trampoline() does charge and alloc.
charge/alloc matches with uncharge/free.

> 
>>
>>
>>>>> void bpf_struct_ops_free_trampoline(void *image)
>>>>> {
>>>>>      bpf_jit_uncharge_modmem(PAGE_SIZE);
>>>>>      arch_free_bpf_trampoline(image, PAGE_SIZE);
>>>>> }
>>>>>
>>
Kui-Feng Lee Feb. 23, 2024, 10:06 p.m. UTC | #12
On 2/23/24 11:15, Martin KaFai Lau wrote:
> On 2/23/24 11:05 AM, Kui-Feng Lee wrote:
>>
>>
>>
>> On 2/23/24 10:42, Martin KaFai Lau wrote:
>>> On 2/23/24 10:29 AM, Kui-Feng Lee wrote:
>>>> One thing I forgot to mention is that bpf_dummy_ops has to call
>>>> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move
>>>> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(),
>>>> meaning bpf_struct_ops_map_update_elem() should handle the case that 
>>>> the
>>>> allocation in bpf_struct_ops_prepare_trampoline() successes, but
>>>> bpf_jit_charge_modmem() fails.
>>>
>>> Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline().
>>>
>>> It is fine to have bpf_dummy_ops charge and then uncharge a 
>>> PAGE_SIZE. There is no need to optimize for bpf_dummy_ops. Use 
>>> bpf_struct_ops_free_trampoline() in bpf_dummy_ops to uncharge and free.
>>
>>
>> Then, I don't get the point here.
>> I agree with moving the allocation into
>> bpf_struct_ops_prepare_trampoline() to avoid duplication of the code
>> about flags and tlinks. It really simplifies the code with the fact
>> that bpf_dummy_ops is still there. So, I tried to pass a st_map to
>> bpf_struct_ops_prepare_trampoline() to keep page managements code
>> together. But, you said to simplify the code of bpf_dummy_ops by
>> allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping
>> in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to
> 
> I don't think I ever mentioned to do book keeping in 
> bpf_struct_ops_map_update_elem(). Have you looked at my earlier code in 
> bpf_struct_ops_prepare_trampoline() which also does the memory charging 
> also?

The bookkeeping that I am saying is about maintaining image_pages and
image_pages_cnt.

> 
>> allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to
>> bpf_dummy_ops. For me, this trade-off that include removing an
>> allocation and adding a bpf_jit_uncharge_modmem() make no sense.
> 
> Which part make no sense? Having bpf_dummy_ops charge/uncharge memory also?

Simplifying bpf_dummy_ops by removing the duty of allocation but adding
the duty of uncharge memory doesn't make sense to me in terms of
simplification. Although the lines of code would be similar, it actually
makes it more complicated than before.

> 
> The bpf_dummy_ops() uses the below bpf_struct_ops_free_trampoline() 
> which does uncharge and free. bpf_struct_ops_prepare_trampoline() does 
> charge and alloc.
> charge/alloc matches with uncharge/free.
> 
>>
>>>
>>>
>>>>>> void bpf_struct_ops_free_trampoline(void *image)
>>>>>> {
>>>>>>      bpf_jit_uncharge_modmem(PAGE_SIZE);
>>>>>>      arch_free_bpf_trampoline(image, PAGE_SIZE);
>>>>>> }
>>>>>>
>>>
>
Martin KaFai Lau Feb. 24, 2024, 3:20 a.m. UTC | #13
On 2/23/24 2:06 PM, Kui-Feng Lee wrote:
> 
> 
> On 2/23/24 11:15, Martin KaFai Lau wrote:
>> On 2/23/24 11:05 AM, Kui-Feng Lee wrote:
>>>
>>>
>>>
>>> On 2/23/24 10:42, Martin KaFai Lau wrote:
>>>> On 2/23/24 10:29 AM, Kui-Feng Lee wrote:
>>>>> One thing I forgot to mention is that bpf_dummy_ops has to call
>>>>> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move
>>>>> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(),
>>>>> meaning bpf_struct_ops_map_update_elem() should handle the case that the
>>>>> allocation in bpf_struct_ops_prepare_trampoline() successes, but
>>>>> bpf_jit_charge_modmem() fails.
>>>>
>>>> Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline().
>>>>
>>>> It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. There 
>>>> is no need to optimize for bpf_dummy_ops. Use 
>>>> bpf_struct_ops_free_trampoline() in bpf_dummy_ops to uncharge and free.
>>>
>>>
>>> Then, I don't get the point here.
>>> I agree with moving the allocation into
>>> bpf_struct_ops_prepare_trampoline() to avoid duplication of the code
>>> about flags and tlinks. It really simplifies the code with the fact
>>> that bpf_dummy_ops is still there. So, I tried to pass a st_map to
>>> bpf_struct_ops_prepare_trampoline() to keep page managements code
>>> together. But, you said to simplify the code of bpf_dummy_ops by
>>> allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping
>>> in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to
>>
>> I don't think I ever mentioned to do book keeping in 
>> bpf_struct_ops_map_update_elem(). Have you looked at my earlier code in 
>> bpf_struct_ops_prepare_trampoline() which also does the memory charging also?
> 
> The bookkeeping that I am saying is about maintaining image_pages and
> image_pages_cnt.

image_pages and image_pages_cnt are in the st_map (struct_ops map) itself.
Why the bpf_struct_ops_map_update_elem() cannot keep track of the pages itself
and need  "_"bpf_struct_ops_prepare_trampoline() to handle it? It is not
like refactoring the image_pages[_cnt] handling to
_bpf_struct_ops_prepare_trampoline() and it can be reused by another caller.
bpf_struct_ops_map_update_elem() is the only one needing to keep track of
its own image_pages and image_pages_cnt.

> 
>>
>>> allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to
>>> bpf_dummy_ops. For me, this trade-off that include removing an
>>> allocation and adding a bpf_jit_uncharge_modmem() make no sense.
>>
>> Which part make no sense? Having bpf_dummy_ops charge/uncharge memory also?
> 
> Simplifying bpf_dummy_ops by removing the duty of allocation but adding
> the duty of uncharge memory doesn't make sense to me in terms of
> simplification. Although the lines of code would be similar, it actually
> makes it more complicated than before.

The bpf_dummy_ops does not need to call uncharge. It does not need
to know the page is charged or not. It uses bpf_struct_ops_prepare_trampoline()
to get a prepared trampoline page and bpf_struct_ops_free_trampoline() which
does the uncharge+free. It is the alloc/free concept that you have been
proposing which fits well here.

The bpf_dummy_ops is doing arch_alloc_bpf_trampoline and
arch_free_bpf_trampoline.

The arch_alloc_bpf_trampoline will be gone (-1).

The arch_free_bpf_trampoline will be replaced (+0) by
bpf_struct_ops_free_trampoline.

Untested code:

diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 02de71719aed..a3bb89eb037f 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -89,8 +89,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
  	struct bpf_dummy_ops_test_args *args;
  	struct bpf_tramp_links *tlinks;
  	struct bpf_tramp_link *link = NULL;
+	unsigned int op_idx, image_off = 0;
  	void *image = NULL;
-	unsigned int op_idx;
  	int prog_ret;
  	s32 type_id;
  	int err;
@@ -114,12 +114,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
  		goto out;
  	}
  
-	image = arch_alloc_bpf_trampoline(PAGE_SIZE);
-	if (!image) {
-		err = -ENOMEM;
-		goto out;
-	}
-
  	link = kzalloc(sizeof(*link), GFP_USER);
  	if (!link) {
  		err = -ENOMEM;
@@ -130,12 +124,14 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
  	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog);
  
  	op_idx = prog->expected_attach_type;
-	err = bpf_struct_ops_prepare_trampoline(tlinks, link,
-						&st_ops->func_models[op_idx],
-						&dummy_ops_test_ret_function,
-						image, image + PAGE_SIZE);
-	if (err < 0)
+	image = bpf_struct_ops_prepare_trampoline(tlinks, link,
+						  &st_ops->func_models[op_idx],
+						  &dummy_ops_test_ret_function,
+						  NULL, &image_off, true);
+	if (IS_ERR(image)) {
+		err = PTR_ERR(image);
  		goto out;
+	}
  
  	arch_protect_bpf_trampoline(image, PAGE_SIZE);
  	prog_ret = dummy_ops_call_op(image, args);
@@ -147,7 +143,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
  		err = -EFAULT;
  out:
  	kfree(args);
-	arch_free_bpf_trampoline(image, PAGE_SIZE);
+	if (!IS_ERR_OR_NULL(image))
+		bpf_struct_ops_free_trampoline(image);
  	if (link)
  		bpf_link_put(&link->link);
  	kfree(tlinks);


>>>>>>> void bpf_struct_ops_free_trampoline(void *image)
>>>>>>> {
>>>>>>>      bpf_jit_uncharge_modmem(PAGE_SIZE);
>>>>>>>      arch_free_bpf_trampoline(image, PAGE_SIZE);
>>>>>>> }

^^^^^^^^^^^^^^^
To be clear, this bpf_struct_ops_free_trampoline() function that
does the uncharge+free.

~~~~~~~~~~~~~~~~

Lets summarize a bit of the thread:

. I think we agree duplicating codes from
   bpf_struct_ops_prepare_trampoline() is not good.

. bpf_struct_ops_prepare_trampoline() can allocate page.

. The first concern was there is no alloc/free pairing.
   Then adding bpf_struct_ops_free_trampoline was suggested
   to do the uncharge+free together,
   while bpf_struct_ops_prepare_trampoline does the charge+alloc

. The second concern was bpf_struct_ops_prepare_trampoline()
   is not obvious that a free(page) needs to be done. It is
   more like a naming perception since returning a page is already
   a good signal that needs to be freed. However, it is open for
   a function name change if it can make the "alloc" part more obvious.

. Another concern was bpf_dummy_ops now needs to remember it
   needs to uncharge. bpf_struct_ops_free_trampoline() should
   address it also since it does uncharge+free.

. Another point brought up was having bpf_struct_ops_prepare_trampoline()
   store the allocated page in st_map->image_pages instead of
   map_update doing it itself which was covered in the first part of
   this email.

I hope the code pieces have addressed the points brought up above.
I have combined these code pieces in a patch
which is only compiler tested. Hope it will make things clearer (first patch):
https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=struct_ops.images
diff mbox series

Patch

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index c244ed5114fd..15bf8058161b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -18,6 +18,8 @@  struct bpf_struct_ops_value {
 	char data[] ____cacheline_aligned_in_smp;
 };
 
+#define MAX_TRAMP_IMAGE_PAGES 8
+
 struct bpf_struct_ops_map {
 	struct bpf_map map;
 	struct rcu_head rcu;
@@ -30,12 +32,11 @@  struct bpf_struct_ops_map {
 	 */
 	struct bpf_link **links;
 	u32 links_cnt;
-	/* image is a page that has all the trampolines
+	u32 image_pages_cnt;
+	/* image_pages is an array of pages that has all the trampolines
 	 * that stores the func args before calling the bpf_prog.
-	 * A PAGE_SIZE "image" is enough to store all trampoline for
-	 * "links[]".
 	 */
-	void *image;
+	void *image_pages[MAX_TRAMP_IMAGE_PAGES];
 	/* The owner moduler's btf. */
 	struct btf *btf;
 	/* uvalue->data stores the kernel struct
@@ -456,6 +457,41 @@  static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 	}
 }
 
+static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
+{
+	int i;
+	void *image;
+
+	bpf_jit_uncharge_modmem(PAGE_SIZE * st_map->image_pages_cnt);
+	for (i = 0; i < st_map->image_pages_cnt; i++) {
+		image = st_map->image_pages[i];
+		arch_free_bpf_trampoline(image, PAGE_SIZE);
+	}
+	st_map->image_pages_cnt = 0;
+}
+
+static void *bpf_struct_ops_map_inc_image(struct bpf_struct_ops_map *st_map)
+{
+	void *image;
+	int err;
+
+	if (st_map->image_pages_cnt >= MAX_TRAMP_IMAGE_PAGES)
+		return ERR_PTR(-E2BIG);
+
+	err = bpf_jit_charge_modmem(PAGE_SIZE);
+	if (err)
+		return ERR_PTR(err);
+
+	image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+	if (!image) {
+		bpf_jit_uncharge_modmem(PAGE_SIZE);
+		return ERR_PTR(-ENOMEM);
+	}
+	st_map->image_pages[st_map->image_pages_cnt++] = image;
+
+	return image;
+}
+
 static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
 {
 	const struct btf_member *member;
@@ -531,10 +567,10 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	const struct btf_type *module_type;
 	const struct btf_member *member;
 	const struct btf_type *t = st_ops_desc->type;
+	void *image = NULL, *image_end = NULL;
 	struct bpf_tramp_links *tlinks;
 	void *udata, *kdata;
 	int prog_fd, err;
-	void *image, *image_end;
 	u32 i;
 
 	if (flags)
@@ -573,15 +609,14 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 	udata = &uvalue->data;
 	kdata = &kvalue->data;
-	image = st_map->image;
-	image_end = st_map->image + PAGE_SIZE;
 
 	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
 	for_each_member(i, t, member) {
 		const struct btf_type *mtype, *ptype;
 		struct bpf_prog *prog;
 		struct bpf_tramp_link *link;
-		u32 moff;
+		u32 moff, tflags;
+		int tsize;
 
 		moff = __btf_member_bit_offset(t, member) / 8;
 		ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
@@ -653,10 +688,38 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			      &bpf_struct_ops_link_lops, prog);
 		st_map->links[i] = &link->link;
 
-		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
-							&st_ops->func_models[i],
-							*(void **)(st_ops->cfi_stubs + moff),
-							image, image_end);
+		tflags = BPF_TRAMP_F_INDIRECT;
+		if (st_ops->func_models[i].ret_size > 0)
+			tflags |= BPF_TRAMP_F_RET_FENTRY_RET;
+
+		/* Compute the size of the trampoline */
+		tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+		tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+		tsize = arch_bpf_trampoline_size(&st_ops->func_models[i],
+						 tflags, tlinks, NULL);
+		if (tsize < 0) {
+			err = tsize;
+			goto reset_unlock;
+		}
+
+		/* Allocate pages */
+		if (tsize > (unsigned long)image_end - (unsigned long)image) {
+			if (tsize > PAGE_SIZE) {
+				err = -E2BIG;
+				goto reset_unlock;
+			}
+			image = bpf_struct_ops_map_inc_image(st_map);
+			if (IS_ERR(image)) {
+				err = PTR_ERR(image);
+				goto reset_unlock;
+			}
+			image_end = image + PAGE_SIZE;
+		}
+
+		err = arch_prepare_bpf_trampoline(NULL, image, image_end,
+						  &st_ops->func_models[i],
+						  tflags, tlinks,
+						  *(void **)(st_ops->cfi_stubs + moff));
 		if (err < 0)
 			goto reset_unlock;
 
@@ -672,10 +735,11 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		if (err)
 			goto reset_unlock;
 	}
+	for (i = 0; i < st_map->image_pages_cnt; i++)
+		arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
 
 	if (st_map->map.map_flags & BPF_F_LINK) {
 		err = 0;
-		arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 		/* Let bpf_link handle registration & unregistration.
 		 *
 		 * Pair with smp_load_acquire() during lookup_elem().
@@ -684,7 +748,6 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		goto unlock;
 	}
 
-	arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
 		/* This refcnt increment on the map here after
@@ -707,9 +770,9 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 * there was a race in registering the struct_ops (under the same name) to
 	 * a sub-system through different struct_ops's maps.
 	 */
-	arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
 
 reset_unlock:
+	bpf_struct_ops_map_free_image(st_map);
 	bpf_struct_ops_map_put_progs(st_map);
 	memset(uvalue, 0, map->value_size);
 	memset(kvalue, 0, map->value_size);
@@ -776,10 +839,7 @@  static void __bpf_struct_ops_map_free(struct bpf_map *map)
 	if (st_map->links)
 		bpf_struct_ops_map_put_progs(st_map);
 	bpf_map_area_free(st_map->links);
-	if (st_map->image) {
-		arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
-		bpf_jit_uncharge_modmem(PAGE_SIZE);
-	}
+	bpf_struct_ops_map_free_image(st_map);
 	bpf_map_area_free(st_map->uvalue);
 	bpf_map_area_free(st_map);
 }
@@ -889,20 +949,6 @@  static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	st_map->st_ops_desc = st_ops_desc;
 	map = &st_map->map;
 
-	ret = bpf_jit_charge_modmem(PAGE_SIZE);
-	if (ret)
-		goto errout_free;
-
-	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
-	if (!st_map->image) {
-		/* __bpf_struct_ops_map_free() uses st_map->image as flag
-		 * for "charged or not". In this case, we need to unchange
-		 * here.
-		 */
-		bpf_jit_uncharge_modmem(PAGE_SIZE);
-		ret = -ENOMEM;
-		goto errout_free;
-	}
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
 	st_map->links_cnt = btf_type_vlen(t);
 	st_map->links =