diff mbox series

[bpf-next,03/16] bpf: Parse bpf_dynptr in map key

Message ID 20241008091501.8302-4-houtao@huaweicloud.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Support dynptr key for hash map | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 212 this patch: 206
netdev/build_tools success Errors and warnings before: 0 (+1) this patch: 0 (+1)
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 265 this patch: 257
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 6928 this patch: 6961
netdev/checkpatch warning WARNING: line length of 93 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Hou Tao Oct. 8, 2024, 9:14 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

To support variable-length key or strings in map key, use bpf_dynptr to
represent these variable-length objects and save these bpf_dynptr
fields in the map key. As shown in the examples below, a map key with an
integer and a string is defined:

	struct pid_name {
		int pid;
		struct bpf_dynptr name;
	};

The bpf_dynptr in the map key could also be contained indirectly in a
struct as shown below:

	struct pid_name_time {
		struct pid_name process;
		unsigned long long time;
	};

If the whole map key is a bpf_dynptr, the map could be defined as a
struct or directly using bpf_dynptr as the map key:

	struct map_key {
		struct bpf_dynptr name;
	};

The bpf program could use bpf_dynptr_init() to initialize the dynptr
part in the map key, and the userspace application will use
bpf_dynptr_user_init() or similar API to initialize the dynptr. Just
like kptrs in map value, the bpf_dynptr in the map key could also be
defined in a nested struct which is contained in the map key struct.

The patch updates map_create() accordingly to parse these bpf_dynptr
fields in map key, just like it does for other special fields in map
value. To enable bpf_dynptr support in map key, the map_type should be
BPF_MAP_TYPE_HASH, and BPF_F_DYNPTR_IN_KEY should also be enabled in
map_flags. For now, the max number of bpf_dynptr in a map key is
arbitrarily chosen as 4 and it may be changed later.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf.h     | 13 ++++++++++--
 kernel/bpf/btf.c        |  4 ++++
 kernel/bpf/map_in_map.c | 19 ++++++++++++-----
 kernel/bpf/syscall.c    | 47 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 7 deletions(-)

Comments

Eduard Zingerman Oct. 10, 2024, 6:02 p.m. UTC | #1
On Tue, 2024-10-08 at 17:14 +0800, Hou Tao wrote:

[...]

> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index 645bd30bc9a9..a072835dc645 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c

[...]

> @@ -45,9 +45,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>  		 * invalid/empty/valid, but ERR_PTR in case of errors. During
>  		 * equality NULL or IS_ERR is equivalent.
>  		 */
> -		struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
> -		kfree(inner_map_meta);
> -		return ret;
> +		ret = ERR_CAST(inner_map_meta->record);
> +		goto free;
> +	}
> +	inner_map_meta->key_record = btf_record_dup(inner_map->key_record);
> +	if (IS_ERR(inner_map_meta->key_record)) {
> +		ret = ERR_CAST(inner_map_meta->key_record);
> +		goto free;

The 'goto free' executes a call to bpf_map_meta_free() which does
btf_put(map_meta->btf), but corresponding btf_get(inner_map->btf) only
happens on the lines below => in case when 'free' branch is taken we
'put' BTF object that was not 'get' by us.

>  	}
>  	/* Note: We must use the same BTF, as we also used btf_record_dup above
>  	 * which relies on BTF being same for both maps, as some members like
> @@ -71,6 +75,10 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>  		inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
>  	}
>  	return inner_map_meta;
> +
> +free:
> +	bpf_map_meta_free(inner_map_meta);
> +	return ret;
>  }
>  
>  void bpf_map_meta_free(struct bpf_map *map_meta)

[...]
Alexei Starovoitov Oct. 11, 2024, 4:29 p.m. UTC | #2
On Tue, Oct 8, 2024 at 2:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> +#define MAX_DYNPTR_CNT_IN_MAP_KEY 4
> +
>  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>                          const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
>  {
> @@ -1103,6 +1113,40 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>         if (!value_type || value_size != map->value_size)
>                 return -EINVAL;
>
> +       if (btf_type_is_dynptr(btf, key_type))
> +               map->key_record = btf_new_bpf_dynptr_record();
> +       else
> +               map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
> +       if (!IS_ERR_OR_NULL(map->key_record)) {
> +               if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
> +                       ret = -E2BIG;
> +                       goto free_map_tab;

Took me a while to grasp that map->key_record is only for dynptr fields
and map->record is for the rest except dynptr fields.

Maybe rename key_record to dynptr_fields ?
Or at least add a comment to struct bpf_map to explain
what each btf_record is for.

It's kinda arbitrary decision to support multiple dynptr-s per key
while other fields are not.
Maybe worth looking at generalizing it a bit so single btf_record
can have multiple of certain field kinds?
In addition to btf_record->cnt you'd need btf_record->dynptr_cnt
but that would be easier to extend in the future ?
Hou Tao Oct. 21, 2024, 1:48 p.m. UTC | #3
Hi,

On 10/11/2024 2:02 AM, Eduard Zingerman wrote:
> On Tue, 2024-10-08 at 17:14 +0800, Hou Tao wrote:
>
> [...]
>
>> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
>> index 645bd30bc9a9..a072835dc645 100644
>> --- a/kernel/bpf/map_in_map.c
>> +++ b/kernel/bpf/map_in_map.c
> [...]
>
>> @@ -45,9 +45,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>>  		 * invalid/empty/valid, but ERR_PTR in case of errors. During
>>  		 * equality NULL or IS_ERR is equivalent.
>>  		 */
>> -		struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
>> -		kfree(inner_map_meta);
>> -		return ret;
>> +		ret = ERR_CAST(inner_map_meta->record);
>> +		goto free;
>> +	}
>> +	inner_map_meta->key_record = btf_record_dup(inner_map->key_record);
>> +	if (IS_ERR(inner_map_meta->key_record)) {
>> +		ret = ERR_CAST(inner_map_meta->key_record);
>> +		goto free;
> The 'goto free' executes a call to bpf_map_meta_free() which does
> btf_put(map_meta->btf), but corresponding btf_get(inner_map->btf) only
> happens on the lines below => in case when 'free' branch is taken we
> 'put' BTF object that was not 'get' by us.

Yes, but map_meta->btf is NULL, so calling btf_put(NULL) incurs no harm.
My purpose was trying to simplify the error handling, but it seems that
it leads to confusion. Will only undo the done part in next revision.
>
>>  	}
>>  	/* Note: We must use the same BTF, as we also used btf_record_dup above
>>  	 * which relies on BTF being same for both maps, as some members like
>> @@ -71,6 +75,10 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>>  		inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
>>  	}
>>  	return inner_map_meta;
>> +
>> +free:
>> +	bpf_map_meta_free(inner_map_meta);
>> +	return ret;
>>  }
>>  
>>  void bpf_map_meta_free(struct bpf_map *map_meta)
> [...]
Hou Tao Oct. 21, 2024, 2:02 p.m. UTC | #4
Hi,

On 10/12/2024 12:29 AM, Alexei Starovoitov wrote:
> On Tue, Oct 8, 2024 at 2:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> +#define MAX_DYNPTR_CNT_IN_MAP_KEY 4
>> +
>>  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>>                          const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
>>  {
>> @@ -1103,6 +1113,40 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>>         if (!value_type || value_size != map->value_size)
>>                 return -EINVAL;
>>
>> +       if (btf_type_is_dynptr(btf, key_type))
>> +               map->key_record = btf_new_bpf_dynptr_record();
>> +       else
>> +               map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
>> +       if (!IS_ERR_OR_NULL(map->key_record)) {
>> +               if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
>> +                       ret = -E2BIG;
>> +                       goto free_map_tab;
> Took me a while to grasp that map->key_record is only for dynptr fields
> and map->record is for the rest except dynptr fields.
>
> Maybe rename key_record to dynptr_fields ?
> Or at least add a comment to struct bpf_map to explain
> what each btf_record is for.

I was trying to rename map->record to map->value_record, however, I was
afraid that it may introduce too much churn, so I didn't do that. But I
think it is a good idea to add comments for both btf_record. And
considering that only bpf_dynptr is enabled for map key, renaming it to
dynptr_fields seems reasonable as well.
>
> It's kinda arbitrary decision to support multiple dynptr-s per key
> while other fields are not.
> Maybe worth looking at generalizing it a bit so single btf_record
> can have multiple of certain field kinds?
> In addition to btf_record->cnt you'd need btf_record->dynptr_cnt
> but that would be easier to extend in the future ?

Map value has already supported multiple kptrs or bpf_list_node. And in
the discussion [1], I thought multiple dynptr support in map key is
necessary, so I enabled it.

[1]:
https://lore.kernel.org/bpf/CAADnVQJWaBRB=P-ZNkppwm=0tZaT3qP8PKLLJ2S5SSA2-S8mxg@mail.gmail.com/
Alexei Starovoitov Oct. 22, 2024, 3:59 a.m. UTC | #5
On Mon, Oct 21, 2024 at 7:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 10/12/2024 12:29 AM, Alexei Starovoitov wrote:
> > On Tue, Oct 8, 2024 at 2:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >> +#define MAX_DYNPTR_CNT_IN_MAP_KEY 4
> >> +
> >>  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
> >>                          const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
> >>  {
> >> @@ -1103,6 +1113,40 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
> >>         if (!value_type || value_size != map->value_size)
> >>                 return -EINVAL;
> >>
> >> +       if (btf_type_is_dynptr(btf, key_type))
> >> +               map->key_record = btf_new_bpf_dynptr_record();
> >> +       else
> >> +               map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
> >> +       if (!IS_ERR_OR_NULL(map->key_record)) {
> >> +               if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
> >> +                       ret = -E2BIG;
> >> +                       goto free_map_tab;
> > Took me a while to grasp that map->key_record is only for dynptr fields
> > and map->record is for the rest except dynptr fields.
> >
> > Maybe rename key_record to dynptr_fields ?
> > Or at least add a comment to struct bpf_map to explain
> > what each btf_record is for.
>
> I was trying to rename map->record to map->value_record, however, I was
> afraid that it may introduce too much churn, so I didn't do that. But I
> think it is a good idea to add comments for both btf_record. And
> considering that only bpf_dynptr is enabled for map key, renaming it to
> dynptr_fields seems reasonable as well.
> >
> > It's kinda arbitrary decision to support multiple dynptr-s per key
> > while other fields are not.
> > Maybe worth looking at generalizing it a bit so single btf_record
> > can have multiple of certain field kinds?
> > In addition to btf_record->cnt you'd need btf_record->dynptr_cnt
> > but that would be easier to extend in the future ?
>
> Map value has already supported multiple kptrs or bpf_list_node.

fwiw I believe we reached the dead end there.
The whole support for bpf_list and bpf_rb_tree may get deprecated
and removed. The expected users didn't materialize.

> And in
> the discussion [1], I thought multiple dynptr support in map key is
> necessary, so I enabled it.
>
> [1]:
> https://lore.kernel.org/bpf/CAADnVQJWaBRB=P-ZNkppwm=0tZaT3qP8PKLLJ2S5SSA2-S8mxg@mail.gmail.com/

Sure. That's a different reasoning and use case.
I'm proposing to use a single btf_record with different cnt-s.
The current btf_record->cnt will stay as-is indicating total number of fields
while btf_record->dynptr_cnt will be just for these dynptrs you're introducing.
Then you won't need two top level btf_record-s.
Hou Tao Oct. 22, 2024, 7:20 a.m. UTC | #6
Hi,

On 10/22/2024 11:59 AM, Alexei Starovoitov wrote:
> On Mon, Oct 21, 2024 at 7:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi,
>>
>> On 10/12/2024 12:29 AM, Alexei Starovoitov wrote:
>>> On Tue, Oct 8, 2024 at 2:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
>>>> +#define MAX_DYNPTR_CNT_IN_MAP_KEY 4
>>>> +
>>>>  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>>>>                          const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
>>>>  {
>>>> @@ -1103,6 +1113,40 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>>>>         if (!value_type || value_size != map->value_size)
>>>>                 return -EINVAL;
>>>>
>>>> +       if (btf_type_is_dynptr(btf, key_type))
>>>> +               map->key_record = btf_new_bpf_dynptr_record();
>>>> +       else
>>>> +               map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
>>>> +       if (!IS_ERR_OR_NULL(map->key_record)) {
>>>> +               if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
>>>> +                       ret = -E2BIG;
>>>> +                       goto free_map_tab;
>>> Took me a while to grasp that map->key_record is only for dynptr fields
>>> and map->record is for the rest except dynptr fields.
>>>
>>> Maybe rename key_record to dynptr_fields ?
>>> Or at least add a comment to struct bpf_map to explain
>>> what each btf_record is for.
>> I was trying to rename map->record to map->value_record, however, I was
>> afraid that it may introduce too much churn, so I didn't do that. But I
>> think it is a good idea to add comments for both btf_record. And
>> considering that only bpf_dynptr is enabled for map key, renaming it to
>> dynptr_fields seems reasonable as well.
>>> It's kinda arbitrary decision to support multiple dynptr-s per key
>>> while other fields are not.
>>> Maybe worth looking at generalizing it a bit so single btf_record
>>> can have multiple of certain field kinds?
>>> In addition to btf_record->cnt you'd need btf_record->dynptr_cnt
>>> but that would be easier to extend in the future ?
>> Map value has already supported multiple kptrs or bpf_list_node.
> fwiw I believe we reached the dead end there.
> The whole support for bpf_list and bpf_rb_tree may get deprecated
> and removed. The expected users didn't materialize.

OK.
>
>> And in
>> the discussion [1], I thought multiple dynptr support in map key is
>> necessary, so I enabled it.
>>
>> [1]:
>> https://lore.kernel.org/bpf/CAADnVQJWaBRB=P-ZNkppwm=0tZaT3qP8PKLLJ2S5SSA2-S8mxg@mail.gmail.com/
> Sure. That's a different reasoning and use case.
> I'm proposing to use a single btf_record with different cnt-s.
> The current btf_record->cnt will stay as-is indicating total number of fields
> while btf_record->dynptr_cnt will be just for these dynptrs you're introducing.
> Then you won't need two top level btf_record-s.

I misunderstood your suggestion yesterday. Now I see what you are
suggesting. However, it seems using a separated counter for different
kinds of btf_field will only benefit dynptr field, because other types
doesn't need to iterate all field instead they just need to find one
through binary search. And I don't understand why only one btf_record
will be enough, because these two btf_records are derived from map key
and map value respectively.
Alexei Starovoitov Oct. 22, 2024, 6:44 p.m. UTC | #7
On Tue, Oct 22, 2024 at 12:20 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 10/22/2024 11:59 AM, Alexei Starovoitov wrote:
> > On Mon, Oct 21, 2024 at 7:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >> Hi,
> >>
> >> On 10/12/2024 12:29 AM, Alexei Starovoitov wrote:
> >>> On Tue, Oct 8, 2024 at 2:02 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >>>> +#define MAX_DYNPTR_CNT_IN_MAP_KEY 4
> >>>> +
> >>>>  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
> >>>>                          const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
> >>>>  {
> >>>> @@ -1103,6 +1113,40 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
> >>>>         if (!value_type || value_size != map->value_size)
> >>>>                 return -EINVAL;
> >>>>
> >>>> +       if (btf_type_is_dynptr(btf, key_type))
> >>>> +               map->key_record = btf_new_bpf_dynptr_record();
> >>>> +       else
> >>>> +               map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
> >>>> +       if (!IS_ERR_OR_NULL(map->key_record)) {
> >>>> +               if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
> >>>> +                       ret = -E2BIG;
> >>>> +                       goto free_map_tab;
> >>> Took me a while to grasp that map->key_record is only for dynptr fields
> >>> and map->record is for the rest except dynptr fields.
> >>>
> >>> Maybe rename key_record to dynptr_fields ?
> >>> Or at least add a comment to struct bpf_map to explain
> >>> what each btf_record is for.
> >> I was trying to rename map->record to map->value_record, however, I was
> >> afraid that it may introduce too much churn, so I didn't do that. But I
> >> think it is a good idea to add comments for both btf_record. And
> >> considering that only bpf_dynptr is enabled for map key, renaming it to
> >> dynptr_fields seems reasonable as well.
> >>> It's kinda arbitrary decision to support multiple dynptr-s per key
> >>> while other fields are not.
> >>> Maybe worth looking at generalizing it a bit so single btf_record
> >>> can have multiple of certain field kinds?
> >>> In addition to btf_record->cnt you'd need btf_record->dynptr_cnt
> >>> but that would be easier to extend in the future ?
> >> Map value has already supported multiple kptrs or bpf_list_node.
> > fwiw I believe we reached the dead end there.
> > The whole support for bpf_list and bpf_rb_tree may get deprecated
> > and removed. The expected users didn't materialize.
>
> OK.
> >
> >> And in
> >> the discussion [1], I thought multiple dynptr support in map key is
> >> necessary, so I enabled it.
> >>
> >> [1]:
> >> https://lore.kernel.org/bpf/CAADnVQJWaBRB=P-ZNkppwm=0tZaT3qP8PKLLJ2S5SSA2-S8mxg@mail.gmail.com/
> > Sure. That's a different reasoning and use case.
> > I'm proposing to use a single btf_record with different cnt-s.
> > The current btf_record->cnt will stay as-is indicating total number of fields
> > while btf_record->dynptr_cnt will be just for these dynptrs you're introducing.
> > Then you won't need two top level btf_record-s.
>
> I misunderstood your suggestion yesterday. Now I see what you are
> suggesting. However, it seems using a separated counter for different
> kinds of btf_field will only benefit dynptr field, because other types
> doesn't need to iterate all field instead they just need to find one
> through binary search. And I don't understand why only one btf_record
> will be enough, because these two btf_records are derived from map key
> and map value respectively.

If we have two btf records (one for key and one for value)
then it's fine.
For now btf_record for key will only have multiple
dynptr-s in there as special fields and it's fine too.
But we need to document it and when the need arises to have
dynptr-s in value or kptr/timers/whatever in a key
then we shouldn't add a 3rd and 4th btf records to
separate different kinds of special fields.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f61bf427e14e..3e25e94b7457 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -184,8 +184,8 @@  struct bpf_map_ops {
 };
 
 enum {
-	/* Support at most 11 fields in a BTF type */
-	BTF_FIELDS_MAX	   = 11,
+	/* Support at most 12 fields in a BTF type */
+	BTF_FIELDS_MAX	   = 12,
 };
 
 enum btf_field_type {
@@ -203,6 +203,7 @@  enum btf_field_type {
 	BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
 	BPF_REFCOUNT   = (1 << 9),
 	BPF_WORKQUEUE  = (1 << 10),
+	BPF_DYNPTR     = (1 << 11),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -270,6 +271,7 @@  struct bpf_map {
 	u32 map_flags;
 	u32 id;
 	struct btf_record *record;
+	struct btf_record *key_record;
 	int numa_node;
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
@@ -337,6 +339,8 @@  static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "bpf_rb_node";
 	case BPF_REFCOUNT:
 		return "bpf_refcount";
+	case BPF_DYNPTR:
+		return "bpf_dynptr";
 	default:
 		WARN_ON_ONCE(1);
 		return "unknown";
@@ -366,6 +370,8 @@  static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_rb_node);
 	case BPF_REFCOUNT:
 		return sizeof(struct bpf_refcount);
+	case BPF_DYNPTR:
+		return sizeof(struct bpf_dynptr);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -395,6 +401,8 @@  static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_rb_node);
 	case BPF_REFCOUNT:
 		return __alignof__(struct bpf_refcount);
+	case BPF_DYNPTR:
+		return __alignof__(struct bpf_dynptr);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -424,6 +432,7 @@  static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
+	case BPF_DYNPTR:
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 321356710924..2604cef53915 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3500,6 +3500,7 @@  static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
 	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");
 	field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node");
 	field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");
+	field_mask_test_name(BPF_DYNPTR,    "bpf_dynptr");
 
 	/* Only return BPF_KPTR when all other types with matchable names fail */
 	if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) {
@@ -3537,6 +3538,7 @@  static int btf_repeat_fields(struct btf_field_info *info,
 		case BPF_KPTR_PERCPU:
 		case BPF_LIST_HEAD:
 		case BPF_RB_ROOT:
+		case BPF_DYNPTR:
 			break;
 		default:
 			return -EINVAL;
@@ -3659,6 +3661,7 @@  static int btf_find_field_one(const struct btf *btf,
 	case BPF_LIST_NODE:
 	case BPF_RB_NODE:
 	case BPF_REFCOUNT:
+	case BPF_DYNPTR:
 		ret = btf_find_struct(btf, var_type, off, sz, field_type,
 				      info_cnt ? &info[0] : &tmp);
 		if (ret < 0)
@@ -4014,6 +4017,7 @@  struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			break;
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_DYNPTR:
 			break;
 		default:
 			ret = -EFAULT;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 645bd30bc9a9..a072835dc645 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -9,7 +9,7 @@ 
 
 struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 {
-	struct bpf_map *inner_map, *inner_map_meta;
+	struct bpf_map *inner_map, *inner_map_meta, *ret;
 	u32 inner_map_meta_size;
 	CLASS(fd, f)(inner_map_ufd);
 
@@ -45,9 +45,13 @@  struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		 * invalid/empty/valid, but ERR_PTR in case of errors. During
 		 * equality NULL or IS_ERR is equivalent.
 		 */
-		struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
-		kfree(inner_map_meta);
-		return ret;
+		ret = ERR_CAST(inner_map_meta->record);
+		goto free;
+	}
+	inner_map_meta->key_record = btf_record_dup(inner_map->key_record);
+	if (IS_ERR(inner_map_meta->key_record)) {
+		ret = ERR_CAST(inner_map_meta->key_record);
+		goto free;
 	}
 	/* Note: We must use the same BTF, as we also used btf_record_dup above
 	 * which relies on BTF being same for both maps, as some members like
@@ -71,6 +75,10 @@  struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
 	}
 	return inner_map_meta;
+
+free:
+	bpf_map_meta_free(inner_map_meta);
+	return ret;
 }
 
 void bpf_map_meta_free(struct bpf_map *map_meta)
@@ -88,7 +96,8 @@  bool bpf_map_meta_equal(const struct bpf_map *meta0,
 		meta0->key_size == meta1->key_size &&
 		meta0->value_size == meta1->value_size &&
 		meta0->map_flags == meta1->map_flags &&
-		btf_record_equal(meta0->record, meta1->record);
+		btf_record_equal(meta0->record, meta1->record) &&
+		btf_record_equal(meta0->key_record, meta1->key_record);
 }
 
 void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bffd803c5977..aa0a500f8fad 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -561,6 +561,7 @@  void btf_record_free(struct btf_record *rec)
 		case BPF_TIMER:
 		case BPF_REFCOUNT:
 		case BPF_WORKQUEUE:
+		case BPF_DYNPTR:
 			/* Nothing to release */
 			break;
 		default:
@@ -574,7 +575,9 @@  void btf_record_free(struct btf_record *rec)
 void bpf_map_free_record(struct bpf_map *map)
 {
 	btf_record_free(map->record);
+	btf_record_free(map->key_record);
 	map->record = NULL;
+	map->key_record = NULL;
 }
 
 struct btf_record *btf_record_dup(const struct btf_record *rec)
@@ -612,6 +615,7 @@  struct btf_record *btf_record_dup(const struct btf_record *rec)
 		case BPF_TIMER:
 		case BPF_REFCOUNT:
 		case BPF_WORKQUEUE:
+		case BPF_DYNPTR:
 			/* Nothing to acquire */
 			break;
 		default:
@@ -728,6 +732,8 @@  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		case BPF_RB_NODE:
 		case BPF_REFCOUNT:
 			break;
+		case BPF_DYNPTR:
+			break;
 		default:
 			WARN_ON_ONCE(1);
 			continue;
@@ -737,6 +743,7 @@  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 
 static void bpf_map_free(struct bpf_map *map)
 {
+	struct btf_record *key_rec = map->key_record;
 	struct btf_record *rec = map->record;
 	struct btf *btf = map->btf;
 
@@ -751,6 +758,7 @@  static void bpf_map_free(struct bpf_map *map)
 	 * eventually calls bpf_map_free_meta, since inner_map_meta is only a
 	 * template bpf_map struct used during verification.
 	 */
+	btf_record_free(key_rec);
 	btf_record_free(rec);
 	/* Delay freeing of btf for maps, as map_free callback may need
 	 * struct_meta info which will be freed with btf_put().
@@ -1081,6 +1089,8 @@  int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
+#define MAX_DYNPTR_CNT_IN_MAP_KEY 4
+
 static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
 {
@@ -1103,6 +1113,40 @@  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 	if (!value_type || value_size != map->value_size)
 		return -EINVAL;
 
+	if (btf_type_is_dynptr(btf, key_type))
+		map->key_record = btf_new_bpf_dynptr_record();
+	else
+		map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
+	if (!IS_ERR_OR_NULL(map->key_record)) {
+		if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
+			ret = -E2BIG;
+			goto free_map_tab;
+		}
+		if (!bpf_map_has_dynptr_key(map)) {
+			ret = -EINVAL;
+			goto free_map_tab;
+		}
+		if (map->map_type != BPF_MAP_TYPE_HASH) {
+			ret = -EOPNOTSUPP;
+			goto free_map_tab;
+		}
+		if (!bpf_token_capable(token, CAP_BPF)) {
+			ret = -EPERM;
+			goto free_map_tab;
+		}
+		/* Disallow key with dynptr for special map */
+		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+			ret = -EACCES;
+			goto free_map_tab;
+		}
+	} else if (bpf_map_has_dynptr_key(map)) {
+		ret = -EINVAL;
+		goto free_map_tab;
+	} else {
+		/* Ensure key_record is either a valid btf_record or NULL */
+		map->key_record = NULL;
+	}
+
 	map->record = btf_parse_fields(btf, value_type,
 				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
 				       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
@@ -1230,6 +1274,9 @@  static int map_create(union bpf_attr *attr)
 		return -EINVAL;
 	}
 
+	if ((attr->map_flags & BPF_F_DYNPTR_IN_KEY) && !attr->btf_key_type_id)
+		return -EINVAL;
+
 	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
 	    attr->map_type != BPF_MAP_TYPE_ARENA &&
 	    !(attr->map_flags & BPF_F_DYNPTR_IN_KEY) &&