diff mbox series

[v2,bpf-next,1/4] bpf: Search for kptrs in prog BTF structs

Message ID 20240803001145.635887-2-amery.hung@bytedance.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Support bpf_kptr_xchg into local kptr | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 42 this patch: 42
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 9 maintainers not CCed: kpsingh@kernel.org haoluo@google.com john.fastabend@gmail.com jolsa@kernel.org yonghong.song@linux.dev martin.lau@linux.dev song@kernel.org eddyz87@gmail.com sdf@fomichev.me
netdev/build_clang success Errors and warnings before: 43 this patch: 43
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 45 this patch: 45
netdev/checkpatch warning WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Amery Hung Aug. 3, 2024, 12:11 a.m. UTC
From: Dave Marchevsky <davemarchevsky@fb.com>

Currently btf_parse_fields is used in two places to create struct
btf_record's for structs: when looking at mapval type, and when looking
at any struct in program BTF. The former looks for kptr fields while the
latter does not. This patch modifies the btf_parse_fields call made when
looking at prog BTF struct types to search for kptrs as well.

Before this series there was no reason to search for kptrs in non-mapval
types: a referenced kptr needs some owner to guarantee resource cleanup,
and map values were the only owner that supported this. If a struct with
a kptr field were to have some non-kptr-aware owner, the kptr field
might not be properly cleaned up and result in resources leaking. Only
searching for kptr fields in mapval was a simple way to avoid this
problem.

In practice, though, searching for BPF_KPTR when populating
struct_meta_tab does not expose us to this risk, as struct_meta_tab is
only accessed through btf_find_struct_meta helper, and that helper is
only called in contexts where recognizing the kptr field is safe:

  * PTR_TO_BTF_ID reg w/ MEM_ALLOC flag
    * Such a reg is a local kptr and must be free'd via bpf_obj_drop,
      which will correctly handle kptr field

  * When handling specific kfuncs which either expect MEM_ALLOC input or
    return MEM_ALLOC output (obj_{new,drop}, percpu_obj_{new,drop},
    list+rbtree funcs, refcount_acquire)
     * Will correctly handle kptr field for same reasons as above

  * When looking at kptr pointee type
     * Called by functions which implement "correct kptr resource
       handling"

  * In btf_check_and_fixup_fields
     * Helper that ensures no ownership loops for lists and rbtrees,
       doesn't care about kptr field existence

So we should be able to find BPF_KPTR fields in all prog BTF structs
without leaking resources.

Further patches in the series will build on this change to support
kptr_xchg into non-mapval local kptr. Without this change there would be
no kptr field found in such a type.

On a side note, when building program BTF, the refcount of program BTF
is now initialized before btf_parse_struct_metas() to prevent a
refcount_inc() on zero warning. This happens when BPF_KPTR is present
in program BTF: btf_parse_struct_metas() -> btf_parse_fields()
-> btf_parse_kptr() -> btf_get(). This should be okay as the program BTF
is not available yet outside btf_parse().

Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Amery Hung <amery.hung@bytedance.com>
---
 kernel/bpf/btf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

Comments

Hou Tao Aug. 5, 2024, 2:41 a.m. UTC | #1
Hi,

On 8/3/2024 8:11 AM, Amery Hung wrote:
> From: Dave Marchevsky <davemarchevsky@fb.com>
>
> Currently btf_parse_fields is used in two places to create struct
> btf_record's for structs: when looking at mapval type, and when looking
> at any struct in program BTF. The former looks for kptr fields while the
> latter does not. This patch modifies the btf_parse_fields call made when
> looking at prog BTF struct types to search for kptrs as well.
>

SNIP
> On a side note, when building program BTF, the refcount of program BTF
> is now initialized before btf_parse_struct_metas() to prevent a
> refcount_inc() on zero warning. This happens when BPF_KPTR is present
> in program BTF: btf_parse_struct_metas() -> btf_parse_fields()
> -> btf_parse_kptr() -> btf_get(). This should be okay as the program BTF
> is not available yet outside btf_parse().

If btf_parse_kptr() pins the passed btf, there will be memory leak for
the btf after closing the btf fd, because the invocation of btf_put()
for kptr record in btf->struct_meta_tab depends on the invocation of 
btf_free_struct_meta_tab() in btf_free(), but the invocation of
btf_free() depends the final refcnt of the btf is released, so the btf
will not be freed forever. The reason why map value doesn't have such
problem is that the invocation of btf_put() for kptr record doesn't
depends on the release of map value btf and it is accomplished by
bpf_map_free_record().

Maybe we should move the common btf used by kptr and graph_root into
btf_record and let the callers of btf_parse_fields() and
btf_record_free() to decide the life cycle of btf in btf_record.
> Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> Signed-off-by: Amery Hung <amery.hung@bytedance.com>
> ---
>  kernel/bpf/btf.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 95426d5b634e..7b8275e3e500 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -5585,7 +5585,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
>  		type = &tab->types[tab->cnt];
>  		type->btf_id = i;
>  		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
> -						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
> +						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
> +						  BPF_KPTR, t->size);
>  		/* The record cannot be unset, treat it as an error if so */
>  		if (IS_ERR_OR_NULL(record)) {
>  			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
> @@ -5737,6 +5738,8 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
>  	if (err)
>  		goto errout;
>  
> +	refcount_set(&btf->refcnt, 1);
> +
>  	struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
>  	if (IS_ERR(struct_meta_tab)) {
>  		err = PTR_ERR(struct_meta_tab);
> @@ -5759,7 +5762,6 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
>  		goto errout_free;
>  
>  	btf_verifier_env_free(env);
> -	refcount_set(&btf->refcnt, 1);
>  	return btf;
>  
>  errout_meta:
Amery Hung Aug. 5, 2024, 4:31 a.m. UTC | #2
On Sun, Aug 4, 2024 at 7:41 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 8/3/2024 8:11 AM, Amery Hung wrote:
> > From: Dave Marchevsky <davemarchevsky@fb.com>
> >
> > Currently btf_parse_fields is used in two places to create struct
> > btf_record's for structs: when looking at mapval type, and when looking
> > at any struct in program BTF. The former looks for kptr fields while the
> > latter does not. This patch modifies the btf_parse_fields call made when
> > looking at prog BTF struct types to search for kptrs as well.
> >
>
> SNIP
> > On a side note, when building program BTF, the refcount of program BTF
> > is now initialized before btf_parse_struct_metas() to prevent a
> > refcount_inc() on zero warning. This happens when BPF_KPTR is present
> > in program BTF: btf_parse_struct_metas() -> btf_parse_fields()
> > -> btf_parse_kptr() -> btf_get(). This should be okay as the program BTF
> > is not available yet outside btf_parse().
>
> If btf_parse_kptr() pins the passed btf, there will be memory leak for
> the btf after closing the btf fd, because the invocation of btf_put()
> for kptr record in btf->struct_meta_tab depends on the invocation of
> btf_free_struct_meta_tab() in btf_free(), but the invocation of
> btf_free() depends the final refcnt of the btf is released, so the btf
> will not be freed forever. The reason why map value doesn't have such
> problem is that the invocation of btf_put() for kptr record doesn't
> depends on the release of map value btf and it is accomplished by
> bpf_map_free_record().
>

Thanks for pointing this out. It makes sense to me.

> Maybe we should move the common btf used by kptr and graph_root into
> btf_record and let the callers of btf_parse_fields() and
> btf_record_free() to decide the life cycle of btf in btf_record.

Could you maybe explain if and why moving btf of btf_field_kptr and
btf_field_graph_root to btf_record is necessary? I think letting
callers of btf_parse_fields() and btf_record_free() decide whether or
not to change refcount should be enough. Besides, I personally would
like to keep individual btf in btf_field_kptr and
btf_field_graph_root, so that later we can have special fields
referencing different btf.

Thanks,
Amery

> > Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
> > Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> > Signed-off-by: Amery Hung <amery.hung@bytedance.com>
> > ---
> >  kernel/bpf/btf.c | 6 ++++--
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > index 95426d5b634e..7b8275e3e500 100644
> > --- a/kernel/bpf/btf.c
> > +++ b/kernel/bpf/btf.c
> > @@ -5585,7 +5585,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
> >               type = &tab->types[tab->cnt];
> >               type->btf_id = i;
> >               record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
> > -                                               BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
> > +                                               BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
> > +                                               BPF_KPTR, t->size);
> >               /* The record cannot be unset, treat it as an error if so */
> >               if (IS_ERR_OR_NULL(record)) {
> >                       ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
> > @@ -5737,6 +5738,8 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
> >       if (err)
> >               goto errout;
> >
> > +     refcount_set(&btf->refcnt, 1);
> > +
> >       struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
> >       if (IS_ERR(struct_meta_tab)) {
> >               err = PTR_ERR(struct_meta_tab);
> > @@ -5759,7 +5762,6 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
> >               goto errout_free;
> >
> >       btf_verifier_env_free(env);
> > -     refcount_set(&btf->refcnt, 1);
> >       return btf;
> >
> >  errout_meta:
>
Hou Tao Aug. 5, 2024, 7:32 a.m. UTC | #3
Hi,

On 8/5/2024 12:31 PM, Amery Hung wrote:
> On Sun, Aug 4, 2024 at 7:41 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi,
>>
>> On 8/3/2024 8:11 AM, Amery Hung wrote:
>>> From: Dave Marchevsky <davemarchevsky@fb.com>
>>>
>>> Currently btf_parse_fields is used in two places to create struct
>>> btf_record's for structs: when looking at mapval type, and when looking
>>> at any struct in program BTF. The former looks for kptr fields while the
>>> latter does not. This patch modifies the btf_parse_fields call made when
>>> looking at prog BTF struct types to search for kptrs as well.
>>>
>> SNIP
>>> On a side note, when building program BTF, the refcount of program BTF
>>> is now initialized before btf_parse_struct_metas() to prevent a
>>> refcount_inc() on zero warning. This happens when BPF_KPTR is present
>>> in program BTF: btf_parse_struct_metas() -> btf_parse_fields()
>>> -> btf_parse_kptr() -> btf_get(). This should be okay as the program BTF
>>> is not available yet outside btf_parse().
>> If btf_parse_kptr() pins the passed btf, there will be memory leak for
>> the btf after closing the btf fd, because the invocation of btf_put()
>> for kptr record in btf->struct_meta_tab depends on the invocation of
>> btf_free_struct_meta_tab() in btf_free(), but the invocation of
>> btf_free() depends the final refcnt of the btf is released, so the btf
>> will not be freed forever. The reason why map value doesn't have such
>> problem is that the invocation of btf_put() for kptr record doesn't
>> depends on the release of map value btf and it is accomplished by
>> bpf_map_free_record().
>>
> Thanks for pointing this out. It makes sense to me.
>
>> Maybe we should move the common btf used by kptr and graph_root into
>> btf_record and let the callers of btf_parse_fields() and
>> btf_record_free() to decide the life cycle of btf in btf_record.
> Could you maybe explain if and why moving btf of btf_field_kptr and
> btf_field_graph_root to btf_record is necessary? I think letting
> callers of btf_parse_fields() and btf_record_free() decide whether or
> not to change refcount should be enough. Besides, I personally would
> like to keep individual btf in btf_field_kptr and
> btf_field_graph_root, so that later we can have special fields
> referencing different btf.

Sorry, I didn't express the rough idea clearly enough. I didn't mean to
move btf of btf_field_kptr and btf_field_graph_root to btf_record,
because there are other btf-s which are different with the btf which
creates the struct_meta_tab. What I was trying to suggest is to save one
btf in btf_record and hope it will simplify the pin and the unpin of btf
in btf_record:

1) save the btf which owns the btf_record in btf_record.
2) during btf_parse_kptr() or similar, if the used btf is the same as
the btf in btf_record, there is no need to pin the btf
3) when freeing the btf_record, if the btf saved in btf_field is the
same as the btf in btf_record, there is no need to put it

For step 2) and step 3), however I think it is also doable through other
ways (e.g., pass the btf to btf_record_free or similar).
>
> Thanks,
> Amery
>
>>> Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>> Signed-off-by: Amery Hung <amery.hung@bytedance.com>
>>> ---
>>>  kernel/bpf/btf.c | 6 ++++--
>>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>>> index 95426d5b634e..7b8275e3e500 100644
>>> --- a/kernel/bpf/btf.c
>>> +++ b/kernel/bpf/btf.c
>>> @@ -5585,7 +5585,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
>>>               type = &tab->types[tab->cnt];
>>>               type->btf_id = i;
>>>               record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
>>> -                                               BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
>>> +                                               BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
>>> +                                               BPF_KPTR, t->size);
>>>               /* The record cannot be unset, treat it as an error if so */
>>>               if (IS_ERR_OR_NULL(record)) {
>>>                       ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
>>> @@ -5737,6 +5738,8 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
>>>       if (err)
>>>               goto errout;
>>>
>>> +     refcount_set(&btf->refcnt, 1);
>>> +
>>>       struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
>>>       if (IS_ERR(struct_meta_tab)) {
>>>               err = PTR_ERR(struct_meta_tab);
>>> @@ -5759,7 +5762,6 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
>>>               goto errout_free;
>>>
>>>       btf_verifier_env_free(env);
>>> -     refcount_set(&btf->refcnt, 1);
>>>       return btf;
>>>
>>>  errout_meta:
> .
Amery Hung Aug. 5, 2024, 8 p.m. UTC | #4
On Mon, Aug 5, 2024 at 12:32 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 8/5/2024 12:31 PM, Amery Hung wrote:
> > On Sun, Aug 4, 2024 at 7:41 PM Hou Tao <houtao@huaweicloud.com> wrote:
> >> Hi,
> >>
> >> On 8/3/2024 8:11 AM, Amery Hung wrote:
> >>> From: Dave Marchevsky <davemarchevsky@fb.com>
> >>>
> >>> Currently btf_parse_fields is used in two places to create struct
> >>> btf_record's for structs: when looking at mapval type, and when looking
> >>> at any struct in program BTF. The former looks for kptr fields while the
> >>> latter does not. This patch modifies the btf_parse_fields call made when
> >>> looking at prog BTF struct types to search for kptrs as well.
> >>>
> >> SNIP
> >>> On a side note, when building program BTF, the refcount of program BTF
> >>> is now initialized before btf_parse_struct_metas() to prevent a
> >>> refcount_inc() on zero warning. This happens when BPF_KPTR is present
> >>> in program BTF: btf_parse_struct_metas() -> btf_parse_fields()
> >>> -> btf_parse_kptr() -> btf_get(). This should be okay as the program BTF
> >>> is not available yet outside btf_parse().
> >> If btf_parse_kptr() pins the passed btf, there will be memory leak for
> >> the btf after closing the btf fd, because the invocation of btf_put()
> >> for kptr record in btf->struct_meta_tab depends on the invocation of
> >> btf_free_struct_meta_tab() in btf_free(), but the invocation of
> >> btf_free() depends the final refcnt of the btf is released, so the btf
> >> will not be freed forever. The reason why map value doesn't have such
> >> problem is that the invocation of btf_put() for kptr record doesn't
> >> depends on the release of map value btf and it is accomplished by
> >> bpf_map_free_record().
> >>
> > Thanks for pointing this out. It makes sense to me.
> >
> >> Maybe we should move the common btf used by kptr and graph_root into
> >> btf_record and let the callers of btf_parse_fields() and
> >> btf_record_free() to decide the life cycle of btf in btf_record.
> > Could you maybe explain if and why moving btf of btf_field_kptr and
> > btf_field_graph_root to btf_record is necessary? I think letting
> > callers of btf_parse_fields() and btf_record_free() decide whether or
> > not to change refcount should be enough. Besides, I personally would
> > like to keep individual btf in btf_field_kptr and
> > btf_field_graph_root, so that later we can have special fields
> > referencing different btf.
>
> Sorry, I didn't express the rough idea clearly enough. I didn't mean to
> move btf of btf_field_kptr and btf_field_graph_root to btf_record,
> because there are other btf-s which are different with the btf which
> creates the struct_meta_tab. What I was trying to suggest is to save one
> btf in btf_record and hope it will simplify the pin and the unpin of btf
> in btf_record:
>
> 1) save the btf which owns the btf_record in btf_record.
> 2) during btf_parse_kptr() or similar, if the used btf is the same as
> the btf in btf_record, there is no need to pin the btf
> 3) when freeing the btf_record, if the btf saved in btf_field is the
> same as the btf in btf_record, there is no need to put it
>
> For step 2) and step 3), however I think it is also doable through other
> ways (e.g., pass the btf to btf_record_free or similar).

Thanks for clarifying. I see what you mean in 1), and saving the
owner's btf in btf_record seems to be cleaner than adding additional
arguments to btf_record_free() and other related functions.

Thanks,
Amery

> >
> > Thanks,
> > Amery
> >
> >>> Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
> >>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> >>> Signed-off-by: Amery Hung <amery.hung@bytedance.com>
> >>> ---
> >>>  kernel/bpf/btf.c | 6 ++++--
> >>>  1 file changed, 4 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> >>> index 95426d5b634e..7b8275e3e500 100644
> >>> --- a/kernel/bpf/btf.c
> >>> +++ b/kernel/bpf/btf.c
> >>> @@ -5585,7 +5585,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
> >>>               type = &tab->types[tab->cnt];
> >>>               type->btf_id = i;
> >>>               record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
> >>> -                                               BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
> >>> +                                               BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
> >>> +                                               BPF_KPTR, t->size);
> >>>               /* The record cannot be unset, treat it as an error if so */
> >>>               if (IS_ERR_OR_NULL(record)) {
> >>>                       ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
> >>> @@ -5737,6 +5738,8 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
> >>>       if (err)
> >>>               goto errout;
> >>>
> >>> +     refcount_set(&btf->refcnt, 1);
> >>> +
> >>>       struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
> >>>       if (IS_ERR(struct_meta_tab)) {
> >>>               err = PTR_ERR(struct_meta_tab);
> >>> @@ -5759,7 +5762,6 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
> >>>               goto errout_free;
> >>>
> >>>       btf_verifier_env_free(env);
> >>> -     refcount_set(&btf->refcnt, 1);
> >>>       return btf;
> >>>
> >>>  errout_meta:
> > .
>
Martin KaFai Lau Aug. 5, 2024, 8 p.m. UTC | #5
On 8/5/24 12:32 AM, Hou Tao wrote:
> Hi,
> 
> On 8/5/2024 12:31 PM, Amery Hung wrote:
>> On Sun, Aug 4, 2024 at 7:41 PM Hou Tao <houtao@huaweicloud.com> wrote:
>>> Hi,
>>>
>>> On 8/3/2024 8:11 AM, Amery Hung wrote:
>>>> From: Dave Marchevsky <davemarchevsky@fb.com>
>>>>
>>>> Currently btf_parse_fields is used in two places to create struct
>>>> btf_record's for structs: when looking at mapval type, and when looking
>>>> at any struct in program BTF. The former looks for kptr fields while the
>>>> latter does not. This patch modifies the btf_parse_fields call made when
>>>> looking at prog BTF struct types to search for kptrs as well.
>>>>
>>> SNIP
>>>> On a side note, when building program BTF, the refcount of program BTF
>>>> is now initialized before btf_parse_struct_metas() to prevent a
>>>> refcount_inc() on zero warning. This happens when BPF_KPTR is present
>>>> in program BTF: btf_parse_struct_metas() -> btf_parse_fields()
>>>> -> btf_parse_kptr() -> btf_get(). This should be okay as the program BTF
>>>> is not available yet outside btf_parse().
>>> If btf_parse_kptr() pins the passed btf, there will be memory leak for
>>> the btf after closing the btf fd, because the invocation of btf_put()
>>> for kptr record in btf->struct_meta_tab depends on the invocation of
>>> btf_free_struct_meta_tab() in btf_free(), but the invocation of
>>> btf_free() depends the final refcnt of the btf is released, so the btf
>>> will not be freed forever. The reason why map value doesn't have such
>>> problem is that the invocation of btf_put() for kptr record doesn't
>>> depends on the release of map value btf and it is accomplished by
>>> bpf_map_free_record().
>>>
>> Thanks for pointing this out. It makes sense to me.
>>
>>> Maybe we should move the common btf used by kptr and graph_root into
>>> btf_record and let the callers of btf_parse_fields() and
>>> btf_record_free() to decide the life cycle of btf in btf_record.
>> Could you maybe explain if and why moving btf of btf_field_kptr and
>> btf_field_graph_root to btf_record is necessary? I think letting
>> callers of btf_parse_fields() and btf_record_free() decide whether or
>> not to change refcount should be enough. Besides, I personally would
>> like to keep individual btf in btf_field_kptr and
>> btf_field_graph_root, so that later we can have special fields
>> referencing different btf.
> 
> Sorry, I didn't express the rough idea clearly enough. I didn't mean to
> move btf of btf_field_kptr and btf_field_graph_root to btf_record,
> because there are other btf-s which are different with the btf which
> creates the struct_meta_tab. What I was trying to suggest is to save one
> btf in btf_record and hope it will simplify the pin and the unpin of btf
> in btf_record:
> 
> 1) save the btf which owns the btf_record in btf_record.
> 2) during btf_parse_kptr() or similar, if the used btf is the same as
> the btf in btf_record, there is no need to pin the btf

I assume the used btf is the one that btf_parse is working on.

> 3) when freeing the btf_record, if the btf saved in btf_field is the
> same as the btf in btf_record, there is no need to put it

For btf_field_kptr.btf, is it the same as testing the btf_field_kptr.btf is 
btf_is_kernel() or not? How about only does btf_get/put for btf_is_kernel()?


> 
> For step 2) and step 3), however I think it is also doable through other
> ways (e.g., pass the btf to btf_record_free or similar).
Amery Hung Aug. 5, 2024, 8:44 p.m. UTC | #6
On Mon, Aug 5, 2024 at 1:00 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 8/5/24 12:32 AM, Hou Tao wrote:
> > Hi,
> >
> > On 8/5/2024 12:31 PM, Amery Hung wrote:
> >> On Sun, Aug 4, 2024 at 7:41 PM Hou Tao <houtao@huaweicloud.com> wrote:
> >>> Hi,
> >>>
> >>> On 8/3/2024 8:11 AM, Amery Hung wrote:
> >>>> From: Dave Marchevsky <davemarchevsky@fb.com>
> >>>>
> >>>> Currently btf_parse_fields is used in two places to create struct
> >>>> btf_record's for structs: when looking at mapval type, and when looking
> >>>> at any struct in program BTF. The former looks for kptr fields while the
> >>>> latter does not. This patch modifies the btf_parse_fields call made when
> >>>> looking at prog BTF struct types to search for kptrs as well.
> >>>>
> >>> SNIP
> >>>> On a side note, when building program BTF, the refcount of program BTF
> >>>> is now initialized before btf_parse_struct_metas() to prevent a
> >>>> refcount_inc() on zero warning. This happens when BPF_KPTR is present
> >>>> in program BTF: btf_parse_struct_metas() -> btf_parse_fields()
> >>>> -> btf_parse_kptr() -> btf_get(). This should be okay as the program BTF
> >>>> is not available yet outside btf_parse().
> >>> If btf_parse_kptr() pins the passed btf, there will be memory leak for
> >>> the btf after closing the btf fd, because the invocation of btf_put()
> >>> for kptr record in btf->struct_meta_tab depends on the invocation of
> >>> btf_free_struct_meta_tab() in btf_free(), but the invocation of
> >>> btf_free() depends the final refcnt of the btf is released, so the btf
> >>> will not be freed forever. The reason why map value doesn't have such
> >>> problem is that the invocation of btf_put() for kptr record doesn't
> >>> depends on the release of map value btf and it is accomplished by
> >>> bpf_map_free_record().
> >>>
> >> Thanks for pointing this out. It makes sense to me.
> >>
> >>> Maybe we should move the common btf used by kptr and graph_root into
> >>> btf_record and let the callers of btf_parse_fields() and
> >>> btf_record_free() to decide the life cycle of btf in btf_record.
> >> Could you maybe explain if and why moving btf of btf_field_kptr and
> >> btf_field_graph_root to btf_record is necessary? I think letting
> >> callers of btf_parse_fields() and btf_record_free() decide whether or
> >> not to change refcount should be enough. Besides, I personally would
> >> like to keep individual btf in btf_field_kptr and
> >> btf_field_graph_root, so that later we can have special fields
> >> referencing different btf.
> >
> > Sorry, I didn't express the rough idea clearly enough. I didn't mean to
> > move btf of btf_field_kptr and btf_field_graph_root to btf_record,
> > because there are other btf-s which are different with the btf which
> > creates the struct_meta_tab. What I was trying to suggest is to save one
> > btf in btf_record and hope it will simplify the pin and the unpin of btf
> > in btf_record:
> >
> > 1) save the btf which owns the btf_record in btf_record.
> > 2) during btf_parse_kptr() or similar, if the used btf is the same as
> > the btf in btf_record, there is no need to pin the btf
>
> I assume the used btf is the one that btf_parse is working on.
>
> > 3) when freeing the btf_record, if the btf saved in btf_field is the
> > same as the btf in btf_record, there is no need to put it
>
> For btf_field_kptr.btf, is it the same as testing the btf_field_kptr.btf is
> btf_is_kernel() or not? How about only does btf_get/put for btf_is_kernel()?
>

IIUC. It will not be the same. For a map referencing prog btf, I
suppose we should still do btf_get().

I think the core idea is since a btf_record and the prog btf
containing it has the same life time, we don't need to
btf_get()/btf_put() in btf_parse_kptr()/btf_record_free() when a
btf_field_kptr.btf is referencing itself.

However, since btf_parse_kptr() called from btf_parse() and
map_check_btf() all use prog btf, we need a way to differentiate the
two. Hence Hou suggested saving the owner's btf in btf_record, and
then check if btf_record->btf is the same as the btf_field_kptr.btf in
btf_parse_kptr()/btf_record_free().


>
> >
> > For step 2) and step 3), however I think it is also doable through other
> > ways (e.g., pass the btf to btf_record_free or similar).
Martin KaFai Lau Aug. 5, 2024, 10:25 p.m. UTC | #7
On 8/5/24 1:44 PM, Amery Hung wrote:
>>>>> Maybe we should move the common btf used by kptr and graph_root into
>>>>> btf_record and let the callers of btf_parse_fields() and
>>>>> btf_record_free() to decide the life cycle of btf in btf_record.
>>>> Could you maybe explain if and why moving btf of btf_field_kptr and
>>>> btf_field_graph_root to btf_record is necessary? I think letting
>>>> callers of btf_parse_fields() and btf_record_free() decide whether or
>>>> not to change refcount should be enough. Besides, I personally would
>>>> like to keep individual btf in btf_field_kptr and
>>>> btf_field_graph_root, so that later we can have special fields
>>>> referencing different btf.
>>>
>>> Sorry, I didn't express the rough idea clearly enough. I didn't mean to
>>> move btf of btf_field_kptr and btf_field_graph_root to btf_record,
>>> because there are other btf-s which are different with the btf which
>>> creates the struct_meta_tab. What I was trying to suggest is to save one
>>> btf in btf_record and hope it will simplify the pin and the unpin of btf
>>> in btf_record:
>>>
>>> 1) save the btf which owns the btf_record in btf_record.
>>> 2) during btf_parse_kptr() or similar, if the used btf is the same as
>>> the btf in btf_record, there is no need to pin the btf
>>
>> I assume the used btf is the one that btf_parse is working on.
>>
>>> 3) when freeing the btf_record, if the btf saved in btf_field is the
>>> same as the btf in btf_record, there is no need to put it
>>
>> For btf_field_kptr.btf, is it the same as testing the btf_field_kptr.btf is
>> btf_is_kernel() or not? How about only does btf_get/put for btf_is_kernel()?
>>
> 
> IIUC. It will not be the same. For a map referencing prog btf, I
> suppose we should still do btf_get().
> 
> I think the core idea is since a btf_record and the prog btf
> containing it has the same life time, we don't need to
> btf_get()/btf_put() in btf_parse_kptr()/btf_record_free() when a
> btf_field_kptr.btf is referencing itself.
> 
> However, since btf_parse_kptr() called from btf_parse() and
> map_check_btf() all use prog btf, we need a way to differentiate the
> two. Hence Hou suggested saving the owner's btf in btf_record, and

map_check_btf() calls btf_parse_kptr(map->btf).

I am missing how it is different from the 
btf_new_fd()=>btf_parse()=>btf_parse_kptr(new_btf).

akaik, the map->record has no issue now because bpf_map_free_deferred() does 
btf_record_free(map->record) before btf_put(map->btf). In the map->record case, 
does the map->record need to take a refcnt of the btf_field_kptr.btf if the 
btf_field_kptr.btf is pointing back to itself (map->btf) which is not a kernel btf?

> then check if btf_record->btf is the same as the btf_field_kptr.btf in
> btf_parse_kptr()/btf_record_free().

I suspect it will have the same end result? The btf_field_kptr.btf is only the 
same as the owner's btf when btf_parse_kptr() cannot found the kptr type from a 
kernel's btf (the id == -ENOENT case in btf_parse_kptr).
Amery Hung Aug. 6, 2024, 12:31 a.m. UTC | #8
On Mon, Aug 5, 2024 at 3:25 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 8/5/24 1:44 PM, Amery Hung wrote:
> >>>>> Maybe we should move the common btf used by kptr and graph_root into
> >>>>> btf_record and let the callers of btf_parse_fields() and
> >>>>> btf_record_free() to decide the life cycle of btf in btf_record.
> >>>> Could you maybe explain if and why moving btf of btf_field_kptr and
> >>>> btf_field_graph_root to btf_record is necessary? I think letting
> >>>> callers of btf_parse_fields() and btf_record_free() decide whether or
> >>>> not to change refcount should be enough. Besides, I personally would
> >>>> like to keep individual btf in btf_field_kptr and
> >>>> btf_field_graph_root, so that later we can have special fields
> >>>> referencing different btf.
> >>>
> >>> Sorry, I didn't express the rough idea clearly enough. I didn't mean to
> >>> move btf of btf_field_kptr and btf_field_graph_root to btf_record,
> >>> because there are other btf-s which are different with the btf which
> >>> creates the struct_meta_tab. What I was trying to suggest is to save one
> >>> btf in btf_record and hope it will simplify the pin and the unpin of btf
> >>> in btf_record:
> >>>
> >>> 1) save the btf which owns the btf_record in btf_record.
> >>> 2) during btf_parse_kptr() or similar, if the used btf is the same as
> >>> the btf in btf_record, there is no need to pin the btf
> >>
> >> I assume the used btf is the one that btf_parse is working on.
> >>
> >>> 3) when freeing the btf_record, if the btf saved in btf_field is the
> >>> same as the btf in btf_record, there is no need to put it
> >>
> >> For btf_field_kptr.btf, is it the same as testing the btf_field_kptr.btf is
> >> btf_is_kernel() or not? How about only does btf_get/put for btf_is_kernel()?
> >>
> >
> > IIUC. It will not be the same. For a map referencing prog btf, I
> > suppose we should still do btf_get().
> >
> > I think the core idea is since a btf_record and the prog btf
> > containing it has the same life time, we don't need to
> > btf_get()/btf_put() in btf_parse_kptr()/btf_record_free() when a
> > btf_field_kptr.btf is referencing itself.
> >
> > However, since btf_parse_kptr() called from btf_parse() and
> > map_check_btf() all use prog btf, we need a way to differentiate the
> > two. Hence Hou suggested saving the owner's btf in btf_record, and
>
> map_check_btf() calls btf_parse_kptr(map->btf).
>
> I am missing how it is different from the
> btf_new_fd()=>btf_parse()=>btf_parse_kptr(new_btf).
>
> akaik, the map->record has no issue now because bpf_map_free_deferred() does
> btf_record_free(map->record) before btf_put(map->btf). In the map->record case,
> does the map->record need to take a refcnt of the btf_field_kptr.btf if the
> btf_field_kptr.btf is pointing back to itself (map->btf) which is not a kernel btf?
>
> > then check if btf_record->btf is the same as the btf_field_kptr.btf in
> > btf_parse_kptr()/btf_record_free().
>
> I suspect it will have the same end result? The btf_field_kptr.btf is only the
> same as the owner's btf when btf_parse_kptr() cannot found the kptr type from a
> kernel's btf (the id == -ENOENT case in btf_parse_kptr).

I added some code to better explain how I think it could work.

I am thinking about adding a struct btf* under struct btf_record, and
then adding an argument in btf_parse_fields:

btf_parse_fields(const struct btf *btf,..., bool btf_is_owner)
{
        ...
        /* Before the for loop that goes through info_arr */
        rec->btf = btf_is_owner ? btf : NULL;
        ...
}

The btf_is_owner indicates whether the btf_record returned by the
function will be part of the btf. So map_check_btf() will set it to
false while btf_parse() will set it to true. Maybe "owner" is what
makes it confusing? (btf_record for a map belongs to bpf_map but not
map->btf. On the other hand, btf_record for a prog btf is part of it.)

Then, in btf_record_free(), we can do:

case BPF_KPTR_XXX:
        ...
        if (rec->btf != rec->fields[i].kptr.btf)
                btf_put(rec->fields[i].kptr.btf);

In btf_parse_kptr(), we also do similar things. We just need to pass
btf_record into it.
Hou Tao Aug. 6, 2024, 1:57 a.m. UTC | #9
Hi,

On 8/6/2024 8:31 AM, Amery Hung wrote:
> On Mon, Aug 5, 2024 at 3:25 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>> On 8/5/24 1:44 PM, Amery Hung wrote:
>>>>>>> Maybe we should move the common btf used by kptr and graph_root into
>>>>>>> btf_record and let the callers of btf_parse_fields() and
>>>>>>> btf_record_free() to decide the life cycle of btf in btf_record.
>>>>>> Could you maybe explain if and why moving btf of btf_field_kptr and
>>>>>> btf_field_graph_root to btf_record is necessary? I think letting
>>>>>> callers of btf_parse_fields() and btf_record_free() decide whether or
>>>>>> not to change refcount should be enough. Besides, I personally would
>>>>>> like to keep individual btf in btf_field_kptr and
>>>>>> btf_field_graph_root, so that later we can have special fields
>>>>>> referencing different btf.
>>>>> Sorry, I didn't express the rough idea clearly enough. I didn't mean to
>>>>> move btf of btf_field_kptr and btf_field_graph_root to btf_record,
>>>>> because there are other btf-s which are different with the btf which
>>>>> creates the struct_meta_tab. What I was trying to suggest is to save one
>>>>> btf in btf_record and hope it will simplify the pin and the unpin of btf
>>>>> in btf_record:
>>>>>
>>>>> 1) save the btf which owns the btf_record in btf_record.
>>>>> 2) during btf_parse_kptr() or similar, if the used btf is the same as
>>>>> the btf in btf_record, there is no need to pin the btf
>>>> I assume the used btf is the one that btf_parse is working on.
>>>>
>>>>> 3) when freeing the btf_record, if the btf saved in btf_field is the
>>>>> same as the btf in btf_record, there is no need to put it
>>>> For btf_field_kptr.btf, is it the same as testing the btf_field_kptr.btf is
>>>> btf_is_kernel() or not? How about only does btf_get/put for btf_is_kernel()?
>>>>
>>> IIUC. It will not be the same. For a map referencing prog btf, I
>>> suppose we should still do btf_get().
>>>
>>> I think the core idea is since a btf_record and the prog btf
>>> containing it has the same life time, we don't need to
>>> btf_get()/btf_put() in btf_parse_kptr()/btf_record_free() when a
>>> btf_field_kptr.btf is referencing itself.
>>>
>>> However, since btf_parse_kptr() called from btf_parse() and
>>> map_check_btf() all use prog btf, we need a way to differentiate the
>>> two. Hence Hou suggested saving the owner's btf in btf_record, and
>> map_check_btf() calls btf_parse_kptr(map->btf).
>>
>> I am missing how it is different from the
>> btf_new_fd()=>btf_parse()=>btf_parse_kptr(new_btf).
>>
>> akaik, the map->record has no issue now because bpf_map_free_deferred() does
>> btf_record_free(map->record) before btf_put(map->btf). In the map->record case,
>> does the map->record need to take a refcnt of the btf_field_kptr.btf if the
>> btf_field_kptr.btf is pointing back to itself (map->btf) which is not a kernel btf?

I think you are right. For both callees of btf_parse_kptr() when the btf
saved in btf_field_kptr.btf is the same as the passed btf, there is no
need to call btf_get(). For bpf map case, just as you remained, map->btf
has already pin the btf passed to btf_parse_kptr() in map_create().
>>> then check if btf_record->btf is the same as the btf_field_kptr.btf in
>>> btf_parse_kptr()/btf_record_free().
>> I suspect it will have the same end result? The btf_field_kptr.btf is only the
>> same as the owner's btf when btf_parse_kptr() cannot found the kptr type from a
>> kernel's btf (the id == -ENOENT case in btf_parse_kptr).

It seems your suggestion of using btf_is_kernel() is much simpler:

(1) btf_parse_kptr():  doesn't invoke btf_get() when bpf_find_btf_id()
returns -ENOENT and let the caller ensure the life cycle of the passed btf.
(2) btf_record_free(): only invoke btf_put() if the saved btf is a
kernel btf.
(3) btf_record_dup(): only invoke btf_get() if the saved btf is a kernel
btf.
> I added some code to better explain how I think it could work.
>
> I am thinking about adding a struct btf* under struct btf_record, and
> then adding an argument in btf_parse_fields:
>
> btf_parse_fields(const struct btf *btf,..., bool btf_is_owner)
> {
>         ...
>         /* Before the for loop that goes through info_arr */
>         rec->btf = btf_is_owner ? btf : NULL;
>         ...
> }
>
> The btf_is_owner indicates whether the btf_record returned by the
> function will be part of the btf. So map_check_btf() will set it to
> false while btf_parse() will set it to true. Maybe "owner" is what
> makes it confusing? (btf_record for a map belongs to bpf_map but not
> map->btf. On the other hand, btf_record for a prog btf is part of it.)
>
> Then, in btf_record_free(), we can do:
>
> case BPF_KPTR_XXX:
>         ...
>         if (rec->btf != rec->fields[i].kptr.btf)
>                 btf_put(rec->fields[i].kptr.btf);
>
> In btf_parse_kptr(), we also do similar things. We just need to pass
> btf_record into it.
Amery Hung Aug. 6, 2024, 4:07 a.m. UTC | #10
On Mon, Aug 5, 2024 at 6:57 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 8/6/2024 8:31 AM, Amery Hung wrote:
> > On Mon, Aug 5, 2024 at 3:25 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >> On 8/5/24 1:44 PM, Amery Hung wrote:
> >>>>>>> Maybe we should move the common btf used by kptr and graph_root into
> >>>>>>> btf_record and let the callers of btf_parse_fields() and
> >>>>>>> btf_record_free() to decide the life cycle of btf in btf_record.
> >>>>>> Could you maybe explain if and why moving btf of btf_field_kptr and
> >>>>>> btf_field_graph_root to btf_record is necessary? I think letting
> >>>>>> callers of btf_parse_fields() and btf_record_free() decide whether or
> >>>>>> not to change refcount should be enough. Besides, I personally would
> >>>>>> like to keep individual btf in btf_field_kptr and
> >>>>>> btf_field_graph_root, so that later we can have special fields
> >>>>>> referencing different btf.
> >>>>> Sorry, I didn't express the rough idea clearly enough. I didn't mean to
> >>>>> move btf of btf_field_kptr and btf_field_graph_root to btf_record,
> >>>>> because there are other btf-s which are different with the btf which
> >>>>> creates the struct_meta_tab. What I was trying to suggest is to save one
> >>>>> btf in btf_record and hope it will simplify the pin and the unpin of btf
> >>>>> in btf_record:
> >>>>>
> >>>>> 1) save the btf which owns the btf_record in btf_record.
> >>>>> 2) during btf_parse_kptr() or similar, if the used btf is the same as
> >>>>> the btf in btf_record, there is no need to pin the btf
> >>>> I assume the used btf is the one that btf_parse is working on.
> >>>>
> >>>>> 3) when freeing the btf_record, if the btf saved in btf_field is the
> >>>>> same as the btf in btf_record, there is no need to put it
> >>>> For btf_field_kptr.btf, is it the same as testing the btf_field_kptr.btf is
> >>>> btf_is_kernel() or not? How about only does btf_get/put for btf_is_kernel()?
> >>>>
> >>> IIUC. It will not be the same. For a map referencing prog btf, I
> >>> suppose we should still do btf_get().
> >>>
> >>> I think the core idea is since a btf_record and the prog btf
> >>> containing it has the same life time, we don't need to
> >>> btf_get()/btf_put() in btf_parse_kptr()/btf_record_free() when a
> >>> btf_field_kptr.btf is referencing itself.
> >>>
> >>> However, since btf_parse_kptr() called from btf_parse() and
> >>> map_check_btf() all use prog btf, we need a way to differentiate the
> >>> two. Hence Hou suggested saving the owner's btf in btf_record, and
> >> map_check_btf() calls btf_parse_kptr(map->btf).
> >>
> >> I am missing how it is different from the
> >> btf_new_fd()=>btf_parse()=>btf_parse_kptr(new_btf).
> >>
> >> akaik, the map->record has no issue now because bpf_map_free_deferred() does
> >> btf_record_free(map->record) before btf_put(map->btf). In the map->record case,
> >> does the map->record need to take a refcnt of the btf_field_kptr.btf if the
> >> btf_field_kptr.btf is pointing back to itself (map->btf) which is not a kernel btf?
>
> I think you are right. For both callees of btf_parse_kptr() when the btf
> saved in btf_field_kptr.btf is the same as the passed btf, there is no
> need to call btf_get(). For bpf map case, just as you remained, map->btf
> has already pin the btf passed to btf_parse_kptr() in map_create().
> >>> then check if btf_record->btf is the same as the btf_field_kptr.btf in
> >>> btf_parse_kptr()/btf_record_free().
> >> I suspect it will have the same end result? The btf_field_kptr.btf is only the
> >> same as the owner's btf when btf_parse_kptr() cannot found the kptr type from a
> >> kernel's btf (the id == -ENOENT case in btf_parse_kptr).
>
> It seems your suggestion of using btf_is_kernel() is much simpler:
>
> (1) btf_parse_kptr():  doesn't invoke btf_get() when bpf_find_btf_id()
> returns -ENOENT and let the caller ensure the life cycle of the passed btf.
> (2) btf_record_free(): only invoke btf_put() if the saved btf is a
> kernel btf.
> (3) btf_record_dup(): only invoke btf_get() if the saved btf is a kernel
> btf.

I see. It is much simpler. I will send a new version using this approach.

Thank you,
Amery

> > I added some code to better explain how I think it could work.
> >
> > I am thinking about adding a struct btf* under struct btf_record, and
> > then adding an argument in btf_parse_fields:
> >
> > btf_parse_fields(const struct btf *btf,..., bool btf_is_owner)
> > {
> >         ...
> >         /* Before the for loop that goes through info_arr */
> >         rec->btf = btf_is_owner ? btf : NULL;
> >         ...
> > }
> >
> > The btf_is_owner indicates whether the btf_record returned by the
> > function will be part of the btf. So map_check_btf() will set it to
> > false while btf_parse() will set it to true. Maybe "owner" is what
> > makes it confusing? (btf_record for a map belongs to bpf_map but not
> > map->btf. On the other hand, btf_record for a prog btf is part of it.)
> >
> > Then, in btf_record_free(), we can do:
> >
> > case BPF_KPTR_XXX:
> >         ...
> >         if (rec->btf != rec->fields[i].kptr.btf)
> >                 btf_put(rec->fields[i].kptr.btf);
> >
> > In btf_parse_kptr(), we also do similar things. We just need to pass
> > btf_record into it.
>
diff mbox series

Patch

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 95426d5b634e..7b8275e3e500 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5585,7 +5585,8 @@  btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 		type = &tab->types[tab->cnt];
 		type->btf_id = i;
 		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
-						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+						  BPF_KPTR, t->size);
 		/* The record cannot be unset, treat it as an error if so */
 		if (IS_ERR_OR_NULL(record)) {
 			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
@@ -5737,6 +5738,8 @@  static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	if (err)
 		goto errout;
 
+	refcount_set(&btf->refcnt, 1);
+
 	struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
 	if (IS_ERR(struct_meta_tab)) {
 		err = PTR_ERR(struct_meta_tab);
@@ -5759,7 +5762,6 @@  static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 		goto errout_free;
 
 	btf_verifier_env_free(env);
-	refcount_set(&btf->refcnt, 1);
 	return btf;
 
 errout_meta: