diff mbox series

[v5,bpf-next,3/9] libbpf: split BTF relocation

Message ID 20240528122408.3154936-4-alan.maguire@oracle.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: support resilient split BTF | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next-0

Commit Message

Alan Maguire May 28, 2024, 12:24 p.m. UTC
Map distilled base BTF type ids referenced in split BTF and their
references to the base BTF passed in, and if the mapping succeeds,
reparent the split BTF to the base BTF.

Relocation is done by first verifying that distilled base BTF
only consists of named INT, FLOAT, ENUM, FWD, STRUCT and
UNION kinds; then we sort these to speed lookups.  Once sorted,
the base BTF is iterated, and for each relevant kind we check
for an equivalent in distilled base BTF.  When found, the
mapping from distilled -> base BTF id and string offset is recorded.
In establishing mappings, we need to ensure we check STRUCT/UNION
size when the STRUCT/UNION is embedded in a split BTF STRUCT/UNION,
and when duplicate names exist for the same STRUCT/UNION.  Otherwise
size is ignored in matching STRUCT/UNIONs.

Once all mappings are established, we can update type ids
and string offsets in split BTF and reparent it to the new base.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/lib/bpf/Build             |   2 +-
 tools/lib/bpf/btf.c             |  17 ++
 tools/lib/bpf/btf.h             |  14 ++
 tools/lib/bpf/btf_relocate.c    | 430 ++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.map        |   1 +
 tools/lib/bpf/libbpf_internal.h |   3 +
 6 files changed, 466 insertions(+), 1 deletion(-)
 create mode 100644 tools/lib/bpf/btf_relocate.c

Comments

Eduard Zingerman May 31, 2024, 2:22 a.m. UTC | #1
On Tue, 2024-05-28 at 13:24 +0100, Alan Maguire wrote:

[...]

> +/* Build a map from distilled base BTF ids to base BTF ids. To do so, iterate
> + * through base BTF looking up distilled type (using binary search) equivalents.
> + */
> +static int btf_relocate_map_distilled_base(struct btf_relocate *r)
> +{

I have several observations about this algorithm.

The algorithm does Base.Cnt * log2(Dist.Cnt) binary searches.
However, it might be better to switch searches around
and look for distilled {name,size} pairs in base btf,
doing Dist.Cnt * log2(Base.cnt) searches instead.
Suppose Base.Cnt = 2**20 and Dist.Cnt = 2**10, in such case:
  - Base.Cnt * log2(Dist.Cnt) = 2**20 * 10
  - Dist.Cnt * log2(Base.cnt) = 2**10 * 20, which is smaller

The algorithm might not handle name duplicates in the distilled BTF well,
e.g. in theory, the following is a valid C code

  struct foo { int f; }; // sizeof(struct foo) == 4
  typedef int foo;       // sizeof(foo) == 4

Suppose that these types are a part of the distilled BTF.
Depending on which one would end up first in 'dist_base_info_sorted'
bsearch might fail to find one or the other.

Also, algorithm does not report an error if there are several
types with the same name and size in the base BTF.

I suggest to modify the algorithm as follows:
- let 'base_info_sorted' be a set of tuples {kind,name,size,id}
  corresponding to the base BTF, sorted by kind, name and size;
- add a custom utility bsearch_unique, that behaves like bsearch,
  but returns NULL if entry is non-unique with regards to current
  predicate (e.g. use bsearch but also check neighbors);
- for each type D in the distilled base:
  - use bsearch_unique to find entry E in 'base_info_sorted'
    that matches D.{kind,name,size} sub-tuple;
  - if E exists, set id_map[D] := E.id;
  - if E does not exist:
    - if id_map[D] == BTF_IS_EMBEDDED, report an error;
    - if id_map[D] != BTF_IS_EMBEDDED:
      - use bsearch_unique to find entry E in 'base_info_sorted'
        that matches D.{kind,name} sub-tuple;
      - if E exists, set id_map[D] := E.id;
      - otherwise, report an error.

This allows to:
- flip the search order, potentially gaining some speed;
- drop the 'base_name_cnt' array and logic;
- handle the above hypothetical name conflict example.

Wdyt?

> +	struct btf_name_info *dist_base_info_sorted;
> +	struct btf_type *base_t, *dist_t, *split_t;
> +	__u8 *base_name_cnt = NULL;
> +	int err = 0;
> +	__u32 id;
> +
> +	/* generate a sort index array of name/type ids sorted by name for
> +	 * distilled base BTF to speed name-based lookups.
> +	 */
> +	dist_base_info_sorted = calloc(r->nr_dist_base_types, sizeof(*dist_base_info_sorted));
> +	if (!dist_base_info_sorted) {
> +		err = -ENOMEM;
> +		goto done;
> +	}
> +	for (id = 0; id < r->nr_dist_base_types; id++) {
> +		dist_t = btf_type_by_id(r->dist_base_btf, id);
> +		dist_base_info_sorted[id].name = btf__name_by_offset(r->dist_base_btf,
> +								     dist_t->name_off);
> +		dist_base_info_sorted[id].id = id;
> +		dist_base_info_sorted[id].size = dist_t->size;
> +		dist_base_info_sorted[id].needs_size = true;
> +	}
> +	qsort(dist_base_info_sorted, r->nr_dist_base_types, sizeof(*dist_base_info_sorted),
> +	      cmp_btf_name_size);
> +
> +	/* Mark distilled base struct/union members of split BTF structs/unions
> +	 * in id_map with BTF_IS_EMBEDDED; this signals that these types
> +	 * need to match both name and size, otherwise embeddding the base
> +	 * struct/union in the split type is invalid.
> +	 */
> +	for (id = r->nr_dist_base_types; id < r->nr_split_types; id++) {
> +		split_t = btf_type_by_id(r->btf, id);
> +		if (btf_is_composite(split_t)) {
> +			err = btf_type_visit_type_ids(split_t, btf_mark_embedded_composite_type_ids,
> +						      r);
> +			if (err < 0)
> +				goto done;
> +		}
> +	}
> +
> +	/* Collect name counts for composite types in base BTF.  If multiple
> +	 * instances of a struct/union of the same name exist, we need to use
> +	 * size to determine which to map to since name alone is ambiguous.
> +	 */
> +	base_name_cnt = calloc(r->base_str_len, sizeof(*base_name_cnt));
> +	if (!base_name_cnt) {
> +		err = -ENOMEM;
> +		goto done;
> +	}
> +	for (id = 1; id < r->nr_base_types; id++) {
> +		base_t = btf_type_by_id(r->base_btf, id);
> +		if (!btf_is_composite(base_t) || !base_t->name_off)
> +			continue;
> +		if (base_name_cnt[base_t->name_off] < 255)
> +			base_name_cnt[base_t->name_off]++;
> +	}
> +
> +	/* Now search base BTF for matching distilled base BTF types. */
> +	for (id = 1; id < r->nr_base_types; id++) {
> +		struct btf_name_info *dist_name_info, base_name_info = {};
> +		int dist_kind, base_kind;
> +
> +		base_t = btf_type_by_id(r->base_btf, id);
> +		/* distilled base consists of named types only. */
> +		if (!base_t->name_off)
> +			continue;
> +		base_kind = btf_kind(base_t);
> +		base_name_info.id = id;
> +		base_name_info.name = btf__name_by_offset(r->base_btf, base_t->name_off);
> +		switch (base_kind) {
> +		case BTF_KIND_INT:
> +		case BTF_KIND_FLOAT:
> +		case BTF_KIND_ENUM:
> +		case BTF_KIND_ENUM64:
> +			/* These types should match both name and size */
> +			base_name_info.needs_size = true;
> +			base_name_info.size = base_t->size;
> +			break;
> +		case BTF_KIND_FWD:
> +			/* No size considerations for fwds. */
> +			break;
> +		case BTF_KIND_STRUCT:
> +		case BTF_KIND_UNION:
> +			/* Size only needs to be used for struct/union if there
> +			 * are multiple types in base BTF with the same name.
> +			 * If there are multiple _distilled_ types with the same
> +			 * name (a very unlikely scenario), that doesn't matter
> +			 * unless corresponding _base_ types to match them are
> +			 * missing.
> +			 */
> +			base_name_info.needs_size = base_name_cnt[base_t->name_off] > 1;
> +			base_name_info.size = base_t->size;
> +			break;
> +		default:
> +			continue;
> +		}
> +		dist_name_info = bsearch(&base_name_info, dist_base_info_sorted,
> +					 r->nr_dist_base_types, sizeof(*dist_base_info_sorted),
> +					 cmp_btf_name_size);
> +		if (!dist_name_info)
> +			continue;
> +		if (!dist_name_info->id || dist_name_info->id > r->nr_dist_base_types) {
> +			pr_warn("base BTF id [%d] maps to invalid distilled base BTF id [%d]\n",
> +				id, dist_name_info->id);
> +			err = -EINVAL;
> +			goto done;
> +		}
> +		dist_t = btf_type_by_id(r->dist_base_btf, dist_name_info->id);
> +		dist_kind = btf_kind(dist_t);
> +
> +		/* Validate that the found distilled type is compatible.
> +		 * Do not error out on mismatch as another match may occur
> +		 * for an identically-named type.
> +		 */
> +		switch (dist_kind) {
> +		case BTF_KIND_FWD:
> +			switch (base_kind) {
> +			case BTF_KIND_FWD:
> +				if (btf_kflag(dist_t) != btf_kflag(base_t))
> +					continue;
> +				break;
> +			case BTF_KIND_STRUCT:
> +				if (btf_kflag(base_t))
> +					continue;
> +				break;
> +			case BTF_KIND_UNION:
> +				if (!btf_kflag(base_t))
> +					continue;
> +				break;
> +			default:
> +				continue;
> +			}
> +			break;
> +		case BTF_KIND_INT:
> +			if (dist_kind != base_kind ||
> +			    btf_int_encoding(base_t) != btf_int_encoding(dist_t))
> +				continue;
> +			break;
> +		case BTF_KIND_FLOAT:
> +			if (dist_kind != base_kind)
> +				continue;
> +			break;
> +		case BTF_KIND_ENUM:
> +			/* ENUM and ENUM64 are encoded as sized ENUM in
> +			 * distilled base BTF.
> +			 */
> +			if (dist_kind != base_kind && base_kind != BTF_KIND_ENUM64)
> +				continue;
> +			break;
> +		case BTF_KIND_STRUCT:
> +		case BTF_KIND_UNION:
> +			/* size verification is required for embedded
> +			 * struct/unions.
> +			 */
> +			if (r->id_map[dist_name_info->id] == BTF_IS_EMBEDDED &&
> +			    base_t->size != dist_t->size)
> +				continue;
> +			break;
> +		default:
> +			continue;
> +		}
> +		/* map id and name */
> +		r->id_map[dist_name_info->id] = id;
> +		r->str_map[dist_t->name_off] = base_t->name_off;
> +	}
> +	/* ensure all distilled BTF ids now have a mapping... */
> +	for (id = 1; id < r->nr_dist_base_types; id++) {
> +		const char *name;
> +
> +		if (r->id_map[id] && r->id_map[id] != BTF_IS_EMBEDDED)
> +			continue;
> +		dist_t = btf_type_by_id(r->dist_base_btf, id);
> +		name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off);
> +		pr_warn("distilled base BTF type '%s' [%d] is not mapped to base BTF id\n",
> +			name, id);
> +		err = -EINVAL;
> +		break;
> +	}
> +done:
> +	free(base_name_cnt);
> +	free(dist_base_info_sorted);
> +	return err;
> +}

[...]
Alan Maguire May 31, 2024, 3:38 p.m. UTC | #2
On 31/05/2024 03:22, Eduard Zingerman wrote:
> On Tue, 2024-05-28 at 13:24 +0100, Alan Maguire wrote:
> 
> [...]
> 
>> +/* Build a map from distilled base BTF ids to base BTF ids. To do so, iterate
>> + * through base BTF looking up distilled type (using binary search) equivalents.
>> + */
>> +static int btf_relocate_map_distilled_base(struct btf_relocate *r)
>> +{
> 
> I have several observations about this algorithm.
> 
> The algorithm does Base.Cnt * log2(Dist.Cnt) binary searches.
> However, it might be better to switch searches around
> and look for distilled {name,size} pairs in base btf,
> doing Dist.Cnt * log2(Base.cnt) searches instead.
> Suppose Base.Cnt = 2**20 and Dist.Cnt = 2**10, in such case:
>   - Base.Cnt * log2(Dist.Cnt) = 2**20 * 10
>   - Dist.Cnt * log2(Base.cnt) = 2**10 * 20, which is smaller
> 

Hi Eduard,

I crunched some numbers on base, distilled base BTF to try and flesh
this out a bit more.

The number of struct, union, fwd, int, float, enum and enum64 types in
my vmlinux BTF is 14307.

This is 11% of the overall number of types, and it is this 11% we will
be searching distilled BTF for matches (we avoid searching for any other
types as we've previously verified that our distilled base BTF only
consists of these).

In terms of distilled BTF sizes, I built a kernel forcing distilled base
BTF for all 2708 modules to help get a sense for distilled .BTF.base
sizes.  We can sort these by .BTF.base size by doing the following:

modules=$(find . -name '*.ko' -depth -print);
for module in $modules ; do size=$(objdump -h $module |awk '/.BTF.base/
{ print $3}'); printf '%s %d \n' $module "0x0$size"  ; done |sort -n -k2

With this process, we see the largest .BTF.base section is

./drivers/net/ethernet/chelsio/cxgb4/cxgb4.ko 15732

...but most others are much less than 1k bytes in size. The in-tree
kernel modules probably utilize more kernel types so I suspect give us a
good sense of the worst case scenario for distilled BTF size.

Examining the .BTF.base section of cxgb4, we see 609 types.

So in this worst-case scenario I could generate, Base.Cnt = 14307,
Dist.Cnt = 600

Base.Cnt * log2(Dist.Cnt) = 14307 * 9	= 128763 comparisons
Dist.Cnt * log2(Base.cnt) = 600 * 14 	= 8400 comparisons

So that looks pretty cut-and-dried, but we also have to factor in the
initial sort comparisons.

The in-kernel sort reports n*log2(n) + 0.37*n + o(n) comparisons on
average; for base BTF that means sorting requires at least

Base: 14307*14+0.37*14307	= 205592 comparisons
Dist: 600*9+0.37*600		= 5622 comparisons

So we get an inversion of the above results, with (unless I'm
miscalculating something), sorting distilled base BTF requiring less
comparisons overall across both sort+search.

Sort Comparisons		Search comparisons		Total
======================================================================	
5622	(distilled)		128763	(to base)		134385
205592	(base)			8400	(to distilled)		213992

To validate the distilled numbers are roughly right, I DTraced[1]
comparison functions when loading cxgb4; as above we expect around
134385 across sort and search:

$ sudo dtrace -n 'fbt::cmp_btf_name_size:entry {@c=count(); }'
dtrace: description 'fbt::cmp_btf_name_size:entry ' matched 1 probe
^C

           107971

So the number (107971 calls to cmp_btf_name_size) seem to be in the
ballpark overall; next I tried aggregating by stack to see if the
numbers look right in sort versus search:

$ sudo dtrace -n 'fbt::cmp_btf_name_size:entry {@c[stack()]=count(); }'
dtrace: description 'fbt::cmp_btf_name_size:entry ' matched 1 probe
^C


              vmlinux`cmp_btf_name_size+0x5
              vmlinux`sort+0x34
              vmlinux`btf_relocate_map_distilled_base+0xce
              vmlinux`btf_relocate+0x1e3
              vmlinux`btf_parse_module+0x24b
              vmlinux`btf_module_notify+0xee
              vmlinux`notifier_call_chain+0x65
              vmlinux`blocking_notifier_call_chain_robust+0x67
              vmlinux`load_module+0x7fa
              vmlinux`init_module_from_file+0x97
              vmlinux`idempotent_init_module+0x109
              vmlinux`__x64_sys_finit_module+0x64
              vmlinux`x64_sys_call+0x1480
              vmlinux`do_syscall_64+0x68
              vmlinux`entry_SYSCALL_64_after_hwframe+0x76
             5882

              vmlinux`cmp_btf_name_size+0x5
              vmlinux`btf_relocate_map_distilled_base+0x29f
              vmlinux`btf_relocate+0x1e3
              vmlinux`btf_parse_module+0x24b
              vmlinux`btf_module_notify+0xee
              vmlinux`notifier_call_chain+0x65
              vmlinux`blocking_notifier_call_chain_robust+0x67
              vmlinux`load_module+0x7fa
              vmlinux`init_module_from_file+0x97
              vmlinux`idempotent_init_module+0x109
              vmlinux`__x64_sys_finit_module+0x64
              vmlinux`x64_sys_call+0x1480
              vmlinux`do_syscall_64+0x68
              vmlinux`entry_SYSCALL_64_after_hwframe+0x76
           102089

Yep, looks right - 5882 sort comparisons versus 102089 search comparisons.

I also traced the relocation time for btf_relocate() to complete
in-kernel, collecting the module name, distilled number of types, base
number of types and time for btf_relocate() to complete. I loaded 6
modules with varying numbers of distilled base BTF types.

$ sudo dtrace -n 'fbt::btf_relocate:entry
{ self->start = timestamp;
 self->btf = (struct btf *)arg0;
 self->nr_dist_types = self->btf->base_btf->nr_types
}
fbt::btf_relocate:return /self->start/
{
 @reloc[stringof(self->btf->name), self->nr_dist_types,
self->btf->base_btf->nr_types] = avg(timestamp-self->start);
}'
dtrace: description 'fbt::btf_relocate:entry ' matched 2 probes
^C

  hid_ite                                                  11   124271
       4432193
  lib80211                                                 12   124271
       4445910
  sctp                                                    109   124271
       5547703
  ib_core                                                 153   124271
       5803176
  bnxt_en                                                 147   124271
       5846436
  cxgb4                                                   610   124271
       8081113

So the overall relocation time - from 11 distilled types in hid_ite to
610 for cxgb4 - is within a range from 4.5msec (4432193ns above) to
8msec. The times for relocation represent less than 50% of overall
module load times - the later vary from 11-18msec across these modules.
It would be great to find some performance wins here, but I don't
_think_ swapping the sort/search targets will buy us much unfortunately.

> The algorithm might not handle name duplicates in the distilled BTF well,
> e.g. in theory, the following is a valid C code
> 
>   struct foo { int f; }; // sizeof(struct foo) == 4
>   typedef int foo;       // sizeof(foo) == 4
> 
> Suppose that these types are a part of the distilled BTF.
> Depending on which one would end up first in 'dist_base_info_sorted'
> bsearch might fail to find one or the other.
> 

In the case of distilled base BTF, only struct, union, enum, enum64,
int, float and fwd can be present. Size matches would have to be between
one of these kinds I think, but are still possible nevertheless.

> Also, algorithm does not report an error if there are several
> types with the same name and size in the base BTF.
>

Yep, while we have to handle this, it only becomes an ambiguity problem
if distilled base BTF refers to one of such types. On my vmlinux I see
the following duplicate name/size STRUCTs

'd_partition' size=16
'elf_note_info' size=248
'getdents_callback' size=40
'instance_attribute' size=32
'intel_pinctrl_context' size=16
'intel_pinctrl' size=744
'perf_aux_event'size=16
'quirk_entry'size=8

Of these, 5 seem legit: d_partition, getdents_callback,
instance_attribute, perf_aux_event, quirk_entry.

A few seem to be identical, possibly dedup failures:

struct intel_pinctrl {
        struct device *dev;
        raw_spinlock_t lock;
        struct pinctrl_desc pctldesc;
        struct pinctrl_dev *pctldev;
        struct gpio_chip chip;
        const struct intel_pinctrl_soc_data *soc;
        struct intel_community *communities;
        size_t ncommunities;
        struct intel_pinctrl_context context;
        int irq;
};

struct intel_pinctrl___2 {
        struct device *dev;
        raw_spinlock_t lock;
        struct pinctrl_desc pctldesc;
        struct pinctrl_dev *pctldev;
        struct gpio_chip chip;
        const struct intel_pinctrl_soc_data *soc;
        struct intel_community *communities;
        size_t ncommunities;
        struct intel_pinctrl_context___2 context;
        int irq;
};


struct elf_thread_core_info;

struct elf_note_info {
        struct elf_thread_core_info *thread;
        struct memelfnote psinfo;
        struct memelfnote signote;
        struct memelfnote auxv;
        struct memelfnote files;
        siginfo_t csigdata;
        size_t size;
        int thread_notes;
};

struct elf_thread_core_info___2;

struct elf_note_info___2 {
        struct elf_thread_core_info___2 *thread;
        struct memelfnote psinfo;
        struct memelfnote signote;
        struct memelfnote auxv;
        struct memelfnote files;
        compat_siginfo_t csigdata;
        size_t size;
        int thread_notes;
};

Both of these share self-reference, either directly or indirectly so
maybe it's a corner-case of dedup we're missing. I'll dig into these later.

> I suggest to modify the algorithm as follows:
> - let 'base_info_sorted' be a set of tuples {kind,name,size,id}
>   corresponding to the base BTF, sorted by kind, name and size;

That was my first thought, but we can't always search by kind; for
example it's possible the distilled base has a fwd and vmlinux only has
a struct kind for the same type name; in such a case we'd want to
support a match provided the fwd's kflag indicated a struct fwd.

In fact looking at the code we're missing logic for the opposite
condition (fwd only in base, struct in distilled base). I'll fix that.

The other case is an enum in distilled base matching an enum64
or an enum.

> - add a custom utility bsearch_unique, that behaves like bsearch,
>   but returns NULL if entry is non-unique with regards to current
>   predicate (e.g. use bsearch but also check neighbors);
> - for each type D in the distilled base:
>   - use bsearch_unique to find entry E in 'base_info_sorted'
>     that matches D.{kind,name,size} sub-tuple;
>   - if E exists, set id_map[D] := E.id;
>   - if E does not exist:
>     - if id_map[D] == BTF_IS_EMBEDDED, report an error;
>     - if id_map[D] != BTF_IS_EMBEDDED:
>       - use bsearch_unique to find entry E in 'base_info_sorted'
>         that matches D.{kind,name} sub-tuple;
>       - if E exists, set id_map[D] := E.id;
>       - otherwise, report an error.
> 
> This allows to:
> - flip the search order, potentially gaining some speed;
> - drop the 'base_name_cnt' array and logic;
> - handle the above hypothetical name conflict example.
>

I think flipping the search order could gain search speed, but only at
the expense of slowing things down overall due to the extra cost of
having to sort so many more elements. I suspect it will mostly be a
wash, though numbers above seem to suggest sorting distilled base may
have an edge when we consider both search and sort. The question is
probably which sort/search order is most amenable to handling the data
and helping us deal with the edge cases like duplicates.

With the existing scheme, I think catching cases of name duplicates in
distilled base BTF and name/size duplicates in base BTF for types we
want to relocate from distilled base BTF and erroring out would suffice;
basically the following applied to this patch (patch 3 in the series)

diff --git a/tools/lib/bpf/btf_relocate.c b/tools/lib/bpf/btf_relocate.c
index f2e91cdfb5cc..4e282ee8f183 100644
--- a/tools/lib/bpf/btf_relocate.c
+++ b/tools/lib/bpf/btf_relocate.c
@@ -113,6 +113,7 @@ static int btf_relocate_map_distilled_base(struct
btf_relocate *r)
 {
        struct btf_name_info *dist_base_info_sorted;
        struct btf_type *base_t, *dist_t, *split_t;
+       const char *last_name = NULL;
        __u8 *base_name_cnt = NULL;
        int err = 0;
        __u32 id;
@@ -136,6 +137,19 @@ static int btf_relocate_map_distilled_base(struct
btf_relocate *r)
        qsort(dist_base_info_sorted, r->nr_dist_base_types,
sizeof(*dist_base_info_sorted),
              cmp_btf_name_size);

+       /* It is possible - though highly unlikely - that
duplicate-named types
+        * end up in distilled based BTF; error out if this is the case.
+        */
+       for (id = 1; id < r->nr_dist_base_types; id++) {
+               if (last_name == dist_base_info_sorted[id].name) {
+                       pr_warn("Multiple distilled base types [%u],
[%u] share name '%s'; cannot relocate with base BTF.\n",
+                               id - 1, id, last_name);
+                       err = -EINVAL;
+                       goto done;
+               }
+               last_name = dist_base_info_sorted[id].name;
+       }
+
        /* Mark distilled base struct/union members of split BTF
structs/unions
         * in id_map with BTF_IS_EMBEDDED; this signals that these types
         * need to match both name and size, otherwise embeddding the base
@@ -272,6 +286,21 @@ static int btf_relocate_map_distilled_base(struct
btf_relocate *r)
                default:
                        continue;
                }
+               if (r->id_map[dist_name_info->id] &&
+ 		    r->id_map[dist_name_info->id != BTF_IS_EMBEDDED) {
+                       /* we already have a match; this tells us that
+                        * multiple base types of the same name
+                        * have the same size, since for cases where
+                        * multiple types have the same name we match
+                        * on name and size.  In this case, we have
+                        * no way of determining which to relocate
+                        * to in base BTF, so error out.
+                        */
+                       pr_warn("distilled base BTF type '%s' [%u], size
%u has multiple candidates of the same size (ids [%u, %u]) in base BTF\n",
+                               base_name_info.name, dist_name_info->id,
base_t->size,
+                               id, r->id_map[dist_name_info->id]);
+                       err = -EINVAL;
+                       goto done;
+               }
                /* map id and name */
                r->id_map[dist_name_info->id] = id;
                r->str_map[dist_t->name_off] = base_t->name_off;


With this change, I then tried using "bpftool btf dump -B vmlinux file
$module" for each of the 2700-odd modules I force-generated .BTF.base
sections for, to see if these conditions ever get triggered in practice
(since with your BTF parsing changes that allows us to test relocation).
 They don't it seems - all modules could relocate successfully with
vmlinux - which would suggest at least initially it might not be worth
adding additional complexity to the algorithm to handle them, aside from
error checks like the above.

> Wdyt?
> 

My personal take is that it would suffice to error out in some of the
edge cases, but I'm open to other approaches too. Hopefully some of the
data above helps us understand the costs of this approach at least. Thanks!

Alan

[1] https://github.com/oracle/dtrace-utils

>> +	struct btf_name_info *dist_base_info_sorted;
>> +	struct btf_type *base_t, *dist_t, *split_t;
>> +	__u8 *base_name_cnt = NULL;
>> +	int err = 0;
>> +	__u32 id;
>> +
>> +	/* generate a sort index array of name/type ids sorted by name for
>> +	 * distilled base BTF to speed name-based lookups.
>> +	 */
>> +	dist_base_info_sorted = calloc(r->nr_dist_base_types, sizeof(*dist_base_info_sorted));
>> +	if (!dist_base_info_sorted) {
>> +		err = -ENOMEM;
>> +		goto done;
>> +	}
>> +	for (id = 0; id < r->nr_dist_base_types; id++) {
>> +		dist_t = btf_type_by_id(r->dist_base_btf, id);
>> +		dist_base_info_sorted[id].name = btf__name_by_offset(r->dist_base_btf,
>> +								     dist_t->name_off);
>> +		dist_base_info_sorted[id].id = id;
>> +		dist_base_info_sorted[id].size = dist_t->size;
>> +		dist_base_info_sorted[id].needs_size = true;
>> +	}
>> +	qsort(dist_base_info_sorted, r->nr_dist_base_types, sizeof(*dist_base_info_sorted),
>> +	      cmp_btf_name_size);
>> +
>> +	/* Mark distilled base struct/union members of split BTF structs/unions
>> +	 * in id_map with BTF_IS_EMBEDDED; this signals that these types
>> +	 * need to match both name and size, otherwise embeddding the base
>> +	 * struct/union in the split type is invalid.
>> +	 */
>> +	for (id = r->nr_dist_base_types; id < r->nr_split_types; id++) {
>> +		split_t = btf_type_by_id(r->btf, id);
>> +		if (btf_is_composite(split_t)) {
>> +			err = btf_type_visit_type_ids(split_t, btf_mark_embedded_composite_type_ids,
>> +						      r);
>> +			if (err < 0)
>> +				goto done;
>> +		}
>> +	}
>> +
>> +	/* Collect name counts for composite types in base BTF.  If multiple
>> +	 * instances of a struct/union of the same name exist, we need to use
>> +	 * size to determine which to map to since name alone is ambiguous.
>> +	 */
>> +	base_name_cnt = calloc(r->base_str_len, sizeof(*base_name_cnt));
>> +	if (!base_name_cnt) {
>> +		err = -ENOMEM;
>> +		goto done;
>> +	}
>> +	for (id = 1; id < r->nr_base_types; id++) {
>> +		base_t = btf_type_by_id(r->base_btf, id);
>> +		if (!btf_is_composite(base_t) || !base_t->name_off)
>> +			continue;
>> +		if (base_name_cnt[base_t->name_off] < 255)
>> +			base_name_cnt[base_t->name_off]++;
>> +	}
>> +
>> +	/* Now search base BTF for matching distilled base BTF types. */
>> +	for (id = 1; id < r->nr_base_types; id++) {
>> +		struct btf_name_info *dist_name_info, base_name_info = {};
>> +		int dist_kind, base_kind;
>> +
>> +		base_t = btf_type_by_id(r->base_btf, id);
>> +		/* distilled base consists of named types only. */
>> +		if (!base_t->name_off)
>> +			continue;
>> +		base_kind = btf_kind(base_t);
>> +		base_name_info.id = id;
>> +		base_name_info.name = btf__name_by_offset(r->base_btf, base_t->name_off);
>> +		switch (base_kind) {
>> +		case BTF_KIND_INT:
>> +		case BTF_KIND_FLOAT:
>> +		case BTF_KIND_ENUM:
>> +		case BTF_KIND_ENUM64:
>> +			/* These types should match both name and size */
>> +			base_name_info.needs_size = true;
>> +			base_name_info.size = base_t->size;
>> +			break;
>> +		case BTF_KIND_FWD:
>> +			/* No size considerations for fwds. */
>> +			break;
>> +		case BTF_KIND_STRUCT:
>> +		case BTF_KIND_UNION:
>> +			/* Size only needs to be used for struct/union if there
>> +			 * are multiple types in base BTF with the same name.
>> +			 * If there are multiple _distilled_ types with the same
>> +			 * name (a very unlikely scenario), that doesn't matter
>> +			 * unless corresponding _base_ types to match them are
>> +			 * missing.
>> +			 */
>> +			base_name_info.needs_size = base_name_cnt[base_t->name_off] > 1;
>> +			base_name_info.size = base_t->size;
>> +			break;
>> +		default:
>> +			continue;
>> +		}
>> +		dist_name_info = bsearch(&base_name_info, dist_base_info_sorted,
>> +					 r->nr_dist_base_types, sizeof(*dist_base_info_sorted),
>> +					 cmp_btf_name_size);
>> +		if (!dist_name_info)
>> +			continue;
>> +		if (!dist_name_info->id || dist_name_info->id > r->nr_dist_base_types) {
>> +			pr_warn("base BTF id [%d] maps to invalid distilled base BTF id [%d]\n",
>> +				id, dist_name_info->id);
>> +			err = -EINVAL;
>> +			goto done;
>> +		}
>> +		dist_t = btf_type_by_id(r->dist_base_btf, dist_name_info->id);
>> +		dist_kind = btf_kind(dist_t);
>> +
>> +		/* Validate that the found distilled type is compatible.
>> +		 * Do not error out on mismatch as another match may occur
>> +		 * for an identically-named type.
>> +		 */
>> +		switch (dist_kind) {
>> +		case BTF_KIND_FWD:
>> +			switch (base_kind) {
>> +			case BTF_KIND_FWD:
>> +				if (btf_kflag(dist_t) != btf_kflag(base_t))
>> +					continue;
>> +				break;
>> +			case BTF_KIND_STRUCT:
>> +				if (btf_kflag(base_t))
>> +					continue;
>> +				break;
>> +			case BTF_KIND_UNION:
>> +				if (!btf_kflag(base_t))
>> +					continue;
>> +				break;
>> +			default:
>> +				continue;
>> +			}
>> +			break;
>> +		case BTF_KIND_INT:
>> +			if (dist_kind != base_kind ||
>> +			    btf_int_encoding(base_t) != btf_int_encoding(dist_t))
>> +				continue;
>> +			break;
>> +		case BTF_KIND_FLOAT:
>> +			if (dist_kind != base_kind)
>> +				continue;
>> +			break;
>> +		case BTF_KIND_ENUM:
>> +			/* ENUM and ENUM64 are encoded as sized ENUM in
>> +			 * distilled base BTF.
>> +			 */
>> +			if (dist_kind != base_kind && base_kind != BTF_KIND_ENUM64)
>> +				continue;
>> +			break;
>> +		case BTF_KIND_STRUCT:
>> +		case BTF_KIND_UNION:
>> +			/* size verification is required for embedded
>> +			 * struct/unions.
>> +			 */
>> +			if (r->id_map[dist_name_info->id] == BTF_IS_EMBEDDED &&
>> +			    base_t->size != dist_t->size)
>> +				continue;
>> +			break;
>> +		default:
>> +			continue;
>> +		}
>> +		/* map id and name */
>> +		r->id_map[dist_name_info->id] = id;
>> +		r->str_map[dist_t->name_off] = base_t->name_off;
>> +	}
>> +	/* ensure all distilled BTF ids now have a mapping... */
>> +	for (id = 1; id < r->nr_dist_base_types; id++) {
>> +		const char *name;
>> +
>> +		if (r->id_map[id] && r->id_map[id] != BTF_IS_EMBEDDED)
>> +			continue;
>> +		dist_t = btf_type_by_id(r->dist_base_btf, id);
>> +		name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off);
>> +		pr_warn("distilled base BTF type '%s' [%d] is not mapped to base BTF id\n",
>> +			name, id);
>> +		err = -EINVAL;
>> +		break;
>> +	}
>> +done:
>> +	free(base_name_cnt);
>> +	free(dist_base_info_sorted);
>> +	return err;
>> +}
> 
> [...]
Andrii Nakryiko May 31, 2024, 6:21 p.m. UTC | #3
On Tue, May 28, 2024 at 5:26 AM Alan Maguire <alan.maguire@oracle.com> wrote:
>
> Map distilled base BTF type ids referenced in split BTF and their
> references to the base BTF passed in, and if the mapping succeeds,
> reparent the split BTF to the base BTF.
>
> Relocation is done by first verifying that distilled base BTF
> only consists of named INT, FLOAT, ENUM, FWD, STRUCT and
> UNION kinds; then we sort these to speed lookups.  Once sorted,
> the base BTF is iterated, and for each relevant kind we check
> for an equivalent in distilled base BTF.  When found, the
> mapping from distilled -> base BTF id and string offset is recorded.
> In establishing mappings, we need to ensure we check STRUCT/UNION
> size when the STRUCT/UNION is embedded in a split BTF STRUCT/UNION,
> and when duplicate names exist for the same STRUCT/UNION.  Otherwise
> size is ignored in matching STRUCT/UNIONs.
>
> Once all mappings are established, we can update type ids
> and string offsets in split BTF and reparent it to the new base.
>
> Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
> ---
>  tools/lib/bpf/Build             |   2 +-
>  tools/lib/bpf/btf.c             |  17 ++
>  tools/lib/bpf/btf.h             |  14 ++
>  tools/lib/bpf/btf_relocate.c    | 430 ++++++++++++++++++++++++++++++++
>  tools/lib/bpf/libbpf.map        |   1 +
>  tools/lib/bpf/libbpf_internal.h |   3 +
>  6 files changed, 466 insertions(+), 1 deletion(-)
>  create mode 100644 tools/lib/bpf/btf_relocate.c
>

[...]

> +/* Set temporarily in relocation id_map if distilled base struct/union is
> + * embedded in a split BTF struct/union; in such a case, size information must
> + * match between distilled base BTF and base BTF representation of type.
> + */
> +#define BTF_IS_EMBEDDED ((__u32)-1)
> +
> +/* <name, size, id> triple used in sorting/searching distilled base BTF. */
> +struct btf_name_info {
> +       const char *name;
> +       int size:31;
> +       /* set when search requires a size match */
> +       bool needs_size;

this was meant to be a 1-bit field, right? `: 1` is missing?

> +       __u32 id;
> +};
> +

[...]

> +               case BTF_KIND_INT:
> +                       if (dist_kind != base_kind ||
> +                           btf_int_encoding(base_t) != btf_int_encoding(dist_t))
> +                               continue;
> +                       break;
> +               case BTF_KIND_FLOAT:
> +                       if (dist_kind != base_kind)
> +                               continue;
> +                       break;
> +               case BTF_KIND_ENUM:
> +                       /* ENUM and ENUM64 are encoded as sized ENUM in
> +                        * distilled base BTF.
> +                        */
> +                       if (dist_kind != base_kind && base_kind != BTF_KIND_ENUM64)
> +                               continue;

probably unnecessarily strict, I'd check something like

if (!btf_is_enum_any(dist_t) || !btf_is_enum_any(base_t) ||
dist_t->size != base_t->size)
    continue;

it's minor, unlikely to matter in practice

> +                       break;
> +               case BTF_KIND_STRUCT:
> +               case BTF_KIND_UNION:
> +                       /* size verification is required for embedded
> +                        * struct/unions.
> +                        */
> +                       if (r->id_map[dist_name_info->id] == BTF_IS_EMBEDDED &&
> +                           base_t->size != dist_t->size)
> +                               continue;
> +                       break;
> +               default:
> +                       continue;
> +               }
> +               /* map id and name */
> +               r->id_map[dist_name_info->id] = id;
> +               r->str_map[dist_t->name_off] = base_t->name_off;
> +       }

[...]
Andrii Nakryiko May 31, 2024, 6:44 p.m. UTC | #4
On Fri, May 31, 2024 at 8:39 AM Alan Maguire <alan.maguire@oracle.com> wrote:
>
> On 31/05/2024 03:22, Eduard Zingerman wrote:
> > On Tue, 2024-05-28 at 13:24 +0100, Alan Maguire wrote:
> >
> > [...]
> >
> >> +/* Build a map from distilled base BTF ids to base BTF ids. To do so, iterate
> >> + * through base BTF looking up distilled type (using binary search) equivalents.
> >> + */
> >> +static int btf_relocate_map_distilled_base(struct btf_relocate *r)
> >> +{
> >
> > I have several observations about this algorithm.
> >
> > The algorithm does Base.Cnt * log2(Dist.Cnt) binary searches.
> > However, it might be better to switch searches around
> > and look for distilled {name,size} pairs in base btf,
> > doing Dist.Cnt * log2(Base.cnt) searches instead.
> > Suppose Base.Cnt = 2**20 and Dist.Cnt = 2**10, in such case:
> >   - Base.Cnt * log2(Dist.Cnt) = 2**20 * 10
> >   - Dist.Cnt * log2(Base.cnt) = 2**10 * 20, which is smaller
> >
>
> Hi Eduard,
>
> I crunched some numbers on base, distilled base BTF to try and flesh
> this out a bit more.
>

Wow, you guys really went hardcore here with counting...

Eduard, as Alan mentioned, you ignored both CPU and *memory*
requirements for sorting a large base number of types. If N is base
type count, M is distilled type count (note, M << N), then

  - with Alan's current approach we have
    - O(MlogM + N*logM) = O((M+N)logM) for sorting/search
    - O(M) memory for index

  - with your proposal
    - O(NlogN + M*logN) = O((M+N)logN) for sorting/search
    - O(N) memory for index.

Just on memory usage it's clear that the current approach wins
significantly and is the reason enough to prefer it. But even on
overall CPU usage we have (N+M)logM < (N+M)logN, which
(asymptotically) is still better.

But I think the memory argument in this case wins, we should avoid
allocating 1MB of extra memory to shave off 1ms (if at all) of kernel
load time, IMO.

[...]

>
> So the overall relocation time - from 11 distilled types in hid_ite to
> 610 for cxgb4 - is within a range from 4.5msec (4432193ns above) to
> 8msec. The times for relocation represent less than 50% of overall
> module load times - the later vary from 11-18msec across these modules.
> It would be great to find some performance wins here, but I don't
> _think_ swapping the sort/search targets will buy us much unfortunately.
>

As long as it's NlogN-ish (mix N/M here), I think it's fine. It's
O(N*M) that would be horrible.

> > The algorithm might not handle name duplicates in the distilled BTF well,
> > e.g. in theory, the following is a valid C code
> >
> >   struct foo { int f; }; // sizeof(struct foo) == 4
> >   typedef int foo;       // sizeof(foo) == 4
> >

I think only typedef has its own "namespace", right? And typedefs are
not in distilled base. All other types share the same name (so at
least we don't have to support two separate namespaces).

But you are right, it's C, and across multiple compile units one can
technically have two different kinds with the same name co-existing
overall in vmlinux BTF.

But instead of doing bsearch_unique(), I propose we implement
lower-bound-like binary search, which will find the first instance of
a given name, and then we iterate linearly across all duplicates,
ignoring incompatible kinds. If we do find two possible matches (when
taking size/kind into account), then we should error out due to
ambiguity.

Alan, please see find_linfo() in kernel/bpf/log.c. You basically need
exactly that implementation, except you'd be using strcmp() <= 0
condition to find the index. After that just iterate starting from
that index and do strcmp() == 0, marking all matches. Check number of
matches (0 -- bad, 1 -- great, >1 -- bad).

Eduard, Alan, makes sense or did I miss anything?

> > Suppose that these types are a part of the distilled BTF.
> > Depending on which one would end up first in 'dist_base_info_sorted'
> > bsearch might fail to find one or the other.
> >
>
> In the case of distilled base BTF, only struct, union, enum, enum64,
> int, float and fwd can be present. Size matches would have to be between
> one of these kinds I think, but are still possible nevertheless.
>

+1

> > Also, algorithm does not report an error if there are several
> > types with the same name and size in the base BTF.
> >
>

[...]

>
>
> struct elf_thread_core_info;
>
> struct elf_note_info {
>         struct elf_thread_core_info *thread;
>         struct memelfnote psinfo;
>         struct memelfnote signote;
>         struct memelfnote auxv;
>         struct memelfnote files;
>         siginfo_t csigdata;
>         size_t size;
>         int thread_notes;
> };
>
> struct elf_thread_core_info___2;
>
> struct elf_note_info___2 {
>         struct elf_thread_core_info___2 *thread;
>         struct memelfnote psinfo;
>         struct memelfnote signote;
>         struct memelfnote auxv;
>         struct memelfnote files;
>         compat_siginfo_t csigdata;
>         size_t size;
>         int thread_notes;
> };
>
> Both of these share self-reference, either directly or indirectly so
> maybe it's a corner-case of dedup we're missing. I'll dig into these later.

I think it's some differing type deeper in the
elf_thread_core_info/elf_thread_core_info___2 (and note that this is
not a self-reference, it's a different type).

I don't think it's a bug in dedup algo, this is a known case due to
some type that is supposed to be equivalent actually not being such.

>
> > I suggest to modify the algorithm as follows:
> > - let 'base_info_sorted' be a set of tuples {kind,name,size,id}
> >   corresponding to the base BTF, sorted by kind, name and size;
>
> That was my first thought, but we can't always search by kind; for
> example it's possible the distilled base has a fwd and vmlinux only has
> a struct kind for the same type name; in such a case we'd want to
> support a match provided the fwd's kflag indicated a struct fwd.

yep, makes sense (though highly unlikely)

>
> In fact looking at the code we're missing logic for the opposite
> condition (fwd only in base, struct in distilled base). I'll fix that.

I'd error out on this, this looks like a weird nonsensical case (but
technically can happen with BTF, of course).

>
> The other case is an enum in distilled base matching an enum64
> or an enum.

I mentioned that in a previous email, we should just ignore the enum
vs enum64 difference as long as their size matches.

>
> > - add a custom utility bsearch_unique, that behaves like bsearch,
> >   but returns NULL if entry is non-unique with regards to current
> >   predicate (e.g. use bsearch but also check neighbors);
> > - for each type D in the distilled base:
> >   - use bsearch_unique to find entry E in 'base_info_sorted'
> >     that matches D.{kind,name,size} sub-tuple;
> >   - if E exists, set id_map[D] := E.id;
> >   - if E does not exist:
> >     - if id_map[D] == BTF_IS_EMBEDDED, report an error;
> >     - if id_map[D] != BTF_IS_EMBEDDED:
> >       - use bsearch_unique to find entry E in 'base_info_sorted'
> >         that matches D.{kind,name} sub-tuple;
> >       - if E exists, set id_map[D] := E.id;
> >       - otherwise, report an error.
> >
> > This allows to:
> > - flip the search order, potentially gaining some speed;
> > - drop the 'base_name_cnt' array and logic;
> > - handle the above hypothetical name conflict example.
> >
>
> I think flipping the search order could gain search speed, but only at
> the expense of slowing things down overall due to the extra cost of
> having to sort so many more elements. I suspect it will mostly be a
> wash, though numbers above seem to suggest sorting distilled base may
> have an edge when we consider both search and sort. The question is
> probably which sort/search order is most amenable to handling the data
> and helping us deal with the edge cases like duplicates.
>
> With the existing scheme, I think catching cases of name duplicates in
> distilled base BTF and name/size duplicates in base BTF for types we
> want to relocate from distilled base BTF and erroring out would suffice;
> basically the following applied to this patch (patch 3 in the series)
>
> diff --git a/tools/lib/bpf/btf_relocate.c b/tools/lib/bpf/btf_relocate.c
> index f2e91cdfb5cc..4e282ee8f183 100644
> --- a/tools/lib/bpf/btf_relocate.c
> +++ b/tools/lib/bpf/btf_relocate.c
> @@ -113,6 +113,7 @@ static int btf_relocate_map_distilled_base(struct
> btf_relocate *r)
>  {
>         struct btf_name_info *dist_base_info_sorted;
>         struct btf_type *base_t, *dist_t, *split_t;
> +       const char *last_name = NULL;
>         __u8 *base_name_cnt = NULL;
>         int err = 0;
>         __u32 id;
> @@ -136,6 +137,19 @@ static int btf_relocate_map_distilled_base(struct
> btf_relocate *r)
>         qsort(dist_base_info_sorted, r->nr_dist_base_types,
> sizeof(*dist_base_info_sorted),
>               cmp_btf_name_size);
>
> +       /* It is possible - though highly unlikely - that
> duplicate-named types
> +        * end up in distilled based BTF; error out if this is the case.
> +        */
> +       for (id = 1; id < r->nr_dist_base_types; id++) {
> +               if (last_name == dist_base_info_sorted[id].name) {

technically this a) has to take into account kind and b) even with the
same kind is legal

On some older kernel version we'd have that with ring_buffer, for
example. There were two completely independent struct ring_buffer
types.

I thought we would do all the relocation-time size check shenanigans
to resolve all this. I might have missed some nuance, though.

> +                       pr_warn("Multiple distilled base types [%u],
> [%u] share name '%s'; cannot relocate with base BTF.\n",
> +                               id - 1, id, last_name);
> +                       err = -EINVAL;
> +                       goto done;
> +               }
> +               last_name = dist_base_info_sorted[id].name;
> +       }
> +

[...]

> > Wdyt?
> >
>
> My personal take is that it would suffice to error out in some of the
> edge cases, but I'm open to other approaches too. Hopefully some of the
> data above helps us understand the costs of this approach at least. Thanks!
>

I wouldn't change the overall relocation algorithm, but better
detection of ambiguities and reporting would definitely be useful.

> Alan
>
> [1] https://github.com/oracle/dtrace-utils
>
> >> +    struct btf_name_info *dist_base_info_sorted;
> >> +    struct btf_type *base_t, *dist_t, *split_t;
> >> +    __u8 *base_name_cnt = NULL;
> >> +    int err = 0;
> >> +    __u32 id;
> >> +

[...]

Please try to trim larger chunks of code/discussion that are not
relevant anymore.
Eduard Zingerman June 1, 2024, 7:56 a.m. UTC | #5
On Fri, 2024-05-31 at 16:38 +0100, Alan Maguire wrote:

Hi Alan,

> The in-kernel sort reports n*log2(n) + 0.37*n + o(n) comparisons on
> average; for base BTF that means sorting requires at least
> 
> Base: 14307*14+0.37*14307	= 205592 comparisons
> Dist: 600*9+0.37*600		= 5622 comparisons
> 
> So we get an inversion of the above results, with (unless I'm
> miscalculating something), sorting distilled base BTF requiring less
> comparisons overall across both sort+search.
> 
> Sort Comparisons		Search comparisons		Total
> ======================================================================	
> 5622	(distilled)		128763	(to base)		134385
> 205592	(base)			8400	(to distilled)		213992

It was absolutely stupid of me to not include the base sort cost into
the calculations, really embarrassing. Thank you for pointing this out
and my apologies for suggesting such nonsense.

[...]

> > The algorithm might not handle name duplicates in the distilled BTF well,
> > e.g. in theory, the following is a valid C code
> > 
> >   struct foo { int f; }; // sizeof(struct foo) == 4
> >   typedef int foo;       // sizeof(foo) == 4
> > 
> > Suppose that these types are a part of the distilled BTF.
> > Depending on which one would end up first in 'dist_base_info_sorted'
> > bsearch might fail to find one or the other.
> 
> In the case of distilled base BTF, only struct, union, enum, enum64,
> int, float and fwd can be present. Size matches would have to be between
> one of these kinds I think, but are still possible nevertheless.

As Andrii noted in a sibling reply, there is still a slim possibility
for name duplicates in the distilled base. Imo, if we can catch the
corner case we should.

> > Also, algorithm does not report an error if there are several
> > types with the same name and size in the base BTF.
> 
> Yep, while we have to handle this, it only becomes an ambiguity problem
> if distilled base BTF refers to one of such types. On my vmlinux I see
> the following duplicate name/size STRUCTs

As you noted, this situation is really easy to catch by checking if
id_map slot is already occupied, so it should be checked.

[...]

> struct elf_thread_core_info___2;
> 
> struct elf_note_info___2 {
>         struct elf_thread_core_info___2 *thread;
>         struct memelfnote psinfo;
>         struct memelfnote signote;
>         struct memelfnote auxv;
>         struct memelfnote files;
>         compat_siginfo_t csigdata;
>         size_t size;
>         int thread_notes;
> };
> 
> Both of these share self-reference, either directly or indirectly so
> maybe it's a corner-case of dedup we're missing. I'll dig into these later.

This is interesting indeed.

> > I suggest to modify the algorithm as follows:
> > - let 'base_info_sorted' be a set of tuples {kind,name,size,id}
> >   corresponding to the base BTF, sorted by kind, name and size;
> 
> That was my first thought, but we can't always search by kind; for
> example it's possible the distilled base has a fwd and vmlinux only has
> a struct kind for the same type name; in such a case we'd want to
> support a match provided the fwd's kflag indicated a struct fwd.
> 
> In fact looking at the code we're missing logic for the opposite
> condition (fwd only in base, struct in distilled base). I'll fix that.
> 
> The other case is an enum in distilled base matching an enum64
> or an enum.

I think it could be possible to do some kinds normalization
(e.g. represent fwd's as zero sized structs or unions in
btf_name_info).
I'll try to implement this and get back to you on Monday.

[...]

> I think flipping the search order could gain search speed, but only at
> the expense of slowing things down overall due to the extra cost of
> having to sort so many more elements. I suspect it will mostly be a
> wash, though numbers above seem to suggest sorting distilled base may
> have an edge when we consider both search and sort. The question is
> probably which sort/search order is most amenable to handling the data
> and helping us deal with the edge cases like duplicates.

Yes, you are absolutely correct.

[...]

> @@ -136,6 +137,19 @@ static int btf_relocate_map_distilled_base(struct
> btf_relocate *r)
>         qsort(dist_base_info_sorted, r->nr_dist_base_types,
> sizeof(*dist_base_info_sorted),
>               cmp_btf_name_size);
> 
> +       /* It is possible - though highly unlikely - that
> duplicate-named types
> +        * end up in distilled based BTF; error out if this is the case.
> +        */
> +       for (id = 1; id < r->nr_dist_base_types; id++) {
> +               if (last_name == dist_base_info_sorted[id].name) {
> +                       pr_warn("Multiple distilled base types [%u],
> [%u] share name '%s'; cannot relocate with base BTF.\n",
> +                               id - 1, id, last_name);
> +                       err = -EINVAL;
> +                       goto done;
> +               }
> +               last_name = dist_base_info_sorted[id].name;
> +       }
> +

Nit: this rejects a case when both distilled types are embedded and a
     counterpart for each could be found in base. But that's a bit
     inconvenient to check for in the current framework. Probably not
     important.

>         /* Mark distilled base struct/union members of split BTF
> structs/unions
>          * in id_map with BTF_IS_EMBEDDED; this signals that these types
>          * need to match both name and size, otherwise embeddding the base
> @@ -272,6 +286,21 @@ static int btf_relocate_map_distilled_base(struct
> btf_relocate *r)
>                 default:
>                         continue;
>                 }
> +               if (r->id_map[dist_name_info->id] &&
> + 		    r->id_map[dist_name_info->id != BTF_IS_EMBEDDED) {
> +                       /* we already have a match; this tells us that
> +                        * multiple base types of the same name
> +                        * have the same size, since for cases where
> +                        * multiple types have the same name we match
> +                        * on name and size.  In this case, we have
> +                        * no way of determining which to relocate
> +                        * to in base BTF, so error out.
> +                        */
> +                       pr_warn("distilled base BTF type '%s' [%u], size
> %u has multiple candidates of the same size (ids [%u, %u]) in base BTF\n",
> +                               base_name_info.name, dist_name_info->id,
> base_t->size,
> +                               id, r->id_map[dist_name_info->id]);
> +                       err = -EINVAL;
> +                       goto done;
> +               }

I think this hunk should be added.

[...]

Best regards,
Eduard
Alan Maguire June 2, 2024, 1:47 p.m. UTC | #6
On 01/06/2024 08:56, Eduard Zingerman wrote:
> On Fri, 2024-05-31 at 16:38 +0100, Alan Maguire wrote:
> 
> Hi Alan,
> 
>> The in-kernel sort reports n*log2(n) + 0.37*n + o(n) comparisons on
>> average; for base BTF that means sorting requires at least
>>
>> Base: 14307*14+0.37*14307	= 205592 comparisons
>> Dist: 600*9+0.37*600		= 5622 comparisons
>>
>> So we get an inversion of the above results, with (unless I'm
>> miscalculating something), sorting distilled base BTF requiring less
>> comparisons overall across both sort+search.
>>
>> Sort Comparisons		Search comparisons		Total
>> ======================================================================	
>> 5622	(distilled)		128763	(to base)		134385
>> 205592	(base)			8400	(to distilled)		213992
> 
> It was absolutely stupid of me to not include the base sort cost into
> the calculations, really embarrassing. Thank you for pointing this out
> and my apologies for suggesting such nonsense.
>

No apologies necessary! Your feedback was a great reminder to actually
check the relative overheads properly (which I had neglected to do in
depth), and it just turns out in this case that the relative proportions
between base and distilled base make sorting the distilled base the
right approach. So this was absolutely a valid theoretical question, and
I'm very glad you asked it.

> [...]
> 
>>> The algorithm might not handle name duplicates in the distilled BTF well,
>>> e.g. in theory, the following is a valid C code
>>>
>>>   struct foo { int f; }; // sizeof(struct foo) == 4
>>>   typedef int foo;       // sizeof(foo) == 4
>>>
>>> Suppose that these types are a part of the distilled BTF.
>>> Depending on which one would end up first in 'dist_base_info_sorted'
>>> bsearch might fail to find one or the other.
>>
>> In the case of distilled base BTF, only struct, union, enum, enum64,
>> int, float and fwd can be present. Size matches would have to be between
>> one of these kinds I think, but are still possible nevertheless.
> 
> As Andrii noted in a sibling reply, there is still a slim possibility
> for name duplicates in the distilled base. Imo, if we can catch the
> corner case we should.
> 
>>> Also, algorithm does not report an error if there are several
>>> types with the same name and size in the base BTF.
>>
>> Yep, while we have to handle this, it only becomes an ambiguity problem
>> if distilled base BTF refers to one of such types. On my vmlinux I see
>> the following duplicate name/size STRUCTs
> 
> As you noted, this situation is really easy to catch by checking if
> id_map slot is already occupied, so it should be checked.
> 
> [...]
> 
>> struct elf_thread_core_info___2;
>>
>> struct elf_note_info___2 {
>>         struct elf_thread_core_info___2 *thread;
>>         struct memelfnote psinfo;
>>         struct memelfnote signote;
>>         struct memelfnote auxv;
>>         struct memelfnote files;
>>         compat_siginfo_t csigdata;
>>         size_t size;
>>         int thread_notes;
>> };
>>
>> Both of these share self-reference, either directly or indirectly so
>> maybe it's a corner-case of dedup we're missing. I'll dig into these later.
> 
> This is interesting indeed.
> 
>>> I suggest to modify the algorithm as follows:
>>> - let 'base_info_sorted' be a set of tuples {kind,name,size,id}
>>>   corresponding to the base BTF, sorted by kind, name and size;
>>
>> That was my first thought, but we can't always search by kind; for
>> example it's possible the distilled base has a fwd and vmlinux only has
>> a struct kind for the same type name; in such a case we'd want to
>> support a match provided the fwd's kflag indicated a struct fwd.
>>
>> In fact looking at the code we're missing logic for the opposite
>> condition (fwd only in base, struct in distilled base). I'll fix that.
>>
>> The other case is an enum in distilled base matching an enum64
>> or an enum.
> 
> I think it could be possible to do some kinds normalization
> (e.g. represent fwd's as zero sized structs or unions in
> btf_name_info).
> I'll try to implement this and get back to you on Monday.
> 
> [...]
> 
>> I think flipping the search order could gain search speed, but only at
>> the expense of slowing things down overall due to the extra cost of
>> having to sort so many more elements. I suspect it will mostly be a
>> wash, though numbers above seem to suggest sorting distilled base may
>> have an edge when we consider both search and sort. The question is
>> probably which sort/search order is most amenable to handling the data
>> and helping us deal with the edge cases like duplicates.
> 
> Yes, you are absolutely correct.
> 
> [...]
> 
>> @@ -136,6 +137,19 @@ static int btf_relocate_map_distilled_base(struct
>> btf_relocate *r)
>>         qsort(dist_base_info_sorted, r->nr_dist_base_types,
>> sizeof(*dist_base_info_sorted),
>>               cmp_btf_name_size);
>>
>> +       /* It is possible - though highly unlikely - that
>> duplicate-named types
>> +        * end up in distilled based BTF; error out if this is the case.
>> +        */
>> +       for (id = 1; id < r->nr_dist_base_types; id++) {
>> +               if (last_name == dist_base_info_sorted[id].name) {
>> +                       pr_warn("Multiple distilled base types [%u],
>> [%u] share name '%s'; cannot relocate with base BTF.\n",
>> +                               id - 1, id, last_name);
>> +                       err = -EINVAL;
>> +                       goto done;
>> +               }
>> +               last_name = dist_base_info_sorted[id].name;
>> +       }
>> +
> 
> Nit: this rejects a case when both distilled types are embedded and a
>      counterpart for each could be found in base. But that's a bit
>      inconvenient to check for in the current framework. Probably not
>      important.
> 
>>         /* Mark distilled base struct/union members of split BTF
>> structs/unions
>>          * in id_map with BTF_IS_EMBEDDED; this signals that these types
>>          * need to match both name and size, otherwise embeddding the base
>> @@ -272,6 +286,21 @@ static int btf_relocate_map_distilled_base(struct
>> btf_relocate *r)
>>                 default:
>>                         continue;
>>                 }
>> +               if (r->id_map[dist_name_info->id] &&
>> + 		    r->id_map[dist_name_info->id != BTF_IS_EMBEDDED) {
>> +                       /* we already have a match; this tells us that
>> +                        * multiple base types of the same name
>> +                        * have the same size, since for cases where
>> +                        * multiple types have the same name we match
>> +                        * on name and size.  In this case, we have
>> +                        * no way of determining which to relocate
>> +                        * to in base BTF, so error out.
>> +                        */
>> +                       pr_warn("distilled base BTF type '%s' [%u], size
>> %u has multiple candidates of the same size (ids [%u, %u]) in base BTF\n",
>> +                               base_name_info.name, dist_name_info->id,
>> base_t->size,
>> +                               id, r->id_map[dist_name_info->id]);
>> +                       err = -EINVAL;
>> +                       goto done;
>> +               }
> 
> I think this hunk should be added.
> 

Sure, will do! Thanks again!


Alan
> [...]
> 
> Best regards,
> Eduard
diff mbox series

Patch

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index b6619199a706..336da6844d42 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,4 +1,4 @@ 
 libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
 	    netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \
 	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \
-	    usdt.o zip.o elf.o features.o
+	    usdt.o zip.o elf.o features.o btf_relocate.o
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 9f68268e659a..cb762d7a5dd7 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -5502,3 +5502,20 @@  int btf__distill_base(const struct btf *src_btf, struct btf **new_base_btf,
 
 	return 0;
 }
+
+const struct btf_header *btf_header(const struct btf *btf)
+{
+	return btf->hdr;
+}
+
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
+{
+	btf->base_btf = (struct btf *)base_btf;
+	btf->start_id = btf__type_cnt(base_btf);
+	btf->start_str_off = base_btf->hdr->str_len;
+}
+
+int btf__relocate(struct btf *btf, const struct btf *base_btf)
+{
+	return libbpf_err(btf_relocate(btf, base_btf, NULL));
+}
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index cb08ee9a5a10..8a93120b7385 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -252,6 +252,20 @@  struct btf_dedup_opts {
 
 LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts);
 
+/**
+ * @brief **btf__relocate()** will check the split BTF *btf* for references
+ * to base BTF kinds, and verify those references are compatible with
+ * *base_btf*; if they are, *btf* is adjusted such that is re-parented to
+ * *base_btf* and type ids and strings are adjusted to accommodate this.
+ *
+ * If successful, 0 is returned and **btf** now has **base_btf** as its
+ * base.
+ *
+ * A negative value is returned on error and the thread-local `errno` variable
+ * is set to the error code as well.
+ */
+LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf);
+
 struct btf_dump;
 
 struct btf_dump_opts {
diff --git a/tools/lib/bpf/btf_relocate.c b/tools/lib/bpf/btf_relocate.c
new file mode 100644
index 000000000000..f2e91cdfb5cc
--- /dev/null
+++ b/tools/lib/bpf/btf_relocate.c
@@ -0,0 +1,430 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024, Oracle and/or its affiliates. */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "btf.h"
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_internal.h"
+
+struct btf;
+
+struct btf_relocate {
+	struct btf *btf;
+	const struct btf *base_btf;
+	const struct btf *dist_base_btf;
+	unsigned int nr_base_types;
+	unsigned int nr_split_types;
+	unsigned int nr_dist_base_types;
+	int dist_str_len;
+	int base_str_len;
+	__u32 *id_map;
+	__u32 *str_map;
+};
+
+/* Set temporarily in relocation id_map if distilled base struct/union is
+ * embedded in a split BTF struct/union; in such a case, size information must
+ * match between distilled base BTF and base BTF representation of type.
+ */
+#define BTF_IS_EMBEDDED ((__u32)-1)
+
+/* <name, size, id> triple used in sorting/searching distilled base BTF. */
+struct btf_name_info {
+	const char *name;
+	int size:31;
+	/* set when search requires a size match */
+	bool needs_size;
+	__u32 id;
+};
+
+static int btf_relocate_rewrite_type_id(__u32 *id, void *ctx)
+{
+	struct btf_relocate *r = ctx;
+
+	*id = r->id_map[*id];
+	return 0;
+}
+
+/* Simple string comparison used for sorting within BTF, since all distilled
+ * types are named.  If strings match, and size is non-zero for both elements
+ * fall back to using size for ordering.
+ */
+static int cmp_btf_name_size(const void *n1, const void *n2)
+{
+	const struct btf_name_info *ni1 = n1;
+	const struct btf_name_info *ni2 = n2;
+	int name_diff = strcmp(ni1->name, ni2->name);
+
+	if (!name_diff && ni1->needs_size && ni2->needs_size)
+		return ni2->size - ni1->size;
+	return name_diff;
+}
+
+/* If a member of a split BTF struct/union refers to a base BTF
+ * struct/union, mark that struct/union id temporarily in the id_map
+ * with BTF_IS_EMBEDDED.  Members can be const/restrict/volatile/typedef
+ * reference types, but if a pointer is encountered, the type is no longer
+ * considered embedded.
+ */
+static int btf_mark_embedded_composite_type_ids(__u32 *id, void *ctx)
+{
+	struct btf_relocate *r = ctx;
+	struct btf_type *t;
+	__u32 next_id = *id;
+
+	while (true) {
+		if (next_id == 0)
+			return 0;
+		t = btf_type_by_id(r->btf, next_id);
+		switch (btf_kind(t)) {
+		case BTF_KIND_CONST:
+		case BTF_KIND_RESTRICT:
+		case BTF_KIND_VOLATILE:
+		case BTF_KIND_TYPEDEF:
+		case BTF_KIND_TYPE_TAG:
+			next_id = t->type;
+			break;
+		case BTF_KIND_ARRAY: {
+			struct btf_array *a = btf_array(t);
+
+			next_id = a->type;
+			break;
+		}
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION:
+			if (next_id < r->nr_dist_base_types)
+				r->id_map[next_id] = BTF_IS_EMBEDDED;
+			return 0;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+/* Build a map from distilled base BTF ids to base BTF ids. To do so, iterate
+ * through base BTF looking up distilled type (using binary search) equivalents.
+ */
+static int btf_relocate_map_distilled_base(struct btf_relocate *r)
+{
+	struct btf_name_info *dist_base_info_sorted;
+	struct btf_type *base_t, *dist_t, *split_t;
+	__u8 *base_name_cnt = NULL;
+	int err = 0;
+	__u32 id;
+
+	/* generate a sort index array of name/type ids sorted by name for
+	 * distilled base BTF to speed name-based lookups.
+	 */
+	dist_base_info_sorted = calloc(r->nr_dist_base_types, sizeof(*dist_base_info_sorted));
+	if (!dist_base_info_sorted) {
+		err = -ENOMEM;
+		goto done;
+	}
+	for (id = 0; id < r->nr_dist_base_types; id++) {
+		dist_t = btf_type_by_id(r->dist_base_btf, id);
+		dist_base_info_sorted[id].name = btf__name_by_offset(r->dist_base_btf,
+								     dist_t->name_off);
+		dist_base_info_sorted[id].id = id;
+		dist_base_info_sorted[id].size = dist_t->size;
+		dist_base_info_sorted[id].needs_size = true;
+	}
+	qsort(dist_base_info_sorted, r->nr_dist_base_types, sizeof(*dist_base_info_sorted),
+	      cmp_btf_name_size);
+
+	/* Mark distilled base struct/union members of split BTF structs/unions
+	 * in id_map with BTF_IS_EMBEDDED; this signals that these types
+	 * need to match both name and size, otherwise embeddding the base
+	 * struct/union in the split type is invalid.
+	 */
+	for (id = r->nr_dist_base_types; id < r->nr_split_types; id++) {
+		split_t = btf_type_by_id(r->btf, id);
+		if (btf_is_composite(split_t)) {
+			err = btf_type_visit_type_ids(split_t, btf_mark_embedded_composite_type_ids,
+						      r);
+			if (err < 0)
+				goto done;
+		}
+	}
+
+	/* Collect name counts for composite types in base BTF.  If multiple
+	 * instances of a struct/union of the same name exist, we need to use
+	 * size to determine which to map to since name alone is ambiguous.
+	 */
+	base_name_cnt = calloc(r->base_str_len, sizeof(*base_name_cnt));
+	if (!base_name_cnt) {
+		err = -ENOMEM;
+		goto done;
+	}
+	for (id = 1; id < r->nr_base_types; id++) {
+		base_t = btf_type_by_id(r->base_btf, id);
+		if (!btf_is_composite(base_t) || !base_t->name_off)
+			continue;
+		if (base_name_cnt[base_t->name_off] < 255)
+			base_name_cnt[base_t->name_off]++;
+	}
+
+	/* Now search base BTF for matching distilled base BTF types. */
+	for (id = 1; id < r->nr_base_types; id++) {
+		struct btf_name_info *dist_name_info, base_name_info = {};
+		int dist_kind, base_kind;
+
+		base_t = btf_type_by_id(r->base_btf, id);
+		/* distilled base consists of named types only. */
+		if (!base_t->name_off)
+			continue;
+		base_kind = btf_kind(base_t);
+		base_name_info.id = id;
+		base_name_info.name = btf__name_by_offset(r->base_btf, base_t->name_off);
+		switch (base_kind) {
+		case BTF_KIND_INT:
+		case BTF_KIND_FLOAT:
+		case BTF_KIND_ENUM:
+		case BTF_KIND_ENUM64:
+			/* These types should match both name and size */
+			base_name_info.needs_size = true;
+			base_name_info.size = base_t->size;
+			break;
+		case BTF_KIND_FWD:
+			/* No size considerations for fwds. */
+			break;
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION:
+			/* Size only needs to be used for struct/union if there
+			 * are multiple types in base BTF with the same name.
+			 * If there are multiple _distilled_ types with the same
+			 * name (a very unlikely scenario), that doesn't matter
+			 * unless corresponding _base_ types to match them are
+			 * missing.
+			 */
+			base_name_info.needs_size = base_name_cnt[base_t->name_off] > 1;
+			base_name_info.size = base_t->size;
+			break;
+		default:
+			continue;
+		}
+		dist_name_info = bsearch(&base_name_info, dist_base_info_sorted,
+					 r->nr_dist_base_types, sizeof(*dist_base_info_sorted),
+					 cmp_btf_name_size);
+		if (!dist_name_info)
+			continue;
+		if (!dist_name_info->id || dist_name_info->id > r->nr_dist_base_types) {
+			pr_warn("base BTF id [%d] maps to invalid distilled base BTF id [%d]\n",
+				id, dist_name_info->id);
+			err = -EINVAL;
+			goto done;
+		}
+		dist_t = btf_type_by_id(r->dist_base_btf, dist_name_info->id);
+		dist_kind = btf_kind(dist_t);
+
+		/* Validate that the found distilled type is compatible.
+		 * Do not error out on mismatch as another match may occur
+		 * for an identically-named type.
+		 */
+		switch (dist_kind) {
+		case BTF_KIND_FWD:
+			switch (base_kind) {
+			case BTF_KIND_FWD:
+				if (btf_kflag(dist_t) != btf_kflag(base_t))
+					continue;
+				break;
+			case BTF_KIND_STRUCT:
+				if (btf_kflag(base_t))
+					continue;
+				break;
+			case BTF_KIND_UNION:
+				if (!btf_kflag(base_t))
+					continue;
+				break;
+			default:
+				continue;
+			}
+			break;
+		case BTF_KIND_INT:
+			if (dist_kind != base_kind ||
+			    btf_int_encoding(base_t) != btf_int_encoding(dist_t))
+				continue;
+			break;
+		case BTF_KIND_FLOAT:
+			if (dist_kind != base_kind)
+				continue;
+			break;
+		case BTF_KIND_ENUM:
+			/* ENUM and ENUM64 are encoded as sized ENUM in
+			 * distilled base BTF.
+			 */
+			if (dist_kind != base_kind && base_kind != BTF_KIND_ENUM64)
+				continue;
+			break;
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION:
+			/* size verification is required for embedded
+			 * struct/unions.
+			 */
+			if (r->id_map[dist_name_info->id] == BTF_IS_EMBEDDED &&
+			    base_t->size != dist_t->size)
+				continue;
+			break;
+		default:
+			continue;
+		}
+		/* map id and name */
+		r->id_map[dist_name_info->id] = id;
+		r->str_map[dist_t->name_off] = base_t->name_off;
+	}
+	/* ensure all distilled BTF ids now have a mapping... */
+	for (id = 1; id < r->nr_dist_base_types; id++) {
+		const char *name;
+
+		if (r->id_map[id] && r->id_map[id] != BTF_IS_EMBEDDED)
+			continue;
+		dist_t = btf_type_by_id(r->dist_base_btf, id);
+		name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off);
+		pr_warn("distilled base BTF type '%s' [%d] is not mapped to base BTF id\n",
+			name, id);
+		err = -EINVAL;
+		break;
+	}
+done:
+	free(base_name_cnt);
+	free(dist_base_info_sorted);
+	return err;
+}
+
+/* distilled base should only have named int/float/enum/fwd/struct/union types. */
+static int btf_relocate_validate_distilled_base(struct btf_relocate *r)
+{
+	unsigned int i;
+
+	for (i = 1; i < r->nr_dist_base_types; i++) {
+		struct btf_type *t = btf_type_by_id(r->dist_base_btf, i);
+		int kind = btf_kind(t);
+
+		switch (kind) {
+		case BTF_KIND_INT:
+		case BTF_KIND_FLOAT:
+		case BTF_KIND_ENUM:
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION:
+		case BTF_KIND_FWD:
+			if (t->name_off)
+				break;
+			pr_warn("type [%d], kind [%d] is invalid for distilled base BTF; it is anonymous\n",
+				i, kind);
+			return -EINVAL;
+		default:
+			pr_warn("type [%d] in distilled based BTF has unexpected kind [%d]\n",
+				i, kind);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int btf_relocate_rewrite_strs(__u32 *str_off, void *ctx)
+{
+	struct btf_relocate *r = ctx;
+	int off;
+
+	if (!*str_off)
+		return 0;
+	if (*str_off >= r->dist_str_len) {
+		*str_off += r->base_str_len - r->dist_str_len;
+	} else {
+		off = r->str_map[*str_off];
+		if (!off) {
+			pr_warn("string '%s' [offset %u] is not mapped to base BTF",
+				btf__str_by_offset(r->btf, off), *str_off);
+			return -ENOENT;
+		}
+		*str_off = off;
+	}
+	return 0;
+}
+
+/* If successful, output of relocation is updated BTF with base BTF pointing
+ * at base_btf, and type ids, strings adjusted accordingly.
+ */
+int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map)
+{
+	unsigned int nr_types = btf__type_cnt(btf);
+	const struct btf_header *dist_base_hdr;
+	const struct btf_header *base_hdr;
+	struct btf_relocate r = {};
+	struct btf_type *t;
+	int err = 0;
+	__u32 id, i;
+
+	r.dist_base_btf = btf__base_btf(btf);
+	if (!base_btf || r.dist_base_btf == base_btf)
+		return -EINVAL;
+
+	r.nr_dist_base_types = btf__type_cnt(r.dist_base_btf);
+	r.nr_base_types = btf__type_cnt(base_btf);
+	r.nr_split_types = nr_types - r.nr_dist_base_types;
+	r.btf = btf;
+	r.base_btf = base_btf;
+
+	r.id_map = calloc(nr_types, sizeof(*r.id_map));
+	r.str_map = calloc(btf_header(r.dist_base_btf)->str_len, sizeof(*r.str_map));
+	dist_base_hdr = btf_header(r.dist_base_btf);
+	base_hdr = btf_header(r.base_btf);
+	r.dist_str_len = dist_base_hdr->str_len;
+	r.base_str_len = base_hdr->str_len;
+	if (!r.id_map || !r.str_map) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	err = btf_relocate_validate_distilled_base(&r);
+	if (err)
+		goto err_out;
+
+	/* Split BTF ids need to be adjusted as base and distilled base
+	 * have different numbers of types, changing the start id of split
+	 * BTF.
+	 */
+	for (id = r.nr_dist_base_types; id < nr_types; id++)
+		r.id_map[id] = id + r.nr_base_types - r.nr_dist_base_types;
+
+	/* Build a map from distilled base ids to actual base BTF ids; it is used
+	 * to update split BTF id references.  Also build a str_map mapping from
+	 * distilled base BTF names to base BTF names.
+	 */
+	err = btf_relocate_map_distilled_base(&r);
+	if (err)
+		goto err_out;
+
+	/* Next, rewrite type ids in split BTF, replacing split ids with updated
+	 * ids based on number of types in base BTF, and base ids with
+	 * relocated ids from base_btf.
+	 */
+	for (i = 0, id = r.nr_dist_base_types; i < r.nr_split_types; i++, id++) {
+		t = btf_type_by_id(btf, id);
+		err = btf_type_visit_type_ids(t, btf_relocate_rewrite_type_id, &r);
+		if (err)
+			goto err_out;
+	}
+	/* String offsets now need to be updated using the str_map. */
+	for (i = 0; i < r.nr_split_types; i++) {
+		t = btf_type_by_id(btf, i + r.nr_dist_base_types);
+		err = btf_type_visit_str_offs(t, btf_relocate_rewrite_strs, &r);
+		if (err)
+			goto err_out;
+	}
+	/* Finally reset base BTF to be base_btf */
+	btf_set_base_btf(btf, base_btf);
+
+	if (id_map) {
+		*id_map = r.id_map;
+		r.id_map = NULL;
+	}
+err_out:
+	free(r.id_map);
+	free(r.str_map);
+	return err;
+}
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 9e69d6e2a512..b333e78bcd96 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -420,6 +420,7 @@  LIBBPF_1.4.0 {
 LIBBPF_1.5.0 {
 	global:
 		btf__distill_base;
+		btf__relocate;
 		bpf_program__attach_sockmap;
 		ring__consume_n;
 		ring_buffer__consume_n;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index a0dcfb82e455..a8ce5fb5a8c4 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -234,6 +234,9 @@  struct btf_type;
 struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id);
 const char *btf_kind_str(const struct btf_type *t);
 const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id);
+const struct btf_header *btf_header(const struct btf *btf);
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf);
+int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map);
 
 static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t)
 {