diff mbox series

[bpf-next,v4,7/8] libbpf: Support creating light skeleton of either endianness

Message ID ed17daaceb6fd12a62c7286958239bb34fc3f55b.1724976539.git.tony.ambardar@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series libbpf, selftests/bpf: Support cross-endian usage | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7 this patch: 7
netdev/build_tools success Errors and warnings before: 4 this patch: 4
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 7 this patch: 7
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7 this patch: 7
netdev/checkpatch fail CHECK: Alignment should match open parenthesis ERROR: code indent should use tabs where possible ERROR: trailing statements should be on next line WARNING: line length of 82 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Tony Ambardar Aug. 30, 2024, 7:29 a.m. UTC
Track target endianness in 'struct bpf_gen' and process in-memory data in
native byte-order, but on finalization convert the embedded loader BPF
insns to target endianness.

The light skeleton also includes a target-accessed data blob which is
heterogeneous and thus difficult to convert to target byte-order on
finalization. Add support functions to convert data to target endianness
as it is added to the blob.

Also add additional debug logging for data blob structure details and
skeleton loading.

Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
---
 tools/lib/bpf/bpf_gen_internal.h |   1 +
 tools/lib/bpf/gen_loader.c       | 187 +++++++++++++++++++++++--------
 tools/lib/bpf/libbpf.c           |   1 +
 tools/lib/bpf/skel_internal.h    |   3 +-
 4 files changed, 147 insertions(+), 45 deletions(-)

Comments

Andrii Nakryiko Aug. 30, 2024, 9:30 p.m. UTC | #1
On Fri, Aug 30, 2024 at 12:30 AM Tony Ambardar <tony.ambardar@gmail.com> wrote:
>
> Track target endianness in 'struct bpf_gen' and process in-memory data in
> native byte-order, but on finalization convert the embedded loader BPF
> insns to target endianness.
>
> The light skeleton also includes a target-accessed data blob which is
> heterogeneous and thus difficult to convert to target byte-order on
> finalization. Add support functions to convert data to target endianness
> as it is added to the blob.
>
> Also add additional debug logging for data blob structure details and
> skeleton loading.
>
> Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
> ---
>  tools/lib/bpf/bpf_gen_internal.h |   1 +
>  tools/lib/bpf/gen_loader.c       | 187 +++++++++++++++++++++++--------
>  tools/lib/bpf/libbpf.c           |   1 +
>  tools/lib/bpf/skel_internal.h    |   3 +-
>  4 files changed, 147 insertions(+), 45 deletions(-)
>
> diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
> index fdf44403ff36..6ff963a491d9 100644
> --- a/tools/lib/bpf/bpf_gen_internal.h
> +++ b/tools/lib/bpf/bpf_gen_internal.h
> @@ -34,6 +34,7 @@ struct bpf_gen {
>         void *data_cur;
>         void *insn_start;
>         void *insn_cur;
> +       bool swapped_endian;
>         ssize_t cleanup_label;
>         __u32 nr_progs;
>         __u32 nr_maps;
> diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
> index cf3323fd47b8..4374399bc3f8 100644
> --- a/tools/lib/bpf/gen_loader.c
> +++ b/tools/lib/bpf/gen_loader.c
> @@ -401,6 +401,15 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
>                 opts->insns_sz = gen->insn_cur - gen->insn_start;
>                 opts->data = gen->data_start;
>                 opts->data_sz = gen->data_cur - gen->data_start;
> +
> +               /* use target endianness for embedded loader */
> +               if (gen->swapped_endian) {
> +                       struct bpf_insn *insn = (struct bpf_insn *)opts->insns;
> +                       int insn_cnt = opts->insns_sz / sizeof(struct bpf_insn);
> +
> +                       for (i = 0; i < insn_cnt; i++)
> +                               bpf_insn_bswap(insn++);
> +               }
>         }
>         return gen->error;
>  }
> @@ -414,6 +423,31 @@ void bpf_gen__free(struct bpf_gen *gen)
>         free(gen);
>  }
>
> +/*
> + * Fields of bpf_attr are set to values in native byte-order before being
> + * written to the target-bound data blob, and may need endian conversion.
> + * This macro allows providing the correct value in situ more simply than
> + * writing a separate converter for *all fields* of *all records* included
> + * in union bpf_attr. Note that sizeof(rval) should match the assignment
> + * target to avoid runtime problems.
> + */
> +#define tgt_endian(rval) ({                                    \
> +       typeof(rval) _val;                                      \
> +       if (!gen->swapped_endian)                               \

if/else has to have balanced branches w.r.t. {}. Either both should
have it or both shouldn't. In this case both should have it.

> +               _val = (rval);                                  \
> +       else {                                                  \
> +               switch (sizeof(rval)) {                         \
> +               case 1: _val = (rval); break;                   \
> +               case 2: _val = bswap_16(rval); break;           \
> +               case 4: _val = bswap_32(rval); break;           \
> +               case 8: _val = bswap_64(rval); break;           \
> +               default:_val = (rval);                          \
> +                       pr_warn("unsupported bswap size!\n");   \

this is a weird formatting,  but you can also just unconditionally
assign _val, and only swap it if gen->swapped_endian

typeof(rval) _val = (rval);

if (gen->swapped_endian) {
    switch (...) {
        case 1: ...
        ...
        case 8: ...
        default: pr_warn("...");
    }
}

_val;


seems simpler and cleaner, imo

> +               }                                               \
> +       }                                                       \
> +       _val;                                                   \
> +})
> +


for the rest, Alexei, can you please review and give your ack?
Alexei Starovoitov Aug. 31, 2024, 1:24 a.m. UTC | #2
On Fri, Aug 30, 2024 at 2:31 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
>
>
> for the rest, Alexei, can you please review and give your ack?

It looks fine.
All of the additional pr_debug()s look a bit excessive.
Will take another look at respin.
Tony Ambardar Sept. 1, 2024, 6 a.m. UTC | #3
On Fri, Aug 30, 2024 at 02:30:46PM -0700, Andrii Nakryiko wrote:
> On Fri, Aug 30, 2024 at 12:30 AM Tony Ambardar <tony.ambardar@gmail.com> wrote:
> >
> > Track target endianness in 'struct bpf_gen' and process in-memory data in
> > native byte-order, but on finalization convert the embedded loader BPF
> > insns to target endianness.
> >
> > The light skeleton also includes a target-accessed data blob which is
> > heterogeneous and thus difficult to convert to target byte-order on
> > finalization. Add support functions to convert data to target endianness
> > as it is added to the blob.
> >
> > Also add additional debug logging for data blob structure details and
> > skeleton loading.
> >
> > Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
> > ---
> >  tools/lib/bpf/bpf_gen_internal.h |   1 +
> >  tools/lib/bpf/gen_loader.c       | 187 +++++++++++++++++++++++--------
> >  tools/lib/bpf/libbpf.c           |   1 +
> >  tools/lib/bpf/skel_internal.h    |   3 +-
> >  4 files changed, 147 insertions(+), 45 deletions(-)
> >
> > diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
> > index fdf44403ff36..6ff963a491d9 100644
> > --- a/tools/lib/bpf/bpf_gen_internal.h
> > +++ b/tools/lib/bpf/bpf_gen_internal.h
> > @@ -34,6 +34,7 @@ struct bpf_gen {
> >         void *data_cur;
> >         void *insn_start;
> >         void *insn_cur;
> > +       bool swapped_endian;
> >         ssize_t cleanup_label;
> >         __u32 nr_progs;
> >         __u32 nr_maps;
> > diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
> > index cf3323fd47b8..4374399bc3f8 100644
> > --- a/tools/lib/bpf/gen_loader.c
> > +++ b/tools/lib/bpf/gen_loader.c
> > @@ -401,6 +401,15 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
> >                 opts->insns_sz = gen->insn_cur - gen->insn_start;
> >                 opts->data = gen->data_start;
> >                 opts->data_sz = gen->data_cur - gen->data_start;
> > +
> > +               /* use target endianness for embedded loader */
> > +               if (gen->swapped_endian) {
> > +                       struct bpf_insn *insn = (struct bpf_insn *)opts->insns;
> > +                       int insn_cnt = opts->insns_sz / sizeof(struct bpf_insn);
> > +
> > +                       for (i = 0; i < insn_cnt; i++)
> > +                               bpf_insn_bswap(insn++);
> > +               }
> >         }
> >         return gen->error;
> >  }
> > @@ -414,6 +423,31 @@ void bpf_gen__free(struct bpf_gen *gen)
> >         free(gen);
> >  }
> >
> > +/*
> > + * Fields of bpf_attr are set to values in native byte-order before being
> > + * written to the target-bound data blob, and may need endian conversion.
> > + * This macro allows providing the correct value in situ more simply than
> > + * writing a separate converter for *all fields* of *all records* included
> > + * in union bpf_attr. Note that sizeof(rval) should match the assignment
> > + * target to avoid runtime problems.
> > + */
> > +#define tgt_endian(rval) ({                                    \
> > +       typeof(rval) _val;                                      \
> > +       if (!gen->swapped_endian)                               \
> 
> if/else has to have balanced branches w.r.t. {}. Either both should
> have it or both shouldn't. In this case both should have it.
> 
> > +               _val = (rval);                                  \
> > +       else {                                                  \
> > +               switch (sizeof(rval)) {                         \
> > +               case 1: _val = (rval); break;                   \
> > +               case 2: _val = bswap_16(rval); break;           \
> > +               case 4: _val = bswap_32(rval); break;           \
> > +               case 8: _val = bswap_64(rval); break;           \
> > +               default:_val = (rval);                          \
> > +                       pr_warn("unsupported bswap size!\n");   \
> 
> this is a weird formatting,  but you can also just unconditionally
> assign _val, and only swap it if gen->swapped_endian
> 
> typeof(rval) _val = (rval);
> 
> if (gen->swapped_endian) {
>     switch (...) {
>         case 1: ...
>         ...
>         case 8: ...
>         default: pr_warn("...");
>     }
> }
> 
> _val;
> 
> 
> seems simpler and cleaner, imo
> 

Yes, agreed. Will update.

> > +               }                                               \
> > +       }                                                       \
> > +       _val;                                                   \
> > +})
> > +
> 
> 
> for the rest, Alexei, can you please review and give your ack?
Tony Ambardar Sept. 1, 2024, 6:02 a.m. UTC | #4
On Fri, Aug 30, 2024 at 06:24:47PM -0700, Alexei Starovoitov wrote:
> On Fri, Aug 30, 2024 at 2:31 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> >
> >
> > for the rest, Alexei, can you please review and give your ack?
> 
> It looks fine.
> All of the additional pr_debug()s look a bit excessive.
> Will take another look at respin.

Thanks for taking a look. My experience is the added pr_debug() were
essential to troubleshooting problems with the loader blobs, and only
provide indexes to embedded data. I'd hate for someone else to debug
without these. I think several of them could be consolidated however,
so let me have a try.

There's another area where I'd appreciate your help. The loader code
includes some inline debug statements, but I wasn't ever able to make
these work or see them in trace logs. Could you share an example of how
to enable these (e.g. for test_progs tests)?

Thanks,
Tony
diff mbox series

Patch

diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
index fdf44403ff36..6ff963a491d9 100644
--- a/tools/lib/bpf/bpf_gen_internal.h
+++ b/tools/lib/bpf/bpf_gen_internal.h
@@ -34,6 +34,7 @@  struct bpf_gen {
 	void *data_cur;
 	void *insn_start;
 	void *insn_cur;
+	bool swapped_endian;
 	ssize_t cleanup_label;
 	__u32 nr_progs;
 	__u32 nr_maps;
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index cf3323fd47b8..4374399bc3f8 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -401,6 +401,15 @@  int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
 		opts->insns_sz = gen->insn_cur - gen->insn_start;
 		opts->data = gen->data_start;
 		opts->data_sz = gen->data_cur - gen->data_start;
+
+		/* use target endianness for embedded loader */
+		if (gen->swapped_endian) {
+			struct bpf_insn *insn = (struct bpf_insn *)opts->insns;
+			int insn_cnt = opts->insns_sz / sizeof(struct bpf_insn);
+
+			for (i = 0; i < insn_cnt; i++)
+				bpf_insn_bswap(insn++);
+		}
 	}
 	return gen->error;
 }
@@ -414,6 +423,31 @@  void bpf_gen__free(struct bpf_gen *gen)
 	free(gen);
 }
 
+/*
+ * Fields of bpf_attr are set to values in native byte-order before being
+ * written to the target-bound data blob, and may need endian conversion.
+ * This macro allows providing the correct value in situ more simply than
+ * writing a separate converter for *all fields* of *all records* included
+ * in union bpf_attr. Note that sizeof(rval) should match the assignment
+ * target to avoid runtime problems.
+ */
+#define tgt_endian(rval) ({					\
+	typeof(rval) _val;					\
+	if (!gen->swapped_endian)				\
+		_val = (rval);					\
+	else {							\
+		switch (sizeof(rval)) {				\
+		case 1: _val = (rval); break;			\
+		case 2: _val = bswap_16(rval); break;		\
+		case 4: _val = bswap_32(rval); break;		\
+		case 8: _val = bswap_64(rval); break;		\
+		default:_val = (rval);				\
+			pr_warn("unsupported bswap size!\n");	\
+		}						\
+	}							\
+	_val;							\
+})
+
 void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data,
 		       __u32 btf_raw_size)
 {
@@ -422,11 +456,13 @@  void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data,
 	union bpf_attr attr;
 
 	memset(&attr, 0, attr_size);
-	pr_debug("gen: load_btf: size %d\n", btf_raw_size);
 	btf_data = add_data(gen, btf_raw_data, btf_raw_size);
+	pr_debug("gen: load_btf: off %d size %d\n", btf_data, btf_raw_size);
 
-	attr.btf_size = btf_raw_size;
+	attr.btf_size = tgt_endian(btf_raw_size);
 	btf_load_attr = add_data(gen, &attr, attr_size);
+	pr_debug("gen: load_btf: btf_load_attr: off %d size %d\n",
+		 btf_load_attr, attr_size);
 
 	/* populate union bpf_attr with user provided log details */
 	move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_level), 4,
@@ -457,28 +493,30 @@  void bpf_gen__map_create(struct bpf_gen *gen,
 	union bpf_attr attr;
 
 	memset(&attr, 0, attr_size);
-	attr.map_type = map_type;
-	attr.key_size = key_size;
-	attr.value_size = value_size;
-	attr.map_flags = map_attr->map_flags;
-	attr.map_extra = map_attr->map_extra;
+	attr.map_type = tgt_endian(map_type);
+	attr.key_size = tgt_endian(key_size);
+	attr.value_size = tgt_endian(value_size);
+	attr.map_flags = tgt_endian(map_attr->map_flags);
+	attr.map_extra = tgt_endian(map_attr->map_extra);
 	if (map_name)
 		libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name));
-	attr.numa_node = map_attr->numa_node;
-	attr.map_ifindex = map_attr->map_ifindex;
-	attr.max_entries = max_entries;
-	attr.btf_key_type_id = map_attr->btf_key_type_id;
-	attr.btf_value_type_id = map_attr->btf_value_type_id;
+	attr.numa_node = tgt_endian(map_attr->numa_node);
+	attr.map_ifindex = tgt_endian(map_attr->map_ifindex);
+	attr.max_entries = tgt_endian(max_entries);
+	attr.btf_key_type_id = tgt_endian(map_attr->btf_key_type_id);
+	attr.btf_value_type_id = tgt_endian(map_attr->btf_value_type_id);
 
 	pr_debug("gen: map_create: %s idx %d type %d value_type_id %d\n",
-		 attr.map_name, map_idx, map_type, attr.btf_value_type_id);
+		 map_name, map_idx, map_type, map_attr->btf_value_type_id);
 
 	map_create_attr = add_data(gen, &attr, attr_size);
-	if (attr.btf_value_type_id)
+	pr_debug("gen: map_create: map_create_attr: off %d size %d\n",
+		 map_create_attr, attr_size);
+	if (map_attr->btf_value_type_id)
 		/* populate union bpf_attr with btf_fd saved in the stack earlier */
 		move_stack2blob(gen, attr_field(map_create_attr, btf_fd), 4,
 				stack_off(btf_fd));
-	switch (attr.map_type) {
+	switch (map_type) {
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		move_stack2blob(gen, attr_field(map_create_attr, inner_map_fd), 4,
@@ -498,8 +536,8 @@  void bpf_gen__map_create(struct bpf_gen *gen,
 	/* emit MAP_CREATE command */
 	emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size);
 	debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d",
-		  attr.map_name, map_idx, map_type, value_size,
-		  attr.btf_value_type_id);
+		  map_name, map_idx, map_type, value_size,
+		  map_attr->btf_value_type_id);
 	emit_check_err(gen);
 	/* remember map_fd in the stack, if successful */
 	if (map_idx < 0) {
@@ -784,12 +822,12 @@  static void emit_relo_ksym_typeless(struct bpf_gen *gen,
 	emit_ksym_relo_log(gen, relo, kdesc->ref);
 }
 
-static __u32 src_reg_mask(void)
+static __u32 src_reg_mask(struct bpf_gen *gen)
 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	return 0x0f; /* src_reg,dst_reg,... */
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	return 0xf0; /* dst_reg,src_reg,... */
+#if defined(__LITTLE_ENDIAN_BITFIELD) /* src_reg,dst_reg,... */
+	return gen->swapped_endian ? 0xf0 : 0x0f;
+#elif defined(__BIG_ENDIAN_BITFIELD) /* dst_reg,src_reg,... */
+	return gen->swapped_endian ? 0x0f : 0xf0;
 #else
 #error "Unsupported bit endianness, cannot proceed"
 #endif
@@ -840,7 +878,7 @@  static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo,
 	emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 3));
 clear_src_reg:
 	/* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */
-	reg_mask = src_reg_mask();
+	reg_mask = src_reg_mask(gen);
 	emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code)));
 	emit(gen, BPF_ALU32_IMM(BPF_AND, BPF_REG_9, reg_mask));
 	emit(gen, BPF_STX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, offsetofend(struct bpf_insn, code)));
@@ -931,11 +969,33 @@  static void cleanup_relos(struct bpf_gen *gen, int insns)
 	cleanup_core_relo(gen);
 }
 
+/* Convert func, line, or core relo info records to target endianness,
+ * checking the blob size is consistent with 32-bit fields.
+ */
+static void info_blob_bswap(struct bpf_gen *gen, int info_off, int info_cnt,
+			    anon_info_bswap_fn_t bswap)
+{
+	void *p = gen->data_start + info_off;
+	int i;
+
+	if (!gen->swapped_endian)
+		return;
+
+	for (i = 0; i < info_cnt; i++)
+		p += bswap(p);
+}
+
 void bpf_gen__prog_load(struct bpf_gen *gen,
 			enum bpf_prog_type prog_type, const char *prog_name,
 			const char *license, struct bpf_insn *insns, size_t insn_cnt,
 			struct bpf_prog_load_opts *load_attr, int prog_idx)
 {
+	int func_info_tot_sz = load_attr->func_info_cnt *
+			       load_attr->func_info_rec_size;
+	int line_info_tot_sz = load_attr->line_info_cnt *
+			       load_attr->line_info_rec_size;
+	int core_relo_tot_sz = gen->core_relo_cnt *
+			       sizeof(struct bpf_core_relo);
 	int prog_load_attr, license_off, insns_off, func_info, line_info, core_relos;
 	int attr_size = offsetofend(union bpf_attr, core_relo_rec_size);
 	union bpf_attr attr;
@@ -947,32 +1007,63 @@  void bpf_gen__prog_load(struct bpf_gen *gen,
 	license_off = add_data(gen, license, strlen(license) + 1);
 	/* add insns to blob of bytes */
 	insns_off = add_data(gen, insns, insn_cnt * sizeof(struct bpf_insn));
+	pr_debug("gen: prog_load: license off %d insn off %d\n",
+		 license_off, insns_off);
 
-	attr.prog_type = prog_type;
-	attr.expected_attach_type = load_attr->expected_attach_type;
-	attr.attach_btf_id = load_attr->attach_btf_id;
-	attr.prog_ifindex = load_attr->prog_ifindex;
-	attr.kern_version = 0;
-	attr.insn_cnt = (__u32)insn_cnt;
-	attr.prog_flags = load_attr->prog_flags;
-
-	attr.func_info_rec_size = load_attr->func_info_rec_size;
-	attr.func_info_cnt = load_attr->func_info_cnt;
-	func_info = add_data(gen, load_attr->func_info,
-			     attr.func_info_cnt * attr.func_info_rec_size);
+	/* convert blob insns to target endianness */
+	if (gen->swapped_endian) {
+		struct bpf_insn *insn = gen->data_start + insns_off;
+		int i;
 
-	attr.line_info_rec_size = load_attr->line_info_rec_size;
-	attr.line_info_cnt = load_attr->line_info_cnt;
-	line_info = add_data(gen, load_attr->line_info,
-			     attr.line_info_cnt * attr.line_info_rec_size);
+		for (i = 0; i < insn_cnt; i++, insn++)
+			bpf_insn_bswap(insn);
+	}
 
-	attr.core_relo_rec_size = sizeof(struct bpf_core_relo);
-	attr.core_relo_cnt = gen->core_relo_cnt;
-	core_relos = add_data(gen, gen->core_relos,
-			     attr.core_relo_cnt * attr.core_relo_rec_size);
+	attr.prog_type = tgt_endian(prog_type);
+	attr.expected_attach_type = tgt_endian(load_attr->expected_attach_type);
+	attr.attach_btf_id = tgt_endian(load_attr->attach_btf_id);
+	attr.prog_ifindex = tgt_endian(load_attr->prog_ifindex);
+	attr.kern_version = 0;
+	attr.insn_cnt = tgt_endian((__u32)insn_cnt);
+	attr.prog_flags = tgt_endian(load_attr->prog_flags);
+
+	attr.func_info_rec_size = tgt_endian(load_attr->func_info_rec_size);
+	attr.func_info_cnt = tgt_endian(load_attr->func_info_cnt);
+	func_info = add_data(gen, load_attr->func_info, func_info_tot_sz);
+	pr_debug("gen: prog_load: func_info: off %d cnt %d rec size %d\n",
+		 func_info, load_attr->func_info_cnt,
+		 load_attr->func_info_rec_size);
+
+	/* convert func_info blob to target endianness */
+	info_blob_bswap(gen, func_info, load_attr->func_info_cnt,
+		        (anon_info_bswap_fn_t)bpf_func_info_bswap);
+
+	attr.line_info_rec_size = tgt_endian(load_attr->line_info_rec_size);
+	attr.line_info_cnt = tgt_endian(load_attr->line_info_cnt);
+	line_info = add_data(gen, load_attr->line_info, line_info_tot_sz);
+	pr_debug("gen: prog_load: line_info: off %d cnt %d rec size %d\n",
+		 line_info, load_attr->line_info_cnt,
+		 load_attr->line_info_rec_size);
+
+	/* convert line_info blob to target endianness */
+	info_blob_bswap(gen, line_info, load_attr->line_info_cnt,
+		        (anon_info_bswap_fn_t)bpf_line_info_bswap);
+
+	attr.core_relo_rec_size = tgt_endian((__u32)sizeof(struct bpf_core_relo));
+	attr.core_relo_cnt = tgt_endian(gen->core_relo_cnt);
+	core_relos = add_data(gen, gen->core_relos, core_relo_tot_sz);
+	pr_debug("gen: prog_load: core_relos: off %d cnt %d rec size %zd\n",
+		 core_relos, gen->core_relo_cnt,
+		 sizeof(struct bpf_core_relo));
+
+	/* convert core_relo info blob to target endianness */
+	info_blob_bswap(gen, core_relos, gen->core_relo_cnt,
+		        (anon_info_bswap_fn_t)bpf_core_relo_bswap);
 
 	libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name));
 	prog_load_attr = add_data(gen, &attr, attr_size);
+	pr_debug("gen: prog_load: prog_load_attr: off %d size %d\n",
+		 prog_load_attr, attr_size);
 
 	/* populate union bpf_attr with a pointer to license */
 	emit_rel_store(gen, attr_field(prog_load_attr, license), license_off);
@@ -1068,6 +1159,8 @@  void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue,
 	emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel));
 
 	map_update_attr = add_data(gen, &attr, attr_size);
+	pr_debug("gen: map_update_elem: map_update_attr: off %d size %d\n",
+		 map_update_attr, attr_size);
 	move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4,
 		       blob_fd_array_off(gen, map_idx));
 	emit_rel_store(gen, attr_field(map_update_attr, key), key);
@@ -1084,14 +1177,18 @@  void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int slo
 	int attr_size = offsetofend(union bpf_attr, flags);
 	int map_update_attr, key;
 	union bpf_attr attr;
+	int tgt_slot;
 
 	memset(&attr, 0, attr_size);
 	pr_debug("gen: populate_outer_map: outer %d key %d inner %d\n",
 		 outer_map_idx, slot, inner_map_idx);
 
-	key = add_data(gen, &slot, sizeof(slot));
+	tgt_slot = tgt_endian(slot);
+	key = add_data(gen, &tgt_slot, sizeof(tgt_slot));
 
 	map_update_attr = add_data(gen, &attr, attr_size);
+	pr_debug("gen: populate_outer_map: map_update_attr: off %d size %d\n",
+		 map_update_attr, attr_size);
 	move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4,
 		       blob_fd_array_off(gen, outer_map_idx));
 	emit_rel_store(gen, attr_field(map_update_attr, key), key);
@@ -1114,6 +1211,8 @@  void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx)
 	memset(&attr, 0, attr_size);
 	pr_debug("gen: map_freeze: idx %d\n", map_idx);
 	map_freeze_attr = add_data(gen, &attr, attr_size);
+	pr_debug("gen: map_freeze: map_update_attr: off %d size %d\n",
+		 map_freeze_attr, attr_size);
 	move_blob2blob(gen, attr_field(map_freeze_attr, map_fd), 4,
 		       blob_fd_array_off(gen, map_idx));
 	/* emit MAP_FREEZE command */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index aa52870b1967..1ba73c27973c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9124,6 +9124,7 @@  int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts)
 	if (!gen)
 		return -ENOMEM;
 	gen->opts = opts;
+	gen->swapped_endian = !is_native_endianness(obj);
 	obj->gen_loader = gen;
 	return 0;
 }
diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h
index 1e82ab06c3eb..67e8477ecb5b 100644
--- a/tools/lib/bpf/skel_internal.h
+++ b/tools/lib/bpf/skel_internal.h
@@ -351,10 +351,11 @@  static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
 	attr.test.ctx_size_in = opts->ctx->sz;
 	err = skel_sys_bpf(BPF_PROG_RUN, &attr, test_run_attr_sz);
 	if (err < 0 || (int)attr.test.retval < 0) {
-		opts->errstr = "failed to execute loader prog";
 		if (err < 0) {
+			opts->errstr = "failed to execute loader prog";
 			set_err;
 		} else {
+			opts->errstr = "error returned by loader prog";
 			err = (int)attr.test.retval;
 #ifndef __KERNEL__
 			errno = -err;