diff mbox series

[bpf-next,1/2] bpf, arm64: Jit BPF_CALL to direct call when possible

Message ID 20220902152043.721806-2-xukuohai@huaweicloud.com (mailing list archive)
State New, archived
Delegated to: BPF
Headers show
Series Jit BPF_CALL to direct call when possible | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 16 of 16 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Comparison to NULL could be written "ctx->image"
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_verifier on x86_64 with llvm-16

Commit Message

Xu Kuohai Sept. 2, 2022, 3:20 p.m. UTC
From: Xu Kuohai <xukuohai@huawei.com>

Currently BPF_CALL is always jited to indirect call, but when target is
in the range of direct call, a BPF_CALL can be jited to direct call.

For example, the following BPF_CALL

    call __htab_map_lookup_elem

is always jited to an indirect call:

    mov     x10, #0xffffffffffff18f4
    movk    x10, #0x821, lsl #16
    movk    x10, #0x8000, lsl #32
    blr     x10

When the target is in the range of a direct call, it can be jited to:

    bl      0xfffffffffd33bc98

This patch does such jit when possible.

1. Before allocating jit image memory, jit all BPF_CALL to indirect call,
   so we could get the maximum image size at the end.

2. Allocate jit image memory with the maximum image size.

3. Since we have now allocated jit image memory, every jited instruction
   address is determined, so the distance to call target is determined,
   so whether a BPF_CALL can be jited to direct call is determined.

4. Since step 3 may adjust the position of jited instructions, update the
   offset of jump instructions whose target is within the jit image.

Tested with test_bpf.ko and some arm64 working selftests, nothing failed.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
---
 arch/arm64/net/bpf_jit_comp.c | 71 ++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 13 deletions(-)
diff mbox series

Patch

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 389623ae5a91..69eb29f397e5 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -72,6 +72,7 @@  static const int bpf2a64[] = {
 struct jit_ctx {
 	const struct bpf_prog *prog;
 	int idx;
+	bool write;
 	int epilogue_offset;
 	int *offset;
 	int exentry_idx;
@@ -91,7 +92,7 @@  struct bpf_plt {
 
 static inline void emit(const u32 insn, struct jit_ctx *ctx)
 {
-	if (ctx->image != NULL)
+	if (ctx->image != NULL && ctx->write)
 		ctx->image[ctx->idx] = cpu_to_le32(insn);
 
 	ctx->idx++;
@@ -178,10 +179,29 @@  static inline void emit_addr_mov_i64(const int reg, const u64 val,
 
 static inline void emit_call(u64 target, struct jit_ctx *ctx)
 {
-	u8 tmp = bpf2a64[TMP_REG_1];
+	u8 tmp;
+	long offset;
+	unsigned long pc;
+	u32 insn = AARCH64_BREAK_FAULT;
+
+	/* if ctx->image == NULL or target == 0, the jump distance is unknown,
+	 * emit indirect call.
+	 */
+	if (ctx->image && target) {
+		pc = (unsigned long)&ctx->image[ctx->idx];
+		offset = (long)target - (long)pc;
+		if (offset >= -SZ_128M && offset < SZ_128M)
+			insn = aarch64_insn_gen_branch_imm(pc, target,
+					AARCH64_INSN_BRANCH_LINK);
+	}
 
-	emit_addr_mov_i64(tmp, target, ctx);
-	emit(A64_BLR(tmp), ctx);
+	if (insn == AARCH64_BREAK_FAULT) {
+		tmp = bpf2a64[TMP_REG_1];
+		emit_addr_mov_i64(tmp, target, ctx);
+		emit(A64_BLR(tmp), ctx);
+	} else {
+		emit(insn, ctx);
+	}
 }
 
 static inline int bpf2a64_offset(int bpf_insn, int off,
@@ -1392,13 +1412,11 @@  static int build_body(struct jit_ctx *ctx, bool extra_pass)
 		const struct bpf_insn *insn = &prog->insnsi[i];
 		int ret;
 
-		if (ctx->image == NULL)
-			ctx->offset[i] = ctx->idx;
+		ctx->offset[i] = ctx->idx;
 		ret = build_insn(insn, ctx, extra_pass);
 		if (ret > 0) {
 			i++;
-			if (ctx->image == NULL)
-				ctx->offset[i] = ctx->idx;
+			ctx->offset[i] = ctx->idx;
 			continue;
 		}
 		if (ret)
@@ -1409,8 +1427,7 @@  static int build_body(struct jit_ctx *ctx, bool extra_pass)
 	 * the last element with the offset after the last
 	 * instruction (end of program)
 	 */
-	if (ctx->image == NULL)
-		ctx->offset[i] = ctx->idx;
+	ctx->offset[i] = ctx->idx;
 
 	return 0;
 }
@@ -1461,6 +1478,8 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	bool extra_pass = false;
 	struct jit_ctx ctx;
 	u8 *image_ptr;
+	int body_offset;
+	int exentry_idx;
 
 	if (!prog->jit_requested)
 		return orig_prog;
@@ -1515,6 +1534,7 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_off;
 	}
 
+	/* Get the max image size */
 	if (build_body(&ctx, extra_pass)) {
 		prog = orig_prog;
 		goto out_off;
@@ -1528,7 +1548,7 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	extable_size = prog->aux->num_exentries *
 		sizeof(struct exception_table_entry);
 
-	/* Now we know the actual image size. */
+	/* Now we know the max image size. */
 	prog_size = sizeof(u32) * ctx.idx;
 	/* also allocate space for plt target */
 	extable_offset = round_up(prog_size + PLT_TARGET_SIZE, extable_align);
@@ -1548,15 +1568,37 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 skip_init_ctx:
 	ctx.idx = 0;
 	ctx.exentry_idx = 0;
+	ctx.write = true;
 
 	build_prologue(&ctx, was_classic);
 
+	/* Record exentry_idx and ctx.idx before first build_body */
+	exentry_idx = ctx.exentry_idx;
+	body_offset = ctx.idx;
+	/* Don't write instruction to memory for now */
+	ctx.write = false;
+
+	/* Determine call distance and instruction position */
 	if (build_body(&ctx, extra_pass)) {
 		bpf_jit_binary_free(header);
 		prog = orig_prog;
 		goto out_off;
 	}
 
+	ctx.epilogue_offset = ctx.idx;
+
+	ctx.exentry_idx = exentry_idx;
+	ctx.idx = body_offset;
+	ctx.write = true;
+
+	/* Determine jump offset and write result to memory */
+	if (build_body(&ctx, extra_pass) ||
+		WARN_ON_ONCE(ctx.idx != ctx.epilogue_offset)) {
+		bpf_jit_binary_free(header);
+		prog = orig_prog;
+		goto out_off;
+	}
+
 	build_epilogue(&ctx);
 	build_plt(&ctx);
 
@@ -1567,6 +1609,8 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_off;
 	}
 
+	/* Update prog size */
+	prog_size = sizeof(u32) * ctx.idx;
 	/* And we're done. */
 	if (bpf_jit_enable > 1)
 		bpf_jit_dump(prog->len, prog_size, 2, ctx.image);
@@ -1574,8 +1618,8 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	bpf_flush_icache(header, ctx.image + ctx.idx);
 
 	if (!prog->is_func || extra_pass) {
-		if (extra_pass && ctx.idx != jit_data->ctx.idx) {
-			pr_err_once("multi-func JIT bug %d != %d\n",
+		if (extra_pass && ctx.idx > jit_data->ctx.idx) {
+			pr_err_once("multi-func JIT bug %d > %d\n",
 				    ctx.idx, jit_data->ctx.idx);
 			bpf_jit_binary_free(header);
 			prog->bpf_func = NULL;
@@ -1976,6 +2020,7 @@  int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	struct jit_ctx ctx = {
 		.image = NULL,
 		.idx = 0,
+		.write = true,
 	};
 
 	/* the first 8 arguments are passed by registers */