diff mbox series

[bpf-next,v2,1/4] bpf, x64: Fix tailcall infinite loop caused by freplace

Message ID 20240901133856.64367-2-leon.hwang@linux.dev (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Fix tailcall infinite loop caused by freplace | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 16 this patch: 16
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers fail 1 blamed authors not CCed: martin.lau@linux.dev; 15 maintainers not CCed: bp@alien8.de netdev@vger.kernel.org sdf@fomichev.me hpa@zytor.com haoluo@google.com jolsa@kernel.org dave.hansen@linux.intel.com song@kernel.org tglx@linutronix.de x86@kernel.org mingo@redhat.com dsahern@kernel.org kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 16 this patch: 16
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 26 this patch: 26
netdev/checkpatch warning WARNING: line length of 98 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18

Commit Message

Leon Hwang Sept. 1, 2024, 1:38 p.m. UTC
This patch fixes a tailcall infinite loop issue caused by freplace.

Since commit 1c123c567fb1 ("bpf: Resolve fext program type when checking map compatibility"),
freplace prog is allowed to tail call its target prog. Then, when a
freplace prog attaches to its target prog's subprog and tail calls its
target prog, kernel will panic.

For example:

tc_bpf2bpf.c:

// SPDX-License-Identifier: GPL-2.0
\#include <linux/bpf.h>
\#include <bpf/bpf_helpers.h>

__noinline
int subprog_tc(struct __sk_buff *skb)
{
	return skb->len * 2;
}

SEC("tc")
int entry_tc(struct __sk_buff *skb)
{
	return subprog_tc(skb);
}

char __license[] SEC("license") = "GPL";

tailcall_bpf2bpf_hierarchy_freplace.c:

// SPDX-License-Identifier: GPL-2.0
\#include <linux/bpf.h>
\#include <bpf/bpf_helpers.h>

struct {
	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
	__uint(max_entries, 1);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");

int count = 0;

static __noinline
int subprog_tail(struct __sk_buff *skb)
{
	bpf_tail_call_static(skb, &jmp_table, 0);
	return 0;
}

SEC("freplace")
int entry_freplace(struct __sk_buff *skb)
{
	count++;
	subprog_tail(skb);
	subprog_tail(skb);
	return count;
}

char __license[] SEC("license") = "GPL";

The attach target of entry_freplace is subprog_tc, and the tail callee
in subprog_tail is entry_tc.

Then, the infinite loop will be entry_tc -> subprog_tc -> entry_freplace
-> subprog_tail --tailcall-> entry_tc, because tail_call_cnt in
entry_freplace will count from zero for every time of entry_freplace
execution. Kernel will panic:

[   15.310490] BUG: TASK stack guard page was hit at (____ptrval____)
(stack is (____ptrval____)..(____ptrval____))
[   15.310490] Oops: stack guard page: 0000 [#1] PREEMPT SMP NOPTI
[   15.310490] CPU: 1 PID: 89 Comm: test_progs Tainted: G           OE
   6.10.0-rc6-g026dcdae8d3e-dirty #72
[   15.310490] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX,
1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   15.310490] RIP: 0010:bpf_prog_3a140cef239a4b4f_subprog_tail+0x14/0x53
[   15.310490] Code: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
cc cc cc cc cc f3 0f 1e fa 0f 1f 44 00 00 0f 1f 00 55 48 89 e5 f3 0f 1e
fa <50> 50 53 41 55 48 89 fb 49 bd 00 2a 46 82 98 9c ff ff 48 89 df 4c
[   15.310490] RSP: 0018:ffffb500c0aa0000 EFLAGS: 00000202
[   15.310490] RAX: ffffb500c0aa0028 RBX: ffff9c98808b7e00 RCX:
0000000000008cb5
[   15.310490] RDX: 0000000000000000 RSI: ffff9c9882462a00 RDI:
ffff9c98808b7e00
[   15.310490] RBP: ffffb500c0aa0000 R08: 0000000000000000 R09:
0000000000000000
[   15.310490] R10: 0000000000000001 R11: 0000000000000000 R12:
ffffb500c01af000
[   15.310490] R13: ffffb500c01cd000 R14: 0000000000000000 R15:
0000000000000000
[   15.310490] FS:  00007f133b665140(0000) GS:ffff9c98bbd00000(0000)
knlGS:0000000000000000
[   15.310490] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   15.310490] CR2: ffffb500c0a9fff8 CR3: 0000000102478000 CR4:
00000000000006f0
[   15.310490] Call Trace:
[   15.310490]  <#DF>
[   15.310490]  ? die+0x36/0x90
[   15.310490]  ? handle_stack_overflow+0x4d/0x60
[   15.310490]  ? exc_double_fault+0x117/0x1a0
[   15.310490]  ? asm_exc_double_fault+0x23/0x30
[   15.310490]  ? bpf_prog_3a140cef239a4b4f_subprog_tail+0x14/0x53
[   15.310490]  </#DF>
[   15.310490]  <TASK>
[   15.310490]  bpf_prog_85781a698094722f_entry+0x4c/0x64
[   15.310490]  bpf_prog_1c515f389a9059b4_entry2+0x19/0x1b
[   15.310490]  ...
[   15.310490]  bpf_prog_85781a698094722f_entry+0x4c/0x64
[   15.310490]  bpf_prog_1c515f389a9059b4_entry2+0x19/0x1b
[   15.310490]  bpf_test_run+0x210/0x370
[   15.310490]  ? bpf_test_run+0x128/0x370
[   15.310490]  bpf_prog_test_run_skb+0x388/0x7a0
[   15.310490]  __sys_bpf+0xdbf/0x2c40
[   15.310490]  ? clockevents_program_event+0x52/0xf0
[   15.310490]  ? lock_release+0xbf/0x290
[   15.310490]  __x64_sys_bpf+0x1e/0x30
[   15.310490]  do_syscall_64+0x68/0x140
[   15.310490]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   15.310490] RIP: 0033:0x7f133b52725d
[   15.310490] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8b bb 0d 00 f7 d8 64 89 01 48
[   15.310490] RSP: 002b:00007ffddbc10258 EFLAGS: 00000206 ORIG_RAX:
0000000000000141
[   15.310490] RAX: ffffffffffffffda RBX: 00007ffddbc10828 RCX:
00007f133b52725d
[   15.310490] RDX: 0000000000000050 RSI: 00007ffddbc102a0 RDI:
000000000000000a
[   15.310490] RBP: 00007ffddbc10270 R08: 0000000000000000 R09:
00007ffddbc102a0
[   15.310490] R10: 0000000000000064 R11: 0000000000000206 R12:
0000000000000004
[   15.310490] R13: 0000000000000000 R14: 0000558ec4c24890 R15:
00007f133b6ed000
[   15.310490]  </TASK>
[   15.310490] Modules linked in: bpf_testmod(OE)
[   15.310490] ---[ end trace 0000000000000000 ]---
[   15.310490] RIP: 0010:bpf_prog_3a140cef239a4b4f_subprog_tail+0x14/0x53
[   15.310490] Code: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
cc cc cc cc cc f3 0f 1e fa 0f 1f 44 00 00 0f 1f 00 55 48 89 e5 f3 0f 1e
fa <50> 50 53 41 55 48 89 fb 49 bd 00 2a 46 82 98 9c ff ff 48 89 df 4c
[   15.310490] RSP: 0018:ffffb500c0aa0000 EFLAGS: 00000202
[   15.310490] RAX: ffffb500c0aa0028 RBX: ffff9c98808b7e00 RCX:
0000000000008cb5
[   15.310490] RDX: 0000000000000000 RSI: ffff9c9882462a00 RDI:
ffff9c98808b7e00
[   15.310490] RBP: ffffb500c0aa0000 R08: 0000000000000000 R09:
0000000000000000
[   15.310490] R10: 0000000000000001 R11: 0000000000000000 R12:
ffffb500c01af000
[   15.310490] R13: ffffb500c01cd000 R14: 0000000000000000 R15:
0000000000000000
[   15.310490] FS:  00007f133b665140(0000) GS:ffff9c98bbd00000(0000)
knlGS:0000000000000000
[   15.310490] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   15.310490] CR2: ffffb500c0a9fff8 CR3: 0000000102478000 CR4:
00000000000006f0
[   15.310490] Kernel panic - not syncing: Fatal exception in interrupt
[   15.310490] Kernel Offset: 0x30000000 from 0xffffffff81000000
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)

This patch fixes the issue by initializing tail_call_cnt at the prologue
of entry_tc.

Next, when call subprog_tc, the tail_call_cnt_ptr is propagated to
subprog_tc by rax.
Next, when jump to entry_freplace, the tail_call_cnt_ptr will be reused to
count tailcall in freplace prog.
Next, when call subprog_tail, the tail_call_cnt_ptr is propagated to
subprog_tail by rax.
Next, while tail calling to entry_tc, the tail_call_cnt on the stack of
entry_tc increments via the tail_call_cnt_ptr.

The whole procedure shows as the following JITed prog dumping.

bpftool p d j n entry_tc:

int entry_tc(struct __sk_buff * skb):
bpf_prog_1c515f389a9059b4_entry_tc:
; return subprog_tc(skb);
   0:	endbr64
   4:	xorq	%rax, %rax		;; rax = 0 (tail_call_cnt)
   7:	nopl	(%rax,%rax)
   c:	pushq	%rbp
   d:	movq	%rsp, %rbp
  10:	endbr64
  14:	cmpq	$33, %rax		;; if rax > 33, rax = tcc_ptr
  18:	ja	0x20			;; if rax > 33 goto 0x20 ---+
  1a:	pushq	%rax			;; [rbp - 8] = rax = 0      |
  1b:	movq	%rsp, %rax		;; rax = rbp - 8            |
  1e:	jmp	0x21			;; ---------+               |
  20:	pushq	%rax			;; <--------|---------------+
  21:	pushq	%rax			;; <--------+ [rbp - 16] = rax
  22:	movq	-16(%rbp), %rax		;; rax = tcc_ptr
  29:	callq	0x70			;; call subprog_tc()
; return subprog_tc(skb);
  2e:	leave
  2f:	retq

int subprog_tc(struct __sk_buff * skb):
bpf_prog_1e8f76e2374a0607_subprog_tc:
; return skb->len * 2;
   0:	endbr64
   4:	nopl	(%rax)			;; do not touch tail_call_cnt
   7:	jmp	0x108			;; jump to entry_freplace()
   c:	pushq	%rbp
   d:	movq	%rsp, %rbp
  10:	endbr64
  14:	pushq	%rax
  15:	pushq	%rax
  16:	movl	112(%rdi), %eax
; return skb->len * 2;
  19:	shll	%eax
; return skb->len * 2;
  1b:	leave
  1c:	retq

bpftool p d j n entry_freplace:

int entry_freplace(struct __sk_buff * skb):
bpf_prog_85781a698094722f_entry_freplace:
; int entry_freplace(struct __sk_buff *skb)
   0:	endbr64
   4:	nopl	(%rax)			;; do not touch tail_call_cnt
   7:	nopl	(%rax,%rax)
   c:	pushq	%rbp
   d:	movq	%rsp, %rbp
  10:	endbr64
  14:	cmpq	$33, %rax		;; if rax > 33, rax = tcc_ptr
  18:	ja	0x20			;; if rax > 33 goto 0x20 ---+
  1a:	pushq	%rax			;; [rbp - 8] = rax = 0      |
  1b:	movq	%rsp, %rax		;; rax = rbp - 8            |
  1e:	jmp	0x21			;; ---------+               |
  20:	pushq	%rax			;; <--------|---------------+
  21:	pushq	%rax			;; <--------+ [rbp - 16] = rax
  22:	pushq	%rbx			;; callee saved
  23:	pushq	%r13			;; callee saved
  25:	movq	%rdi, %rbx		;; rbx = skb (callee saved)
; count++;
  28:	movabsq	$-123406219759616, %r13
  32:	movl	(%r13), %edi
  36:	addl	$1, %edi
  39:	movl	%edi, (%r13)
; subprog_tail(skb);
  3d:	movq	%rbx, %rdi		;; rdi = skb
  40:	movq	-16(%rbp), %rax		;; rax = tcc_ptr
  47:	callq	0x80			;; call subprog_tail()
; subprog_tail(skb);
  4c:	movq	%rbx, %rdi		;; rdi = skb
  4f:	movq	-16(%rbp), %rax		;; rax = tcc_ptr
  56:	callq	0x80			;; call subprog_tail()
; return count;
  5b:	movl	(%r13), %eax
; return count;
  5f:	popq	%r13
  61:	popq	%rbx
  62:	leave
  63:	retq

int subprog_tail(struct __sk_buff * skb):
bpf_prog_3a140cef239a4b4f_subprog_tail:
; int subprog_tail(struct __sk_buff *skb)
   0:	endbr64
   4:	nopl	(%rax)			;; do not touch tail_call_cnt
   7:	nopl	(%rax,%rax)
   c:	pushq	%rbp
   d:	movq	%rsp, %rbp
  10:	endbr64
  14:	pushq	%rax			;; [rbp - 8]  = rax (tcc_ptr)
  15:	pushq	%rax			;; [rbp - 16] = rax (tcc_ptr)
  16:	pushq	%rbx			;; callee saved
  17:	pushq	%r13			;; callee saved
  19:	movq	%rdi, %rbx		;; rbx = skb
; asm volatile("r1 = %[ctx]\n\t"
  1c:	movabsq	$-128494642337280, %r13	;; r13 = jmp_table
  26:	movq	%rbx, %rdi		;; 1st arg, skb
  29:	movq	%r13, %rsi		;; 2nd arg, jmp_table
  2c:	xorl	%edx, %edx		;; 3rd arg, index = 0
  2e:	movq	-16(%rbp), %rax		;; rax = [rbp - 16] (tcc_ptr)
  35:	cmpq	$33, (%rax)
  39:	jae	0x4e			;; if *tcc_ptr >= 33 goto 0x4e --------+
  3b:	nopl	(%rax,%rax)		;; jmp bypass, toggled by poking       |
  40:	addq	$1, (%rax)		;; (*tcc_ptr)++                        |
  44:	popq	%r13			;; callee saved                        |
  46:	popq	%rbx			;; callee saved                        |
  47:	popq	%rax			;; undo rbp-16 push                    |
  48:	popq	%rax			;; undo rbp-8  push                    |
  49:	jmp	0xfffffffffffffe18	;; tail call target, toggled by poking |
; return 0;				;;                                     |
  4e:	popq	%r13			;; restore callee saved <--------------+
  50:	popq	%rbx			;; restore callee saved
  51:	leave
  52:	retq

As a result, the tail_call_cnt is stored on the stack of entry_tc. And
the tail_call_cnt_ptr is propagated between subprog_tc, entry_freplace,
subprog_tail and entry_tc.

But wait, what if entry_freplace has tailcall and entry_tc has no
tailcall? It's to disallow attaching this entry_freplace to this
entry_tc in verifier.

And, what if entry_freplace has tailcall and entry_tc has tailcall and
entry_freplace attaches to entry_tc? In this patch, the tailcall info of
entry_freplace inherits from its target. Therefore, it swaps the
positions of nop5 and xor/nop3 in order to initialize tail_call_cnt at
the prologue of entry_tc and then propagate the tail_call_cnt to
entry_freplace.

Fixes: 1c123c567fb1 ("bpf: Resolve fext program type when checking map compatibility")
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 arch/x86/net/bpf_jit_comp.c | 26 ++++++++++++++++++--------
 kernel/bpf/verifier.c       |  6 ++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

Comments

Alexei Starovoitov Sept. 13, 2024, 7:28 p.m. UTC | #1
On Sun, Sep 1, 2024 at 6:41 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> @@ -573,10 +575,13 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
>
>         /*
>          * See emit_prologue(), for IBT builds the trampoline hook is preceded
> -        * with an ENDBR instruction.
> +        * with an ENDBR instruction and 3 bytes tail_call_cnt initialization
> +        * instruction.
>          */
>         if (is_endbr(*(u32 *)ip))
>                 ip += ENDBR_INSN_SIZE;
> +       if (is_bpf_text_address((long)ip))
> +               ip += X86_POKE_EXTRA;

This is a foot gun.
bpf_arch_text_poke() is used not only at the beginning of the function.
So unconditional ip += 3 is not just puzzling with 'what is this for',
but dangerous and wasteful...

> @@ -2923,6 +2930,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
>                  */
>                 if (is_endbr(*(u32 *)orig_call))
>                         orig_call += ENDBR_INSN_SIZE;
> +               if (is_bpf_text_address((long)orig_call))
> +                       orig_call += X86_POKE_EXTRA;
>                 orig_call += X86_PATCH_SIZE;
>         }

..this bit needs to be hacked too...

> @@ -3025,6 +3034,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
>                 /* remember return value in a stack for bpf prog to access */
>                 emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
>                 im->ip_after_call = image + (prog - (u8 *)rw_image);
> +               emit_nops(&prog, X86_POKE_EXTRA);
>                 emit_nops(&prog, X86_PATCH_SIZE);

And this is just pure waste of kernel code and cpu run-time.

You're adding 3 byte nop for no reason at all.

See commit e21aa341785c ("bpf: Fix fexit trampoline.")
that added:
                int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
                                             NULL, im->ip_epilogue);
logic that is patching bpf trampoline in the middle of it.
(not at the start).

Because of unconditional +=3 in bpf_arch_text_poke() every trampoline
will have to waste nop3 ?
No.

Please fix freplace and tail call combination without
this kind of unacceptable shortcuts.

I very much prefer to stop hacking into JITs and trampolines because
tailcalls and freplace don't work well together.

We cannot completely disable that combination because libxdp
is using freplace to populate chain of progs the main prog is calling
and these freplace progs might be doing tailcall,
but we can still prevent such infinite loop that you describe in commit log:
entry_tc -> subprog_tc -> entry_freplace -> subprog_tail --tailcall-> entry_tc
in the verifier without resorting to JIT hacks.

pw-bot: cr
Leon Hwang Sept. 15, 2024, 1 p.m. UTC | #2
On 2024/9/14 03:28, Alexei Starovoitov wrote:
> On Sun, Sep 1, 2024 at 6:41 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>
>> @@ -573,10 +575,13 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
>>
>>         /*
>>          * See emit_prologue(), for IBT builds the trampoline hook is preceded
>> -        * with an ENDBR instruction.
>> +        * with an ENDBR instruction and 3 bytes tail_call_cnt initialization
>> +        * instruction.
>>          */
>>         if (is_endbr(*(u32 *)ip))
>>                 ip += ENDBR_INSN_SIZE;
>> +       if (is_bpf_text_address((long)ip))
>> +               ip += X86_POKE_EXTRA;
> 
> This is a foot gun.
> bpf_arch_text_poke() is used not only at the beginning of the function.
> So unconditional ip += 3 is not just puzzling with 'what is this for',
> but dangerous and wasteful...
> 
>> @@ -2923,6 +2930,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
>>                  */
>>                 if (is_endbr(*(u32 *)orig_call))
>>                         orig_call += ENDBR_INSN_SIZE;
>> +               if (is_bpf_text_address((long)orig_call))
>> +                       orig_call += X86_POKE_EXTRA;
>>                 orig_call += X86_PATCH_SIZE;
>>         }
> 
> ..this bit needs to be hacked too...
> 
>> @@ -3025,6 +3034,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
>>                 /* remember return value in a stack for bpf prog to access */
>>                 emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
>>                 im->ip_after_call = image + (prog - (u8 *)rw_image);
>> +               emit_nops(&prog, X86_POKE_EXTRA);
>>                 emit_nops(&prog, X86_PATCH_SIZE);
> 
> And this is just pure waste of kernel code and cpu run-time.
> 
> You're adding 3 byte nop for no reason at all.
> 
> See commit e21aa341785c ("bpf: Fix fexit trampoline.")
> that added:
>                 int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
>                                              NULL, im->ip_epilogue);
> logic that is patching bpf trampoline in the middle of it.
> (not at the start).
> 
> Because of unconditional +=3 in bpf_arch_text_poke() every trampoline
> will have to waste nop3 ?
> No.
> 
> Please fix freplace and tail call combination without
> this kind of unacceptable shortcuts.
> 
> I very much prefer to stop hacking into JITs and trampolines because
> tailcalls and freplace don't work well together.
> 
> We cannot completely disable that combination because libxdp
> is using freplace to populate chain of progs the main prog is calling
> and these freplace progs might be doing tailcall,
> but we can still prevent such infinite loop that you describe in commit log:
> entry_tc -> subprog_tc -> entry_freplace -> subprog_tail --tailcall-> entry_tc
> in the verifier without resorting to JIT hacks.
> 
> pw-bot: cr

IIUC, it's going to prevent this niche case at update/attach time, that
a prog cannot be updated to prog_array map and be extended by freplace
prog at the same time.

More specific explanation:

1. If a prog has been updated to prog_array, it and its subprog cannot
   be extended by freplace prog.
2. If a prog or its subprog has been extended by freplace prog, it
   cannot be updated to prog_array map.

Then, I do a POC[0] to prevent this niche case. And the POC does not
break any selftest.

[0] https://github.com/kernel-patches/bpf/pull/7732

Here's the diff of the POC.

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0c3893c471711..b864b37e67c17 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1483,6 +1483,8 @@ struct bpf_prog_aux {
 	bool xdp_has_frags;
 	bool exception_cb;
 	bool exception_boundary;
+	bool is_extended; /* true if extended by freplace program */
+	atomic_t tail_callee_cnt;
 	struct bpf_arena *arena;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 79660e3fca4c1..be41240d4fb3a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -951,18 +951,27 @@ static void *prog_fd_array_get_ptr(struct bpf_map
*map,
 	if (IS_ERR(prog))
 		return prog;

-	if (!bpf_prog_map_compatible(map, prog)) {
+	if (!bpf_prog_map_compatible(map, prog) || prog->aux->is_extended) {
+		/* Extended prog can not be tail callee. It's to prevent a
+		 * potential infinite loop like:
+		 * tail callee prog entry -> tail callee prog subprog ->
+		 * freplace prog entry --tailcall-> tail callee prog entry.
+		 */
 		bpf_prog_put(prog);
 		return ERR_PTR(-EINVAL);
 	}

+	atomic_inc(&prog->aux->tail_callee_cnt);
 	return prog;
 }

 static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool
need_defer)
 {
+	struct bpf_prog *prog = ptr;
+
 	/* bpf_prog is freed after one RCU or tasks trace grace period */
-	bpf_prog_put(ptr);
+	atomic_dec(&prog->aux->tail_callee_cnt);
+	bpf_prog_put(prog);
 }

 static u32 prog_fd_array_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8a4117f6d7610..be829016d8182 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3292,8 +3292,11 @@ static void bpf_tracing_link_release(struct
bpf_link *link)
 	bpf_trampoline_put(tr_link->trampoline);

 	/* tgt_prog is NULL if target is a kernel function */
-	if (tr_link->tgt_prog)
+	if (tr_link->tgt_prog) {
+		if (link->prog->type == BPF_PROG_TYPE_EXT)
+			tr_link->tgt_prog->aux->is_extended = false;
 		bpf_prog_put(tr_link->tgt_prog);
+	}
 }

 static void bpf_tracing_link_dealloc(struct bpf_link *link)
@@ -3498,6 +3501,18 @@ static int bpf_tracing_prog_attach(struct
bpf_prog *prog,
 		tgt_prog = prog->aux->dst_prog;
 	}

+	if (prog->type == BPF_PROG_TYPE_EXT &&
+	    atomic_read(&tgt_prog->aux->tail_callee_cnt)) {
+		/* Program extensions can not extend target prog when the target
+		 * prog has been updated to any prog_array map as tail callee.
+		 * It's to prevent a potential infinite loop like:
+		 * tgt prog entry -> tgt prog subprog -> freplace prog entry
+		 * --tailcall-> tgt prog entry.
+		 */
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
 	err = bpf_link_prime(&link->link.link, &link_primer);
 	if (err)
 		goto out_unlock;
@@ -3523,6 +3538,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog
*prog,
 	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
 		/* we allocated a new trampoline, so free the old one */
 		bpf_trampoline_put(prog->aux->dst_trampoline);
+	if (prog->type == BPF_PROG_TYPE_EXT)
+		tgt_prog->aux->is_extended = true;

 	prog->aux->dst_prog = NULL;
 	prog->aux->dst_trampoline = NULL;

Thanks,
Leon
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 074b41fafbe3f..143974b3e953b 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -274,6 +274,8 @@  struct jit_context {
 #define X86_PATCH_SIZE		5
 /* Number of bytes that will be skipped on tailcall */
 #define X86_TAIL_CALL_OFFSET	(12 + ENDBR_INSN_SIZE)
+/* Number of extra bytes that will be skipped on poke */
+#define X86_POKE_EXTRA		3
 
 static void push_r12(u8 **pprog)
 {
@@ -441,17 +443,13 @@  static void emit_prologue_tail_call(u8 **pprog, bool is_subprog)
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 			  bool tail_call_reachable, bool is_subprog,
-			  bool is_exception_cb)
+			  bool is_exception_cb, bool is_extension)
 {
 	u8 *prog = *pprog;
 
 	emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
-	/* BPF trampoline can be made to work without these nops,
-	 * but let's waste 5 bytes for now and optimize later
-	 */
-	emit_nops(&prog, X86_PATCH_SIZE);
 	if (!ebpf_from_cbpf) {
-		if (tail_call_reachable && !is_subprog)
+		if (tail_call_reachable && !is_extension && !is_subprog)
 			/* When it's the entry of the whole tailcall context,
 			 * zeroing rax means initialising tail_call_cnt.
 			 */
@@ -460,6 +458,10 @@  static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 			/* Keep the same instruction layout. */
 			emit_nops(&prog, 3);     /* nop3 */
 	}
+	/* BPF trampoline can be made to work without these nops,
+	 * but let's waste 5 bytes for now and optimize later
+	 */
+	emit_nops(&prog, X86_PATCH_SIZE);
 	/* Exception callback receives FP as third parameter */
 	if (is_exception_cb) {
 		EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
@@ -573,10 +575,13 @@  int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 
 	/*
 	 * See emit_prologue(), for IBT builds the trampoline hook is preceded
-	 * with an ENDBR instruction.
+	 * with an ENDBR instruction and 3 bytes tail_call_cnt initialization
+	 * instruction.
 	 */
 	if (is_endbr(*(u32 *)ip))
 		ip += ENDBR_INSN_SIZE;
+	if (is_bpf_text_address((long)ip))
+		ip += X86_POKE_EXTRA;
 
 	return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
 }
@@ -1366,6 +1371,7 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
 	bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
+	bool is_extension = bpf_prog->type == BPF_PROG_TYPE_EXT;
 	struct bpf_insn *insn = bpf_prog->insnsi;
 	bool callee_regs_used[4] = {};
 	int insn_cnt = bpf_prog->len;
@@ -1384,7 +1390,8 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb,
+		      is_extension);
 	/* Exception callback will clobber callee regs for its own use, and
 	 * restore the original callee regs from main prog's stack frame.
 	 */
@@ -2923,6 +2930,8 @@  static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		 */
 		if (is_endbr(*(u32 *)orig_call))
 			orig_call += ENDBR_INSN_SIZE;
+		if (is_bpf_text_address((long)orig_call))
+			orig_call += X86_POKE_EXTRA;
 		orig_call += X86_PATCH_SIZE;
 	}
 
@@ -3025,6 +3034,7 @@  static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		/* remember return value in a stack for bpf prog to access */
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
 		im->ip_after_call = image + (prog - (u8 *)rw_image);
+		emit_nops(&prog, X86_POKE_EXTRA);
 		emit_nops(&prog, X86_PATCH_SIZE);
 	}
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 217eb0eafa2a6..0a92fd4e0401e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21898,6 +21898,12 @@  int bpf_check_attach_target(struct bpf_verifier_log *log,
 			bpf_log(log, "Cannot extend fentry/fexit\n");
 			return -EINVAL;
 		}
+		if (prog_extension && prog->aux->tail_call_reachable &&
+		    !(aux->func ? aux->func[subprog]->aux->tail_call_reachable :
+		      aux->tail_call_reachable)) {
+			bpf_log(log, "Cannot extend no-tailcall target with tailcall ext prog\n");
+			return -EINVAL;
+		}
 	} else {
 		if (prog_extension) {
 			bpf_log(log, "Cannot replace kernel functions\n");