diff mbox series

[bpf-next,v2,1/2] bpf: Support private stack for bpf progs

Message ID 20240718205158.3651529-1-yonghong.song@linux.dev (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next,v2,1/2] bpf: Support private stack for bpf progs | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 898 this patch: 898
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 16 maintainers not CCed: kpsingh@kernel.org dave.hansen@linux.intel.com haoluo@google.com dsahern@kernel.org bp@alien8.de john.fastabend@gmail.com hpa@zytor.com song@kernel.org mingo@redhat.com x86@kernel.org netdev@vger.kernel.org jolsa@kernel.org martin.lau@linux.dev eddyz87@gmail.com tglx@linutronix.de sdf@fomichev.me
netdev/build_clang success Errors and warnings before: 971 this patch: 971
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 7630 this patch: 7631
netdev/checkpatch warning CHECK: No space is necessary after a cast WARNING: line length of 91 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Yonghong Song July 18, 2024, 8:51 p.m. UTC
The main motivation for private stack comes from nested
scheduler in sched-ext from Tejun. The basic idea is that
 - each cgroup will its own associated bpf program,
 - bpf program with parent cgroup will call bpf programs
   in immediate child cgroups.

Let us say we have the following cgroup hierarchy:
  root_cg (prog0):
    cg1 (prog1):
      cg11 (prog11):
        cg111 (prog111)
        cg112 (prog112)
      cg12 (prog12):
        cg121 (prog121)
        cg122 (prog122)
    cg2 (prog2):
      cg21 (prog21)
      cg22 (prog22)
      cg23 (prog23)

In the above example, prog0 will call a kfunc which will
call prog1 and prog2 to get sched info for cg1 and cg2 and
then the information is summarized and sent back to prog0.
Similarly, prog11 and prog12 will be invoked in the kfunc
and the result will be summarized and sent back to prog1, etc.

Currently, for each thread, the x86 kernel allocate 8KB stack.
The each bpf program (including its subprograms) has maximum
512B stack size to avoid potential stack overflow.
And nested bpf programs increase the risk of stack overflow.
To avoid potential stack overflow caused by bpf programs,
this patch implemented a private stack so bpf program stack
space is allocated dynamically when the program is jited.
Such private stack is applied to tracing programs like
kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
tracing.

But more than one instance of the same bpf program may
run in the system. To make things simple, percpu private
stack is allocated for each program, so if the same program
is running on different cpus concurrently, we won't have
any issue. Note that the kernel already have logic to prevent
the recursion for the same bpf program on the same cpu
(kprobe, fentry, etc.).

The patch implemented a percpu private stack based approach
for x86 arch.
  - The stack size will be 0 and any stack access is from
    jit-time allocated percpu storage.
  - In the beginning of jit, r9 is used to save percpu
    private stack pointer.
  - Each rbp in the bpf asm insn is replaced by r9.
  - For each call, push r9 before the call and pop r9
    after the call to preserve r9 value.

Compared to previous RFC patch [1], this patch added
some conditions to enable private stack, e.g., verifier
calculated stack size, prog type, etc. The new patch
also added a performance test to compare private stack
vs. no private stack.

The following are some code example to illustrate the idea
for selftest cgroup_skb_sk_lookup:

   the existing code                        the private-stack approach code
   endbr64                                  endbr64
   nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR [rax+rax*1+0x0]
   xchg   ax,ax                             xchg   ax,ax
   push   rbp                               push   rbp
   mov    rbp,rsp                           mov    rbp,rsp
   endbr64                                  endbr64
   sub    rsp,0x68
   push   rbx                               push   rbx
   ...                                      ...
   ...                                      mov    r9d,0x8c1c860
   ...                                      add    r9,QWORD PTR gs:0x21a00
   ...                                      ...
   mov    rdx,rbp                           mov    rdx, r9
   add    rdx,0xffffffffffffffb4            rdx,0xffffffffffffffb4
   ...                                      ...
   mov    ecx,0x28                          mov    ecx,0x28
                                            push   r9
   call   0xffffffffe305e474                call   0xffffffffe305e524
                                            pop    r9
   mov    rdi,rax                           mov    rdi,rax
   ...                                      ...
   movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR [r9-0x46]
   ...                                      ...

So the number of insns is increased by 1 + num_of_calls * 2.
Here the number of calls are those calls in the final jited binary.
Comparing function call itself, the push/pop overhead should be
minimum in most common cases.

Our original use case is for sched-ext nested scheduler. This will be done
in the future.

  [1] https://lore.kernel.org/bpf/707970c5-6bba-450a-be08-adf24d8b9276@linux.dev/T/

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 arch/x86/net/bpf_jit_comp.c | 63 ++++++++++++++++++++++++++++++++++---
 include/linux/bpf.h         |  2 ++
 kernel/bpf/core.c           | 20 ++++++++++++
 kernel/bpf/syscall.c        |  1 +
 4 files changed, 82 insertions(+), 4 deletions(-)

Comments

Andrii Nakryiko July 20, 2024, 3:28 a.m. UTC | #1
On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> The main motivation for private stack comes from nested
> scheduler in sched-ext from Tejun. The basic idea is that
>  - each cgroup will its own associated bpf program,
>  - bpf program with parent cgroup will call bpf programs
>    in immediate child cgroups.
>
> Let us say we have the following cgroup hierarchy:
>   root_cg (prog0):
>     cg1 (prog1):
>       cg11 (prog11):
>         cg111 (prog111)
>         cg112 (prog112)
>       cg12 (prog12):
>         cg121 (prog121)
>         cg122 (prog122)
>     cg2 (prog2):
>       cg21 (prog21)
>       cg22 (prog22)
>       cg23 (prog23)
>
> In the above example, prog0 will call a kfunc which will
> call prog1 and prog2 to get sched info for cg1 and cg2 and
> then the information is summarized and sent back to prog0.
> Similarly, prog11 and prog12 will be invoked in the kfunc
> and the result will be summarized and sent back to prog1, etc.
>
> Currently, for each thread, the x86 kernel allocate 8KB stack.
> The each bpf program (including its subprograms) has maximum
> 512B stack size to avoid potential stack overflow.
> And nested bpf programs increase the risk of stack overflow.
> To avoid potential stack overflow caused by bpf programs,
> this patch implemented a private stack so bpf program stack
> space is allocated dynamically when the program is jited.
> Such private stack is applied to tracing programs like
> kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
> tracing.
>
> But more than one instance of the same bpf program may
> run in the system. To make things simple, percpu private
> stack is allocated for each program, so if the same program
> is running on different cpus concurrently, we won't have
> any issue. Note that the kernel already have logic to prevent
> the recursion for the same bpf program on the same cpu
> (kprobe, fentry, etc.).
>
> The patch implemented a percpu private stack based approach
> for x86 arch.
>   - The stack size will be 0 and any stack access is from
>     jit-time allocated percpu storage.
>   - In the beginning of jit, r9 is used to save percpu
>     private stack pointer.
>   - Each rbp in the bpf asm insn is replaced by r9.
>   - For each call, push r9 before the call and pop r9
>     after the call to preserve r9 value.
>
> Compared to previous RFC patch [1], this patch added
> some conditions to enable private stack, e.g., verifier
> calculated stack size, prog type, etc. The new patch
> also added a performance test to compare private stack
> vs. no private stack.
>
> The following are some code example to illustrate the idea
> for selftest cgroup_skb_sk_lookup:
>
>    the existing code                        the private-stack approach code
>    endbr64                                  endbr64
>    nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR [rax+rax*1+0x0]
>    xchg   ax,ax                             xchg   ax,ax
>    push   rbp                               push   rbp
>    mov    rbp,rsp                           mov    rbp,rsp
>    endbr64                                  endbr64
>    sub    rsp,0x68
>    push   rbx                               push   rbx
>    ...                                      ...
>    ...                                      mov    r9d,0x8c1c860
>    ...                                      add    r9,QWORD PTR gs:0x21a00
>    ...                                      ...
>    mov    rdx,rbp                           mov    rdx, r9
>    add    rdx,0xffffffffffffffb4            rdx,0xffffffffffffffb4
>    ...                                      ...
>    mov    ecx,0x28                          mov    ecx,0x28
>                                             push   r9
>    call   0xffffffffe305e474                call   0xffffffffe305e524
>                                             pop    r9
>    mov    rdi,rax                           mov    rdi,rax
>    ...                                      ...
>    movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR [r9-0x46]
>    ...                                      ...
>

Eduard nerd-sniped me today with this a bit... :)

I have a few questions and suggestions.

So it seems like each *subprogram* (not the entire BPF program) gets
its own per-CPU private stack allocation. Is that intentional? That
seems a bit unnecessary. It also prevents any sort of actual
recursion. Not sure if it's possible to write recursive BPF subprogram
today, verifier seems to reject obvious limited recursion cases, but
still, eventually we might need/want to support that, and this will be
just another hurdle to overcome (so it's best to avoid adding it in
the first place).

I'm sure Eduard is going to try something like below and it will
probably break badly (I haven't tried, sorry):

int entry(void *ctx);

struct {
        __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
        __uint(max_entries, 1);
        __uint(key_size, sizeof(__u32));
        __array(values, int (void *));
} prog_array_init SEC(".maps") = {
        .values = {
                [0] = (void *)&entry,
        },
};

static __noinline int subprog1(void)
{
    <some state on the stack>

    /* here entry will replace subprog1, and so we'll have
     * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
     */
    bpf_tail_call(ctx, &prog_array_init, 0);

    return 0;
}


SEC("raw_tp/sys_enter")
int entry(void *ctx)
{
     <some state on the stack>

     subprog1();
}

And we effectively have limited recursion where the entry's stack
state is clobbered, no?

So it seems like we need to support recursion.


So, the question I have is. Why not do the following:
a) only setup r9 *once* in entry program's prologue (before tail call
jump target)
b) before each call we can adjust r9 with current prog/subprog's
maximum *own* stack, something like:

push r9;
r9 += 128; // 128 is subprog's stack usage
call <some-subprog>
pop r9;

The idea being that on tail call or in subprog call we assume r9 is
already pointing to the right place. We can probably also figure out
how to avoid push/pop r9 if we make sure that subprogram always
restores r9 (taking tail calls into account and all that, of course)?

Is this feasible?

Another question I have is whether it would be possible to just plain
set rbp to private stack and keep using rbp in such a way that stack
traces are preserved? I.e., save return address on private stack to
unwinders can correctly jump back to kernel's stack?

How stupid is what I propose above?


> So the number of insns is increased by 1 + num_of_calls * 2.
> Here the number of calls are those calls in the final jited binary.
> Comparing function call itself, the push/pop overhead should be
> minimum in most common cases.
>
> Our original use case is for sched-ext nested scheduler. This will be done
> in the future.
>
>   [1] https://lore.kernel.org/bpf/707970c5-6bba-450a-be08-adf24d8b9276@linux.dev/T/
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  arch/x86/net/bpf_jit_comp.c | 63 ++++++++++++++++++++++++++++++++++---
>  include/linux/bpf.h         |  2 ++
>  kernel/bpf/core.c           | 20 ++++++++++++
>  kernel/bpf/syscall.c        |  1 +
>  4 files changed, 82 insertions(+), 4 deletions(-)
>

[...]
Eduard Zingerman July 22, 2024, 3:33 a.m. UTC | #2
Hi Yonghong,

In general I think that changes in this patch are logical and make sense.
I have a suggestion regarding testing JIT related changes.

We currently lack a convenient way to verify jit behaviour modulo
runtime tests. I think we should have a capability to write tests like below:

    SEC("tp")
    __jited_x86("f:	endbr64")
    __jited_x86("13:	movabs $0x.*,%r9")
    __jited_x86("1d:	add    %gs:0x.*,%r9")
    __jited_x86("26:	mov    $0x1,%edi")
    __jited_x86("2b:	mov    %rdi,-0x8(%r9)")
    __jited_x86("2f:	mov    -0x8(%r9),%rdi")
    __jited_x86("33:	xor    %eax,%eax")
    __jited_x86("35:	lock xchg %rax,-0x8(%r9)")
    __jited_x86("3a:	lock xadd %rax,-0x8(%r9)")
    __naked void stack_access_insns(void)
    {
    	asm volatile (
    	"r1 = 1;"
    	"*(u64 *)(r10 - 8) = r1;"
    	"r1 = *(u64 *)(r10 - 8);"
    	"r0 = 0;"
    	"r0 = xchg_64(r10 - 8, r0);"
    	"r0 = atomic_fetch_add((u64 *)(r10 - 8), r0);"
    	"exit;"
    	::: __clobber_all);
    }

In the following branch I explored a way to add such capability:
https://github.com/eddyz87/bpf/tree/yhs-private-stack-plus-jit-testing

Beside testing exact translation, such tests also provide good
starting point for people trying to figure out how some jit features work.

The below two commits are the gist of the feature:
8f9361be2fb3 ("selftests/bpf: __jited_x86 test tag to check x86 assembly after jit")
0156b148b5b4 ("selftests/bpf: utility function to get program disassembly after jit")

For "0156b148b5b4" I opted to do a popen() call and execute bpftool process,
an alternative would be to:
a. either link tools/bpf/bpftool/jit_disasm.c as a part of the
   test_progs executable;
b. call libbfd (binutils dis-assembler) directly from the bpftool.

Currently bpftool can use two dis-assemblers: libbfd and llvm library,
depending on the build environment. For CI builds libbfd is used.
I don't know if llvm and libbfd always produce same output for
identical binary code. Imo, if people are Ok with adding libbfd
dependency to test_progs, option (b) is the best. If folks on the
mailing list agree with this, I can work on updating the patches.

-------------

Aside from testing I agree with Andrii regarding rbp usage,
it seems like it should be possible to do the following in prologue:

    movabs $0x...,%rsp
    add %gs:0x...,%rsp
    push %rbp

and there would be no need to modify translation for instructions
accessing r10, plus debugger stack unrolling logic should still work?.
Or am I mistaken?

Thanks,
Eduard
Yonghong Song July 22, 2024, 4:43 p.m. UTC | #3
On 7/19/24 8:28 PM, Andrii Nakryiko wrote:
> On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> The main motivation for private stack comes from nested
>> scheduler in sched-ext from Tejun. The basic idea is that
>>   - each cgroup will its own associated bpf program,
>>   - bpf program with parent cgroup will call bpf programs
>>     in immediate child cgroups.
>>
>> Let us say we have the following cgroup hierarchy:
>>    root_cg (prog0):
>>      cg1 (prog1):
>>        cg11 (prog11):
>>          cg111 (prog111)
>>          cg112 (prog112)
>>        cg12 (prog12):
>>          cg121 (prog121)
>>          cg122 (prog122)
>>      cg2 (prog2):
>>        cg21 (prog21)
>>        cg22 (prog22)
>>        cg23 (prog23)
>>
>> In the above example, prog0 will call a kfunc which will
>> call prog1 and prog2 to get sched info for cg1 and cg2 and
>> then the information is summarized and sent back to prog0.
>> Similarly, prog11 and prog12 will be invoked in the kfunc
>> and the result will be summarized and sent back to prog1, etc.
>>
>> Currently, for each thread, the x86 kernel allocate 8KB stack.
>> The each bpf program (including its subprograms) has maximum
>> 512B stack size to avoid potential stack overflow.
>> And nested bpf programs increase the risk of stack overflow.
>> To avoid potential stack overflow caused by bpf programs,
>> this patch implemented a private stack so bpf program stack
>> space is allocated dynamically when the program is jited.
>> Such private stack is applied to tracing programs like
>> kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
>> tracing.
>>
>> But more than one instance of the same bpf program may
>> run in the system. To make things simple, percpu private
>> stack is allocated for each program, so if the same program
>> is running on different cpus concurrently, we won't have
>> any issue. Note that the kernel already have logic to prevent
>> the recursion for the same bpf program on the same cpu
>> (kprobe, fentry, etc.).
>>
>> The patch implemented a percpu private stack based approach
>> for x86 arch.
>>    - The stack size will be 0 and any stack access is from
>>      jit-time allocated percpu storage.
>>    - In the beginning of jit, r9 is used to save percpu
>>      private stack pointer.
>>    - Each rbp in the bpf asm insn is replaced by r9.
>>    - For each call, push r9 before the call and pop r9
>>      after the call to preserve r9 value.
>>
>> Compared to previous RFC patch [1], this patch added
>> some conditions to enable private stack, e.g., verifier
>> calculated stack size, prog type, etc. The new patch
>> also added a performance test to compare private stack
>> vs. no private stack.
>>
>> The following are some code example to illustrate the idea
>> for selftest cgroup_skb_sk_lookup:
>>
>>     the existing code                        the private-stack approach code
>>     endbr64                                  endbr64
>>     nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR [rax+rax*1+0x0]
>>     xchg   ax,ax                             xchg   ax,ax
>>     push   rbp                               push   rbp
>>     mov    rbp,rsp                           mov    rbp,rsp
>>     endbr64                                  endbr64
>>     sub    rsp,0x68
>>     push   rbx                               push   rbx
>>     ...                                      ...
>>     ...                                      mov    r9d,0x8c1c860
>>     ...                                      add    r9,QWORD PTR gs:0x21a00
>>     ...                                      ...
>>     mov    rdx,rbp                           mov    rdx, r9
>>     add    rdx,0xffffffffffffffb4            rdx,0xffffffffffffffb4
>>     ...                                      ...
>>     mov    ecx,0x28                          mov    ecx,0x28
>>                                              push   r9
>>     call   0xffffffffe305e474                call   0xffffffffe305e524
>>                                              pop    r9
>>     mov    rdi,rax                           mov    rdi,rax
>>     ...                                      ...
>>     movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR [r9-0x46]
>>     ...                                      ...
>>
> Eduard nerd-sniped me today with this a bit... :)
>
> I have a few questions and suggestions.
>
> So it seems like each *subprogram* (not the entire BPF program) gets
> its own per-CPU private stack allocation. Is that intentional? That

Currently yes. The reason is the same prog could be run on different
cpus at the same time.

> seems a bit unnecessary. It also prevents any sort of actual
> recursion. Not sure if it's possible to write recursive BPF subprogram
> today, verifier seems to reject obvious limited recursion cases, but
> still, eventually we might need/want to support that, and this will be
> just another hurdle to overcome (so it's best to avoid adding it in
> the first place).
>
> I'm sure Eduard is going to try something like below and it will
> probably break badly (I haven't tried, sorry):
>
> int entry(void *ctx);
>
> struct {
>          __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>          __uint(max_entries, 1);
>          __uint(key_size, sizeof(__u32));
>          __array(values, int (void *));
> } prog_array_init SEC(".maps") = {
>          .values = {
>                  [0] = (void *)&entry,
>          },
> };
>
> static __noinline int subprog1(void)
> {
>      <some state on the stack>
>
>      /* here entry will replace subprog1, and so we'll have
>       * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
>       */
>      bpf_tail_call(ctx, &prog_array_init, 0);
>
>      return 0;
> }
>
>
> SEC("raw_tp/sys_enter")
> int entry(void *ctx)
> {
>       <some state on the stack>
>
>       subprog1();
> }
>
> And we effectively have limited recursion where the entry's stack
> state is clobbered, no?
>
> So it seems like we need to support recursion.
>
>
> So, the question I have is. Why not do the following:
> a) only setup r9 *once* in entry program's prologue (before tail call
> jump target)
> b) before each call we can adjust r9 with current prog/subprog's
> maximum *own* stack, something like:
>
> push r9;
> r9 += 128; // 128 is subprog's stack usage
> call <some-subprog>
> pop r9;
>
> The idea being that on tail call or in subprog call we assume r9 is
> already pointing to the right place. We can probably also figure out
> how to avoid push/pop r9 if we make sure that subprogram always
> restores r9 (taking tail calls into account and all that, of course)?
>
> Is this feasible?

This is possible. I actually hacked such an idea easily. The basic
idea is push frame pointer as an additional argument to the bpf
static sub-prog. This is a little bit complicated. It will probably
save some stack size but I am not sure how much it is.

>
> Another question I have is whether it would be possible to just plain
> set rbp to private stack and keep using rbp in such a way that stack
> traces are preserved? I.e., save return address on private stack to
> unwinders can correctly jump back to kernel's stack?

I also tried this approach earlier. But it is very trickly we need to
modify rbp/rsp and additional jit code will be added
If interrupts happens, we will not be able to get reliable stack trace.

>
> How stupid is what I propose above?
>
>
>> So the number of insns is increased by 1 + num_of_calls * 2.
>> Here the number of calls are those calls in the final jited binary.
>> Comparing function call itself, the push/pop overhead should be
>> minimum in most common cases.
>>
>> Our original use case is for sched-ext nested scheduler. This will be done
>> in the future.
>>
>>    [1] https://lore.kernel.org/bpf/707970c5-6bba-450a-be08-adf24d8b9276@linux.dev/T/
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   arch/x86/net/bpf_jit_comp.c | 63 ++++++++++++++++++++++++++++++++++---
>>   include/linux/bpf.h         |  2 ++
>>   kernel/bpf/core.c           | 20 ++++++++++++
>>   kernel/bpf/syscall.c        |  1 +
>>   4 files changed, 82 insertions(+), 4 deletions(-)
>>
> [...]
Yonghong Song July 22, 2024, 4:54 p.m. UTC | #4
On 7/21/24 8:33 PM, Eduard Zingerman wrote:
> Hi Yonghong,
>
> In general I think that changes in this patch are logical and make sense.
> I have a suggestion regarding testing JIT related changes.
>
> We currently lack a convenient way to verify jit behaviour modulo
> runtime tests. I think we should have a capability to write tests like below:
>
>      SEC("tp")
>      __jited_x86("f:	endbr64")
>      __jited_x86("13:	movabs $0x.*,%r9")
>      __jited_x86("1d:	add    %gs:0x.*,%r9")
>      __jited_x86("26:	mov    $0x1,%edi")
>      __jited_x86("2b:	mov    %rdi,-0x8(%r9)")
>      __jited_x86("2f:	mov    -0x8(%r9),%rdi")
>      __jited_x86("33:	xor    %eax,%eax")
>      __jited_x86("35:	lock xchg %rax,-0x8(%r9)")
>      __jited_x86("3a:	lock xadd %rax,-0x8(%r9)")
>      __naked void stack_access_insns(void)
>      {
>      	asm volatile (
>      	"r1 = 1;"
>      	"*(u64 *)(r10 - 8) = r1;"
>      	"r1 = *(u64 *)(r10 - 8);"
>      	"r0 = 0;"
>      	"r0 = xchg_64(r10 - 8, r0);"
>      	"r0 = atomic_fetch_add((u64 *)(r10 - 8), r0);"
>      	"exit;"
>      	::: __clobber_all);
>      }
>
> In the following branch I explored a way to add such capability:
> https://github.com/eddyz87/bpf/tree/yhs-private-stack-plus-jit-testing
>
> Beside testing exact translation, such tests also provide good
> starting point for people trying to figure out how some jit features work.
>
> The below two commits are the gist of the feature:
> 8f9361be2fb3 ("selftests/bpf: __jited_x86 test tag to check x86 assembly after jit")
> 0156b148b5b4 ("selftests/bpf: utility function to get program disassembly after jit")
>
> For "0156b148b5b4" I opted to do a popen() call and execute bpftool process,
> an alternative would be to:
> a. either link tools/bpf/bpftool/jit_disasm.c as a part of the
>     test_progs executable;
> b. call libbfd (binutils dis-assembler) directly from the bpftool.
>
> Currently bpftool can use two dis-assemblers: libbfd and llvm library,
> depending on the build environment. For CI builds libbfd is used.
> I don't know if llvm and libbfd always produce same output for
> identical binary code. Imo, if people are Ok with adding libbfd
> dependency to test_progs, option (b) is the best. If folks on the
> mailing list agree with this, I can work on updating the patches.

I think this is a good idea in the long time.
Let me try with your patch.

>
> -------------
>
> Aside from testing I agree with Andrii regarding rbp usage,
> it seems like it should be possible to do the following in prologue:
>
>      movabs $0x...,%rsp
>      add %gs:0x...,%rsp
>      push %rbp
>
> and there would be no need to modify translation for instructions
> accessing r10, plus debugger stack unrolling logic should still work?.
> Or am I mistaken?

This may not work. The 'push %rbp' does not change %rbp value which still
the original %rbp.

>
> Thanks,
> Eduard
Alexei Starovoitov July 22, 2024, 5:51 p.m. UTC | #5
On Sun, Jul 21, 2024 at 8:33 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>

> Aside from testing I agree with Andrii regarding rbp usage,
> it seems like it should be possible to do the following in prologue:
>
>     movabs $0x...,%rsp
>     add %gs:0x...,%rsp
>     push %rbp
>
> and there would be no need to modify translation for instructions
> accessing r10, plus debugger stack unrolling logic should still work?.
> Or am I mistaken?

It's not that simple.
Above sequence violates -mno-red-zone.
The part of the fix may look like:
movabs .., rax
add %gs.., rax
mov rbp, qword ptr [rax - ...]

mov rax, rsp
mox rax, rbp
sub rsp, ...

it's probably correct from mno-red-zone pov and
end result is maybe correct for stack unwind,
but if irq happens in the middle it won't crash,
but unwind will not work.
The main reason to use r9 is to have valid unwind
at any point of the prog.
Eduard Zingerman July 22, 2024, 5:53 p.m. UTC | #6
On Mon, 2024-07-22 at 09:54 -0700, Yonghong Song wrote:

[...]

> > For "0156b148b5b4" I opted to do a popen() call and execute bpftool process,
> > an alternative would be to:
> > a. either link tools/bpf/bpftool/jit_disasm.c as a part of the
> >     test_progs executable;
> > b. call libbfd (binutils dis-assembler) directly from the bpftool.
> > 
> > Currently bpftool can use two dis-assemblers: libbfd and llvm library,
> > depending on the build environment. For CI builds libbfd is used.
> > I don't know if llvm and libbfd always produce same output for
> > identical binary code. Imo, if people are Ok with adding libbfd
> > dependency to test_progs, option (b) is the best. If folks on the
> > mailing list agree with this, I can work on updating the patches.
> 
> I think this is a good idea in the long time.
> Let me try with your patch.

What do you think about direct dependency on libbfd for test_progs,
should I update the disassembly function or popen'ing bpftool is fine?
I'd prefer libbfd dependency, tbh.

[...]
Eduard Zingerman July 22, 2024, 6:22 p.m. UTC | #7
On Mon, 2024-07-22 at 10:51 -0700, Alexei Starovoitov wrote:

[...]

> It's not that simple.
> Above sequence violates -mno-red-zone.
> The part of the fix may look like:
> movabs .., rax
> add %gs.., rax
> mov rbp, qword ptr [rax - ...]
> 
> mov rax, rsp
> mox rax, rbp
> sub rsp, ...
> 
> it's probably correct from mno-red-zone pov and
> end result is maybe correct for stack unwind,
> but if irq happens in the middle it won't crash,
> but unwind will not work.
> The main reason to use r9 is to have valid unwind
> at any point of the prog.

Oh, I see, bad things would happen if this sequence:

      movabs $0x...,%rsp
      add %gs:0x...,%rsp

Would be split by an interrupt.
However, I don't understand why 'push %rbp' violates red zone.
In any case, the interrupt argument is sufficient,
thank you for explaining.
Alexei Starovoitov July 22, 2024, 8:08 p.m. UTC | #8
On Mon, Jul 22, 2024 at 11:22 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Mon, 2024-07-22 at 10:51 -0700, Alexei Starovoitov wrote:
>
> [...]
>
> > It's not that simple.
> > Above sequence violates -mno-red-zone.
> > The part of the fix may look like:
> > movabs .., rax
> > add %gs.., rax
> > mov rbp, qword ptr [rax - ...]
> >
> > mov rax, rsp
> > mox rax, rbp
> > sub rsp, ...
> >
> > it's probably correct from mno-red-zone pov and
> > end result is maybe correct for stack unwind,
> > but if irq happens in the middle it won't crash,
> > but unwind will not work.
> > The main reason to use r9 is to have valid unwind
> > at any point of the prog.
>
> Oh, I see, bad things would happen if this sequence:
>
>       movabs $0x...,%rsp
>       add %gs:0x...,%rsp
>
> Would be split by an interrupt.
> However, I don't understand why 'push %rbp' violates red zone.
> In any case, the interrupt argument is sufficient,
> thank you for explaining.

push rbp itself doesn't break red-zone.
If we do:
movabs .., rax
add %gs.., rax
mov rax, rsp
push rbp


at the time of setting of rsp the unwind is broken.
hence the idea to mov rbp, qword ptr [rax - ...]
before setting rsp.
We have to maintain uwindable stack at all times.
But if we overwrite rsp it means everything will go into
this new memory.
We'd need another 16k of space in there for everything
that bpf prog can call, since kernel code will be using
that area from the moment of the switch.
At the end we'd have to restore to original stack somehow.
Instead of single 'leave' insn the sequence has to preserve
unwinding. It all looks very tricky and fragile.
Andrii Nakryiko July 22, 2024, 8:57 p.m. UTC | #9
On Fri, Jul 19, 2024 at 8:28 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >
> > The main motivation for private stack comes from nested
> > scheduler in sched-ext from Tejun. The basic idea is that
> >  - each cgroup will its own associated bpf program,
> >  - bpf program with parent cgroup will call bpf programs
> >    in immediate child cgroups.
> >
> > Let us say we have the following cgroup hierarchy:
> >   root_cg (prog0):
> >     cg1 (prog1):
> >       cg11 (prog11):
> >         cg111 (prog111)
> >         cg112 (prog112)
> >       cg12 (prog12):
> >         cg121 (prog121)
> >         cg122 (prog122)
> >     cg2 (prog2):
> >       cg21 (prog21)
> >       cg22 (prog22)
> >       cg23 (prog23)
> >
> > In the above example, prog0 will call a kfunc which will
> > call prog1 and prog2 to get sched info for cg1 and cg2 and
> > then the information is summarized and sent back to prog0.
> > Similarly, prog11 and prog12 will be invoked in the kfunc
> > and the result will be summarized and sent back to prog1, etc.
> >
> > Currently, for each thread, the x86 kernel allocate 8KB stack.
> > The each bpf program (including its subprograms) has maximum
> > 512B stack size to avoid potential stack overflow.
> > And nested bpf programs increase the risk of stack overflow.
> > To avoid potential stack overflow caused by bpf programs,
> > this patch implemented a private stack so bpf program stack
> > space is allocated dynamically when the program is jited.
> > Such private stack is applied to tracing programs like
> > kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
> > tracing.
> >
> > But more than one instance of the same bpf program may
> > run in the system. To make things simple, percpu private
> > stack is allocated for each program, so if the same program
> > is running on different cpus concurrently, we won't have
> > any issue. Note that the kernel already have logic to prevent
> > the recursion for the same bpf program on the same cpu
> > (kprobe, fentry, etc.).
> >
> > The patch implemented a percpu private stack based approach
> > for x86 arch.
> >   - The stack size will be 0 and any stack access is from
> >     jit-time allocated percpu storage.
> >   - In the beginning of jit, r9 is used to save percpu
> >     private stack pointer.
> >   - Each rbp in the bpf asm insn is replaced by r9.
> >   - For each call, push r9 before the call and pop r9
> >     after the call to preserve r9 value.
> >
> > Compared to previous RFC patch [1], this patch added
> > some conditions to enable private stack, e.g., verifier
> > calculated stack size, prog type, etc. The new patch
> > also added a performance test to compare private stack
> > vs. no private stack.
> >
> > The following are some code example to illustrate the idea
> > for selftest cgroup_skb_sk_lookup:
> >
> >    the existing code                        the private-stack approach code
> >    endbr64                                  endbr64
> >    nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR [rax+rax*1+0x0]
> >    xchg   ax,ax                             xchg   ax,ax
> >    push   rbp                               push   rbp
> >    mov    rbp,rsp                           mov    rbp,rsp
> >    endbr64                                  endbr64
> >    sub    rsp,0x68
> >    push   rbx                               push   rbx
> >    ...                                      ...
> >    ...                                      mov    r9d,0x8c1c860
> >    ...                                      add    r9,QWORD PTR gs:0x21a00
> >    ...                                      ...
> >    mov    rdx,rbp                           mov    rdx, r9
> >    add    rdx,0xffffffffffffffb4            rdx,0xffffffffffffffb4
> >    ...                                      ...
> >    mov    ecx,0x28                          mov    ecx,0x28
> >                                             push   r9
> >    call   0xffffffffe305e474                call   0xffffffffe305e524
> >                                             pop    r9
> >    mov    rdi,rax                           mov    rdi,rax
> >    ...                                      ...
> >    movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR [r9-0x46]
> >    ...                                      ...
> >
>
> Eduard nerd-sniped me today with this a bit... :)
>
> I have a few questions and suggestions.
>
> So it seems like each *subprogram* (not the entire BPF program) gets
> its own per-CPU private stack allocation. Is that intentional? That
> seems a bit unnecessary. It also prevents any sort of actual
> recursion. Not sure if it's possible to write recursive BPF subprogram
> today, verifier seems to reject obvious limited recursion cases, but
> still, eventually we might need/want to support that, and this will be
> just another hurdle to overcome (so it's best to avoid adding it in
> the first place).
>
> I'm sure Eduard is going to try something like below and it will
> probably break badly (I haven't tried, sorry):
>
> int entry(void *ctx);
>
> struct {
>         __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>         __uint(max_entries, 1);
>         __uint(key_size, sizeof(__u32));
>         __array(values, int (void *));
> } prog_array_init SEC(".maps") = {
>         .values = {
>                 [0] = (void *)&entry,
>         },
> };
>
> static __noinline int subprog1(void)
> {
>     <some state on the stack>
>
>     /* here entry will replace subprog1, and so we'll have
>      * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
>      */
>     bpf_tail_call(ctx, &prog_array_init, 0);
>
>     return 0;
> }
>
>
> SEC("raw_tp/sys_enter")
> int entry(void *ctx)
> {
>      <some state on the stack>
>
>      subprog1();
> }
>
> And we effectively have limited recursion where the entry's stack
> state is clobbered, no?
>
> So it seems like we need to support recursion.
>

How come everyone just completely ignored the main point of my entire
email and a real problem that has to be solved?...

Anyways, I did write a below program:

$ cat minimal.bpf.c
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/* Copyright (c) 2020 Facebook */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>

char LICENSE[] SEC("license") = "Dual BSD/GPL";

int my_pid = 0;

int handle_tp(void *ctx);

struct {
        __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
        __uint(max_entries, 1);
        __uint(key_size, sizeof(__u32));
        __array(values, int (void *));
} prog_array_init SEC(".maps") = {
        .values = {
                [0] = (void *)&handle_tp,
        },
};

static __noinline int subprog(void *ctx)
{
        static int cnt;

        cnt++;

        bpf_printk("SUBPROG - BEFORE %d", cnt);

        bpf_tail_call(ctx, &prog_array_init, 0);

        bpf_printk("SUBPROG - AFTER %d", cnt);

    return 0;
}

SEC("tp/syscalls/sys_enter_write")
int handle_tp(void *ctx)
{
        static int cnt;
        int pid = bpf_get_current_pid_tgid() >> 32;

        if (pid != my_pid)
                return 0;

        cnt++;

        bpf_printk("ENTRY - BEFORE %d", cnt);

        subprog(ctx);

        bpf_printk("ENTRY - AFTER %d", cnt);

        return 0;
}


And triggered one write syscall, getting the log above. You can see
that only subprogs are replaced (we only get "SUBPROG - AFTER 34" due
to the tail call limit). And we do indeed get lots of entry program
recurrence.

We *need to support recursion* is my main point. rbp is a distraction, sorry.

$ sudo cat /sys/kernel/tracing/trace_pipe
         minimal-1219321 [025] ....1 8119446.322300: bpf_trace_printk:
ENTRY - BEFORE 1
         minimal-1219321 [025] ....1 8119446.322303: bpf_trace_printk:
SUBPROG - BEFORE 1
         minimal-1219321 [025] ....1 8119446.322304: bpf_trace_printk:
ENTRY - BEFORE 2
         minimal-1219321 [025] ....1 8119446.322304: bpf_trace_printk:
SUBPROG - BEFORE 2
         minimal-1219321 [025] ....1 8119446.322304: bpf_trace_printk:
ENTRY - BEFORE 3
         minimal-1219321 [025] ....1 8119446.322305: bpf_trace_printk:
SUBPROG - BEFORE 3
         minimal-1219321 [025] ....1 8119446.322305: bpf_trace_printk:
ENTRY - BEFORE 4
         minimal-1219321 [025] ....1 8119446.322305: bpf_trace_printk:
SUBPROG - BEFORE 4
         minimal-1219321 [025] ....1 8119446.322305: bpf_trace_printk:
ENTRY - BEFORE 5
         minimal-1219321 [025] ....1 8119446.322306: bpf_trace_printk:
SUBPROG - BEFORE 5
         minimal-1219321 [025] ....1 8119446.322306: bpf_trace_printk:
ENTRY - BEFORE 6
         minimal-1219321 [025] ....1 8119446.322306: bpf_trace_printk:
SUBPROG - BEFORE 6
         minimal-1219321 [025] ....1 8119446.322307: bpf_trace_printk:
ENTRY - BEFORE 7
         minimal-1219321 [025] ....1 8119446.322307: bpf_trace_printk:
SUBPROG - BEFORE 7
         minimal-1219321 [025] ....1 8119446.322307: bpf_trace_printk:
ENTRY - BEFORE 8
         minimal-1219321 [025] ....1 8119446.322307: bpf_trace_printk:
SUBPROG - BEFORE 8
         minimal-1219321 [025] ....1 8119446.322308: bpf_trace_printk:
ENTRY - BEFORE 9
         minimal-1219321 [025] ....1 8119446.322308: bpf_trace_printk:
SUBPROG - BEFORE 9
         minimal-1219321 [025] ....1 8119446.322308: bpf_trace_printk:
ENTRY - BEFORE 10
         minimal-1219321 [025] ....1 8119446.322308: bpf_trace_printk:
SUBPROG - BEFORE 10
         minimal-1219321 [025] ....1 8119446.322309: bpf_trace_printk:
ENTRY - BEFORE 11
         minimal-1219321 [025] ....1 8119446.322309: bpf_trace_printk:
SUBPROG - BEFORE 11
         minimal-1219321 [025] ....1 8119446.322309: bpf_trace_printk:
ENTRY - BEFORE 12
         minimal-1219321 [025] ....1 8119446.322309: bpf_trace_printk:
SUBPROG - BEFORE 12
         minimal-1219321 [025] ....1 8119446.322310: bpf_trace_printk:
ENTRY - BEFORE 13
         minimal-1219321 [025] ....1 8119446.322310: bpf_trace_printk:
SUBPROG - BEFORE 13
         minimal-1219321 [025] ....1 8119446.322310: bpf_trace_printk:
ENTRY - BEFORE 14
         minimal-1219321 [025] ....1 8119446.322312: bpf_trace_printk:
SUBPROG - BEFORE 14
         minimal-1219321 [025] ....1 8119446.322313: bpf_trace_printk:
ENTRY - BEFORE 15
         minimal-1219321 [025] ....1 8119446.322313: bpf_trace_printk:
SUBPROG - BEFORE 15
         minimal-1219321 [025] ....1 8119446.322313: bpf_trace_printk:
ENTRY - BEFORE 16
         minimal-1219321 [025] ....1 8119446.322313: bpf_trace_printk:
SUBPROG - BEFORE 16
         minimal-1219321 [025] ....1 8119446.322314: bpf_trace_printk:
ENTRY - BEFORE 17
         minimal-1219321 [025] ....1 8119446.322314: bpf_trace_printk:
SUBPROG - BEFORE 17
         minimal-1219321 [025] ....1 8119446.322314: bpf_trace_printk:
ENTRY - BEFORE 18
         minimal-1219321 [025] ....1 8119446.322314: bpf_trace_printk:
SUBPROG - BEFORE 18
         minimal-1219321 [025] ....1 8119446.322315: bpf_trace_printk:
ENTRY - BEFORE 19
         minimal-1219321 [025] ....1 8119446.322315: bpf_trace_printk:
SUBPROG - BEFORE 19
         minimal-1219321 [025] ....1 8119446.322315: bpf_trace_printk:
ENTRY - BEFORE 20
         minimal-1219321 [025] ....1 8119446.322315: bpf_trace_printk:
SUBPROG - BEFORE 20
         minimal-1219321 [025] ....1 8119446.322316: bpf_trace_printk:
ENTRY - BEFORE 21
         minimal-1219321 [025] ....1 8119446.322316: bpf_trace_printk:
SUBPROG - BEFORE 21
         minimal-1219321 [025] ....1 8119446.322316: bpf_trace_printk:
ENTRY - BEFORE 22
         minimal-1219321 [025] ....1 8119446.322316: bpf_trace_printk:
SUBPROG - BEFORE 22
         minimal-1219321 [025] ....1 8119446.322316: bpf_trace_printk:
ENTRY - BEFORE 23
         minimal-1219321 [025] ....1 8119446.322317: bpf_trace_printk:
SUBPROG - BEFORE 23
         minimal-1219321 [025] ....1 8119446.322318: bpf_trace_printk:
ENTRY - BEFORE 24
         minimal-1219321 [025] ....1 8119446.322318: bpf_trace_printk:
SUBPROG - BEFORE 24
         minimal-1219321 [025] ....1 8119446.322319: bpf_trace_printk:
ENTRY - BEFORE 25
         minimal-1219321 [025] ....1 8119446.322319: bpf_trace_printk:
SUBPROG - BEFORE 25
         minimal-1219321 [025] ....1 8119446.322319: bpf_trace_printk:
ENTRY - BEFORE 26
         minimal-1219321 [025] ....1 8119446.322320: bpf_trace_printk:
SUBPROG - BEFORE 26
         minimal-1219321 [025] ....1 8119446.322321: bpf_trace_printk:
ENTRY - BEFORE 27
         minimal-1219321 [025] ....1 8119446.322321: bpf_trace_printk:
SUBPROG - BEFORE 27
         minimal-1219321 [025] ....1 8119446.322321: bpf_trace_printk:
ENTRY - BEFORE 28
         minimal-1219321 [025] ....1 8119446.322322: bpf_trace_printk:
SUBPROG - BEFORE 28
         minimal-1219321 [025] ....1 8119446.322322: bpf_trace_printk:
ENTRY - BEFORE 29
         minimal-1219321 [025] ....1 8119446.322323: bpf_trace_printk:
SUBPROG - BEFORE 29
         minimal-1219321 [025] ....1 8119446.322323: bpf_trace_printk:
ENTRY - BEFORE 30
         minimal-1219321 [025] ....1 8119446.322324: bpf_trace_printk:
SUBPROG - BEFORE 30
         minimal-1219321 [025] ....1 8119446.322324: bpf_trace_printk:
ENTRY - BEFORE 31
         minimal-1219321 [025] ....1 8119446.322324: bpf_trace_printk:
SUBPROG - BEFORE 31
         minimal-1219321 [025] ....1 8119446.322324: bpf_trace_printk:
ENTRY - BEFORE 32
         minimal-1219321 [025] ....1 8119446.322325: bpf_trace_printk:
SUBPROG - BEFORE 32
         minimal-1219321 [025] ....1 8119446.322325: bpf_trace_printk:
ENTRY - BEFORE 33
         minimal-1219321 [025] ....1 8119446.322326: bpf_trace_printk:
SUBPROG - BEFORE 33
         minimal-1219321 [025] ....1 8119446.322327: bpf_trace_printk:
ENTRY - BEFORE 34
         minimal-1219321 [025] ....1 8119446.322327: bpf_trace_printk:
SUBPROG - BEFORE 34
         minimal-1219321 [025] ....1 8119446.322327: bpf_trace_printk:
SUBPROG - AFTER 34
         minimal-1219321 [025] ....1 8119446.322328: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322328: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322328: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322328: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322329: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322329: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322329: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322329: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322329: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322330: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322330: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322330: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322330: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322331: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322331: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322331: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322331: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322331: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322332: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322332: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322332: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322332: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322332: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322333: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322333: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322333: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322333: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322334: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322334: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322334: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322334: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322334: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322335: bpf_trace_printk:
ENTRY - AFTER 34
         minimal-1219321 [025] ....1 8119446.322335: bpf_trace_printk:
ENTRY - AFTER 34


>
> So, the question I have is. Why not do the following:
> a) only setup r9 *once* in entry program's prologue (before tail call
> jump target)
> b) before each call we can adjust r9 with current prog/subprog's
> maximum *own* stack, something like:
>
> push r9;
> r9 += 128; // 128 is subprog's stack usage
> call <some-subprog>
> pop r9;
>
> The idea being that on tail call or in subprog call we assume r9 is
> already pointing to the right place. We can probably also figure out
> how to avoid push/pop r9 if we make sure that subprogram always
> restores r9 (taking tail calls into account and all that, of course)?
>
> Is this feasible?
>
> Another question I have is whether it would be possible to just plain
> set rbp to private stack and keep using rbp in such a way that stack
> traces are preserved? I.e., save return address on private stack to
> unwinders can correctly jump back to kernel's stack?
>
> How stupid is what I propose above?
>
>
> > So the number of insns is increased by 1 + num_of_calls * 2.
> > Here the number of calls are those calls in the final jited binary.
> > Comparing function call itself, the push/pop overhead should be
> > minimum in most common cases.
> >
> > Our original use case is for sched-ext nested scheduler. This will be done
> > in the future.
> >
> >   [1] https://lore.kernel.org/bpf/707970c5-6bba-450a-be08-adf24d8b9276@linux.dev/T/
> >
> > Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> > ---
> >  arch/x86/net/bpf_jit_comp.c | 63 ++++++++++++++++++++++++++++++++++---
> >  include/linux/bpf.h         |  2 ++
> >  kernel/bpf/core.c           | 20 ++++++++++++
> >  kernel/bpf/syscall.c        |  1 +
> >  4 files changed, 82 insertions(+), 4 deletions(-)
> >
>
> [...]
Alexei Starovoitov July 23, 2024, 1:05 a.m. UTC | #10
On Mon, Jul 22, 2024 at 1:58 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Jul 19, 2024 at 8:28 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> > >
> > > The main motivation for private stack comes from nested
> > > scheduler in sched-ext from Tejun. The basic idea is that
> > >  - each cgroup will its own associated bpf program,
> > >  - bpf program with parent cgroup will call bpf programs
> > >    in immediate child cgroups.
> > >
> > > Let us say we have the following cgroup hierarchy:
> > >   root_cg (prog0):
> > >     cg1 (prog1):
> > >       cg11 (prog11):
> > >         cg111 (prog111)
> > >         cg112 (prog112)
> > >       cg12 (prog12):
> > >         cg121 (prog121)
> > >         cg122 (prog122)
> > >     cg2 (prog2):
> > >       cg21 (prog21)
> > >       cg22 (prog22)
> > >       cg23 (prog23)
> > >
> > > In the above example, prog0 will call a kfunc which will
> > > call prog1 and prog2 to get sched info for cg1 and cg2 and
> > > then the information is summarized and sent back to prog0.
> > > Similarly, prog11 and prog12 will be invoked in the kfunc
> > > and the result will be summarized and sent back to prog1, etc.
> > >
> > > Currently, for each thread, the x86 kernel allocate 8KB stack.
> > > The each bpf program (including its subprograms) has maximum
> > > 512B stack size to avoid potential stack overflow.
> > > And nested bpf programs increase the risk of stack overflow.
> > > To avoid potential stack overflow caused by bpf programs,
> > > this patch implemented a private stack so bpf program stack
> > > space is allocated dynamically when the program is jited.
> > > Such private stack is applied to tracing programs like
> > > kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
> > > tracing.
> > >
> > > But more than one instance of the same bpf program may
> > > run in the system. To make things simple, percpu private
> > > stack is allocated for each program, so if the same program
> > > is running on different cpus concurrently, we won't have
> > > any issue. Note that the kernel already have logic to prevent
> > > the recursion for the same bpf program on the same cpu
> > > (kprobe, fentry, etc.).
> > >
> > > The patch implemented a percpu private stack based approach
> > > for x86 arch.
> > >   - The stack size will be 0 and any stack access is from
> > >     jit-time allocated percpu storage.
> > >   - In the beginning of jit, r9 is used to save percpu
> > >     private stack pointer.
> > >   - Each rbp in the bpf asm insn is replaced by r9.
> > >   - For each call, push r9 before the call and pop r9
> > >     after the call to preserve r9 value.
> > >
> > > Compared to previous RFC patch [1], this patch added
> > > some conditions to enable private stack, e.g., verifier
> > > calculated stack size, prog type, etc. The new patch
> > > also added a performance test to compare private stack
> > > vs. no private stack.
> > >
> > > The following are some code example to illustrate the idea
> > > for selftest cgroup_skb_sk_lookup:
> > >
> > >    the existing code                        the private-stack approach code
> > >    endbr64                                  endbr64
> > >    nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR [rax+rax*1+0x0]
> > >    xchg   ax,ax                             xchg   ax,ax
> > >    push   rbp                               push   rbp
> > >    mov    rbp,rsp                           mov    rbp,rsp
> > >    endbr64                                  endbr64
> > >    sub    rsp,0x68
> > >    push   rbx                               push   rbx
> > >    ...                                      ...
> > >    ...                                      mov    r9d,0x8c1c860
> > >    ...                                      add    r9,QWORD PTR gs:0x21a00
> > >    ...                                      ...
> > >    mov    rdx,rbp                           mov    rdx, r9
> > >    add    rdx,0xffffffffffffffb4            rdx,0xffffffffffffffb4
> > >    ...                                      ...
> > >    mov    ecx,0x28                          mov    ecx,0x28
> > >                                             push   r9
> > >    call   0xffffffffe305e474                call   0xffffffffe305e524
> > >                                             pop    r9
> > >    mov    rdi,rax                           mov    rdi,rax
> > >    ...                                      ...
> > >    movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR [r9-0x46]
> > >    ...                                      ...
> > >
> >
> > Eduard nerd-sniped me today with this a bit... :)
> >
> > I have a few questions and suggestions.
> >
> > So it seems like each *subprogram* (not the entire BPF program) gets
> > its own per-CPU private stack allocation. Is that intentional? That
> > seems a bit unnecessary. It also prevents any sort of actual
> > recursion. Not sure if it's possible to write recursive BPF subprogram
> > today, verifier seems to reject obvious limited recursion cases, but
> > still, eventually we might need/want to support that, and this will be
> > just another hurdle to overcome (so it's best to avoid adding it in
> > the first place).
> >
> > I'm sure Eduard is going to try something like below and it will
> > probably break badly (I haven't tried, sorry):
> >
> > int entry(void *ctx);
> >
> > struct {
> >         __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
> >         __uint(max_entries, 1);
> >         __uint(key_size, sizeof(__u32));
> >         __array(values, int (void *));
> > } prog_array_init SEC(".maps") = {
> >         .values = {
> >                 [0] = (void *)&entry,
> >         },
> > };
> >
> > static __noinline int subprog1(void)
> > {
> >     <some state on the stack>
> >
> >     /* here entry will replace subprog1, and so we'll have
> >      * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
> >      */
> >     bpf_tail_call(ctx, &prog_array_init, 0);
> >
> >     return 0;
> > }
> >
> >
> > SEC("raw_tp/sys_enter")
> > int entry(void *ctx)
> > {
> >      <some state on the stack>
> >
> >      subprog1();
> > }
> >
> > And we effectively have limited recursion where the entry's stack
> > state is clobbered, no?
> >
> > So it seems like we need to support recursion.
> >
>
> How come everyone just completely ignored the main point of my entire
> email and a real problem that has to be solved?...
>
> Anyways, I did write a below program:
>
> $ cat minimal.bpf.c
> // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
> /* Copyright (c) 2020 Facebook */
> #include <linux/bpf.h>
> #include <bpf/bpf_helpers.h>
>
> char LICENSE[] SEC("license") = "Dual BSD/GPL";
>
> int my_pid = 0;
>
> int handle_tp(void *ctx);
>
> struct {
>         __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>         __uint(max_entries, 1);
>         __uint(key_size, sizeof(__u32));
>         __array(values, int (void *));
> } prog_array_init SEC(".maps") = {
>         .values = {
>                 [0] = (void *)&handle_tp,
>         },
> };
>
> static __noinline int subprog(void *ctx)
> {
>         static int cnt;
>
>         cnt++;
>
>         bpf_printk("SUBPROG - BEFORE %d", cnt);
>
>         bpf_tail_call(ctx, &prog_array_init, 0);
>
>         bpf_printk("SUBPROG - AFTER %d", cnt);
>
>     return 0;
> }
>
> SEC("tp/syscalls/sys_enter_write")
> int handle_tp(void *ctx)
> {
>         static int cnt;
>         int pid = bpf_get_current_pid_tgid() >> 32;
>
>         if (pid != my_pid)
>                 return 0;
>
>         cnt++;
>
>         bpf_printk("ENTRY - BEFORE %d", cnt);
>
>         subprog(ctx);
>
>         bpf_printk("ENTRY - AFTER %d", cnt);
>
>         return 0;
> }
>
>
> And triggered one write syscall, getting the log above. You can see
> that only subprogs are replaced (we only get "SUBPROG - AFTER 34" due
> to the tail call limit). And we do indeed get lots of entry program
> recurrence.
>
> We *need to support recursion* is my main point.

Not quite.
It's not a recursion. The stack collapsed/gone/wiped out before tail_call.
static int cnt counts stuff because it's static.

So we don't need to support recursion with private stack,
but tail_calls and private stack are buggy indeed.

emit_bpf_tail_call*() shouldn't be adjusting 'rsp' when the private
stack is used.
Andrii Nakryiko July 23, 2024, 3:26 a.m. UTC | #11
On Mon, Jul 22, 2024 at 6:06 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jul 22, 2024 at 1:58 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Jul 19, 2024 at 8:28 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> > > >
> > > > The main motivation for private stack comes from nested
> > > > scheduler in sched-ext from Tejun. The basic idea is that
> > > >  - each cgroup will its own associated bpf program,
> > > >  - bpf program with parent cgroup will call bpf programs
> > > >    in immediate child cgroups.
> > > >
> > > > Let us say we have the following cgroup hierarchy:
> > > >   root_cg (prog0):
> > > >     cg1 (prog1):
> > > >       cg11 (prog11):
> > > >         cg111 (prog111)
> > > >         cg112 (prog112)
> > > >       cg12 (prog12):
> > > >         cg121 (prog121)
> > > >         cg122 (prog122)
> > > >     cg2 (prog2):
> > > >       cg21 (prog21)
> > > >       cg22 (prog22)
> > > >       cg23 (prog23)
> > > >
> > > > In the above example, prog0 will call a kfunc which will
> > > > call prog1 and prog2 to get sched info for cg1 and cg2 and
> > > > then the information is summarized and sent back to prog0.
> > > > Similarly, prog11 and prog12 will be invoked in the kfunc
> > > > and the result will be summarized and sent back to prog1, etc.
> > > >
> > > > Currently, for each thread, the x86 kernel allocate 8KB stack.
> > > > The each bpf program (including its subprograms) has maximum
> > > > 512B stack size to avoid potential stack overflow.
> > > > And nested bpf programs increase the risk of stack overflow.
> > > > To avoid potential stack overflow caused by bpf programs,
> > > > this patch implemented a private stack so bpf program stack
> > > > space is allocated dynamically when the program is jited.
> > > > Such private stack is applied to tracing programs like
> > > > kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
> > > > tracing.
> > > >
> > > > But more than one instance of the same bpf program may
> > > > run in the system. To make things simple, percpu private
> > > > stack is allocated for each program, so if the same program
> > > > is running on different cpus concurrently, we won't have
> > > > any issue. Note that the kernel already have logic to prevent
> > > > the recursion for the same bpf program on the same cpu
> > > > (kprobe, fentry, etc.).
> > > >
> > > > The patch implemented a percpu private stack based approach
> > > > for x86 arch.
> > > >   - The stack size will be 0 and any stack access is from
> > > >     jit-time allocated percpu storage.
> > > >   - In the beginning of jit, r9 is used to save percpu
> > > >     private stack pointer.
> > > >   - Each rbp in the bpf asm insn is replaced by r9.
> > > >   - For each call, push r9 before the call and pop r9
> > > >     after the call to preserve r9 value.
> > > >
> > > > Compared to previous RFC patch [1], this patch added
> > > > some conditions to enable private stack, e.g., verifier
> > > > calculated stack size, prog type, etc. The new patch
> > > > also added a performance test to compare private stack
> > > > vs. no private stack.
> > > >
> > > > The following are some code example to illustrate the idea
> > > > for selftest cgroup_skb_sk_lookup:
> > > >
> > > >    the existing code                        the private-stack approach code
> > > >    endbr64                                  endbr64
> > > >    nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR [rax+rax*1+0x0]
> > > >    xchg   ax,ax                             xchg   ax,ax
> > > >    push   rbp                               push   rbp
> > > >    mov    rbp,rsp                           mov    rbp,rsp
> > > >    endbr64                                  endbr64
> > > >    sub    rsp,0x68
> > > >    push   rbx                               push   rbx
> > > >    ...                                      ...
> > > >    ...                                      mov    r9d,0x8c1c860
> > > >    ...                                      add    r9,QWORD PTR gs:0x21a00
> > > >    ...                                      ...
> > > >    mov    rdx,rbp                           mov    rdx, r9
> > > >    add    rdx,0xffffffffffffffb4            rdx,0xffffffffffffffb4
> > > >    ...                                      ...
> > > >    mov    ecx,0x28                          mov    ecx,0x28
> > > >                                             push   r9
> > > >    call   0xffffffffe305e474                call   0xffffffffe305e524
> > > >                                             pop    r9
> > > >    mov    rdi,rax                           mov    rdi,rax
> > > >    ...                                      ...
> > > >    movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR [r9-0x46]
> > > >    ...                                      ...
> > > >
> > >
> > > Eduard nerd-sniped me today with this a bit... :)
> > >
> > > I have a few questions and suggestions.
> > >
> > > So it seems like each *subprogram* (not the entire BPF program) gets
> > > its own per-CPU private stack allocation. Is that intentional? That
> > > seems a bit unnecessary. It also prevents any sort of actual
> > > recursion. Not sure if it's possible to write recursive BPF subprogram
> > > today, verifier seems to reject obvious limited recursion cases, but
> > > still, eventually we might need/want to support that, and this will be
> > > just another hurdle to overcome (so it's best to avoid adding it in
> > > the first place).
> > >
> > > I'm sure Eduard is going to try something like below and it will
> > > probably break badly (I haven't tried, sorry):
> > >
> > > int entry(void *ctx);
> > >
> > > struct {
> > >         __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
> > >         __uint(max_entries, 1);
> > >         __uint(key_size, sizeof(__u32));
> > >         __array(values, int (void *));
> > > } prog_array_init SEC(".maps") = {
> > >         .values = {
> > >                 [0] = (void *)&entry,
> > >         },
> > > };
> > >
> > > static __noinline int subprog1(void)
> > > {
> > >     <some state on the stack>
> > >
> > >     /* here entry will replace subprog1, and so we'll have
> > >      * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
> > >      */
> > >     bpf_tail_call(ctx, &prog_array_init, 0);
> > >
> > >     return 0;
> > > }
> > >
> > >
> > > SEC("raw_tp/sys_enter")
> > > int entry(void *ctx)
> > > {
> > >      <some state on the stack>
> > >
> > >      subprog1();
> > > }
> > >
> > > And we effectively have limited recursion where the entry's stack
> > > state is clobbered, no?
> > >
> > > So it seems like we need to support recursion.
> > >
> >
> > How come everyone just completely ignored the main point of my entire
> > email and a real problem that has to be solved?...
> >
> > Anyways, I did write a below program:
> >
> > $ cat minimal.bpf.c
> > // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
> > /* Copyright (c) 2020 Facebook */
> > #include <linux/bpf.h>
> > #include <bpf/bpf_helpers.h>
> >
> > char LICENSE[] SEC("license") = "Dual BSD/GPL";
> >
> > int my_pid = 0;
> >
> > int handle_tp(void *ctx);
> >
> > struct {
> >         __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
> >         __uint(max_entries, 1);
> >         __uint(key_size, sizeof(__u32));
> >         __array(values, int (void *));
> > } prog_array_init SEC(".maps") = {
> >         .values = {
> >                 [0] = (void *)&handle_tp,
> >         },
> > };
> >
> > static __noinline int subprog(void *ctx)
> > {
> >         static int cnt;
> >
> >         cnt++;
> >
> >         bpf_printk("SUBPROG - BEFORE %d", cnt);
> >
> >         bpf_tail_call(ctx, &prog_array_init, 0);
> >
> >         bpf_printk("SUBPROG - AFTER %d", cnt);
> >
> >     return 0;
> > }
> >
> > SEC("tp/syscalls/sys_enter_write")
> > int handle_tp(void *ctx)
> > {
> >         static int cnt;
> >         int pid = bpf_get_current_pid_tgid() >> 32;
> >
> >         if (pid != my_pid)
> >                 return 0;
> >
> >         cnt++;
> >
> >         bpf_printk("ENTRY - BEFORE %d", cnt);
> >
> >         subprog(ctx);
> >
> >         bpf_printk("ENTRY - AFTER %d", cnt);
> >
> >         return 0;
> > }
> >
> >
> > And triggered one write syscall, getting the log above. You can see
> > that only subprogs are replaced (we only get "SUBPROG - AFTER 34" due
> > to the tail call limit). And we do indeed get lots of entry program
> > recurrence.
> >
> > We *need to support recursion* is my main point.
>
> Not quite.
> It's not a recursion. The stack collapsed/gone/wiped out before tail_call.

Only of subprog(), not of handle_tp(). See all those "ENTRY - AFTER"
messages. We do return to all the nested handle_tp() calls and
continue just fine.

I put the log into [0] for a bit easier visual inspection.

  [0] https://gist.github.com/anakryiko/6ccdfc62188f8ad4991641fb637d954c

> static int cnt counts stuff because it's static.
>
> So we don't need to support recursion with private stack,
> but tail_calls and private stack are buggy indeed.
>
> emit_bpf_tail_call*() shouldn't be adjusting 'rsp' when the private
> stack is used.
Yonghong Song July 23, 2024, 5:30 a.m. UTC | #12
On 7/22/24 6:05 PM, Alexei Starovoitov wrote:
> On Mon, Jul 22, 2024 at 1:58 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
>> On Fri, Jul 19, 2024 at 8:28 PM Andrii Nakryiko
>> <andrii.nakryiko@gmail.com> wrote:
>>> On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>> The main motivation for private stack comes from nested
>>>> scheduler in sched-ext from Tejun. The basic idea is that
>>>>   - each cgroup will its own associated bpf program,
>>>>   - bpf program with parent cgroup will call bpf programs
>>>>     in immediate child cgroups.
>>>>
>>>> Let us say we have the following cgroup hierarchy:
>>>>    root_cg (prog0):
>>>>      cg1 (prog1):
>>>>        cg11 (prog11):
>>>>          cg111 (prog111)
>>>>          cg112 (prog112)
>>>>        cg12 (prog12):
>>>>          cg121 (prog121)
>>>>          cg122 (prog122)
>>>>      cg2 (prog2):
>>>>        cg21 (prog21)
>>>>        cg22 (prog22)
>>>>        cg23 (prog23)
>>>>
>>>> In the above example, prog0 will call a kfunc which will
>>>> call prog1 and prog2 to get sched info for cg1 and cg2 and
>>>> then the information is summarized and sent back to prog0.
>>>> Similarly, prog11 and prog12 will be invoked in the kfunc
>>>> and the result will be summarized and sent back to prog1, etc.
>>>>
>>>> Currently, for each thread, the x86 kernel allocate 8KB stack.
>>>> The each bpf program (including its subprograms) has maximum
>>>> 512B stack size to avoid potential stack overflow.
>>>> And nested bpf programs increase the risk of stack overflow.
>>>> To avoid potential stack overflow caused by bpf programs,
>>>> this patch implemented a private stack so bpf program stack
>>>> space is allocated dynamically when the program is jited.
>>>> Such private stack is applied to tracing programs like
>>>> kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
>>>> tracing.
>>>>
>>>> But more than one instance of the same bpf program may
>>>> run in the system. To make things simple, percpu private
>>>> stack is allocated for each program, so if the same program
>>>> is running on different cpus concurrently, we won't have
>>>> any issue. Note that the kernel already have logic to prevent
>>>> the recursion for the same bpf program on the same cpu
>>>> (kprobe, fentry, etc.).
>>>>
>>>> The patch implemented a percpu private stack based approach
>>>> for x86 arch.
>>>>    - The stack size will be 0 and any stack access is from
>>>>      jit-time allocated percpu storage.
>>>>    - In the beginning of jit, r9 is used to save percpu
>>>>      private stack pointer.
>>>>    - Each rbp in the bpf asm insn is replaced by r9.
>>>>    - For each call, push r9 before the call and pop r9
>>>>      after the call to preserve r9 value.
>>>>
>>>> Compared to previous RFC patch [1], this patch added
>>>> some conditions to enable private stack, e.g., verifier
>>>> calculated stack size, prog type, etc. The new patch
>>>> also added a performance test to compare private stack
>>>> vs. no private stack.
>>>>
>>>> The following are some code example to illustrate the idea
>>>> for selftest cgroup_skb_sk_lookup:
>>>>
>>>>     the existing code                        the private-stack approach code
>>>>     endbr64                                  endbr64
>>>>     nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR [rax+rax*1+0x0]
>>>>     xchg   ax,ax                             xchg   ax,ax
>>>>     push   rbp                               push   rbp
>>>>     mov    rbp,rsp                           mov    rbp,rsp
>>>>     endbr64                                  endbr64
>>>>     sub    rsp,0x68
>>>>     push   rbx                               push   rbx
>>>>     ...                                      ...
>>>>     ...                                      mov    r9d,0x8c1c860
>>>>     ...                                      add    r9,QWORD PTR gs:0x21a00
>>>>     ...                                      ...
>>>>     mov    rdx,rbp                           mov    rdx, r9
>>>>     add    rdx,0xffffffffffffffb4            rdx,0xffffffffffffffb4
>>>>     ...                                      ...
>>>>     mov    ecx,0x28                          mov    ecx,0x28
>>>>                                              push   r9
>>>>     call   0xffffffffe305e474                call   0xffffffffe305e524
>>>>                                              pop    r9
>>>>     mov    rdi,rax                           mov    rdi,rax
>>>>     ...                                      ...
>>>>     movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR [r9-0x46]
>>>>     ...                                      ...
>>>>
>>> Eduard nerd-sniped me today with this a bit... :)
>>>
>>> I have a few questions and suggestions.
>>>
>>> So it seems like each *subprogram* (not the entire BPF program) gets
>>> its own per-CPU private stack allocation. Is that intentional? That
>>> seems a bit unnecessary. It also prevents any sort of actual
>>> recursion. Not sure if it's possible to write recursive BPF subprogram
>>> today, verifier seems to reject obvious limited recursion cases, but
>>> still, eventually we might need/want to support that, and this will be
>>> just another hurdle to overcome (so it's best to avoid adding it in
>>> the first place).
>>>
>>> I'm sure Eduard is going to try something like below and it will
>>> probably break badly (I haven't tried, sorry):
>>>
>>> int entry(void *ctx);
>>>
>>> struct {
>>>          __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>>>          __uint(max_entries, 1);
>>>          __uint(key_size, sizeof(__u32));
>>>          __array(values, int (void *));
>>> } prog_array_init SEC(".maps") = {
>>>          .values = {
>>>                  [0] = (void *)&entry,
>>>          },
>>> };
>>>
>>> static __noinline int subprog1(void)
>>> {
>>>      <some state on the stack>
>>>
>>>      /* here entry will replace subprog1, and so we'll have
>>>       * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
>>>       */
>>>      bpf_tail_call(ctx, &prog_array_init, 0);
>>>
>>>      return 0;
>>> }
>>>
>>>
>>> SEC("raw_tp/sys_enter")
>>> int entry(void *ctx)
>>> {
>>>       <some state on the stack>
>>>
>>>       subprog1();
>>> }
>>>
>>> And we effectively have limited recursion where the entry's stack
>>> state is clobbered, no?
>>>
>>> So it seems like we need to support recursion.
>>>
>> How come everyone just completely ignored the main point of my entire
>> email and a real problem that has to be solved?...
>>
>> Anyways, I did write a below program:
>>
>> $ cat minimal.bpf.c
>> // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
>> /* Copyright (c) 2020 Facebook */
>> #include <linux/bpf.h>
>> #include <bpf/bpf_helpers.h>
>>
>> char LICENSE[] SEC("license") = "Dual BSD/GPL";
>>
>> int my_pid = 0;
>>
>> int handle_tp(void *ctx);
>>
>> struct {
>>          __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>>          __uint(max_entries, 1);
>>          __uint(key_size, sizeof(__u32));
>>          __array(values, int (void *));
>> } prog_array_init SEC(".maps") = {
>>          .values = {
>>                  [0] = (void *)&handle_tp,
>>          },
>> };
>>
>> static __noinline int subprog(void *ctx)
>> {
>>          static int cnt;
>>
>>          cnt++;
>>
>>          bpf_printk("SUBPROG - BEFORE %d", cnt);
>>
>>          bpf_tail_call(ctx, &prog_array_init, 0);
>>
>>          bpf_printk("SUBPROG - AFTER %d", cnt);
>>
>>      return 0;
>> }
>>
>> SEC("tp/syscalls/sys_enter_write")
>> int handle_tp(void *ctx)
>> {
>>          static int cnt;
>>          int pid = bpf_get_current_pid_tgid() >> 32;
>>
>>          if (pid != my_pid)
>>                  return 0;
>>
>>          cnt++;
>>
>>          bpf_printk("ENTRY - BEFORE %d", cnt);
>>
>>          subprog(ctx);
>>
>>          bpf_printk("ENTRY - AFTER %d", cnt);
>>
>>          return 0;
>> }
>>
>>
>> And triggered one write syscall, getting the log above. You can see
>> that only subprogs are replaced (we only get "SUBPROG - AFTER 34" due
>> to the tail call limit). And we do indeed get lots of entry program
>> recurrence.
>>
>> We *need to support recursion* is my main point.
> Not quite.
> It's not a recursion. The stack collapsed/gone/wiped out before tail_call.
> static int cnt counts stuff because it's static.
>
> So we don't need to support recursion with private stack,
> but tail_calls and private stack are buggy indeed.
>
> emit_bpf_tail_call*() shouldn't be adjusting 'rsp' when the private
> stack is used.

Right, stack_depth argument in emit_bpf_tail_call_direct()/emit_bpf_tail_call_indirect()
should be 0 if private stack is used. Will fix in next revision.
Yonghong Song July 23, 2024, 7:02 a.m. UTC | #13
On 7/22/24 10:30 PM, Yonghong Song wrote:
>
> On 7/22/24 6:05 PM, Alexei Starovoitov wrote:
>> On Mon, Jul 22, 2024 at 1:58 PM Andrii Nakryiko
>> <andrii.nakryiko@gmail.com> wrote:
>>> On Fri, Jul 19, 2024 at 8:28 PM Andrii Nakryiko
>>> <andrii.nakryiko@gmail.com> wrote:
>>>> On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song 
>>>> <yonghong.song@linux.dev> wrote:
>>>>> The main motivation for private stack comes from nested
>>>>> scheduler in sched-ext from Tejun. The basic idea is that
>>>>>   - each cgroup will its own associated bpf program,
>>>>>   - bpf program with parent cgroup will call bpf programs
>>>>>     in immediate child cgroups.
>>>>>
>>>>> Let us say we have the following cgroup hierarchy:
>>>>>    root_cg (prog0):
>>>>>      cg1 (prog1):
>>>>>        cg11 (prog11):
>>>>>          cg111 (prog111)
>>>>>          cg112 (prog112)
>>>>>        cg12 (prog12):
>>>>>          cg121 (prog121)
>>>>>          cg122 (prog122)
>>>>>      cg2 (prog2):
>>>>>        cg21 (prog21)
>>>>>        cg22 (prog22)
>>>>>        cg23 (prog23)
>>>>>
>>>>> In the above example, prog0 will call a kfunc which will
>>>>> call prog1 and prog2 to get sched info for cg1 and cg2 and
>>>>> then the information is summarized and sent back to prog0.
>>>>> Similarly, prog11 and prog12 will be invoked in the kfunc
>>>>> and the result will be summarized and sent back to prog1, etc.
>>>>>
>>>>> Currently, for each thread, the x86 kernel allocate 8KB stack.
>>>>> The each bpf program (including its subprograms) has maximum
>>>>> 512B stack size to avoid potential stack overflow.
>>>>> And nested bpf programs increase the risk of stack overflow.
>>>>> To avoid potential stack overflow caused by bpf programs,
>>>>> this patch implemented a private stack so bpf program stack
>>>>> space is allocated dynamically when the program is jited.
>>>>> Such private stack is applied to tracing programs like
>>>>> kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
>>>>> tracing.
>>>>>
>>>>> But more than one instance of the same bpf program may
>>>>> run in the system. To make things simple, percpu private
>>>>> stack is allocated for each program, so if the same program
>>>>> is running on different cpus concurrently, we won't have
>>>>> any issue. Note that the kernel already have logic to prevent
>>>>> the recursion for the same bpf program on the same cpu
>>>>> (kprobe, fentry, etc.).
>>>>>
>>>>> The patch implemented a percpu private stack based approach
>>>>> for x86 arch.
>>>>>    - The stack size will be 0 and any stack access is from
>>>>>      jit-time allocated percpu storage.
>>>>>    - In the beginning of jit, r9 is used to save percpu
>>>>>      private stack pointer.
>>>>>    - Each rbp in the bpf asm insn is replaced by r9.
>>>>>    - For each call, push r9 before the call and pop r9
>>>>>      after the call to preserve r9 value.
>>>>>
>>>>> Compared to previous RFC patch [1], this patch added
>>>>> some conditions to enable private stack, e.g., verifier
>>>>> calculated stack size, prog type, etc. The new patch
>>>>> also added a performance test to compare private stack
>>>>> vs. no private stack.
>>>>>
>>>>> The following are some code example to illustrate the idea
>>>>> for selftest cgroup_skb_sk_lookup:
>>>>>
>>>>>     the existing code                        the private-stack 
>>>>> approach code
>>>>>     endbr64                                  endbr64
>>>>>     nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR 
>>>>> [rax+rax*1+0x0]
>>>>>     xchg   ax,ax                             xchg   ax,ax
>>>>>     push   rbp                               push   rbp
>>>>>     mov    rbp,rsp                           mov rbp,rsp
>>>>>     endbr64                                  endbr64
>>>>>     sub    rsp,0x68
>>>>>     push   rbx                               push   rbx
>>>>>     ...                                      ...
>>>>>     ...                                      mov r9d,0x8c1c860
>>>>>     ...                                      add r9,QWORD PTR 
>>>>> gs:0x21a00
>>>>>     ...                                      ...
>>>>>     mov    rdx,rbp                           mov    rdx, r9
>>>>>     add    rdx,0xffffffffffffffb4 rdx,0xffffffffffffffb4
>>>>>     ...                                      ...
>>>>>     mov    ecx,0x28                          mov ecx,0x28
>>>>>                                              push   r9
>>>>>     call   0xffffffffe305e474                call 0xffffffffe305e524
>>>>>                                              pop    r9
>>>>>     mov    rdi,rax                           mov rdi,rax
>>>>>     ...                                      ...
>>>>>     movzx  rdi,BYTE PTR [rbp-0x46]           movzx rdi,BYTE PTR 
>>>>> [r9-0x46]
>>>>>     ...                                      ...
>>>>>
>>>> Eduard nerd-sniped me today with this a bit... :)
>>>>
>>>> I have a few questions and suggestions.
>>>>
>>>> So it seems like each *subprogram* (not the entire BPF program) gets
>>>> its own per-CPU private stack allocation. Is that intentional? That
>>>> seems a bit unnecessary. It also prevents any sort of actual
>>>> recursion. Not sure if it's possible to write recursive BPF subprogram
>>>> today, verifier seems to reject obvious limited recursion cases, but
>>>> still, eventually we might need/want to support that, and this will be
>>>> just another hurdle to overcome (so it's best to avoid adding it in
>>>> the first place).
>>>>
>>>> I'm sure Eduard is going to try something like below and it will
>>>> probably break badly (I haven't tried, sorry):
>>>>
>>>> int entry(void *ctx);
>>>>
>>>> struct {
>>>>          __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>>>>          __uint(max_entries, 1);
>>>>          __uint(key_size, sizeof(__u32));
>>>>          __array(values, int (void *));
>>>> } prog_array_init SEC(".maps") = {
>>>>          .values = {
>>>>                  [0] = (void *)&entry,
>>>>          },
>>>> };
>>>>
>>>> static __noinline int subprog1(void)
>>>> {
>>>>      <some state on the stack>
>>>>
>>>>      /* here entry will replace subprog1, and so we'll have
>>>>       * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
>>>>       */
>>>>      bpf_tail_call(ctx, &prog_array_init, 0);
>>>>
>>>>      return 0;
>>>> }
>>>>
>>>>
>>>> SEC("raw_tp/sys_enter")
>>>> int entry(void *ctx)
>>>> {
>>>>       <some state on the stack>
>>>>
>>>>       subprog1();
>>>> }
>>>>
>>>> And we effectively have limited recursion where the entry's stack
>>>> state is clobbered, no?
>>>>
>>>> So it seems like we need to support recursion.
>>>>
>>> How come everyone just completely ignored the main point of my entire
>>> email and a real problem that has to be solved?...
>>>
>>> Anyways, I did write a below program:
>>>
>>> $ cat minimal.bpf.c
>>> // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
>>> /* Copyright (c) 2020 Facebook */
>>> #include <linux/bpf.h>
>>> #include <bpf/bpf_helpers.h>
>>>
>>> char LICENSE[] SEC("license") = "Dual BSD/GPL";
>>>
>>> int my_pid = 0;
>>>
>>> int handle_tp(void *ctx);
>>>
>>> struct {
>>>          __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>>>          __uint(max_entries, 1);
>>>          __uint(key_size, sizeof(__u32));
>>>          __array(values, int (void *));
>>> } prog_array_init SEC(".maps") = {
>>>          .values = {
>>>                  [0] = (void *)&handle_tp,
>>>          },
>>> };
>>>
>>> static __noinline int subprog(void *ctx)
>>> {
>>>          static int cnt;
>>>
>>>          cnt++;
>>>
>>>          bpf_printk("SUBPROG - BEFORE %d", cnt);
>>>
>>>          bpf_tail_call(ctx, &prog_array_init, 0);
>>>
>>>          bpf_printk("SUBPROG - AFTER %d", cnt);
>>>
>>>      return 0;
>>> }
>>>
>>> SEC("tp/syscalls/sys_enter_write")
>>> int handle_tp(void *ctx)
>>> {
>>>          static int cnt;
>>>          int pid = bpf_get_current_pid_tgid() >> 32;
>>>
>>>          if (pid != my_pid)
>>>                  return 0;
>>>
>>>          cnt++;
>>>
>>>          bpf_printk("ENTRY - BEFORE %d", cnt);
>>>
>>>          subprog(ctx);
>>>
>>>          bpf_printk("ENTRY - AFTER %d", cnt);
>>>
>>>          return 0;
>>> }
>>>
>>>
>>> And triggered one write syscall, getting the log above. You can see
>>> that only subprogs are replaced (we only get "SUBPROG - AFTER 34" due
>>> to the tail call limit). And we do indeed get lots of entry program
>>> recurrence.
>>>
>>> We *need to support recursion* is my main point.
>> Not quite.
>> It's not a recursion. The stack collapsed/gone/wiped out before 
>> tail_call.
>> static int cnt counts stuff because it's static.
>>
>> So we don't need to support recursion with private stack,
>> but tail_calls and private stack are buggy indeed.
>>
>> emit_bpf_tail_call*() shouldn't be adjusting 'rsp' when the private
>> stack is used.
>
> Right, stack_depth argument in 
> emit_bpf_tail_call_direct()/emit_bpf_tail_call_indirect()
> should be 0 if private stack is used. Will fix in next revision.

Actually, the current implementation is correct. We already set stack_depth to be 0
if private stack is used. So we should be fine here.
Alexei Starovoitov July 24, 2024, 3:17 a.m. UTC | #14
On Mon, Jul 22, 2024 at 8:27 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> > > We *need to support recursion* is my main point.
> >
> > Not quite.
> > It's not a recursion. The stack collapsed/gone/wiped out before tail_call.
>
> Only of subprog(), not of handle_tp(). See all those "ENTRY - AFTER"
> messages. We do return to all the nested handle_tp() calls and
> continue just fine.
>
> I put the log into [0] for a bit easier visual inspection.
>
>   [0] https://gist.github.com/anakryiko/6ccdfc62188f8ad4991641fb637d954c

Argh. So the pathological prog can consume 512*33 of stack.
We have to reject it somehow in the verifier or tailor private stack
to support it. Then private stack will be a feature and a fix for this issue.
But then it would need to preallocate 512*33 per cpu per program.
Which is too much.
Maybe we can preallocate _aligned_ 512 or 1k per cpu per prog,
then adjust r9 before call or tail_call and if r9 is about to cross
alignment before tail_call fail the tail call (like tail call cnt was
over limit).
Hopefully there are better ideas, since it's all quite messy.
Andrii Nakryiko July 24, 2024, 4:06 a.m. UTC | #15
On Tue, Jul 23, 2024 at 8:17 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jul 22, 2024 at 8:27 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > > > We *need to support recursion* is my main point.
> > >
> > > Not quite.
> > > It's not a recursion. The stack collapsed/gone/wiped out before tail_call.
> >
> > Only of subprog(), not of handle_tp(). See all those "ENTRY - AFTER"
> > messages. We do return to all the nested handle_tp() calls and
> > continue just fine.
> >
> > I put the log into [0] for a bit easier visual inspection.
> >
> >   [0] https://gist.github.com/anakryiko/6ccdfc62188f8ad4991641fb637d954c
>
> Argh. So the pathological prog can consume 512*33 of stack.
> We have to reject it somehow in the verifier or tailor private stack
> to support it. Then private stack will be a feature and a fix for this issue.
> But then it would need to preallocate 512*33 per cpu per program.
> Which is too much.
> Maybe we can preallocate _aligned_ 512 or 1k per cpu per prog,
> then adjust r9 before call or tail_call and if r9 is about to cross
> alignment before tail_call fail the tail call (like tail call cnt was
> over limit).

This is close to what I proposed to Yonghong offline. One approach I
had in mind was as follows. If we know that a BPF program can do a
tail call, then allocate some larger private stack (1KB, 4KB, 8KB,
don't know), compared what the BPF program itself would need. Then in
bpf_tail_call() helper's inlining itself check whether R9 +
<max_prog_stack_size> is larger than the private stack's size. And if
yes, then don't do tail call (as if we reached max number of tail
calls). Tail call interface allows for that.

This way we don't slow down typical non-tail call cases and don't pay
unnecessary memory price, but we still make tail call work just fine
in most cases, except some pathological ones like my example. I think
the expected situation for tail call is to replace main program with
another main program, so the typical case will work perfectly fine.

> Hopefully there are better ideas, since it's all quite messy.
Yonghong Song July 24, 2024, 4:32 a.m. UTC | #16
On 7/23/24 8:17 PM, Alexei Starovoitov wrote:
> On Mon, Jul 22, 2024 at 8:27 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
>>>> We *need to support recursion* is my main point.
>>> Not quite.
>>> It's not a recursion. The stack collapsed/gone/wiped out before tail_call.
>> Only of subprog(), not of handle_tp(). See all those "ENTRY - AFTER"
>> messages. We do return to all the nested handle_tp() calls and
>> continue just fine.
>>
>> I put the log into [0] for a bit easier visual inspection.
>>
>>    [0] https://gist.github.com/anakryiko/6ccdfc62188f8ad4991641fb637d954c
> Argh. So the pathological prog can consume 512*33 of stack.

We have a check in verifier like below:

         if (idx && subprog[idx].has_tail_call && depth >= 256) {
                 verbose(env,
                         "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
                         depth);
                 return -EACCES;
         }

So the maximum stack size could be around 256 * 33 which is a little bit more than 8KB.

> We have to reject it somehow in the verifier or tailor private stack
> to support it. Then private stack will be a feature and a fix for this issue.
> But then it would need to preallocate 512*33 per cpu per program.
> Which is too much.
> Maybe we can preallocate _aligned_ 512 or 1k per cpu per prog,
> then adjust r9 before call or tail_call and if r9 is about to cross
> alignment before tail_call fail the tail call (like tail call cnt was
> over limit).
> Hopefully there are better ideas, since it's all quite messy.
Yonghong Song July 24, 2024, 4:46 a.m. UTC | #17
On 7/23/24 9:06 PM, Andrii Nakryiko wrote:
> On Tue, Jul 23, 2024 at 8:17 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>> On Mon, Jul 22, 2024 at 8:27 PM Andrii Nakryiko
>> <andrii.nakryiko@gmail.com> wrote:
>>>>> We *need to support recursion* is my main point.
>>>> Not quite.
>>>> It's not a recursion. The stack collapsed/gone/wiped out before tail_call.
>>> Only of subprog(), not of handle_tp(). See all those "ENTRY - AFTER"
>>> messages. We do return to all the nested handle_tp() calls and
>>> continue just fine.
>>>
>>> I put the log into [0] for a bit easier visual inspection.
>>>
>>>    [0] https://gist.github.com/anakryiko/6ccdfc62188f8ad4991641fb637d954c
>> Argh. So the pathological prog can consume 512*33 of stack.
>> We have to reject it somehow in the verifier or tailor private stack
>> to support it. Then private stack will be a feature and a fix for this issue.
>> But then it would need to preallocate 512*33 per cpu per program.
>> Which is too much.
>> Maybe we can preallocate _aligned_ 512 or 1k per cpu per prog,
>> then adjust r9 before call or tail_call and if r9 is about to cross
>> alignment before tail_call fail the tail call (like tail call cnt was
>> over limit).
> This is close to what I proposed to Yonghong offline. One approach I
> had in mind was as follows. If we know that a BPF program can do a
> tail call, then allocate some larger private stack (1KB, 4KB, 8KB,
> don't know), compared what the BPF program itself would need. Then in
> bpf_tail_call() helper's inlining itself check whether R9 +
> <max_prog_stack_size> is larger than the private stack's size. And if
> yes, then don't do tail call (as if we reached max number of tail
> calls). Tail call interface allows for that.
>
> This way we don't slow down typical non-tail call cases and don't pay
> unnecessary memory price, but we still make tail call work just fine
> in most cases, except some pathological ones like my example. I think
> the expected situation for tail call is to replace main program with
> another main program, so the typical case will work perfectly fine.

Indeed, this is an approach we could use. Based on recursion 'cnt',
we could calculate the frame pointer properly based on original
allocated frame pointer. Currently, private stack only supports
tracing programs. It should be extremely rare for a tracing program
to self recurse with tail call. So we could allocate smaller amount
of memory e.g. 1KB or 2KB and add runtime checking against the private stack
size to prevent stack overflow.

>
>> Hopefully there are better ideas, since it's all quite messy.
Yonghong Song July 24, 2024, 5:08 a.m. UTC | #18
On 7/22/24 9:43 AM, Yonghong Song wrote:
>
> On 7/19/24 8:28 PM, Andrii Nakryiko wrote:
>> On Thu, Jul 18, 2024 at 1:52 PM Yonghong Song 
>> <yonghong.song@linux.dev> wrote:
>>> The main motivation for private stack comes from nested
>>> scheduler in sched-ext from Tejun. The basic idea is that
>>>   - each cgroup will its own associated bpf program,
>>>   - bpf program with parent cgroup will call bpf programs
>>>     in immediate child cgroups.
>>>
>>> Let us say we have the following cgroup hierarchy:
>>>    root_cg (prog0):
>>>      cg1 (prog1):
>>>        cg11 (prog11):
>>>          cg111 (prog111)
>>>          cg112 (prog112)
>>>        cg12 (prog12):
>>>          cg121 (prog121)
>>>          cg122 (prog122)
>>>      cg2 (prog2):
>>>        cg21 (prog21)
>>>        cg22 (prog22)
>>>        cg23 (prog23)
>>>
>>> In the above example, prog0 will call a kfunc which will
>>> call prog1 and prog2 to get sched info for cg1 and cg2 and
>>> then the information is summarized and sent back to prog0.
>>> Similarly, prog11 and prog12 will be invoked in the kfunc
>>> and the result will be summarized and sent back to prog1, etc.
>>>
>>> Currently, for each thread, the x86 kernel allocate 8KB stack.
>>> The each bpf program (including its subprograms) has maximum
>>> 512B stack size to avoid potential stack overflow.
>>> And nested bpf programs increase the risk of stack overflow.
>>> To avoid potential stack overflow caused by bpf programs,
>>> this patch implemented a private stack so bpf program stack
>>> space is allocated dynamically when the program is jited.
>>> Such private stack is applied to tracing programs like
>>> kprobe/uprobe, perf_event, tracepoint, raw tracepoint and
>>> tracing.
>>>
>>> But more than one instance of the same bpf program may
>>> run in the system. To make things simple, percpu private
>>> stack is allocated for each program, so if the same program
>>> is running on different cpus concurrently, we won't have
>>> any issue. Note that the kernel already have logic to prevent
>>> the recursion for the same bpf program on the same cpu
>>> (kprobe, fentry, etc.).
>>>
>>> The patch implemented a percpu private stack based approach
>>> for x86 arch.
>>>    - The stack size will be 0 and any stack access is from
>>>      jit-time allocated percpu storage.
>>>    - In the beginning of jit, r9 is used to save percpu
>>>      private stack pointer.
>>>    - Each rbp in the bpf asm insn is replaced by r9.
>>>    - For each call, push r9 before the call and pop r9
>>>      after the call to preserve r9 value.
>>>
>>> Compared to previous RFC patch [1], this patch added
>>> some conditions to enable private stack, e.g., verifier
>>> calculated stack size, prog type, etc. The new patch
>>> also added a performance test to compare private stack
>>> vs. no private stack.
>>>
>>> The following are some code example to illustrate the idea
>>> for selftest cgroup_skb_sk_lookup:
>>>
>>>     the existing code                        the private-stack 
>>> approach code
>>>     endbr64                                  endbr64
>>>     nop    DWORD PTR [rax+rax*1+0x0]         nop    DWORD PTR 
>>> [rax+rax*1+0x0]
>>>     xchg   ax,ax                             xchg   ax,ax
>>>     push   rbp                               push   rbp
>>>     mov    rbp,rsp                           mov    rbp,rsp
>>>     endbr64                                  endbr64
>>>     sub    rsp,0x68
>>>     push   rbx                               push   rbx
>>>     ...                                      ...
>>>     ...                                      mov r9d,0x8c1c860
>>>     ...                                      add    r9,QWORD PTR 
>>> gs:0x21a00
>>>     ...                                      ...
>>>     mov    rdx,rbp                           mov    rdx, r9
>>>     add    rdx,0xffffffffffffffb4 rdx,0xffffffffffffffb4
>>>     ...                                      ...
>>>     mov    ecx,0x28                          mov    ecx,0x28
>>>                                              push   r9
>>>     call   0xffffffffe305e474                call 0xffffffffe305e524
>>>                                              pop    r9
>>>     mov    rdi,rax                           mov    rdi,rax
>>>     ...                                      ...
>>>     movzx  rdi,BYTE PTR [rbp-0x46]           movzx  rdi,BYTE PTR 
>>> [r9-0x46]
>>>     ...                                      ...
>>>
>> Eduard nerd-sniped me today with this a bit... :)
>>
>> I have a few questions and suggestions.
>>
>> So it seems like each *subprogram* (not the entire BPF program) gets
>> its own per-CPU private stack allocation. Is that intentional? That
>
> Currently yes. The reason is the same prog could be run on different
> cpus at the same time.
>
>> seems a bit unnecessary. It also prevents any sort of actual
>> recursion. Not sure if it's possible to write recursive BPF subprogram
>> today, verifier seems to reject obvious limited recursion cases, but
>> still, eventually we might need/want to support that, and this will be
>> just another hurdle to overcome (so it's best to avoid adding it in
>> the first place).
>>
>> I'm sure Eduard is going to try something like below and it will
>> probably break badly (I haven't tried, sorry):
>>
>> int entry(void *ctx);
>>
>> struct {
>>          __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
>>          __uint(max_entries, 1);
>>          __uint(key_size, sizeof(__u32));
>>          __array(values, int (void *));
>> } prog_array_init SEC(".maps") = {
>>          .values = {
>>                  [0] = (void *)&entry,
>>          },
>> };
>>
>> static __noinline int subprog1(void)
>> {
>>      <some state on the stack>
>>
>>      /* here entry will replace subprog1, and so we'll have
>>       * entry -> entry -> entry -> ..... <tail call limit> -> subprog1
>>       */
>>      bpf_tail_call(ctx, &prog_array_init, 0);
>>
>>      return 0;
>> }
>>
>>
>> SEC("raw_tp/sys_enter")
>> int entry(void *ctx)
>> {
>>       <some state on the stack>
>>
>>       subprog1();
>> }
>>
>> And we effectively have limited recursion where the entry's stack
>> state is clobbered, no?
>>
>> So it seems like we need to support recursion.
>>
>>
>> So, the question I have is. Why not do the following:
>> a) only setup r9 *once* in entry program's prologue (before tail call
>> jump target)
>> b) before each call we can adjust r9 with current prog/subprog's
>> maximum *own* stack, something like:
>>
>> push r9;
>> r9 += 128; // 128 is subprog's stack usage
>> call <some-subprog>
>> pop r9;
>>
>> The idea being that on tail call or in subprog call we assume r9 is
>> already pointing to the right place. We can probably also figure out
>> how to avoid push/pop r9 if we make sure that subprogram always
>> restores r9 (taking tail calls into account and all that, of course)?
>>
>> Is this feasible?
>
> This is possible. I actually hacked such an idea easily. The basic
> idea is push frame pointer as an additional argument to the bpf
> static sub-prog. This is a little bit complicated. It will probably
> save some stack size but I am not sure how much it is.

Discussed with Andrii. I think the following approach should work.
For each non-static prog, the private stack is allocated including
that non-static prog and the called static progs. For example,
     main_prog
        static_prog_1
          static_prog_11
          global_prog
             static_prog_12
        static_prog_2

So in verifier we calculate stack size for
     main_prog
        static_prog_1
           static_prog_11
        static_prog_2
  and
     global_prog
       static_prog_12

Let us say the stack size for main_prog like below for each (sub)prog
     main_prog // stack size 100
        static_prog_1 // stack size 100
          static_prog_11 // stack size 100
        static_prog_2 // static size 100
so total static size is 300 so the private stack size will be 300.
So R9 is calculated like below
     main_prog
       R9 = ... // for tailcall reachable, R9 may be original R9 + offset
                // for non-tailcall reachable, R9 equals the original R9 (based on jit-time allocation).
       ...  R9 ...
       R9 += 100
       static_prog_1
          ... R9 ...
          R9 += 100
          static_prog_11
            ... R9 ...
          R9 -= 100
       R9 -= 100
       ... R9 ...
       R9 += 100
       static_prog_2
          ... R9 ...
       R9 -= 100

Similary, we can calculate R9 offset for
     global_prog
       static_prog_12
as well.
Alexei Starovoitov July 24, 2024, 4:54 p.m. UTC | #19
On Tue, Jul 23, 2024 at 10:09 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> Discussed with Andrii. I think the following approach should work.
> For each non-static prog, the private stack is allocated including
> that non-static prog and the called static progs. For example,
>      main_prog
>         static_prog_1
>           static_prog_11
>           global_prog
>              static_prog_12
>         static_prog_2
>
> So in verifier we calculate stack size for
>      main_prog
>         static_prog_1
>            static_prog_11
>         static_prog_2
>   and
>      global_prog
>        static_prog_12
>
> Let us say the stack size for main_prog like below for each (sub)prog
>      main_prog // stack size 100
>         static_prog_1 // stack size 100
>           static_prog_11 // stack size 100
>         static_prog_2 // static size 100
> so total static size is 300 so the private stack size will be 300.
> So R9 is calculated like below
>      main_prog
>        R9 = ... // for tailcall reachable, R9 may be original R9 + offset
>                 // for non-tailcall reachable, R9 equals the original R9 (based on jit-time allocation).
>        ...  R9 ...
>        R9 += 100
>        static_prog_1
>           ... R9 ...
>           R9 += 100
>           static_prog_11
>             ... R9 ...
>           R9 -= 100
>        R9 -= 100
>        ... R9 ...
>        R9 += 100
>        static_prog_2
>           ... R9 ...
>        R9 -= 100
>
> Similary, we can calculate R9 offset for
>      global_prog
>        static_prog_12
> as well.

I don't understand why differentiate static and global surprogs.

But, mainly, += and -= around the call is suboptimal.
Can we do it as a normal stack does ?
Each prog knows how much stack it needs,
so it can do:
r9 += stack_depth in the prologue
and all accesses are done as r9 - off.
Then to do a call nothing extra is needed.
The callee will do r9 += its own stack depth.
Whether private stack growth up or down is tbd.

The challenge is how to supply proper r9 on entry
into the main prog. Potentially a task for bpf trampoline,
and kprobe/tp/etc will need special hack in bpf_dispatcher_nop_func.
Yonghong Song July 24, 2024, 5:56 p.m. UTC | #20
On 7/24/24 9:54 AM, Alexei Starovoitov wrote:
> On Tue, Jul 23, 2024 at 10:09 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> Discussed with Andrii. I think the following approach should work.
>> For each non-static prog, the private stack is allocated including
>> that non-static prog and the called static progs. For example,
>>       main_prog
>>          static_prog_1
>>            static_prog_11
>>            global_prog
>>               static_prog_12
>>          static_prog_2
>>
>> So in verifier we calculate stack size for
>>       main_prog
>>          static_prog_1
>>             static_prog_11
>>          static_prog_2
>>    and
>>       global_prog
>>         static_prog_12
>>
>> Let us say the stack size for main_prog like below for each (sub)prog
>>       main_prog // stack size 100
>>          static_prog_1 // stack size 100
>>            static_prog_11 // stack size 100
>>          static_prog_2 // static size 100
>> so total static size is 300 so the private stack size will be 300.
>> So R9 is calculated like below
>>       main_prog
>>         R9 = ... // for tailcall reachable, R9 may be original R9 + offset
>>                  // for non-tailcall reachable, R9 equals the original R9 (based on jit-time allocation).
>>         ...  R9 ...
>>         R9 += 100
>>         static_prog_1
>>            ... R9 ...
>>            R9 += 100
>>            static_prog_11
>>              ... R9 ...
>>            R9 -= 100
>>         R9 -= 100
>>         ... R9 ...
>>         R9 += 100
>>         static_prog_2
>>            ... R9 ...
>>         R9 -= 100
>>
>> Similary, we can calculate R9 offset for
>>       global_prog
>>         static_prog_12
>> as well.
> I don't understand why differentiate static and global surprogs.

Specially handling global subprog is for potential BPF_PROG_TYPE_EXT
prog which may replace global subprog.

Therefore, so private stack, global subprog will terminate
stack accounting to minimize stack usage. If we treat
static/global subprogs the same, and if freplace does happen,
we might allocate more-than-necessary private stack.

freplace probably not a common use case. If it does happen,
the original global subprog may be a stub func which does
not have any stack usage and the freplace prog is the one
implementing the business logic. So from that perspective,
we do not need to differentiate static and global subprogs.

>
> But, mainly, += and -= around the call is suboptimal.
> Can we do it as a normal stack does ?
> Each prog knows how much stack it needs,
> so it can do:
> r9 += stack_depth in the prologue
> and all accesses are done as r9 - off.
> Then to do a call nothing extra is needed.
> The callee will do r9 += its own stack depth.

I thought the += and -= at call site are easier to understand.
But certainly, doing r9 += stack_depth and
r9 -= stack_depth inside the subprog works too.

> Whether private stack growth up or down is tbd.

My current approach is that private stack growth down
similar to normal stack. But we have flexibility
to grow up at frame level.

>
> The challenge is how to supply proper r9 on entry
> into the main prog. Potentially a task for bpf trampoline,
> and kprobe/tp/etc will need special hack in bpf_dispatcher_nop_func.

I have an early hack for bpf trampoline and
bpf_dispatcher_nop_func to pass private stack pointer
as the third argument to the bpf program.
In this particular case, we can just pass private
stack pointer in R9. I will pick this up.
Yonghong Song July 24, 2024, 9:28 p.m. UTC | #21
On 7/21/24 8:33 PM, Eduard Zingerman wrote:
> Hi Yonghong,
>
> In general I think that changes in this patch are logical and make sense.
> I have a suggestion regarding testing JIT related changes.
>
> We currently lack a convenient way to verify jit behaviour modulo
> runtime tests. I think we should have a capability to write tests like below:
>
>      SEC("tp")
>      __jited_x86("f:	endbr64")
>      __jited_x86("13:	movabs $0x.*,%r9")
>      __jited_x86("1d:	add    %gs:0x.*,%r9")
>      __jited_x86("26:	mov    $0x1,%edi")
>      __jited_x86("2b:	mov    %rdi,-0x8(%r9)")
>      __jited_x86("2f:	mov    -0x8(%r9),%rdi")
>      __jited_x86("33:	xor    %eax,%eax")
>      __jited_x86("35:	lock xchg %rax,-0x8(%r9)")
>      __jited_x86("3a:	lock xadd %rax,-0x8(%r9)")
>      __naked void stack_access_insns(void)
>      {
>      	asm volatile (
>      	"r1 = 1;"
>      	"*(u64 *)(r10 - 8) = r1;"
>      	"r1 = *(u64 *)(r10 - 8);"
>      	"r0 = 0;"
>      	"r0 = xchg_64(r10 - 8, r0);"
>      	"r0 = atomic_fetch_add((u64 *)(r10 - 8), r0);"
>      	"exit;"
>      	::: __clobber_all);
>      }
>
> In the following branch I explored a way to add such capability:
> https://github.com/eddyz87/bpf/tree/yhs-private-stack-plus-jit-testing
>
> Beside testing exact translation, such tests also provide good
> starting point for people trying to figure out how some jit features work.
>
> The below two commits are the gist of the feature:
> 8f9361be2fb3 ("selftests/bpf: __jited_x86 test tag to check x86 assembly after jit")
> 0156b148b5b4 ("selftests/bpf: utility function to get program disassembly after jit")
>
> For "0156b148b5b4" I opted to do a popen() call and execute bpftool process,
> an alternative would be to:
> a. either link tools/bpf/bpftool/jit_disasm.c as a part of the
>     test_progs executable;
> b. call libbfd (binutils dis-assembler) directly from the bpftool.
>
> Currently bpftool can use two dis-assemblers: libbfd and llvm library,
> depending on the build environment. For CI builds libbfd is used.
> I don't know if llvm and libbfd always produce same output for
> identical binary code. Imo, if people are Ok with adding libbfd

I tried a simple example like below.
$ cat test.c
#include <stdint.h>
typedef struct {
     unsigned char x[8];
} buf_t;
void f(buf_t *buf, uint64_t y, uint64_t z) {
     if (z > 8) z = 8;
     unsigned char *y_bytes = (unsigned char *)&y;
     for (int i = 0; i < z; ++i) {
         buf->x[i] = y_bytes[i];
     }
}
$ clang -c test.c
$ objdump -d test.o <==== gcc binutils based objdump

test.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <f>:
    0:   55                      push   %rbp
    1:   48 89 e5                mov    %rsp,%rbp
    4:   48 89 7d f8             mov    %rdi,-0x8(%rbp)
    8:   48 89 75 f0             mov    %rsi,-0x10(%rbp)
    c:   48 89 55 e8             mov    %rdx,-0x18(%rbp)
   10:   48 83 7d e8 08          cmpq   $0x8,-0x18(%rbp)
   15:   76 08                   jbe    1f <f+0x1f>
   17:   48 c7 45 e8 08 00 00    movq   $0x8,-0x18(%rbp)
   1e:   00
   1f:   48 8d 45 f0             lea    -0x10(%rbp),%rax
   23:   48 89 45 e0             mov    %rax,-0x20(%rbp)
   27:   c7 45 dc 00 00 00 00    movl   $0x0,-0x24(%rbp)
   2e:   48 63 45 dc             movslq -0x24(%rbp),%rax
   32:   48 3b 45 e8             cmp    -0x18(%rbp),%rax
   36:   73 21                   jae    59 <f+0x59>
   38:   48 8b 45 e0             mov    -0x20(%rbp),%rax
   3c:   48 63 4d dc             movslq -0x24(%rbp),%rcx
   40:   8a 14 08                mov    (%rax,%rcx,1),%dl
   43:   48 8b 45 f8             mov    -0x8(%rbp),%rax
   47:   48 63 4d dc             movslq -0x24(%rbp),%rcx
   4b:   88 14 08                mov    %dl,(%rax,%rcx,1)
   4e:   8b 45 dc                mov    -0x24(%rbp),%eax
   51:   83 c0 01                add    $0x1,%eax
   54:   89 45 dc                mov    %eax,-0x24(%rbp)
   57:   eb d5                   jmp    2e <f+0x2e>
   59:   5d                      pop    %rbp
   5a:   c3                      ret

$ llvm-objdump -d test.o  <== clang based objdump

test.o: file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <f>:
        0: 55                            pushq   %rbp
        1: 48 89 e5                      movq    %rsp, %rbp
        4: 48 89 7d f8                   movq    %rdi, -0x8(%rbp)
        8: 48 89 75 f0                   movq    %rsi, -0x10(%rbp)
        c: 48 89 55 e8                   movq    %rdx, -0x18(%rbp)
       10: 48 83 7d e8 08                cmpq    $0x8, -0x18(%rbp)
       15: 76 08                         jbe     0x1f <f+0x1f>
       17: 48 c7 45 e8 08 00 00 00       movq    $0x8, -0x18(%rbp)
       1f: 48 8d 45 f0                   leaq    -0x10(%rbp), %rax
       23: 48 89 45 e0                   movq    %rax, -0x20(%rbp)
       27: c7 45 dc 00 00 00 00          movl    $0x0, -0x24(%rbp)
       2e: 48 63 45 dc                   movslq  -0x24(%rbp), %rax
       32: 48 3b 45 e8                   cmpq    -0x18(%rbp), %rax
       36: 73 21                         jae     0x59 <f+0x59>
       38: 48 8b 45 e0                   movq    -0x20(%rbp), %rax
       3c: 48 63 4d dc                   movslq  -0x24(%rbp), %rcx
       40: 8a 14 08                      movb    (%rax,%rcx), %dl
       43: 48 8b 45 f8                   movq    -0x8(%rbp), %rax
       47: 48 63 4d dc                   movslq  -0x24(%rbp), %rcx
       4b: 88 14 08                      movb    %dl, (%rax,%rcx)
       4e: 8b 45 dc                      movl    -0x24(%rbp), %eax
       51: 83 c0 01                      addl    $0x1, %eax
       54: 89 45 dc                      movl    %eax, -0x24(%rbp)
       57: eb d5                         jmp     0x2e <f+0x2e>
       59: 5d                            popq    %rbp
       5a: c3                            retq

There are some differences like constant representation, e.g. jump offset hex number,
gcc does not have '0x' prefix while clang has. Insn at 4b is also difference.
But overall the difference is smaller.

> dependency to test_progs, option (b) is the best. If folks on the
> mailing list agree with this, I can work on updating the patches.
>
> -------------
>
> Aside from testing I agree with Andrii regarding rbp usage,
> it seems like it should be possible to do the following in prologue:
>
>      movabs $0x...,%rsp
>      add %gs:0x...,%rsp
>      push %rbp
>
> and there would be no need to modify translation for instructions
> accessing r10, plus debugger stack unrolling logic should still work?.
> Or am I mistaken?
>
> Thanks,
> Eduard
Alexei Starovoitov July 25, 2024, 4:55 a.m. UTC | #22
On Wed, Jul 24, 2024 at 2:28 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> > In the following branch I explored a way to add such capability:
> > https://github.com/eddyz87/bpf/tree/yhs-private-stack-plus-jit-testing
...
> > 0156b148b5b4 ("selftests/bpf: utility function to get program disassembly after jit")
..

> There are some differences like constant representation, e.g. jump offset hex number,
> gcc does not have '0x' prefix while clang has. Insn at 4b is also difference.
> But overall the difference is smaller.

There is certainly a difference in disasm libs that will be causing
miscompare with expected text.

Also not everyone has disasm enabled in bpftool.
In my setup:
$ bpftool prog dump jited id 1
Error: No JIT disassembly support

So we can enable such feature in selftests,
but it would have to skip the tests if bpftool is not built
with the right disasm library, hence the value of such
tests will be small.

It's probably better to make test_progs use
LLVMDisasm* directly and converge on that disasm style
assuming distros have this lib easily available.
Eduard Zingerman July 25, 2024, 5:20 p.m. UTC | #23
On Wed, 2024-07-24 at 21:55 -0700, Alexei Starovoitov wrote:

[...]

> So we can enable such feature in selftests,
> but it would have to skip the tests if bpftool is not built
> with the right disasm library, hence the value of such
> tests will be small.
> 
> It's probably better to make test_progs use
> LLVMDisasm* directly and converge on that disasm style
> assuming distros have this lib easily available.

I agree that the differences in the disassembly are too big.
As Yonghong suggested, I checked why bpftool has two disassemblers,
this is explained in the commit [0]:

> ... To disassemble instructions for JIT-ed programs, bpftool has
> relied on the libbfd library. This has been problematic in the past:
> libbfd's interface is not meant to be stable and has changed several
> times ...

I'll update the disassembly patch to use LLVM library
(or skip the test if library is not available).

[0] eb9d1acf634b ("bpftool: Add LLVM as default library for disassembling JIT-ed programs")
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d25d81c8ecc0..60f5d86fb6aa 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1309,6 +1309,22 @@  static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 	*pprog = prog;
 }
 
+static void emit_private_frame_ptr(u8 **pprog, void *private_frame_ptr)
+{
+	u8 *prog = *pprog;
+
+	/* movabs r9, private_frame_ptr */
+	emit_mov_imm64(&prog, X86_REG_R9, (long) private_frame_ptr >> 32,
+		       (u32) (long) private_frame_ptr);
+
+	/* add <r9>, gs:[<off>] */
+	EMIT2(0x65, 0x4c);
+	EMIT3(0x03, 0x0c, 0x25);
+	EMIT((u32)(unsigned long)&this_cpu_off, 4);
+
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
@@ -1324,18 +1340,25 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	int insn_cnt = bpf_prog->len;
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	u32 stack_depth = bpf_prog->aux->stack_depth;
+	void __percpu *private_frame_ptr = NULL;
 	u64 arena_vm_start, user_vm_start;
 	int i, excnt = 0;
 	int ilen, proglen = 0;
 	u8 *prog = temp;
 	int err;
 
+	if (bpf_prog->private_stack_ptr) {
+		private_frame_ptr = bpf_prog->private_stack_ptr + round_up(stack_depth, 8);
+		stack_depth = 0;
+	}
+
 	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
 	user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
 
 	detect_reg_usage(insn, insn_cnt, callee_regs_used);
 
-	emit_prologue(&prog, bpf_prog->aux->stack_depth,
+	emit_prologue(&prog, stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
 		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
 	/* Exception callback will clobber callee regs for its own use, and
@@ -1357,6 +1380,9 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		emit_mov_imm64(&prog, X86_REG_R12,
 			       arena_vm_start >> 32, (u32) arena_vm_start);
 
+	if (private_frame_ptr)
+		emit_private_frame_ptr(&prog, private_frame_ptr);
+
 	ilen = prog - temp;
 	if (rw_image)
 		memcpy(rw_image + proglen, temp, ilen);
@@ -1376,6 +1402,14 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		u8 *func;
 		int nops;
 
+		if (private_frame_ptr) {
+			if (src_reg == BPF_REG_FP)
+				src_reg = X86_REG_R9;
+
+			if (dst_reg == BPF_REG_FP)
+				dst_reg = X86_REG_R9;
+		}
+
 		switch (insn->code) {
 			/* ALU */
 		case BPF_ALU | BPF_ADD | BPF_X:
@@ -2007,6 +2041,7 @@  st:			if (is_imm8(insn->off))
 				emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0);
 				/* Restore R0 after clobbering RAX */
 				emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
+
 				break;
 			}
 
@@ -2031,14 +2066,20 @@  st:			if (is_imm8(insn->off))
 
 			func = (u8 *) __bpf_call_base + imm32;
 			if (tail_call_reachable) {
-				RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
+				RESTORE_TAIL_CALL_CNT(stack_depth);
 				ip += 7;
 			}
 			if (!imm32)
 				return -EINVAL;
+			if (private_frame_ptr) {
+				EMIT2(0x41, 0x51); /* push r9 */
+				ip += 2;
+			}
 			ip += x86_call_depth_emit_accounting(&prog, func, ip);
 			if (emit_call(&prog, func, ip))
 				return -EINVAL;
+			if (private_frame_ptr)
+				EMIT2(0x41, 0x59); /* pop r9 */
 			break;
 		}
 
@@ -2048,13 +2089,13 @@  st:			if (is_imm8(insn->off))
 							  &bpf_prog->aux->poke_tab[imm32 - 1],
 							  &prog, image + addrs[i - 1],
 							  callee_regs_used,
-							  bpf_prog->aux->stack_depth,
+							  stack_depth,
 							  ctx);
 			else
 				emit_bpf_tail_call_indirect(bpf_prog,
 							    &prog,
 							    callee_regs_used,
-							    bpf_prog->aux->stack_depth,
+							    stack_depth,
 							    image + addrs[i - 1],
 							    ctx);
 			break;
@@ -3218,6 +3259,7 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *rw_header = NULL;
 	struct bpf_binary_header *header = NULL;
+	void __percpu *private_stack_ptr = NULL;
 	struct bpf_prog *tmp, *orig_prog = prog;
 	struct x64_jit_data *jit_data;
 	int proglen, oldproglen = 0;
@@ -3284,6 +3326,15 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	ctx.cleanup_addr = proglen;
 skip_init_addrs:
 
+	if (bpf_enable_private_stack(prog) && !prog->private_stack_ptr) {
+		private_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 8, GFP_KERNEL);
+		if (!private_stack_ptr) {
+			prog = orig_prog;
+			goto out_addrs;
+		}
+		prog->private_stack_ptr = private_stack_ptr;
+	}
+
 	/*
 	 * JITed image shrinks with every pass and the loop iterates
 	 * until the image stops shrinking. Very large BPF programs
@@ -3309,6 +3360,10 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 				prog->jited = 0;
 				prog->jited_len = 0;
 			}
+			if (private_stack_ptr) {
+				free_percpu(private_stack_ptr);
+				prog->private_stack_ptr = NULL;
+			}
 			goto out_addrs;
 		}
 		if (image) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4f1d4a97b9d1..19a3f5355363 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1563,6 +1563,7 @@  struct bpf_prog {
 					    const struct bpf_insn *insn);
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	void __percpu		*private_stack_ptr;
 	/* Instructions for interpreter */
 	union {
 		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
@@ -1819,6 +1820,7 @@  static inline void bpf_module_put(const void *data, struct module *owner)
 		module_put(owner);
 }
 int bpf_struct_ops_link_create(union bpf_attr *attr);
+bool bpf_enable_private_stack(struct bpf_prog *prog);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7ee62e38faf0..f69eb0c5fe03 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2813,6 +2813,26 @@  void bpf_prog_free(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
 
+bool bpf_enable_private_stack(struct bpf_prog *prog)
+{
+	if (prog->aux->stack_depth <= 64)
+		return false;
+
+	switch (prog->aux->prog->type) {
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+		return true;
+	case BPF_PROG_TYPE_TRACING:
+		if (prog->expected_attach_type != BPF_TRACE_ITER)
+			return true;
+		fallthrough;
+	default:
+		return false;
+	}
+}
+
 /* RNG for unprivileged user space with separated state from prandom_u32(). */
 static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 869265852d51..89162ddb4747 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2244,6 +2244,7 @@  static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
+	free_percpu(aux->prog->private_stack_ptr);
 	free_uid(aux->user);
 	security_bpf_prog_free(aux->prog);
 	bpf_prog_free(aux->prog);