diff mbox series

[bpf-next,v3,2/5] bpf: Collect stack depth information

Message ID 20240926234516.1770154-1-yonghong.song@linux.dev (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Support private stack for bpf progs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 213 this patch: 213
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 8 maintainers not CCed: sdf@fomichev.me eddyz87@gmail.com haoluo@google.com jolsa@kernel.org song@kernel.org kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 272 this patch: 272
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6964 this patch: 6964
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18

Commit Message

Yonghong Song Sept. 26, 2024, 11:45 p.m. UTC
Private stack memory allocation is based on call subtrees. For example,

  main_prog     // stack size 50
    subprog1    // stack size 50
      subprog2  // stack size 50
    subprog3    // stack size 50

Overall allocation size should be 150 bytes (stacks from main_prog,
subprog1 and subprog2).

To simplify jit, the root of subtrees is either the main prog
or any callback func. For example,

  main_prog
    subprog1    // callback subprog10
      ...
        subprog10
          subprog11

In this case, two subtrees exist. One root is main_prog and the other
root is subprog10.

The private stack is used only if
 - the subtree stack size is greater than 128 bytes and
   smaller than or equal to U16_MAX, and
 - the prog type is kprobe, tracepoint, perf_event, raw_tracepoint
   and tracing, and
 - jit supports private stack, and
 - no tail call in the main prog and all subprogs

The restriction of no tail call is due to the following two reasons:
 - to avoid potential large memory consumption. Currently maximum tail
   call count is MAX_TAIL_CALL_CNT=33. Considering private stack memory
   allocation is per-cpu based. It will be a very large memory consumption
   to support current MAX_TAIL_CALL_CNT.
 - if the tailcall in the callback function, it is not easy to pass
   the tail call cnt to the callback function and the tail call cnt
   is needed to find proper offset for private stack.
So to avoid complexity, private stack does not support tail call
for now.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h          |  3 +-
 include/linux/bpf_verifier.h |  3 ++
 kernel/bpf/verifier.c        | 81 ++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 1 deletion(-)

Comments

Alexei Starovoitov Sept. 30, 2024, 2:42 p.m. UTC | #1
On Thu, Sep 26, 2024 at 4:45 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> Private stack memory allocation is based on call subtrees. For example,
>
>   main_prog     // stack size 50
>     subprog1    // stack size 50
>       subprog2  // stack size 50
>     subprog3    // stack size 50
>
> Overall allocation size should be 150 bytes (stacks from main_prog,
> subprog1 and subprog2).
>
> To simplify jit, the root of subtrees is either the main prog
> or any callback func. For example,
>
>   main_prog
>     subprog1    // callback subprog10
>       ...
>         subprog10
>           subprog11

This is an odd example.
We have MAX_CALL_FRAMES = 8
So there cannot be more than 512 * 8 = 4k of stack.

>
> In this case, two subtrees exist. One root is main_prog and the other
> root is subprog10.
>
> The private stack is used only if
>  - the subtree stack size is greater than 128 bytes and
>    smaller than or equal to U16_MAX, and

U16 limit looks odd too.
Since we're not bumping MAX_CALL_FRAMES at the moment
let's limit to 4k.

>  - the prog type is kprobe, tracepoint, perf_event, raw_tracepoint
>    and tracing, and
>  - jit supports private stack, and
>  - no tail call in the main prog and all subprogs
>
> The restriction of no tail call is due to the following two reasons:
>  - to avoid potential large memory consumption. Currently maximum tail
>    call count is MAX_TAIL_CALL_CNT=33. Considering private stack memory
>    allocation is per-cpu based. It will be a very large memory consumption
>    to support current MAX_TAIL_CALL_CNT.
>  - if the tailcall in the callback function, it is not easy to pass
>    the tail call cnt to the callback function and the tail call cnt
>    is needed to find proper offset for private stack.
> So to avoid complexity, private stack does not support tail call
> for now.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h          |  3 +-
>  include/linux/bpf_verifier.h |  3 ++
>  kernel/bpf/verifier.c        | 81 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 86 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 62909fbe9e48..156b9516d9f6 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1566,7 +1566,8 @@ struct bpf_prog {
>                                 call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
>                                 call_get_func_ip:1, /* Do we call get_func_ip() */
>                                 tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
> -                               sleepable:1;    /* BPF program is sleepable */
> +                               sleepable:1,    /* BPF program is sleepable */
> +                               pstack_eligible:1; /* Candidate for private stacks */

I'm struggling with this abbreviation.
pstack is just too ambiguous.
It means 'pointer stack' in perf.
'man pstack' means 'print stack of a process'.
Let's use something more concrete.

How about priv_stack ?
And use it this way in all other names.
Instead of:
calc_private_stack_alloc_subprog
do:
calc_priv_stack_alloc_subprog

>         enum bpf_prog_type      type;           /* Type of BPF program */
>         enum bpf_attach_type    expected_attach_type; /* For some prog types */
>         u32                     len;            /* Number of filter blocks */
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 4513372c5bc8..63df10f4129e 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -659,6 +659,8 @@ struct bpf_subprog_info {
>          * are used for bpf_fastcall spills and fills.
>          */
>         s16 fastcall_stack_off;
> +       u16 subtree_stack_depth;
> +       u16 subtree_top_idx;
>         bool has_tail_call: 1;
>         bool tail_call_reachable: 1;
>         bool has_ld_abs: 1;
> @@ -668,6 +670,7 @@ struct bpf_subprog_info {
>         bool args_cached: 1;
>         /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>         bool keep_fastcall_stack: 1;
> +       bool pstack_eligible:1;
>
>         u8 arg_cnt;
>         struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 97700e32e085..69e17cb22037 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
>
>  #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
>
> +#define BPF_PSTACK_MIN_SUBTREE_SIZE    128
> +
>  static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
>  static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
>  static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
> @@ -6192,6 +6194,82 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
>         return 0;
>  }
>
> +static int calc_private_stack_alloc_subprog(struct bpf_verifier_env *env, int idx)
> +{
> +       struct bpf_subprog_info *subprog = env->subprog_info;
> +       struct bpf_insn *insn = env->prog->insnsi;
> +       int depth = 0, frame = 0, i, subprog_end;
> +       int ret_insn[MAX_CALL_FRAMES];
> +       int ret_prog[MAX_CALL_FRAMES];
> +       int ps_eligible = 0;
> +       int orig_idx = idx;
> +
> +       subprog[idx].subtree_top_idx = idx;
> +       i = subprog[idx].start;
> +
> +process_func:
> +       depth += round_up_stack_depth(env, subprog[idx].stack_depth);
> +       if (depth > U16_MAX)
> +               return -EACCES;
> +
> +       if (!ps_eligible && depth >= BPF_PSTACK_MIN_SUBTREE_SIZE) {
> +               subprog[orig_idx].pstack_eligible = true;
> +               ps_eligible = true;
> +       }
> +       subprog[orig_idx].subtree_stack_depth =
> +               max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
> +
> +continue_func:
> +       subprog_end = subprog[idx + 1].start;
> +       for (; i < subprog_end; i++) {
> +               int next_insn, sidx;
> +
> +               if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
> +                       continue;
> +               /* remember insn and function to return to */
> +               ret_insn[frame] = i + 1;
> +               ret_prog[frame] = idx;
> +
> +               /* find the callee */
> +               next_insn = i + insn[i].imm + 1;
> +               sidx = find_subprog(env, next_insn);
> +               if (subprog[sidx].is_cb) {
> +                       if (!bpf_pseudo_call(insn + i))
> +                               continue;
> +               }
> +               i = next_insn;
> +               idx = sidx;
> +               subprog[idx].subtree_top_idx = orig_idx;
> +
> +               frame++;
> +               goto process_func;
> +       }
> +       if (frame == 0)
> +               return ps_eligible;
> +       depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
> +       frame--;
> +       i = ret_insn[frame];
> +       idx = ret_prog[frame];
> +       goto continue_func;
> +}

This looks very similar to check_max_stack_depth_subprog()
Can you share the code?
Yonghong Song Sept. 30, 2024, 4:23 p.m. UTC | #2
On 9/30/24 7:42 AM, Alexei Starovoitov wrote:
> On Thu, Sep 26, 2024 at 4:45 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> Private stack memory allocation is based on call subtrees. For example,
>>
>>    main_prog     // stack size 50
>>      subprog1    // stack size 50
>>        subprog2  // stack size 50
>>      subprog3    // stack size 50
>>
>> Overall allocation size should be 150 bytes (stacks from main_prog,
>> subprog1 and subprog2).
>>
>> To simplify jit, the root of subtrees is either the main prog
>> or any callback func. For example,
>>
>>    main_prog
>>      subprog1    // callback subprog10
>>        ...
>>          subprog10
>>            subprog11
> This is an odd example.
> We have MAX_CALL_FRAMES = 8
> So there cannot be more than 512 * 8 = 4k of stack.
>
>> In this case, two subtrees exist. One root is main_prog and the other
>> root is subprog10.
>>
>> The private stack is used only if
>>   - the subtree stack size is greater than 128 bytes and
>>     smaller than or equal to U16_MAX, and
> U16 limit looks odd too.
> Since we're not bumping MAX_CALL_FRAMES at the moment
> let's limit to 4k.

Make sense. Missed this. Will make the change.

>
>>   - the prog type is kprobe, tracepoint, perf_event, raw_tracepoint
>>     and tracing, and
>>   - jit supports private stack, and
>>   - no tail call in the main prog and all subprogs
>>
>> The restriction of no tail call is due to the following two reasons:
>>   - to avoid potential large memory consumption. Currently maximum tail
>>     call count is MAX_TAIL_CALL_CNT=33. Considering private stack memory
>>     allocation is per-cpu based. It will be a very large memory consumption
>>     to support current MAX_TAIL_CALL_CNT.
>>   - if the tailcall in the callback function, it is not easy to pass
>>     the tail call cnt to the callback function and the tail call cnt
>>     is needed to find proper offset for private stack.
>> So to avoid complexity, private stack does not support tail call
>> for now.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h          |  3 +-
>>   include/linux/bpf_verifier.h |  3 ++
>>   kernel/bpf/verifier.c        | 81 ++++++++++++++++++++++++++++++++++++
>>   3 files changed, 86 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 62909fbe9e48..156b9516d9f6 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1566,7 +1566,8 @@ struct bpf_prog {
>>                                  call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
>>                                  call_get_func_ip:1, /* Do we call get_func_ip() */
>>                                  tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
>> -                               sleepable:1;    /* BPF program is sleepable */
>> +                               sleepable:1,    /* BPF program is sleepable */
>> +                               pstack_eligible:1; /* Candidate for private stacks */
> I'm struggling with this abbreviation.
> pstack is just too ambiguous.
> It means 'pointer stack' in perf.
> 'man pstack' means 'print stack of a process'.
> Let's use something more concrete.
>
> How about priv_stack ?
> And use it this way in all other names.
> Instead of:
> calc_private_stack_alloc_subprog
> do:
> calc_priv_stack_alloc_subprog

I am using pstack to make name shorter but it may
not convey the information. So agree let me use priv_stack then.

>
>>          enum bpf_prog_type      type;           /* Type of BPF program */
>>          enum bpf_attach_type    expected_attach_type; /* For some prog types */
>>          u32                     len;            /* Number of filter blocks */
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 4513372c5bc8..63df10f4129e 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -659,6 +659,8 @@ struct bpf_subprog_info {
>>           * are used for bpf_fastcall spills and fills.
>>           */
>>          s16 fastcall_stack_off;
>> +       u16 subtree_stack_depth;
>> +       u16 subtree_top_idx;
>>          bool has_tail_call: 1;
>>          bool tail_call_reachable: 1;
>>          bool has_ld_abs: 1;
>> @@ -668,6 +670,7 @@ struct bpf_subprog_info {
>>          bool args_cached: 1;
>>          /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>>          bool keep_fastcall_stack: 1;
>> +       bool pstack_eligible:1;
>>
>>          u8 arg_cnt;
>>          struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 97700e32e085..69e17cb22037 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
>>
>>   #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
>>
>> +#define BPF_PSTACK_MIN_SUBTREE_SIZE    128
>> +
>>   static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
>>   static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
>>   static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
>> @@ -6192,6 +6194,82 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
>>          return 0;
>>   }
>>
>> +static int calc_private_stack_alloc_subprog(struct bpf_verifier_env *env, int idx)
>> +{
>> +       struct bpf_subprog_info *subprog = env->subprog_info;
>> +       struct bpf_insn *insn = env->prog->insnsi;
>> +       int depth = 0, frame = 0, i, subprog_end;
>> +       int ret_insn[MAX_CALL_FRAMES];
>> +       int ret_prog[MAX_CALL_FRAMES];
>> +       int ps_eligible = 0;
>> +       int orig_idx = idx;
>> +
>> +       subprog[idx].subtree_top_idx = idx;
>> +       i = subprog[idx].start;
>> +
>> +process_func:
>> +       depth += round_up_stack_depth(env, subprog[idx].stack_depth);
>> +       if (depth > U16_MAX)
>> +               return -EACCES;
>> +
>> +       if (!ps_eligible && depth >= BPF_PSTACK_MIN_SUBTREE_SIZE) {
>> +               subprog[orig_idx].pstack_eligible = true;
>> +               ps_eligible = true;
>> +       }
>> +       subprog[orig_idx].subtree_stack_depth =
>> +               max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
>> +
>> +continue_func:
>> +       subprog_end = subprog[idx + 1].start;
>> +       for (; i < subprog_end; i++) {
>> +               int next_insn, sidx;
>> +
>> +               if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
>> +                       continue;
>> +               /* remember insn and function to return to */
>> +               ret_insn[frame] = i + 1;
>> +               ret_prog[frame] = idx;
>> +
>> +               /* find the callee */
>> +               next_insn = i + insn[i].imm + 1;
>> +               sidx = find_subprog(env, next_insn);
>> +               if (subprog[sidx].is_cb) {
>> +                       if (!bpf_pseudo_call(insn + i))
>> +                               continue;
>> +               }
>> +               i = next_insn;
>> +               idx = sidx;
>> +               subprog[idx].subtree_top_idx = orig_idx;
>> +
>> +               frame++;
>> +               goto process_func;
>> +       }
>> +       if (frame == 0)
>> +               return ps_eligible;
>> +       depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
>> +       frame--;
>> +       i = ret_insn[frame];
>> +       idx = ret_prog[frame];
>> +       goto continue_func;
>> +}
> This looks very similar to check_max_stack_depth_subprog()
> Can you share the code?

Yes. It does similar except it removed some codes from original
check_max_stack_depth_subprog(). I thought this is cleaner but
agree it does duplicate the code. Let me try to share at least
some codes with check_max_stack_depth_subprog().
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 62909fbe9e48..156b9516d9f6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1566,7 +1566,8 @@  struct bpf_prog {
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
 				call_get_func_ip:1, /* Do we call get_func_ip() */
 				tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
-				sleepable:1;	/* BPF program is sleepable */
+				sleepable:1,	/* BPF program is sleepable */
+				pstack_eligible:1; /* Candidate for private stacks */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4513372c5bc8..63df10f4129e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -659,6 +659,8 @@  struct bpf_subprog_info {
 	 * are used for bpf_fastcall spills and fills.
 	 */
 	s16 fastcall_stack_off;
+	u16 subtree_stack_depth;
+	u16 subtree_top_idx;
 	bool has_tail_call: 1;
 	bool tail_call_reachable: 1;
 	bool has_ld_abs: 1;
@@ -668,6 +670,7 @@  struct bpf_subprog_info {
 	bool args_cached: 1;
 	/* true if bpf_fastcall stack region is used by functions that can't be inlined */
 	bool keep_fastcall_stack: 1;
+	bool pstack_eligible:1;
 
 	u8 arg_cnt;
 	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 97700e32e085..69e17cb22037 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,8 @@  struct bpf_verifier_stack_elem {
 
 #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
 
+#define BPF_PSTACK_MIN_SUBTREE_SIZE	128
+
 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
 static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
 static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -6192,6 +6194,82 @@  static int check_max_stack_depth(struct bpf_verifier_env *env)
 	return 0;
 }
 
+static int calc_private_stack_alloc_subprog(struct bpf_verifier_env *env, int idx)
+{
+	struct bpf_subprog_info *subprog = env->subprog_info;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int depth = 0, frame = 0, i, subprog_end;
+	int ret_insn[MAX_CALL_FRAMES];
+	int ret_prog[MAX_CALL_FRAMES];
+	int ps_eligible = 0;
+	int orig_idx = idx;
+
+	subprog[idx].subtree_top_idx = idx;
+	i = subprog[idx].start;
+
+process_func:
+	depth += round_up_stack_depth(env, subprog[idx].stack_depth);
+	if (depth > U16_MAX)
+		return -EACCES;
+
+	if (!ps_eligible && depth >= BPF_PSTACK_MIN_SUBTREE_SIZE) {
+		subprog[orig_idx].pstack_eligible = true;
+		ps_eligible = true;
+	}
+	subprog[orig_idx].subtree_stack_depth =
+		max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
+
+continue_func:
+	subprog_end = subprog[idx + 1].start;
+	for (; i < subprog_end; i++) {
+		int next_insn, sidx;
+
+		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
+			continue;
+		/* remember insn and function to return to */
+		ret_insn[frame] = i + 1;
+		ret_prog[frame] = idx;
+
+		/* find the callee */
+		next_insn = i + insn[i].imm + 1;
+		sidx = find_subprog(env, next_insn);
+		if (subprog[sidx].is_cb) {
+			if (!bpf_pseudo_call(insn + i))
+				continue;
+		}
+		i = next_insn;
+		idx = sidx;
+		subprog[idx].subtree_top_idx = orig_idx;
+
+		frame++;
+		goto process_func;
+	}
+	if (frame == 0)
+		return ps_eligible;
+	depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
+	frame--;
+	i = ret_insn[frame];
+	idx = ret_prog[frame];
+	goto continue_func;
+}
+
+static int calc_private_stack_alloc_size(struct bpf_verifier_env *env)
+{
+	struct bpf_subprog_info *si = env->subprog_info;
+	int ret;
+
+	for (int i = 0; i < env->subprog_cnt; i++) {
+		if (!i || si[i].is_cb) {
+			ret = calc_private_stack_alloc_subprog(env, i);
+			if (ret < 0)
+				return ret;
+			if (ret)
+				env->prog->pstack_eligible = true;
+		}
+	}
+	return 0;
+}
+
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 static int get_callee_stack_depth(struct bpf_verifier_env *env,
 				  const struct bpf_insn *insn, int idx)
@@ -22502,6 +22580,9 @@  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 								     : false;
 	}
 
+	if (ret == 0 && env->prog->aux->pstack_enabled)
+		ret = calc_private_stack_alloc_size(env);
+
 	if (ret == 0)
 		ret = fixup_call_args(env);