diff mbox series

[bpf-next,v3,3/5] bpf: Inline calls to bpf_loop when callback is known

Message ID 20220603141047.2163170-4-eddyz87@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf_loop inlining | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 49 this patch: 49
netdev/cc_maintainers warning 6 maintainers not CCed: netdev@vger.kernel.org songliubraving@fb.com yhs@fb.com john.fastabend@gmail.com kafai@fb.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 6 this patch: 6
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 49 this patch: 49
netdev/checkpatch warning CHECK: multiple assignments should be avoided WARNING: line length of 81 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Kernel LATEST on z15 with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15

Commit Message

Eduard Zingerman June 3, 2022, 2:10 p.m. UTC
Calls to `bpf_loop` are replaced with direct loops to avoid
indirection. E.g. the following:

  bpf_loop(10, foo, NULL, 0);

Is replaced by equivalent of the following:

  for (int i = 0; i < 10; ++i)
    foo(i, NULL);

This transformation could be applied when:
- callback is known and does not change during program execution;
- flags passed to `bpf_loop` are always zero.

Inlining logic works as follows:

- During execution simulation function `update_loop_inline_state`
  tracks the following information for each `bpf_loop` call
  instruction:
  - is callback known and constant?
  - are flags constant and zero?
- Function `adjust_stack_depth_for_loop_inlining` increases stack
  depth for functions where `bpf_loop` calls could be inlined. This is
  needed to spill registers R6, R7 and R8. These registers are used as
  loop counter, loop maximal bound and callback context parameter;
- Function `inline_bpf_loop` called from `do_misc_fixups` replaces
  `bpf_loop` calls fit for inlining with corresponding loop
  instructions.

Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
---
 include/linux/bpf_verifier.h |  16 +++
 kernel/bpf/bpf_iter.c        |   9 +-
 kernel/bpf/verifier.c        | 199 ++++++++++++++++++++++++++++++++++-
 3 files changed, 215 insertions(+), 9 deletions(-)

Comments

Song Liu June 3, 2022, 10:36 p.m. UTC | #1
On Fri, Jun 3, 2022 at 7:11 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
>

[...]

>
> Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
> i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.
>
> Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
> ---
[...]
>
> -/* maximum number of loops */
> -#define MAX_LOOPS      BIT(23)
> -
>  BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>            u64, flags)
>  {
> @@ -733,9 +731,12 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>         u64 ret;
>         u32 i;
>
> +       /* note: these safety checks are also verified when bpf_loop is inlined,
> +        * be careful to modify this code in sync

Let's call out inline_bpf_loop() here to be more clear.

[...]

>

Let's also call out here this the inlined version of bpf_iter.c:bpf_loop

> +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
> +                                       int position, u32 *cnt)
> +{
> +       struct bpf_insn_aux_data *aux = &env->insn_aux_data[position];
> +       s32 stack_base = aux->loop_inline_state.stack_base;
> +       s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
> +       s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
> +       s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
> +       int reg_loop_max = BPF_REG_6;
> +       int reg_loop_cnt = BPF_REG_7;
> +       int reg_loop_ctx = BPF_REG_8;
> +
> +       struct bpf_prog *new_prog;
> +       u32 callback_subprogno = aux->loop_inline_state.callback_subprogno;
> +       u32 callback_start;
> +       u32 call_insn_offset;
> +       s32 callback_offset;
> +       struct bpf_insn insn_buf[19];
> +       struct bpf_insn *next = insn_buf;
> +       struct bpf_insn *call, *jump_to_end, *loop_header;
> +       struct bpf_insn *jump_to_header, *loop_exit;
> +
> +       /* Return error and jump to the end of the patch if
> +        * expected number of iterations is too big.  This
> +        * repeats the check done in bpf_loop helper function,
> +        * be careful to modify this code in sync.
> +        */
> +       (*next++) = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
> +       (*next++) = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
> +       jump_to_end = next;
> +       (*next++) = BPF_JMP_IMM(BPF_JA, 0, 0, 0 /* set below */);
> +       /* spill R6, R7, R8 to use these as loop vars */
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
> +       /* initialize loop vars */
> +       (*next++) = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
> +       (*next++) = BPF_MOV32_IMM(reg_loop_cnt, 0);
> +       (*next++) = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
> +       /* loop header;
> +        * if reg_loop_cnt >= reg_loop_max skip the loop body
> +        */
> +       loop_header = next;
> +       (*next++) = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max,
> +                               0 /* set below */);
> +       /* callback call */
> +       (*next++) = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
> +       (*next++) = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
> +       call = next;
> +       (*next++) = BPF_CALL_REL(0 /* set below after patching */);
> +       /* increment loop counter */
> +       (*next++) = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
> +       /* jump to loop header if callback returned 0 */
> +       jump_to_header = next;
> +       (*next++) = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0 /* set below */);
> +       /* return value of bpf_loop;
> +        * set R0 to the number of iterations
> +        */
> +       loop_exit = next;
> +       (*next++) = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
> +       /* restore original values of R6, R7, R8 */
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
> +
> +       *cnt = next - insn_buf;
> +       if (*cnt > ARRAY_SIZE(insn_buf)) {
> +               WARN_ONCE(1, "BUG %s: 'next' exceeds bounds for 'insn_buf'\n",
> +                         __func__);
> +               return NULL;
> +       }
> +       jump_to_end->off = next - jump_to_end - 1;
> +       loop_header->off = loop_exit - loop_header - 1;
> +       jump_to_header->off = loop_header - jump_to_header - 1;

I know these changes are made based on my feedback on v1. But it seems
v1 is actually cleaner. How about we use v1, but add comments on offsets that
need to redo when we make changes to insn_buf[]?



> +
> +       new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
> +       if (!new_prog)
> +               return new_prog;
> +
> +       /* callback start is known only after patching */
> +       callback_start = env->subprog_info[callback_subprogno].start;
> +       call_insn_offset = position + (call - insn_buf);
> +       callback_offset = callback_start - call_insn_offset - 1;
> +       env->prog->insnsi[call_insn_offset].imm = callback_offset;
> +
> +       return new_prog;
> +}
> +
>  /* Do various post-verification rewrites in a single program pass.
>   * These rewrites simplify JIT and interpreter implementations.
>   */
> @@ -14258,6 +14432,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         continue;
>                 }
>
> +               if (insn->imm == BPF_FUNC_loop &&
> +                   fit_for_bpf_loop_inline(&env->insn_aux_data[i + delta])) {
> +                       new_prog = inline_bpf_loop(env, i + delta, &cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +
> +                       delta    += cnt - 1;
> +                       env->prog = prog = new_prog;
> +                       insn      = new_prog->insnsi + i + delta;
> +                       continue;
> +               }
> +
>  patch_call_imm:
>                 fn = env->ops->get_func_proto(insn->imm, env->prog);
>                 /* all functions that have prototype and verifier allowed
> @@ -15030,6 +15216,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
>         if (ret == 0)
>                 ret = check_max_stack_depth(env);
>
> +       if (ret == 0)
> +               adjust_stack_depth_for_loop_inlining(env);
> +
>         /* instruction rewrites happen after this point */
>         if (is_priv) {
>                 if (ret == 0)
> --
> 2.25.1
>
Joanne Koong June 4, 2022, 12:06 a.m. UTC | #2
On Fri, Jun 3, 2022 at 11:51 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> Calls to `bpf_loop` are replaced with direct loops to avoid
> indirection. E.g. the following:
>
>   bpf_loop(10, foo, NULL, 0);
>
> Is replaced by equivalent of the following:
>
>   for (int i = 0; i < 10; ++i)
>     foo(i, NULL);
>
> This transformation could be applied when:
> - callback is known and does not change during program execution;
> - flags passed to `bpf_loop` are always zero.
>
> Inlining logic works as follows:
>
> - During execution simulation function `update_loop_inline_state`
>   tracks the following information for each `bpf_loop` call
>   instruction:
>   - is callback known and constant?
>   - are flags constant and zero?
> - Function `adjust_stack_depth_for_loop_inlining` increases stack
>   depth for functions where `bpf_loop` calls could be inlined. This is
>   needed to spill registers R6, R7 and R8. These registers are used as
>   loop counter, loop maximal bound and callback context parameter;
> - Function `inline_bpf_loop` called from `do_misc_fixups` replaces
>   `bpf_loop` calls fit for inlining with corresponding loop
>   instructions.
>
> Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
> i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.
>
> Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
> ---
>  include/linux/bpf_verifier.h |  16 +++
>  kernel/bpf/bpf_iter.c        |   9 +-
>  kernel/bpf/verifier.c        | 199 ++++++++++++++++++++++++++++++++++-
>  3 files changed, 215 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index e8439f6cbe57..80279616a76b 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -20,6 +20,8 @@
>  #define BPF_MAX_VAR_SIZ        (1 << 29)
>  /* size of type_str_buf in bpf_verifier. */
>  #define TYPE_STR_BUF_LEN 64
> +/* Maximum number of loops for bpf_loop */
> +#define BPF_MAX_LOOPS  BIT(23)

Should this be moved to include/linux/bpf.h since
kernel/bpf/bpf_iter.c also uses this? Then we don't need the "#include
<linux/bpf_verifier.h>" in bpf_iter.c

>
>  /* Liveness marks, used for registers and spilled-regs (in stack slots).
>   * Read marks propagate upwards until they find a write mark; they record that
> @@ -344,6 +346,16 @@ struct bpf_verifier_state_list {
>         int miss_cnt, hit_cnt;
>  };
>
> +struct bpf_loop_inline_state {
> +       u32 initialized:1; /* set to true upon first entry */
> +       u32 callback_is_constant:1; /* true if callback function
> +                                    * is the same at each call
> +                                    */
> +       u32 flags_is_zero:1; /* true if flags register is zero at each call */

I think the more straightforward

bool initialized;
bool callback_is_constant;
bool flags_is_zero;

ends up being the same size as using bit fields.

If your intention was to use bit fields to try to minimize the size of
the struct, I think we could do something like:

bool initialized:1;
bool callback_is_constant:1;
bool flags_is_zero:1;
 /* u16 since bpf_subprog_info.stack_depth is u16; we take the
negative of it whenever we use it since the stack grows downwards */
u16 stack_depth;
u32 callback_subprogno;

to make the struct more compact.

Also, as a side-note, I think if we have an "is_valid" field, then we
don't need both the "callback_is_constant" and "flags_is_zero" fields.
If the flags is not zero or the callback subprogno is not the same on
each invocation of the instruction, then we could represent that by
just setting is_valid to false.

> +       u32 callback_subprogno; /* valid when callback_is_constant == 1 */
> +       s32 stack_base; /* stack offset for loop vars */
> +};
> +
>  /* Possible states for alu_state member. */
>  #define BPF_ALU_SANITIZE_SRC           (1U << 0)
>  #define BPF_ALU_SANITIZE_DST           (1U << 1)
> @@ -380,6 +392,10 @@ struct bpf_insn_aux_data {
>         bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
>         bool zext_dst; /* this insn zero extends dst reg */
>         u8 alu_state; /* used in combination with alu_limit */
> +       /* if instruction is a call to bpf_loop this field tracks
> +        * the state of the relevant registers to take decision about inlining
> +        */
> +       struct bpf_loop_inline_state loop_inline_state;

Is placing this inside the union in "struct bpf_insn_aux_data" an option?

>
>         /* below fields are initialized once */
>         unsigned int orig_idx; /* original instruction index */
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index d5d96ceca105..cdb898fce118 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -6,6 +6,7 @@
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
>  #include <linux/rcupdate_trace.h>
> +#include <linux/bpf_verifier.h>
>
>  struct bpf_iter_target_info {
>         struct list_head list;
> @@ -723,9 +724,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
>         .arg4_type      = ARG_ANYTHING,
>  };
>
> -/* maximum number of loops */
> -#define MAX_LOOPS      BIT(23)
> -
>  BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>            u64, flags)
>  {
> @@ -733,9 +731,12 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>         u64 ret;
>         u32 i;
>
> +       /* note: these safety checks are also verified when bpf_loop is inlined,
> +        * be careful to modify this code in sync
> +        */
>         if (flags)
>                 return -EINVAL;
> -       if (nr_loops > MAX_LOOPS)
> +       if (nr_loops > BPF_MAX_LOOPS)
>                 return -E2BIG;
>
>         for (i = 0; i < nr_loops; i++) {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index aedac2ac02b9..27d78fe6c3f9 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7103,6 +7103,78 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
>         return -ENOTSUPP;
>  }
>
> +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
> +{
> +       return &env->insn_aux_data[env->insn_idx];
> +}
> +
> +static bool fit_for_bpf_loop_inline(struct bpf_insn_aux_data *insn_aux)
> +{
> +       return insn_aux->loop_inline_state.initialized &&
> +               insn_aux->loop_inline_state.flags_is_zero &&
> +               insn_aux->loop_inline_state.callback_is_constant;
> +}
> +
> +/* For all sub-programs in the program (including main) checks

nit: "check" without the extra s :)

> + * insn_aux_data to see if there are bpf_loop calls that require
> + * inlining. If such calls are found subprog stack_depth is increased
> + * by the size of 3 registers. Reserved space would be used in the
> + * do_misc_fixups to spill values of the R6, R7, R8 to use these
> + * registers for loop iteration.
> + */
> +static void adjust_stack_depth_for_loop_inlining(struct bpf_verifier_env *env)
> +{
> +       int i, subprog_end, cur_subprog = 0;
> +       struct bpf_subprog_info *subprogs = env->subprog_info;
nit: I think this needs to be above the "int i, subprog_end..." line
since it's longer

> +       int insn_cnt = env->prog->len;
> +       bool subprog_updated = false;
> +       s32 stack_base;
> +
> +       subprog_end = (env->subprog_cnt > 1
> +                      ? subprogs[cur_subprog + 1].start
> +                      : insn_cnt);
> +       for (i = 0; i < insn_cnt; i++) {
> +               struct bpf_insn_aux_data *aux = &env->insn_aux_data[i];

Maybe this should be "struct bpf_loop_inline_state *inline_state =
&env->insn_aux_data[i].loop_inline_state" since we only use aux for
aux.loop_inline_state

> +
> +               if (fit_for_bpf_loop_inline(aux)) {
> +                       if (!subprog_updated) {
> +                               subprog_updated = true;
> +                               subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
> +                               stack_base = -subprogs[cur_subprog].stack_depth;
> +                       }
> +                       aux->loop_inline_state.stack_base = stack_base;
> +               }
> +               if (i == subprog_end - 1) {
> +                       subprog_updated = false;
> +                       cur_subprog++;
> +                       if (cur_subprog < env->subprog_cnt)
> +                               subprog_end = subprogs[cur_subprog + 1].start;
> +               }
> +       }
> +
> +       env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;

In the case where a subprogram that is not subprogram 0 is a fit for
the bpf loop inline and thus increases its stack depth, won't
env->prog->aux->stack_depth need to also be updated?

> +}
> +
> +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> +{
> +       struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> +       struct bpf_reg_state *regs = cur_regs(env);
> +       struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
> +
> +       int flags_is_zero =
> +               register_is_const(flags_reg) && flags_reg->var_off.value == 0;
> +
> +       if (state->initialized) {
> +               state->flags_is_zero &= flags_is_zero;
> +               state->callback_is_constant &= state->callback_subprogno == subprogno;
> +       } else {
> +               state->initialized = 1;
> +               state->callback_is_constant = 1;
> +               state->flags_is_zero = flags_is_zero;
> +               state->callback_subprogno = subprogno;
> +       }
> +}
> +
>  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>                              int *insn_idx_p)
>  {
> @@ -7255,6 +7327,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>                 err = check_bpf_snprintf_call(env, regs);
>                 break;
>         case BPF_FUNC_loop:
> +               update_loop_inline_state(env, meta.subprogno);
>                 err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
>                                         set_loop_callback_state);
>                 break;
> @@ -7661,11 +7734,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
>         return true;
>  }
>
> -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
> -{
> -       return &env->insn_aux_data[env->insn_idx];
> -}
> -
>  enum {
>         REASON_BOUNDS   = -1,
>         REASON_TYPE     = -2,
> @@ -12920,6 +12988,22 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
>         return new_prog;
>  }
>
> +static void adjust_loop_inline_subprogno(struct bpf_verifier_env *env,
> +                                        u32 first_removed,
> +                                        u32 first_remaining)
> +{
> +       int delta = first_remaining - first_removed;
> +
> +       for (int i = 0; i < env->prog->len; ++i) {
> +               struct bpf_loop_inline_state *state =
> +                       &env->insn_aux_data[i].loop_inline_state;
> +
> +               if (state->initialized &&
> +                   state->callback_subprogno >= first_remaining)
> +                       state->callback_subprogno -= delta;
> +       }
> +}
> +
>  static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
>                                               u32 off, u32 cnt)
>  {
> @@ -12963,6 +13047,8 @@ static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
>                          * in adjust_btf_func() - no need to adjust
>                          */
>                 }
> +
> +               adjust_loop_inline_subprogno(env, i, j);
>         } else {
>                 /* convert i from "first prog to remove" to "first to adjust" */
>                 if (env->subprog_info[i].start == off)
> @@ -13773,6 +13859,94 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
>         return 0;
>  }
>
> +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
> +                                       int position, u32 *cnt)
> +{
> +       struct bpf_insn_aux_data *aux = &env->insn_aux_data[position];
> +       s32 stack_base = aux->loop_inline_state.stack_base;
> +       s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
> +       s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
> +       s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
> +       int reg_loop_max = BPF_REG_6;
> +       int reg_loop_cnt = BPF_REG_7;
> +       int reg_loop_ctx = BPF_REG_8;
> +
> +       struct bpf_prog *new_prog;
> +       u32 callback_subprogno = aux->loop_inline_state.callback_subprogno;
> +       u32 callback_start;
> +       u32 call_insn_offset;
> +       s32 callback_offset;
> +       struct bpf_insn insn_buf[19];
> +       struct bpf_insn *next = insn_buf;
> +       struct bpf_insn *call, *jump_to_end, *loop_header;
> +       struct bpf_insn *jump_to_header, *loop_exit;
> +
> +       /* Return error and jump to the end of the patch if
> +        * expected number of iterations is too big.  This
> +        * repeats the check done in bpf_loop helper function,
> +        * be careful to modify this code in sync.
> +        */
> +       (*next++) = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
> +       (*next++) = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
> +       jump_to_end = next;
> +       (*next++) = BPF_JMP_IMM(BPF_JA, 0, 0, 0 /* set below */);
> +       /* spill R6, R7, R8 to use these as loop vars */
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
> +       /* initialize loop vars */
> +       (*next++) = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
> +       (*next++) = BPF_MOV32_IMM(reg_loop_cnt, 0);
> +       (*next++) = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
> +       /* loop header;
> +        * if reg_loop_cnt >= reg_loop_max skip the loop body
> +        */
> +       loop_header = next;
> +       (*next++) = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max,
> +                               0 /* set below */);
> +       /* callback call */
> +       (*next++) = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
> +       (*next++) = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
> +       call = next;
> +       (*next++) = BPF_CALL_REL(0 /* set below after patching */);
> +       /* increment loop counter */
> +       (*next++) = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
> +       /* jump to loop header if callback returned 0 */
> +       jump_to_header = next;
> +       (*next++) = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0 /* set below */);
> +       /* return value of bpf_loop;
> +        * set R0 to the number of iterations
> +        */
> +       loop_exit = next;
> +       (*next++) = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
> +       /* restore original values of R6, R7, R8 */
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
> +
> +       *cnt = next - insn_buf;
> +       if (*cnt > ARRAY_SIZE(insn_buf)) {
> +               WARN_ONCE(1, "BUG %s: 'next' exceeds bounds for 'insn_buf'\n",
> +                         __func__);
> +               return NULL;
> +       }
> +       jump_to_end->off = next - jump_to_end - 1;
> +       loop_header->off = loop_exit - loop_header - 1;
> +       jump_to_header->off = loop_header - jump_to_header - 1;
> +
> +       new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
> +       if (!new_prog)
> +               return new_prog;
> +
> +       /* callback start is known only after patching */
> +       callback_start = env->subprog_info[callback_subprogno].start;
> +       call_insn_offset = position + (call - insn_buf);
> +       callback_offset = callback_start - call_insn_offset - 1;
> +       env->prog->insnsi[call_insn_offset].imm = callback_offset;
> +
> +       return new_prog;
> +}
> +
>  /* Do various post-verification rewrites in a single program pass.
>   * These rewrites simplify JIT and interpreter implementations.
>   */
> @@ -14258,6 +14432,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         continue;
>                 }
>
> +               if (insn->imm == BPF_FUNC_loop &&
> +                   fit_for_bpf_loop_inline(&env->insn_aux_data[i + delta])) {
> +                       new_prog = inline_bpf_loop(env, i + delta, &cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +
> +                       delta    += cnt - 1;
> +                       env->prog = prog = new_prog;
> +                       insn      = new_prog->insnsi + i + delta;
> +                       continue;
> +               }
> +
>  patch_call_imm:
>                 fn = env->ops->get_func_proto(insn->imm, env->prog);
>                 /* all functions that have prototype and verifier allowed
> @@ -15030,6 +15216,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
>         if (ret == 0)
>                 ret = check_max_stack_depth(env);
>
> +       if (ret == 0)
> +               adjust_stack_depth_for_loop_inlining(env);

Do we need to do this before the check_max_stack_depth() call above
since adjust_stack_depth_for_loop_inlining() adjusts the stack depth?

> +
>         /* instruction rewrites happen after this point */
>         if (is_priv) {
>                 if (ret == 0)
> --
> 2.25.1
>
Eduard Zingerman June 4, 2022, 12:51 p.m. UTC | #3
Hi Joanne, thanks for the review!

> On Fri, 2022-06-03 at 17:06 -0700, Joanne Koong wrote:
> Should this be moved to include/linux/bpf.h since
> kernel/bpf/bpf_iter.c also uses this? Then we don't need the "#include
> <linux/bpf_verifier.h>" in bpf_iter.c

Will do.

> Also, as a side-note, I think if we have an "is_valid" field, then we
> don't need both the "callback_is_constant" and "flags_is_zero" fields.
> If the flags is not zero or the callback subprogno is not the same on
> each invocation of the instruction, then we could represent that by
> just setting is_valid to false.

Agree, I'll update the declaration as follows:

struct bpf_loop_inline_state {
	bool initialized; /* set to true upon first entry */
	bool can_inline; /* true if callback function
			  * is the same at each call
			  * and flags are always zero */
	u32 callback_subprogno; /* valid when can_inline is true */
	/* stack offset for loop vars;
	 * u16 since bpf_subprog_info.stack_depth is u16;
	 * we take the negative of it whenever we use it
	 * since the stack grows downwards
	 */
	u16 stack_base;
};

> Is placing this inside the union in "struct bpf_insn_aux_data" an option?

This is an option indeed, will do.

> > +
> > +               if (fit_for_bpf_loop_inline(aux)) {
> > +                       if (!subprog_updated) {
> > +                               subprog_updated = true;
> > +                               subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
> > +                               stack_base = -subprogs[cur_subprog].stack_depth;
> > +                       }
> > +                       aux->loop_inline_state.stack_base = stack_base;
> > +               }
> > +               if (i == subprog_end - 1) {
> > +                       subprog_updated = false;
> > +                       cur_subprog++;
> > +                       if (cur_subprog < env->subprog_cnt)
> > +                               subprog_end = subprogs[cur_subprog + 1].start;
> > +               }
> > +       }
> > +
> > +       env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
> 
> In the case where a subprogram that is not subprogram 0 is a fit for
> the bpf loop inline and thus increases its stack depth, won't
> env->prog->aux->stack_depth need to also be updated?

As far as I understand the logic in `do_check_main` and `jit_subprogs`
`env->prog->aux->stack_depth` always reflects the stack depth of the
first sub-program (aka `env->subprog_info[0].stack_depth`). So the
last line of `adjust_stack_depth_for_loop_inlining` merely ensures
this invariant. The stack depth for each relevant subprogram is
updated earlier in the function:

static void adjust_stack_depth_for_loop_inlining(struct bpf_verifier_env *env)
{
	...
	for (i = 0; i < insn_cnt; i++) {
		...
		if (fit_for_bpf_loop_inline(aux)) {
			if (!subprog_updated) {
				subprog_updated = true;
here  ---->			subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
				...
			}
			...
		}
		if (i == subprog_end - 1) {
			subprog_updated = false;
			cur_subprog++;
			...
		}
	}
	...
}

Also, the patch v3 4/5 in a series has a test case "bpf_loop_inline
stack locations for loop vars" which checks that stack offsets for
spilled registers are assigned correctly for subprogram that is not a
first subprogram.

> > @@ -15030,6 +15216,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
> >         if (ret == 0)
> >                 ret = check_max_stack_depth(env);
> > 
> > +       if (ret == 0)
> > +               adjust_stack_depth_for_loop_inlining(env);
> 
> Do we need to do this before the check_max_stack_depth() call above
> since adjust_stack_depth_for_loop_inlining() adjusts the stack depth?

This is an interesting question. I used the following reasoning
placing `adjust_stack_depth_for_loop_inlining` after the
`check_max_stack_depth`:

1. If `adjust_stack_depth_for_loop_inlining` is placed before
   `check_max_stack_depth` some of the existing BPF programs might
   stop loading, because of increased stack usage.
2. To avoid (1) it is possible to do `check_max_stack_depth` first,
   remember actual max depth and apply
   `adjust_stack_depth_for_loop_inlining` only when there is enough
   stack space. However there are two downsides:
   - the optimization becomes flaky, similarly simple changes to the
     code of the BPF program might cause loop inlining to stop
     working;
   - the precise verification itself is non-trivial as each possible
     stack trace has to be analyzed in terms of stack size with loop
     inlining / stack size without loop inlining. If necessary, I will
     probably use a simpler heuristic where stack budget for register
     spilling would be computed as
     `MAX_BPF_STACK - actual_max_stack_depth`
3. Things are simpler if MAX_BPF_STACK is a soft limit that ensures
   that BPF programs consume limited amount of stack. In such case
   current implementation is sufficient.

So this boils down to the question what is `MAX_BPF_STACK`:
- a hard limit for the BPF program stack size?
- a soft limit used to verify that programs consume a limited amount
  of stack while executing?

If the answer is "hard limit" I'll proceed with implementation for
option (2).

Thanks,
Eduard
Alexei Starovoitov June 4, 2022, 2:47 p.m. UTC | #4
On Fri, Jun 03, 2022 at 05:10:45PM +0300, Eduard Zingerman wrote:
> +
> +		if (fit_for_bpf_loop_inline(aux)) {
> +			if (!subprog_updated) {
> +				subprog_updated = true;
> +				subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;

Instead of doing += and keeping track that the stack was increased
(if multiple bpf_loop happened to be in a function)
may be add subprogs[..].stack_depth_extra variable ?
And unconditionally do subprogs[cur_subprog].stack_depth_extra = BPF_REG_SIZE * 3;
every time bpf_loop is seen?
Then we can do that during inline_bpf_loop().
Also is there a test for multiple bpf_loop in a func?

> +				stack_base = -subprogs[cur_subprog].stack_depth;
> +			}
> +			aux->loop_inline_state.stack_base = stack_base;
> +		}
> +		if (i == subprog_end - 1) {
> +			subprog_updated = false;
> +			cur_subprog++;
> +			if (cur_subprog < env->subprog_cnt)
> +				subprog_end = subprogs[cur_subprog + 1].start;
> +		}
> +	}
> +
> +	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
> +}
> +
> +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> +{
> +	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> +	struct bpf_reg_state *regs = cur_regs(env);
> +	struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
> +
> +	int flags_is_zero =
> +		register_is_const(flags_reg) && flags_reg->var_off.value == 0;
> +
> +	if (state->initialized) {
> +		state->flags_is_zero &= flags_is_zero;
> +		state->callback_is_constant &= state->callback_subprogno == subprogno;
> +	} else {
> +		state->initialized = 1;
> +		state->callback_is_constant = 1;
> +		state->flags_is_zero = flags_is_zero;
> +		state->callback_subprogno = subprogno;
> +	}
> +}
> +
>  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>  			     int *insn_idx_p)
>  {
> @@ -7255,6 +7327,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>  		err = check_bpf_snprintf_call(env, regs);
>  		break;
>  	case BPF_FUNC_loop:
> +		update_loop_inline_state(env, meta.subprogno);
>  		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
>  					set_loop_callback_state);
>  		break;
> @@ -7661,11 +7734,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
>  	return true;
>  }
>  
> -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
> -{
> -	return &env->insn_aux_data[env->insn_idx];
> -}
> -
>  enum {
>  	REASON_BOUNDS	= -1,
>  	REASON_TYPE	= -2,
> @@ -12920,6 +12988,22 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
>  	return new_prog;
>  }
>  
> +static void adjust_loop_inline_subprogno(struct bpf_verifier_env *env,
> +					 u32 first_removed,
> +					 u32 first_remaining)
> +{
> +	int delta = first_remaining - first_removed;
> +
> +	for (int i = 0; i < env->prog->len; ++i) {
> +		struct bpf_loop_inline_state *state =
> +			&env->insn_aux_data[i].loop_inline_state;
> +
> +		if (state->initialized &&
> +		    state->callback_subprogno >= first_remaining)
> +			state->callback_subprogno -= delta;
> +	}
> +}
> +
>  static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
>  					      u32 off, u32 cnt)
>  {
> @@ -12963,6 +13047,8 @@ static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
>  			 * in adjust_btf_func() - no need to adjust
>  			 */
>  		}
> +
> +		adjust_loop_inline_subprogno(env, i, j);

This special case isn't great.
May be let's do inline_bpf_loop() earlier?
Instead of doing it from do_misc_fixups() how about doing it as a separate
pass right after check_max_stack_depth() ?
Then opt_remove_dead_code() will adjust BPF_CALL_REL automatically
as a part of bpf_adj_branches().

>  	} else {
>  		/* convert i from "first prog to remove" to "first to adjust" */
>  		if (env->subprog_info[i].start == off)
> @@ -13773,6 +13859,94 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
>  	return 0;
>  }
>  
> +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
> +					int position, u32 *cnt)
> +{
> +	struct bpf_insn_aux_data *aux = &env->insn_aux_data[position];
> +	s32 stack_base = aux->loop_inline_state.stack_base;
> +	s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
> +	s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
> +	s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
> +	int reg_loop_max = BPF_REG_6;
> +	int reg_loop_cnt = BPF_REG_7;
> +	int reg_loop_ctx = BPF_REG_8;
> +
> +	struct bpf_prog *new_prog;
> +	u32 callback_subprogno = aux->loop_inline_state.callback_subprogno;
> +	u32 callback_start;
> +	u32 call_insn_offset;
> +	s32 callback_offset;
> +	struct bpf_insn insn_buf[19];
> +	struct bpf_insn *next = insn_buf;
> +	struct bpf_insn *call, *jump_to_end, *loop_header;
> +	struct bpf_insn *jump_to_header, *loop_exit;
> +
> +	/* Return error and jump to the end of the patch if
> +	 * expected number of iterations is too big.  This
> +	 * repeats the check done in bpf_loop helper function,
> +	 * be careful to modify this code in sync.
> +	 */
> +	(*next++) = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
> +	(*next++) = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
> +	jump_to_end = next;
> +	(*next++) = BPF_JMP_IMM(BPF_JA, 0, 0, 0 /* set below */);
> +	/* spill R6, R7, R8 to use these as loop vars */
> +	(*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
> +	(*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
> +	(*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
> +	/* initialize loop vars */
> +	(*next++) = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
> +	(*next++) = BPF_MOV32_IMM(reg_loop_cnt, 0);
> +	(*next++) = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
> +	/* loop header;
> +	 * if reg_loop_cnt >= reg_loop_max skip the loop body
> +	 */
> +	loop_header = next;
> +	(*next++) = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max,
> +				0 /* set below */);
> +	/* callback call */
> +	(*next++) = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
> +	(*next++) = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
> +	call = next;
> +	(*next++) = BPF_CALL_REL(0 /* set below after patching */);
> +	/* increment loop counter */
> +	(*next++) = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
> +	/* jump to loop header if callback returned 0 */
> +	jump_to_header = next;
> +	(*next++) = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0 /* set below */);
> +	/* return value of bpf_loop;
> +	 * set R0 to the number of iterations
> +	 */
> +	loop_exit = next;
> +	(*next++) = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
> +	/* restore original values of R6, R7, R8 */
> +	(*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
> +	(*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
> +	(*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
> +
> +	*cnt = next - insn_buf;
> +	if (*cnt > ARRAY_SIZE(insn_buf)) {
> +		WARN_ONCE(1, "BUG %s: 'next' exceeds bounds for 'insn_buf'\n",
> +			  __func__);
> +		return NULL;
> +	}
> +	jump_to_end->off = next - jump_to_end - 1;
> +	loop_header->off = loop_exit - loop_header - 1;
> +	jump_to_header->off = loop_header - jump_to_header - 1;
> +
> +	new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
> +	if (!new_prog)
> +		return new_prog;
> +
> +	/* callback start is known only after patching */
> +	callback_start = env->subprog_info[callback_subprogno].start;
> +	call_insn_offset = position + (call - insn_buf);
> +	callback_offset = callback_start - call_insn_offset - 1;
> +	env->prog->insnsi[call_insn_offset].imm = callback_offset;
> +
> +	return new_prog;
> +}
> +
>  /* Do various post-verification rewrites in a single program pass.
>   * These rewrites simplify JIT and interpreter implementations.
>   */
> @@ -14258,6 +14432,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>  			continue;
>  		}
>  
> +		if (insn->imm == BPF_FUNC_loop &&
> +		    fit_for_bpf_loop_inline(&env->insn_aux_data[i + delta])) {
> +			new_prog = inline_bpf_loop(env, i + delta, &cnt);
> +			if (!new_prog)
> +				return -ENOMEM;
> +
> +			delta    += cnt - 1;
> +			env->prog = prog = new_prog;
> +			insn      = new_prog->insnsi + i + delta;
> +			continue;
> +		}
> +
>  patch_call_imm:
>  		fn = env->ops->get_func_proto(insn->imm, env->prog);
>  		/* all functions that have prototype and verifier allowed
> @@ -15030,6 +15216,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
>  	if (ret == 0)
>  		ret = check_max_stack_depth(env);
>  
> +	if (ret == 0)
> +		adjust_stack_depth_for_loop_inlining(env);

With above two suggestions this will become
if (ret == 0)
    optimize_bpf_loop(env);

where it will call inline_bpf_loop() and will assign max_depth_extra.
And then one extra loop for_each_subporg() max_depth += max_depth_extra.

wdyt?
Eduard Zingerman June 4, 2022, 3:36 p.m. UTC | #5
Hi Alexei,

> On Sat, 2022-06-04 at 16:47 +0200, Alexei Starovoitov wrote:
> > On Fri, Jun 03, 2022 at 05:10:45PM +0300, Eduard Zingerman wrote:
[...]
> > +		if (fit_for_bpf_loop_inline(aux)) {
> > +			if (!subprog_updated) {
> > +				subprog_updated = true;
> > +				subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
> 
> Instead of doing += and keeping track that the stack was increased
> (if multiple bpf_loop happened to be in a function)
> may be add subprogs[..].stack_depth_extra variable ?
> And unconditionally do subprogs[cur_subprog].stack_depth_extra = BPF_REG_SIZE * 3;
> every time bpf_loop is seen?
> Then we can do that during inline_bpf_loop().
[...]
> > @@ -12963,6 +13047,8 @@ static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
> >  			 * in adjust_btf_func() - no need to adjust
> >  			 */
> >  		}
> > +
> > +		adjust_loop_inline_subprogno(env, i, j);
> 
> This special case isn't great.
> May be let's do inline_bpf_loop() earlier?
> Instead of doing it from do_misc_fixups() how about doing it as a separate
> pass right after check_max_stack_depth() ?
> Then opt_remove_dead_code() will adjust BPF_CALL_REL automatically
> as a part of bpf_adj_branches().
[...]
> >  	if (ret == 0)
> >  		ret = check_max_stack_depth(env);
> >  
> > +	if (ret == 0)
> > +		adjust_stack_depth_for_loop_inlining(env);
> 
> With above two suggestions this will become
> if (ret == 0)
>     optimize_bpf_loop(env);
> 
> where it will call inline_bpf_loop() and will assign max_depth_extra.
> And then one extra loop for_each_subporg() max_depth += max_depth_extra.
> 
> wdyt?

I will proceed with the suggested rewrite, it simplifies a few things,
thank you!

> Also is there a test for multiple bpf_loop in a func?

Yes, please see the test case "bpf_loop_inline stack locations for
loop vars" in the patch "v3 4/5" from this series.

Also, please note Joanne's comment from a sibiling thread:

> > +       if (ret == 0)
> > +               adjust_stack_depth_for_loop_inlining(env);
> 
> Do we need to do this before the check_max_stack_depth() call above
> since adjust_stack_depth_for_loop_inlining() adjusts the stack depth?

In short, what is the meaning of the `MAX_BPF_STACK` variable?

- Is it a hard limit on BPF program stack size?

  If so, `optimize_bpf_loop` should skip the optimization if not
  enough stack space is available. But this makes optimization a bit
  'flaky'. This could be achieved by passing maximal stack depth
  computed by `check_max_stack_depth` to `optimize_bpf_loop`.

- Or is it a soft limit used by verifier to guarantee that stack space
  needed by the BPF program is limited?
  
  If so, `optimize_bpf_loop` might be executed after
  `check_max_stack_depth` w/o any data passed from one to another. But
  this would mean that for some programs actual stack usage would be
  `MAX_BPF_STACK + 24*N`. Where N is a number of functions with
  inlined loops (which is easier to compute than the max number of
  functions with inlined loop that can occur on a same stack trace).
  
  This might affect the following portion of the `do_misc_fixups`:
  
 static int do_misc_fixups(struct bpf_verifier_env *env) {
	...
		if (insn->imm == BPF_FUNC_tail_call) {
			/* If we tail call into other programs, we
			 * cannot make any assumptions since they can
			 * be replaced dynamically during runtime in
			 * the program array.
			 */
			prog->cb_access = 1;
			if (!allow_tail_call_in_subprogs(env))
here ---->     			prog->aux->stack_depth = MAX_BPF_STACK;
			...
		}
	...
 }

Thanks,
Eduard
Joanne Koong June 6, 2022, 6:08 p.m. UTC | #6
On Sat, Jun 4, 2022 at 5:51 AM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> Hi Joanne, thanks for the review!
>
> > On Fri, 2022-06-03 at 17:06 -0700, Joanne Koong wrote:
[...]
>
> > > +
> > > +               if (fit_for_bpf_loop_inline(aux)) {
> > > +                       if (!subprog_updated) {
> > > +                               subprog_updated = true;
> > > +                               subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
> > > +                               stack_base = -subprogs[cur_subprog].stack_depth;
> > > +                       }
> > > +                       aux->loop_inline_state.stack_base = stack_base;
> > > +               }
> > > +               if (i == subprog_end - 1) {
> > > +                       subprog_updated = false;
> > > +                       cur_subprog++;
> > > +                       if (cur_subprog < env->subprog_cnt)
> > > +                               subprog_end = subprogs[cur_subprog + 1].start;
> > > +               }
> > > +       }
> > > +
> > > +       env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
> >
> > In the case where a subprogram that is not subprogram 0 is a fit for
> > the bpf loop inline and thus increases its stack depth, won't
> > env->prog->aux->stack_depth need to also be updated?
>
> As far as I understand the logic in `do_check_main` and `jit_subprogs`
> `env->prog->aux->stack_depth` always reflects the stack depth of the
> first sub-program (aka `env->subprog_info[0].stack_depth`). So the
> last line of `adjust_stack_depth_for_loop_inlining` merely ensures
> this invariant. The stack depth for each relevant subprogram is
> updated earlier in the function:
>
> static void adjust_stack_depth_for_loop_inlining(struct bpf_verifier_env *env)
> {
>         ...
>         for (i = 0; i < insn_cnt; i++) {
>                 ...
>                 if (fit_for_bpf_loop_inline(aux)) {
>                         if (!subprog_updated) {
>                                 subprog_updated = true;
> here  ---->                     subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
>                                 ...
>                         }
>                         ...
>                 }
>                 if (i == subprog_end - 1) {
>                         subprog_updated = false;
>                         cur_subprog++;
>                         ...
>                 }
>         }
>         ...
> }
>
> Also, the patch v3 4/5 in a series has a test case "bpf_loop_inline
> stack locations for loop vars" which checks that stack offsets for
> spilled registers are assigned correctly for subprogram that is not a
> first subprogram.
>
Thanks for the explanation :) I misunderstood what
prog->aux->stack_depth is used for

Looking at arch/arm64/net/bpf_jit_comp.c, I see this diagram

        /*
         * BPF prog stack layout
         *
         *                         high
         * original A64_SP =>   0:+-----+ BPF prologue
         *                        |FP/LR|
         * current A64_FP =>  -16:+-----+
         *                        | ... | callee saved registers
         * BPF fp register => -64:+-----+ <= (BPF_FP)
         *                        |     |
         *                        | ... | BPF prog stack
         *                        |     |
         *                        +-----+ <= (BPF_FP - prog->aux->stack_depth)
         *                        |RSVD | padding
         * current A64_SP =>      +-----+ <= (BPF_FP - ctx->stack_size)
         *                        |     |
         *                        | ... | Function call stack
         *                        |     |
         *                        +-----+
         *                          low
         *
         */
It looks like prog->aux->stack_depth is used for the "BPF prog stack",
which is the stack for the main bpf program (subprog 0)

> > > @@ -15030,6 +15216,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
> > >         if (ret == 0)
> > >                 ret = check_max_stack_depth(env);
> > >
> > > +       if (ret == 0)
> > > +               adjust_stack_depth_for_loop_inlining(env);
> >
> > Do we need to do this before the check_max_stack_depth() call above
> > since adjust_stack_depth_for_loop_inlining() adjusts the stack depth?
>
> This is an interesting question. I used the following reasoning
> placing `adjust_stack_depth_for_loop_inlining` after the
> `check_max_stack_depth`:
>
> 1. If `adjust_stack_depth_for_loop_inlining` is placed before
>    `check_max_stack_depth` some of the existing BPF programs might
>    stop loading, because of increased stack usage.
> 2. To avoid (1) it is possible to do `check_max_stack_depth` first,
>    remember actual max depth and apply
>    `adjust_stack_depth_for_loop_inlining` only when there is enough
>    stack space. However there are two downsides:
>    - the optimization becomes flaky, similarly simple changes to the
>      code of the BPF program might cause loop inlining to stop
>      working;
>    - the precise verification itself is non-trivial as each possible
>      stack trace has to be analyzed in terms of stack size with loop
>      inlining / stack size without loop inlining. If necessary, I will
>      probably use a simpler heuristic where stack budget for register
>      spilling would be computed as
>      `MAX_BPF_STACK - actual_max_stack_depth`
> 3. Things are simpler if MAX_BPF_STACK is a soft limit that ensures
>    that BPF programs consume limited amount of stack. In such case
>    current implementation is sufficient.
>
> So this boils down to the question what is `MAX_BPF_STACK`:
> - a hard limit for the BPF program stack size?
> - a soft limit used to verify that programs consume a limited amount
>   of stack while executing?
>
> If the answer is "hard limit" I'll proceed with implementation for
> option (2).
I'm not sure either whether MAX_BPF_STACK is a hard limit or a soft
limit. I'm curious to know as well.
>
> Thanks,
> Eduard
>
Eduard Zingerman June 8, 2022, 9:06 a.m. UTC | #7
Hi Joanne,

> Looking at arch/arm64/net/bpf_jit_comp.c, I see this diagram
> 
>         /*
>          * BPF prog stack layout
>          *
>          *                         high
>          * original A64_SP =>   0:+-----+ BPF prologue
>          *                        |FP/LR|
>          * current A64_FP =>  -16:+-----+
>          *                        | ... | callee saved registers
>          * BPF fp register => -64:+-----+ <= (BPF_FP)
>          *                        |     |
>          *                        | ... | BPF prog stack
>          *                        |     |
>          *                        +-----+ <= (BPF_FP - prog->aux->stack_depth)
>          *                        |RSVD | padding
>          * current A64_SP =>      +-----+ <= (BPF_FP - ctx->stack_size)
>          *                        |     |
>          *                        | ... | Function call stack
>          *                        |     |
>          *                        +-----+
>          *                          low
>          *
>          */
> It looks like prog->aux->stack_depth is used for the "BPF prog stack",
> which is the stack for the main bpf program (subprog 0)

Thanks for looking into this. Also note the verifier.c:jit_subprogs
function which essentially "promotes" every sub-program to sub-program #0
before calling bpf_jit_comp.c:bpf_int_jit_compile for that sub-program.

> I'm not sure either whether MAX_BPF_STACK is a hard limit or a soft
> limit. I'm curious to know as well.

Alexei had commented in a sibling thread that MAX_BPF_STACK should be
considered a soft limit.

Thanks,
Eduard
diff mbox series

Patch

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e8439f6cbe57..80279616a76b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -20,6 +20,8 @@ 
 #define BPF_MAX_VAR_SIZ	(1 << 29)
 /* size of type_str_buf in bpf_verifier. */
 #define TYPE_STR_BUF_LEN 64
+/* Maximum number of loops for bpf_loop */
+#define BPF_MAX_LOOPS	BIT(23)
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
@@ -344,6 +346,16 @@  struct bpf_verifier_state_list {
 	int miss_cnt, hit_cnt;
 };
 
+struct bpf_loop_inline_state {
+	u32 initialized:1; /* set to true upon first entry */
+	u32 callback_is_constant:1; /* true if callback function
+				     * is the same at each call
+				     */
+	u32 flags_is_zero:1; /* true if flags register is zero at each call */
+	u32 callback_subprogno; /* valid when callback_is_constant == 1 */
+	s32 stack_base; /* stack offset for loop vars */
+};
+
 /* Possible states for alu_state member. */
 #define BPF_ALU_SANITIZE_SRC		(1U << 0)
 #define BPF_ALU_SANITIZE_DST		(1U << 1)
@@ -380,6 +392,10 @@  struct bpf_insn_aux_data {
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	u8 alu_state; /* used in combination with alu_limit */
+	/* if instruction is a call to bpf_loop this field tracks
+	 * the state of the relevant registers to take decision about inlining
+	 */
+	struct bpf_loop_inline_state loop_inline_state;
 
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index d5d96ceca105..cdb898fce118 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -6,6 +6,7 @@ 
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/bpf_verifier.h>
 
 struct bpf_iter_target_info {
 	struct list_head list;
@@ -723,9 +724,6 @@  const struct bpf_func_proto bpf_for_each_map_elem_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-/* maximum number of loops */
-#define MAX_LOOPS	BIT(23)
-
 BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	   u64, flags)
 {
@@ -733,9 +731,12 @@  BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	u64 ret;
 	u32 i;
 
+	/* note: these safety checks are also verified when bpf_loop is inlined,
+	 * be careful to modify this code in sync
+	 */
 	if (flags)
 		return -EINVAL;
-	if (nr_loops > MAX_LOOPS)
+	if (nr_loops > BPF_MAX_LOOPS)
 		return -E2BIG;
 
 	for (i = 0; i < nr_loops; i++) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aedac2ac02b9..27d78fe6c3f9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7103,6 +7103,78 @@  static int check_get_func_ip(struct bpf_verifier_env *env)
 	return -ENOTSUPP;
 }
 
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+	return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool fit_for_bpf_loop_inline(struct bpf_insn_aux_data *insn_aux)
+{
+	return insn_aux->loop_inline_state.initialized &&
+		insn_aux->loop_inline_state.flags_is_zero &&
+		insn_aux->loop_inline_state.callback_is_constant;
+}
+
+/* For all sub-programs in the program (including main) checks
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found subprog stack_depth is increased
+ * by the size of 3 registers. Reserved space would be used in the
+ * do_misc_fixups to spill values of the R6, R7, R8 to use these
+ * registers for loop iteration.
+ */
+static void adjust_stack_depth_for_loop_inlining(struct bpf_verifier_env *env)
+{
+	int i, subprog_end, cur_subprog = 0;
+	struct bpf_subprog_info *subprogs = env->subprog_info;
+	int insn_cnt = env->prog->len;
+	bool subprog_updated = false;
+	s32 stack_base;
+
+	subprog_end = (env->subprog_cnt > 1
+		       ? subprogs[cur_subprog + 1].start
+		       : insn_cnt);
+	for (i = 0; i < insn_cnt; i++) {
+		struct bpf_insn_aux_data *aux = &env->insn_aux_data[i];
+
+		if (fit_for_bpf_loop_inline(aux)) {
+			if (!subprog_updated) {
+				subprog_updated = true;
+				subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
+				stack_base = -subprogs[cur_subprog].stack_depth;
+			}
+			aux->loop_inline_state.stack_base = stack_base;
+		}
+		if (i == subprog_end - 1) {
+			subprog_updated = false;
+			cur_subprog++;
+			if (cur_subprog < env->subprog_cnt)
+				subprog_end = subprogs[cur_subprog + 1].start;
+		}
+	}
+
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
+
+	int flags_is_zero =
+		register_is_const(flags_reg) && flags_reg->var_off.value == 0;
+
+	if (state->initialized) {
+		state->flags_is_zero &= flags_is_zero;
+		state->callback_is_constant &= state->callback_subprogno == subprogno;
+	} else {
+		state->initialized = 1;
+		state->callback_is_constant = 1;
+		state->flags_is_zero = flags_is_zero;
+		state->callback_subprogno = subprogno;
+	}
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -7255,6 +7327,7 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		err = check_bpf_snprintf_call(env, regs);
 		break;
 	case BPF_FUNC_loop:
+		update_loop_inline_state(env, meta.subprogno);
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_loop_callback_state);
 		break;
@@ -7661,11 +7734,6 @@  static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 	return true;
 }
 
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
-	return &env->insn_aux_data[env->insn_idx];
-}
-
 enum {
 	REASON_BOUNDS	= -1,
 	REASON_TYPE	= -2,
@@ -12920,6 +12988,22 @@  static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	return new_prog;
 }
 
+static void adjust_loop_inline_subprogno(struct bpf_verifier_env *env,
+					 u32 first_removed,
+					 u32 first_remaining)
+{
+	int delta = first_remaining - first_removed;
+
+	for (int i = 0; i < env->prog->len; ++i) {
+		struct bpf_loop_inline_state *state =
+			&env->insn_aux_data[i].loop_inline_state;
+
+		if (state->initialized &&
+		    state->callback_subprogno >= first_remaining)
+			state->callback_subprogno -= delta;
+	}
+}
+
 static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
 					      u32 off, u32 cnt)
 {
@@ -12963,6 +13047,8 @@  static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
 			 * in adjust_btf_func() - no need to adjust
 			 */
 		}
+
+		adjust_loop_inline_subprogno(env, i, j);
 	} else {
 		/* convert i from "first prog to remove" to "first to adjust" */
 		if (env->subprog_info[i].start == off)
@@ -13773,6 +13859,94 @@  static int fixup_kfunc_call(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+					int position, u32 *cnt)
+{
+	struct bpf_insn_aux_data *aux = &env->insn_aux_data[position];
+	s32 stack_base = aux->loop_inline_state.stack_base;
+	s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+	s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+	s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+	int reg_loop_max = BPF_REG_6;
+	int reg_loop_cnt = BPF_REG_7;
+	int reg_loop_ctx = BPF_REG_8;
+
+	struct bpf_prog *new_prog;
+	u32 callback_subprogno = aux->loop_inline_state.callback_subprogno;
+	u32 callback_start;
+	u32 call_insn_offset;
+	s32 callback_offset;
+	struct bpf_insn insn_buf[19];
+	struct bpf_insn *next = insn_buf;
+	struct bpf_insn *call, *jump_to_end, *loop_header;
+	struct bpf_insn *jump_to_header, *loop_exit;
+
+	/* Return error and jump to the end of the patch if
+	 * expected number of iterations is too big.  This
+	 * repeats the check done in bpf_loop helper function,
+	 * be careful to modify this code in sync.
+	 */
+	(*next++) = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+	(*next++) = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+	jump_to_end = next;
+	(*next++) = BPF_JMP_IMM(BPF_JA, 0, 0, 0 /* set below */);
+	/* spill R6, R7, R8 to use these as loop vars */
+	(*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+	(*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+	(*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+	/* initialize loop vars */
+	(*next++) = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+	(*next++) = BPF_MOV32_IMM(reg_loop_cnt, 0);
+	(*next++) = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+	/* loop header;
+	 * if reg_loop_cnt >= reg_loop_max skip the loop body
+	 */
+	loop_header = next;
+	(*next++) = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max,
+				0 /* set below */);
+	/* callback call */
+	(*next++) = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+	(*next++) = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+	call = next;
+	(*next++) = BPF_CALL_REL(0 /* set below after patching */);
+	/* increment loop counter */
+	(*next++) = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+	/* jump to loop header if callback returned 0 */
+	jump_to_header = next;
+	(*next++) = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0 /* set below */);
+	/* return value of bpf_loop;
+	 * set R0 to the number of iterations
+	 */
+	loop_exit = next;
+	(*next++) = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+	/* restore original values of R6, R7, R8 */
+	(*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+	(*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+	(*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+	*cnt = next - insn_buf;
+	if (*cnt > ARRAY_SIZE(insn_buf)) {
+		WARN_ONCE(1, "BUG %s: 'next' exceeds bounds for 'insn_buf'\n",
+			  __func__);
+		return NULL;
+	}
+	jump_to_end->off = next - jump_to_end - 1;
+	loop_header->off = loop_exit - loop_header - 1;
+	jump_to_header->off = loop_header - jump_to_header - 1;
+
+	new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+	if (!new_prog)
+		return new_prog;
+
+	/* callback start is known only after patching */
+	callback_start = env->subprog_info[callback_subprogno].start;
+	call_insn_offset = position + (call - insn_buf);
+	callback_offset = callback_start - call_insn_offset - 1;
+	env->prog->insnsi[call_insn_offset].imm = callback_offset;
+
+	return new_prog;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
@@ -14258,6 +14432,18 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (insn->imm == BPF_FUNC_loop &&
+		    fit_for_bpf_loop_inline(&env->insn_aux_data[i + delta])) {
+			new_prog = inline_bpf_loop(env, i + delta, &cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
 		/* all functions that have prototype and verifier allowed
@@ -15030,6 +15216,9 @@  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 	if (ret == 0)
 		ret = check_max_stack_depth(env);
 
+	if (ret == 0)
+		adjust_stack_depth_for_loop_inlining(env);
+
 	/* instruction rewrites happen after this point */
 	if (is_priv) {
 		if (ret == 0)