diff mbox series

[bpf-next,1/2] bpf: Introduce bpf_can_loop() kfunc

Message ID 20240222063324.46468-1-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next,1/2] bpf: Introduce bpf_can_loop() kfunc | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1157 this patch: 1158
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 13 maintainers not CCed: jolsa@kernel.org john.fastabend@gmail.com nathan@kernel.org yonghong.song@linux.dev song@kernel.org martin.lau@linux.dev sdf@google.com morbo@google.com justinstitt@google.com kpsingh@kernel.org ndesaulniers@google.com llvm@lists.linux.dev haoluo@google.com
netdev/build_clang success Errors and warnings before: 1066 this patch: 1066
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1174 this patch: 1175
netdev/checkpatch fail CHECK: multiple assignments should be avoided ERROR: do not use assignment in if condition WARNING: line length of 101 exceeds 80 columns WARNING: line length of 105 exceeds 80 columns WARNING: line length of 106 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Alexei Starovoitov Feb. 22, 2024, 6:33 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

While working on bpf_arena the following monster macro had to be
used to iterate a link list:
        for (struct bpf_iter_num ___it __attribute__((aligned(8),                       \
                                                      cleanup(bpf_iter_num_destroy))),  \
                        * ___tmp = (                                                    \
                                bpf_iter_num_new(&___it, 0, (1000000)),                 \
                                pos = list_entry_safe((head)->first,                    \
                                                      typeof(*(pos)), member),          \
                                (void)bpf_iter_num_destroy, (void *)0);                 \
             bpf_iter_num_next(&___it) && pos &&                                        \
                ({ ___tmp = (void *)pos->member.next; 1; });                            \
             pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member))

It's similar to bpf_for(), bpf_repeat() macros.
Unfortunately every "for" in normal C code needs an equivalent monster macro.

Instead, let's introduce bpf_can_loop() kfunc that acts on a hidden bpf_iter_num,
so that bpf_iter_num_new(), bpf_iter_num_destroy() don't need to be called explicitly.
It simplifies the macro to:
        for (void * ___tmp = (pos = list_entry_safe((head)->first,                    \
                                                    typeof(*(pos)), member),          \
                                (void *)0);                                           \
             bpf_can_loop(0) && pos &&                                                \
                ({ ___tmp = (void *)pos->member.next; 1; });                          \
             pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member))

and can be used in any normal "for" or "while" loop, like

  for (i = 0; i < cnt && bpf_can_loop(0); i++) {

The verifier recognizes that bpf_can_loop() is used in the program,
reserves additional 8 bytes of stack, zero initializes them in subprog prologue,
and passes that address to bpf_can_loop() kfunc that simply increments
the counter until it reaches BPF_MAX_LOOPS.

In the future bpf_can_loop() can be inlined to improve performance.
New instruction with the same semantics can be added, so that LLVM can generate it.

WARNING:
bpf_can_loop() is not a substitute for bpf_for() when it's used to
iterate normal arrays or map_values.
bpf_can_loop() works well only with arena pointers that don't need
to be bounds-checked on every iteration.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |   3 +
 kernel/bpf/helpers.c         |  12 +++
 kernel/bpf/verifier.c        | 141 ++++++++++++++++++++++++++++++-----
 3 files changed, 136 insertions(+), 20 deletions(-)

Comments

Eduard Zingerman Feb. 24, 2024, midnight UTC | #1
On Wed, 2024-02-21 at 22:33 -0800, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> While working on bpf_arena the following monster macro had to be
> used to iterate a link list:
>         for (struct bpf_iter_num ___it __attribute__((aligned(8),                       \
>                                                       cleanup(bpf_iter_num_destroy))),  \
>                         * ___tmp = (                                                    \
>                                 bpf_iter_num_new(&___it, 0, (1000000)),                 \
>                                 pos = list_entry_safe((head)->first,                    \
>                                                       typeof(*(pos)), member),          \
>                                 (void)bpf_iter_num_destroy, (void *)0);                 \
>              bpf_iter_num_next(&___it) && pos &&                                        \
>                 ({ ___tmp = (void *)pos->member.next; 1; });                            \
>              pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member))
> 
> It's similar to bpf_for(), bpf_repeat() macros.
> Unfortunately every "for" in normal C code needs an equivalent monster macro.

Tbh, I really don't like the idea of adding more hacks for loop handling.
This would be the fourth: regular bounded loops, bpf_loop, iterators
and now bpf_can_loop. Imo, it would be better to:
- either create a reusable "semi-monster" macro e.g. "__with_bound(iter_name)"
  that would hide "struct bpf_iter_num iter_name ... cleanup ..." declaration
  to simplify definitions of other iterating macro;
- or add kfuncs for list iterator.

Having said that, I don't see any obvious technical issues with this particular patch.
A few nits below.

[...]

> @@ -7954,10 +7956,14 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
>  	struct bpf_reg_state *cur_iter, *queued_iter;
>  	int iter_frameno = meta->iter.frameno;
>  	int iter_spi = meta->iter.spi;
> +	bool is_can_loop = is_can_loop_kfunc(meta);
>  
>  	BTF_TYPE_EMIT(struct bpf_iter);
>  
> -	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> +	if (is_can_loop)
> +		cur_iter = &cur_st->can_loop_reg;
> +	else
> +		cur_iter = &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;

I think that adding of a utility function hiding this choice, e.g.:

    get_iter_reg(struct bpf_verifier_state *st, int insn_idx)

would simplify the code a bit, here and in is_state_visited().

[...]

> @@ -19549,6 +19608,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>  			delta	 += cnt - 1;
>  			env->prog = prog = new_prog;
>  			insn	  = new_prog->insnsi + i + delta;
> +			if (stack_extra)
> +				stack_depth_extra = max(stack_depth_extra, stack_extra);

I don't think that using "max" here makes much sense.
If several sorts of kfuncs would need extra stack space,
most likely this space won't be shared, e.g. bpf_can_loop() would need
it's 8 bytes + bpf_some_new_feature() would need 16 bytes on top of that etc.
So some kind of flags indicating space for which features is reacquired is needed here.

[...]
Alexei Starovoitov Feb. 24, 2024, 12:22 a.m. UTC | #2
On Fri, Feb 23, 2024 at 4:00 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Wed, 2024-02-21 at 22:33 -0800, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > While working on bpf_arena the following monster macro had to be
> > used to iterate a link list:
> >         for (struct bpf_iter_num ___it __attribute__((aligned(8),                       \
> >                                                       cleanup(bpf_iter_num_destroy))),  \
> >                         * ___tmp = (                                                    \
> >                                 bpf_iter_num_new(&___it, 0, (1000000)),                 \
> >                                 pos = list_entry_safe((head)->first,                    \
> >                                                       typeof(*(pos)), member),          \
> >                                 (void)bpf_iter_num_destroy, (void *)0);                 \
> >              bpf_iter_num_next(&___it) && pos &&                                        \
> >                 ({ ___tmp = (void *)pos->member.next; 1; });                            \
> >              pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member))
> >
> > It's similar to bpf_for(), bpf_repeat() macros.
> > Unfortunately every "for" in normal C code needs an equivalent monster macro.
>
> Tbh, I really don't like the idea of adding more hacks for loop handling.
> This would be the fourth: regular bounded loops, bpf_loop, iterators
> and now bpf_can_loop. Imo, it would be better to:
> - either create a reusable "semi-monster" macro e.g. "__with_bound(iter_name)"
>   that would hide "struct bpf_iter_num iter_name ... cleanup ..." declaration
>   to simplify definitions of other iterating macro;
> - or add kfuncs for list iterator.

I think you're missing the point.
It's not about this particular list iterator.
It's about _all_ for(), while() loops.
I've started converting lib/radix-tree.c to bpf and arena.
There are hundreds of various loops that need to be converted.
The best is to copy-paste them as-is and add bpf_can_loop() to loop
condition. That's it.
Otherwise explicit iterators are changing the code significantly
and distract from the logic of the algorithm.

Another key point is the last sentence of the commit log:
"New instruction with the same semantics can be added, so that LLVM
can generate it."

This is the way to implement __builtin_memcpy, __builtin_strcmp
and friends in llvm and gcc.

We can go a step further and add such insn to any for/while loop
that uses arena pointers.
Then running kernel code as bpf program will be copy-paste
and addition of __arena tag. Easy.

And if we can make:
#pragma clang attribute push (__attribute__((address_space(1))),
apply_to = pointers)

work, then we won't even need to copy paste!
We will be able to compile lib/radix-tree.c with --target=bpf as-is.


> Having said that, I don't see any obvious technical issues with this particular patch.
> A few nits below.
>
> [...]
>
> > @@ -7954,10 +7956,14 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
> >       struct bpf_reg_state *cur_iter, *queued_iter;
> >       int iter_frameno = meta->iter.frameno;
> >       int iter_spi = meta->iter.spi;
> > +     bool is_can_loop = is_can_loop_kfunc(meta);
> >
> >       BTF_TYPE_EMIT(struct bpf_iter);
> >
> > -     cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> > +     if (is_can_loop)
> > +             cur_iter = &cur_st->can_loop_reg;
> > +     else
> > +             cur_iter = &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
>
> I think that adding of a utility function hiding this choice, e.g.:
>
>     get_iter_reg(struct bpf_verifier_state *st, int insn_idx)
>
> would simplify the code a bit, here and in is_state_visited().

Hmm. That sounds like obfuscation, since 'meta' would need to be passed in,
but is_state_visited() doesn't have meta.
Create fake meta there?!

I'm missing how such get_iter_reg() helper will look.
meta->iter.frameno was populated by process_iter_arg().
Far away from process_iter_next_call().

>
> [...]
>
> > @@ -19549,6 +19608,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> >                       delta    += cnt - 1;
> >                       env->prog = prog = new_prog;
> >                       insn      = new_prog->insnsi + i + delta;
> > +                     if (stack_extra)
> > +                             stack_depth_extra = max(stack_depth_extra, stack_extra);
>
> I don't think that using "max" here makes much sense.
> If several sorts of kfuncs would need extra stack space,
> most likely this space won't be shared,

I see. I was assuming that the space will be shared.
Like in the follow up I was planning to roll optimize_bpf_loop()
into this. And optimize_bpf_loop() will be sharing the scratch area.
But now, I'm not sure what I was thinking, since bpf_can_loop()
cannot share that space with optimize_bpf_loop().

> e.g. bpf_can_loop() would need
> it's 8 bytes + bpf_some_new_feature() would need 16 bytes on top of that etc.
> So some kind of flags indicating space for which features is reacquired is needed here.

Yeah. It needs some way to indicate shared stack_extra vs exclusive.
Will fix in v2.
Eduard Zingerman Feb. 24, 2024, 12:50 a.m. UTC | #3
On Fri, 2024-02-23 at 16:22 -0800, Alexei Starovoitov wrote:
[...]

> I think you're missing the point.
> It's not about this particular list iterator.
> It's about _all_ for(), while() loops.
> I've started converting lib/radix-tree.c to bpf and arena.
> There are hundreds of various loops that need to be converted.
> The best is to copy-paste them as-is and add bpf_can_loop() to loop
> condition. That's it.
> Otherwise explicit iterators are changing the code significantly
> and distract from the logic of the algorithm.
> 
> Another key point is the last sentence of the commit log:
> "New instruction with the same semantics can be added, so that LLVM
> can generate it."
> 
> This is the way to implement __builtin_memcpy, __builtin_strcmp
> and friends in llvm and gcc.

There are two things that usage of bpf_can_loop() provides:
1. A proof that BPF program would terminate at runtime.
2. A way for verifier to terminate verification process
   (by stopping processing some path when two verifier states are exactly equal).

The (1) is iffy, because there are simple ways to forgo it in practical terms.
E.g. for the program below it would be possible to make 64 * 10^12 iterations
at runtime:

    void bar(...) {
      while (... && bpf_can_loop())
        ... do something ...;
    }

    void foo(...) {
      while (... && bpf_can_loop())
        bar();
    }

If we decide that for some programs it is not necessary to enforce
proof of runtime termination, then it would be possible to untie (2)
from iterators and just check if looping state is states_equal(... exact=true)
to some previous one.

[...]

> > > @@ -7954,10 +7956,14 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
> > >       struct bpf_reg_state *cur_iter, *queued_iter;
> > >       int iter_frameno = meta->iter.frameno;
> > >       int iter_spi = meta->iter.spi;
> > > +     bool is_can_loop = is_can_loop_kfunc(meta);
> > > 
> > >       BTF_TYPE_EMIT(struct bpf_iter);
> > > 
> > > -     cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> > > +     if (is_can_loop)
> > > +             cur_iter = &cur_st->can_loop_reg;
> > > +     else
> > > +             cur_iter = &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> > 
> > I think that adding of a utility function hiding this choice, e.g.:
> > 
> >     get_iter_reg(struct bpf_verifier_state *st, int insn_idx)
> > 
> > would simplify the code a bit, here and in is_state_visited().
> 
> Hmm. That sounds like obfuscation, since 'meta' would need to be passed in,
> but is_state_visited() doesn't have meta.
> Create fake meta there?!
> 
> I'm missing how such get_iter_reg() helper will look.
> meta->iter.frameno was populated by process_iter_arg().
> Far away from process_iter_next_call().

I meant that this helper can peek spi from R1 just like code in
is_state_visited() does currently. Forgoing the 'meta' completely.

[...]
Alexei Starovoitov Feb. 24, 2024, 1:55 a.m. UTC | #4
On Fri, Feb 23, 2024 at 4:50 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Fri, 2024-02-23 at 16:22 -0800, Alexei Starovoitov wrote:
> [...]
>
> > I think you're missing the point.
> > It's not about this particular list iterator.
> > It's about _all_ for(), while() loops.
> > I've started converting lib/radix-tree.c to bpf and arena.
> > There are hundreds of various loops that need to be converted.
> > The best is to copy-paste them as-is and add bpf_can_loop() to loop
> > condition. That's it.
> > Otherwise explicit iterators are changing the code significantly
> > and distract from the logic of the algorithm.
> >
> > Another key point is the last sentence of the commit log:
> > "New instruction with the same semantics can be added, so that LLVM
> > can generate it."
> >
> > This is the way to implement __builtin_memcpy, __builtin_strcmp
> > and friends in llvm and gcc.
>
> There are two things that usage of bpf_can_loop() provides:
> 1. A proof that BPF program would terminate at runtime.
> 2. A way for verifier to terminate verification process
>    (by stopping processing some path when two verifier states are exactly equal).
>
> The (1) is iffy, because there are simple ways to forgo it in practical terms.
> E.g. for the program below it would be possible to make 64 * 10^12 iterations
> at runtime:
>
>     void bar(...) {
>       while (... && bpf_can_loop())
>         ... do something ...;
>     }
>
>     void foo(...) {
>       while (... && bpf_can_loop())
>         bar();
>     }

so ?
bpf_loop() helper and open coded iterators can do the same already.
It's something we need to fix regardless.

(1) is not iffy. The program will terminate. That's a 100% guarantee.

> If we decide that for some programs it is not necessary to enforce
> proof of runtime termination, then it would be possible to untie (2)
> from iterators and just check if looping state is states_equal(... exact=true)
> to some previous one.

No. That's not at all the same.
Looping and eventually exiting is a strong guarantee by
the verifier and the users know that all paths to exit are explored.
Just "looping is ok" without exit guarantee
means that a bunch of code may not be visited by the verifier.
Arguably dead code elimination should kick in,
but I don't think it's a territory we can go to.

>
> [...]
>
> > > > @@ -7954,10 +7956,14 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
> > > >       struct bpf_reg_state *cur_iter, *queued_iter;
> > > >       int iter_frameno = meta->iter.frameno;
> > > >       int iter_spi = meta->iter.spi;
> > > > +     bool is_can_loop = is_can_loop_kfunc(meta);
> > > >
> > > >       BTF_TYPE_EMIT(struct bpf_iter);
> > > >
> > > > -     cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> > > > +     if (is_can_loop)
> > > > +             cur_iter = &cur_st->can_loop_reg;
> > > > +     else
> > > > +             cur_iter = &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> > >
> > > I think that adding of a utility function hiding this choice, e.g.:
> > >
> > >     get_iter_reg(struct bpf_verifier_state *st, int insn_idx)
> > >
> > > would simplify the code a bit, here and in is_state_visited().
> >
> > Hmm. That sounds like obfuscation, since 'meta' would need to be passed in,
> > but is_state_visited() doesn't have meta.
> > Create fake meta there?!
> >
> > I'm missing how such get_iter_reg() helper will look.
> > meta->iter.frameno was populated by process_iter_arg().
> > Far away from process_iter_next_call().
>
> I meant that this helper can peek spi from R1 just like code in
> is_state_visited() does currently. Forgoing the 'meta' completely.

I see.
You mean removing:
                meta->iter.spi = spi;
                meta->iter.frameno = reg->frameno;
from process_iter_arg() and
'meta' arg from process_iter_next_call() as well then ?
Alexei Starovoitov Feb. 28, 2024, 6:30 a.m. UTC | #5
On Fri, Feb 23, 2024 at 5:55 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Feb 23, 2024 at 4:50 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >
> > On Fri, 2024-02-23 at 16:22 -0800, Alexei Starovoitov wrote:
> > [...]
> >
> > > I think you're missing the point.
> > > It's not about this particular list iterator.
> > > It's about _all_ for(), while() loops.
> > > I've started converting lib/radix-tree.c to bpf and arena.
> > > There are hundreds of various loops that need to be converted.
> > > The best is to copy-paste them as-is and add bpf_can_loop() to loop
> > > condition. That's it.
> > > Otherwise explicit iterators are changing the code significantly
> > > and distract from the logic of the algorithm.
> > >
> > > Another key point is the last sentence of the commit log:
> > > "New instruction with the same semantics can be added, so that LLVM
> > > can generate it."
> > >
> > > This is the way to implement __builtin_memcpy, __builtin_strcmp
> > > and friends in llvm and gcc.
> >
> > There are two things that usage of bpf_can_loop() provides:
> > 1. A proof that BPF program would terminate at runtime.
> > 2. A way for verifier to terminate verification process
> >    (by stopping processing some path when two verifier states are exactly equal).
> >
> > The (1) is iffy, because there are simple ways to forgo it in practical terms.
> > E.g. for the program below it would be possible to make 64 * 10^12 iterations
> > at runtime:
> >
> >     void bar(...) {
> >       while (... && bpf_can_loop())
> >         ... do something ...;
> >     }
> >
> >     void foo(...) {
> >       while (... && bpf_can_loop())
> >         bar();
> >     }
>
> so ?
> bpf_loop() helper and open coded iterators can do the same already.
> It's something we need to fix regardless.
>
> (1) is not iffy. The program will terminate. That's a 100% guarantee.
>
> > If we decide that for some programs it is not necessary to enforce
> > proof of runtime termination, then it would be possible to untie (2)
> > from iterators and just check if looping state is states_equal(... exact=true)
> > to some previous one.
>
> No. That's not at all the same.
> Looping and eventually exiting is a strong guarantee by
> the verifier and the users know that all paths to exit are explored.
> Just "looping is ok" without exit guarantee
> means that a bunch of code may not be visited by the verifier.
> Arguably dead code elimination should kick in,
> but I don't think it's a territory we can go to.
>
> >
> > [...]
> >
> > > > > @@ -7954,10 +7956,14 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
> > > > >       struct bpf_reg_state *cur_iter, *queued_iter;
> > > > >       int iter_frameno = meta->iter.frameno;
> > > > >       int iter_spi = meta->iter.spi;
> > > > > +     bool is_can_loop = is_can_loop_kfunc(meta);
> > > > >
> > > > >       BTF_TYPE_EMIT(struct bpf_iter);
> > > > >
> > > > > -     cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> > > > > +     if (is_can_loop)
> > > > > +             cur_iter = &cur_st->can_loop_reg;
> > > > > +     else
> > > > > +             cur_iter = &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
> > > >
> > > > I think that adding of a utility function hiding this choice, e.g.:
> > > >
> > > >     get_iter_reg(struct bpf_verifier_state *st, int insn_idx)
> > > >
> > > > would simplify the code a bit, here and in is_state_visited().
> > >
> > > Hmm. That sounds like obfuscation, since 'meta' would need to be passed in,
> > > but is_state_visited() doesn't have meta.
> > > Create fake meta there?!
> > >
> > > I'm missing how such get_iter_reg() helper will look.
> > > meta->iter.frameno was populated by process_iter_arg().
> > > Far away from process_iter_next_call().
> >
> > I meant that this helper can peek spi from R1 just like code in
> > is_state_visited() does currently. Forgoing the 'meta' completely.
>
> I see.
> You mean removing:
>                 meta->iter.spi = spi;
>                 meta->iter.frameno = reg->frameno;
> from process_iter_arg() and
> 'meta' arg from process_iter_next_call() as well then ?

Ed,

That was a bad idea.
I tried what you suggested with
static struct bpf_reg_state *get_iter_reg(struct bpf_verifier_env *env,
                                          struct bpf_verifier_state
*st, int insn_idx)

and implemented in v2.
It's buggy as can be seen in CI (I sloppy tested it before sending it
yesterday).
I'm going to go back to v1 approach.
process_iter_next_call() _has_ to use meta,
since caller saved regs already cleared by the time it's called.
And doing fake 'meta' in is_state_visited() is not a good idea.
It took me a few hours to debug this :(
Eduard Zingerman Feb. 28, 2024, 1:45 p.m. UTC | #6
On Tue, 2024-02-27 at 22:30 -0800, Alexei Starovoitov wrote:
[...]

> > > I meant that this helper can peek spi from R1 just like code in
> > > is_state_visited() does currently. Forgoing the 'meta' completely.
> > 
> > I see.
> > You mean removing:
> >                 meta->iter.spi = spi;
> >                 meta->iter.frameno = reg->frameno;
> > from process_iter_arg() and
> > 'meta' arg from process_iter_next_call() as well then ?
> 
> Ed,
> 
> That was a bad idea.
> I tried what you suggested with
> static struct bpf_reg_state *get_iter_reg(struct bpf_verifier_env *env,
>                                           struct bpf_verifier_state
> *st, int insn_idx)
> 
> and implemented in v2.
> It's buggy as can be seen in CI (I sloppy tested it before sending it
> yesterday).
> I'm going to go back to v1 approach.
> process_iter_next_call() _has_ to use meta,
> since caller saved regs already cleared by the time it's called.
> And doing fake 'meta' in is_state_visited() is not a good idea.
> It took me a few hours to debug this :(

Well, that is unfortunate, sorry about that.
diff mbox series

Patch

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 84365e6dd85d..69bc7f2d20f1 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -449,6 +449,7 @@  struct bpf_verifier_state {
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
+	struct bpf_reg_state can_loop_reg;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
@@ -549,6 +550,7 @@  struct bpf_insn_aux_data {
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
+	bool is_can_loop; /* bpf_can_loop() kfunc call */
 	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
@@ -619,6 +621,7 @@  struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
+	u16 stack_extra;
 	bool has_tail_call: 1;
 	bool tail_call_reachable: 1;
 	bool has_ld_abs: 1;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 93edf730d288..d1d93ad8a010 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2542,6 +2542,17 @@  __bpf_kfunc void bpf_throw(u64 cookie)
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
+__bpf_kfunc long bpf_can_loop(void *ptr__ign)
+{
+	u64 *pcnt = ptr__ign, cnt = *pcnt;
+
+	if (cnt < BPF_MAX_LOOPS) {
+		*pcnt = cnt + 1;
+		return cnt + 1;
+	}
+	return 0;
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -2618,6 +2629,7 @@  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
 BTF_ID_FLAGS(func, bpf_dynptr_size)
 BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_ID_FLAGS(func, bpf_can_loop)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 011d54a1dc53..89667734abf5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -502,6 +502,7 @@  static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 
 static bool is_sync_callback_calling_kfunc(u32 btf_id);
 static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
+static bool is_can_loop_kfunc(struct bpf_kfunc_call_arg_meta *meta);
 
 static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
 {
@@ -1436,6 +1437,7 @@  static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		if (err)
 			return err;
 	}
+	dst_state->can_loop_reg = src->can_loop_reg;
 	return 0;
 }
 
@@ -7954,10 +7956,14 @@  static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 	struct bpf_reg_state *cur_iter, *queued_iter;
 	int iter_frameno = meta->iter.frameno;
 	int iter_spi = meta->iter.spi;
+	bool is_can_loop = is_can_loop_kfunc(meta);
 
 	BTF_TYPE_EMIT(struct bpf_iter);
 
-	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+	if (is_can_loop)
+		cur_iter = &cur_st->can_loop_reg;
+	else
+		cur_iter = &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
 
 	if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
 	    cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
@@ -7985,7 +7991,10 @@  static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 		if (!queued_st)
 			return -ENOMEM;
 
-		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+		if (is_can_loop)
+			queued_iter = &queued_st->can_loop_reg;
+		else
+			queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
 		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
 		queued_iter->iter.depth++;
 		if (prev_st)
@@ -10925,6 +10934,7 @@  enum special_kfunc_type {
 	KF_bpf_percpu_obj_new_impl,
 	KF_bpf_percpu_obj_drop_impl,
 	KF_bpf_throw,
+	KF_bpf_can_loop,
 	KF_bpf_iter_css_task_new,
 };
 
@@ -10949,6 +10959,7 @@  BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_can_loop)
 #ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
 #endif
@@ -10977,6 +10988,7 @@  BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_can_loop)
 #ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
 #else
@@ -11003,6 +11015,11 @@  static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
 }
 
+static bool is_can_loop_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->func_id == special_kfunc_list[KF_bpf_can_loop];
+}
+
 static enum kfunc_ptr_arg_type
 get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 		       struct bpf_kfunc_call_arg_meta *meta,
@@ -12049,6 +12066,7 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	insn_aux = &env->insn_aux_data[insn_idx];
 
 	insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+	insn_aux->is_can_loop = is_can_loop_kfunc(&meta);
 
 	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
 		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
@@ -12424,7 +12442,7 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			mark_btf_func_reg_size(env, regno, t->size);
 	}
 
-	if (is_iter_next_kfunc(&meta)) {
+	if (is_iter_next_kfunc(&meta) || is_can_loop_kfunc(&meta)) {
 		err = process_iter_next_call(env, insn_idx, &meta);
 		if (err)
 			return err;
@@ -15609,7 +15627,7 @@  static int visit_insn(int t, struct bpf_verifier_env *env)
 			struct bpf_kfunc_call_arg_meta meta;
 
 			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
-			if (ret == 0 && is_iter_next_kfunc(&meta)) {
+			if (ret == 0 && (is_iter_next_kfunc(&meta) || is_can_loop_kfunc(&meta))) {
 				mark_prune_point(env, t);
 				/* Checking and saving state checkpoints at iter_next() call
 				 * is crucial for fast convergence of open-coded iterator loop
@@ -16759,6 +16777,9 @@  static bool states_equal(struct bpf_verifier_env *env,
 	if (old->active_rcu_lock != cur->active_rcu_lock)
 		return false;
 
+	if (old->can_loop_reg.iter.state != cur->can_loop_reg.iter.state)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -16933,6 +16954,11 @@  static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
 	return env->insn_aux_data[insn_idx].is_iter_next;
 }
 
+static bool is_can_loop_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].is_can_loop;
+}
+
 /* is_state_visited() handles iter_next() (see process_iter_next_call() for
  * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
  * states to match, which otherwise would look like an infinite loop. So while
@@ -16997,6 +17023,9 @@  static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf
 	struct bpf_func_state *state;
 	int i, fr;
 
+	if (old->can_loop_reg.iter.depth != cur->can_loop_reg.iter.depth)
+		return true;
+
 	for (fr = old->curframe; fr >= 0; fr--) {
 		state = old->frame[fr];
 		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
@@ -17101,23 +17130,27 @@  static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * comparison would discard current state with r7=-32
 			 * => unsafe memory access at 11 would not be caught.
 			 */
-			if (is_iter_next_insn(env, insn_idx)) {
+			if (is_iter_next_insn(env, insn_idx) || is_can_loop_insn(env, insn_idx)) {
 				if (states_equal(env, &sl->state, cur, true)) {
 					struct bpf_func_state *cur_frame;
 					struct bpf_reg_state *iter_state, *iter_reg;
 					int spi;
 
-					cur_frame = cur->frame[cur->curframe];
-					/* btf_check_iter_kfuncs() enforces that
-					 * iter state pointer is always the first arg
-					 */
-					iter_reg = &cur_frame->regs[BPF_REG_1];
-					/* current state is valid due to states_equal(),
-					 * so we can assume valid iter and reg state,
-					 * no need for extra (re-)validations
-					 */
-					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
-					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+					if (is_can_loop_insn(env, insn_idx)) {
+						iter_state = &cur->can_loop_reg;
+					} else {
+						cur_frame = cur->frame[cur->curframe];
+						/* btf_check_iter_kfuncs() enforces that
+						 * iter state pointer is always the first arg
+						 */
+						iter_reg = &cur_frame->regs[BPF_REG_1];
+						/* current state is valid due to states_equal(),
+						 * so we can assume valid iter and reg state,
+						 * no need for extra (re-)validations
+						 */
+						spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+						iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+					}
 					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
 						update_loop_entry(cur, &sl->state);
 						goto hit;
@@ -19258,7 +19291,8 @@  static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
 }
 
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
+			    struct bpf_insn *insn_buf, int insn_idx, int stack_base,
+			    int *cnt, int *stack_extra)
 {
 	const struct bpf_kfunc_desc *desc;
 
@@ -19349,6 +19383,12 @@  static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
+	} else if (desc->func_id == special_kfunc_list[KF_bpf_can_loop]) {
+		insn_buf[0] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP);
+		insn_buf[1] = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, stack_base - 8);
+		insn_buf[2] = *insn;
+		*cnt = 3;
+		*stack_extra = 8;
 	}
 	return 0;
 }
@@ -19396,7 +19436,10 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
 	struct bpf_map *map_ptr;
-	int i, ret, cnt, delta = 0;
+	int i, ret, cnt, delta = 0, cur_subprog = 0;
+	struct bpf_subprog_info *subprogs = env->subprog_info;
+	u16 stack_depth = subprogs[cur_subprog].stack_depth;
+	u16 stack_depth_extra = 0;
 
 	if (env->seen_exception && !env->exception_callback_subprog) {
 		struct bpf_insn patch[] = {
@@ -19416,7 +19459,16 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 		mark_subprog_exc_cb(env, env->exception_callback_subprog);
 	}
 
-	for (i = 0; i < insn_cnt; i++, insn++) {
+	for (i = 0; i < insn_cnt;
+	     ({
+		if (stack_depth_extra && subprogs[cur_subprog + 1].start == i + delta + 1) {
+			subprogs[cur_subprog].stack_depth += stack_depth_extra;
+			subprogs[cur_subprog].stack_extra = stack_depth_extra;
+			cur_subprog++;
+			stack_depth = subprogs[cur_subprog].stack_depth;
+			stack_depth_extra = 0;
+		}
+	      }), i++, insn++) {
 		/* Make divide-by-zero exceptions impossible. */
 		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
 		    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
@@ -19536,11 +19588,18 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
-			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+			int stack_extra = 0;
+
+			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta,
+					       -stack_depth, &cnt, &stack_extra);
 			if (ret)
 				return ret;
 			if (cnt == 0)
 				continue;
+			if (stack_extra & (BPF_REG_SIZE - 1)) {
+				verbose(env, "verifier bug: kfunc stack extra must be power of 8\n");
+				return -EFAULT;
+			}
 
 			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 			if (!new_prog)
@@ -19549,6 +19608,8 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta	 += cnt - 1;
 			env->prog = prog = new_prog;
 			insn	  = new_prog->insnsi + i + delta;
+			if (stack_extra)
+				stack_depth_extra = max(stack_depth_extra, stack_extra);
 			continue;
 		}
 
@@ -19942,6 +20003,30 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 		insn->imm = fn->func - __bpf_call_base;
 	}
 
+	env->prog->aux->stack_depth = subprogs[0].stack_depth;
+	for (i = 0; i < env->subprog_cnt; i++) {
+		int subprog_start = subprogs[i].start, j;
+		int stack_slots = subprogs[i].stack_extra / 8;
+
+		if (stack_slots >= ARRAY_SIZE(insn_buf)) {
+			verbose(env, "verifier bug: stack_extra is too large\n");
+			return -EFAULT;
+		}
+
+		/* Add insns to subprog prologue to zero init extra stack */
+		for (j = 0; j < stack_slots; j++)
+			insn_buf[j] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+						 -subprogs[i].stack_depth + j * 8, 0);
+		if (j) {
+			insn_buf[j] = env->prog->insnsi[subprog_start];
+
+			new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, j + 1);
+			if (!new_prog)
+				return -ENOMEM;
+			env->prog = prog = new_prog;
+		}
+	}
+
 	/* Since poke tab is now finalized, publish aux to tracker. */
 	for (i = 0; i < prog->aux->size_poke_tab; i++) {
 		map_ptr = prog->aux->poke_tab[i].tail_call.map;
@@ -20130,6 +20215,21 @@  static void free_states(struct bpf_verifier_env *env)
 	}
 }
 
+static void init_can_loop_reg(struct bpf_reg_state *st)
+{
+	__mark_reg_known_zero(st);
+	st->type = PTR_TO_STACK;
+	st->live |= REG_LIVE_WRITTEN;
+	st->ref_obj_id = 0;
+	st->iter.btf = NULL;
+	st->iter.btf_id = 0;
+	/* Init register state to sane values.
+	 * Only iter.state and iter.depth are used during verification.
+	 */
+	st->iter.state = BPF_ITER_STATE_ACTIVE;
+	st->iter.depth = 0;
+}
+
 static int do_check_common(struct bpf_verifier_env *env, int subprog)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -20147,6 +20247,7 @@  static int do_check_common(struct bpf_verifier_env *env, int subprog)
 	state->curframe = 0;
 	state->speculative = false;
 	state->branches = 1;
+	init_can_loop_reg(&state->can_loop_reg);
 	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
 	if (!state->frame[0]) {
 		kfree(state);