diff mbox series

[v2,bpf-next,4/6] bpf: stop setting precise in current state

Message ID 20221104163649.121784-5-andrii@kernel.org (mailing list archive)
State Accepted
Commit f63181b6ae79fd3b034cde641db774268c2c3acf
Delegated to: BPF
Headers show
Series BPF verifier precision tracking improvements | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 pending Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain }}
bpf/vmtest-bpf-next-VM_Test-2 pending Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-8 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-10 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 10 this patch: 10
netdev/cc_maintainers warning 12 maintainers not CCed: sdf@google.com hawk@kernel.org kpsingh@kernel.org haoluo@google.com davem@davemloft.net yhs@fb.com kuba@kernel.org netdev@vger.kernel.org jolsa@kernel.org martin.lau@linux.dev song@kernel.org john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 5 this patch: 5
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 10 this patch: 10
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 142 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Andrii Nakryiko Nov. 4, 2022, 4:36 p.m. UTC
Setting reg->precise to true in current state is not necessary from
correctness standpoint, but it does pessimise the whole precision (or
rather "imprecision", because that's what we want to keep as much as
possible) tracking. Why is somewhat subtle and my best attempt to
explain this is recorded in an extensive comment for __mark_chain_precise()
function. Some more careful thinking and code reading is probably required
still to grok this completely, unfortunately. Whiteboarding and a bunch
of extra handwaiving in person would be even more helpful, but is deemed
impractical in Git commit.

Next patch pushes this imprecision property even further, building on top of
the insights described in this patch.

End results are pretty nice, we get reduction in number of total instructions
and states verified due to a better states reuse, as some of the states are now
more generic and permissive due to less unnecessary precise=true requirements.

SELFTESTS RESULTS
=================

$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep -v '+0'
File                                     Program                 Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states (A)  Total states (B)  Total states (DIFF)
---------------------------------------  ----------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
bpf_iter_ksym.bpf.linked1.o              dump_ksym                           347              285       -62 (-17.87%)                20                19          -1 (-5.00%)
pyperf600_bpf_loop.bpf.linked1.o         on_event                           3678             3736        +58 (+1.58%)               276               285          +9 (+3.26%)
setget_sockopt.bpf.linked1.o             skops_sockopt                      4038             3947        -91 (-2.25%)               347               343          -4 (-1.15%)
test_l4lb.bpf.linked1.o                  balancer_ingress                   4559             2611     -1948 (-42.73%)               118               105        -13 (-11.02%)
test_l4lb_noinline.bpf.linked1.o         balancer_ingress                   6279             6268        -11 (-0.18%)               237               236          -1 (-0.42%)
test_misc_tcp_hdr_options.bpf.linked1.o  misc_estab                         1307             1303         -4 (-0.31%)               100                99          -1 (-1.00%)
test_sk_lookup.bpf.linked1.o             ctx_narrow_access                   456              447         -9 (-1.97%)                39                38          -1 (-2.56%)
test_sysctl_loop1.bpf.linked1.o          sysctl_tcp_mem                     1389             1384         -5 (-0.36%)                26                25          -1 (-3.85%)
test_tc_dtime.bpf.linked1.o              egress_fwdns_prio101                518              485        -33 (-6.37%)                51                46          -5 (-9.80%)
test_tc_dtime.bpf.linked1.o              egress_host                         519              468        -51 (-9.83%)                50                44         -6 (-12.00%)
test_tc_dtime.bpf.linked1.o              ingress_fwdns_prio101               842             1000      +158 (+18.76%)                73                88        +15 (+20.55%)
xdp_synproxy_kern.bpf.linked1.o          syncookie_tc                     405757           373173     -32584 (-8.03%)             25735             22882      -2853 (-11.09%)
xdp_synproxy_kern.bpf.linked1.o          syncookie_xdp                    479055           371590   -107465 (-22.43%)             29145             22207      -6938 (-23.81%)
---------------------------------------  ----------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------

Slight regression in test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101
is left for a follow up, there might be some more precision-related bugs
in existing BPF verifier logic.

CILIUM RESULTS
==============

$ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results-cilium.csv ~/imprecise-early-results-cilium.csv | grep -v '+0'
File           Program                         Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states (A)  Total states (B)  Total states (DIFF)
-------------  ------------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------
bpf_host.o     cil_from_host                               762              556      -206 (-27.03%)                43                37         -6 (-13.95%)
bpf_host.o     tail_handle_nat_fwd_ipv4                  23541            23426       -115 (-0.49%)              1538              1537          -1 (-0.07%)
bpf_host.o     tail_nodeport_nat_egress_ipv4             33592            33566        -26 (-0.08%)              2163              2161          -2 (-0.09%)
bpf_lxc.o      tail_handle_nat_fwd_ipv4                  23541            23426       -115 (-0.49%)              1538              1537          -1 (-0.07%)
bpf_overlay.o  tail_nodeport_nat_egress_ipv4             33581            33543        -38 (-0.11%)              2160              2157          -3 (-0.14%)
bpf_xdp.o      tail_handle_nat_fwd_ipv4                  21659            20920       -739 (-3.41%)              1440              1376         -64 (-4.44%)
bpf_xdp.o      tail_handle_nat_fwd_ipv6                  17084            17039        -45 (-0.26%)               907               905          -2 (-0.22%)
bpf_xdp.o      tail_lb_ipv4                              73442            73430        -12 (-0.02%)              4370              4369          -1 (-0.02%)
bpf_xdp.o      tail_lb_ipv6                             152114           151895       -219 (-0.14%)              6493              6479         -14 (-0.22%)
bpf_xdp.o      tail_nodeport_nat_egress_ipv4             17377            17200       -177 (-1.02%)              1125              1111         -14 (-1.24%)
bpf_xdp.o      tail_nodeport_nat_ingress_ipv6             6405             6397         -8 (-0.12%)               309               308          -1 (-0.32%)
bpf_xdp.o      tail_rev_nodeport_lb4                      7126             6934       -192 (-2.69%)               414               402         -12 (-2.90%)
bpf_xdp.o      tail_rev_nodeport_lb6                     18059            17905       -154 (-0.85%)              1105              1096          -9 (-0.81%)
-------------  ------------------------------  ---------------  ---------------  ------------------  ----------------  ----------------  -------------------

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 kernel/bpf/verifier.c | 103 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 12 deletions(-)

Comments

Stefan Fleischmann Jan. 24, 2024, 10:06 a.m. UTC | #1
On Fri, 4 Nov 2022 09:36:47 -0700
Andrii Nakryiko <andrii@kernel.org> wrote:

> Setting reg->precise to true in current state is not necessary from
> correctness standpoint, but it does pessimise the whole precision (or
> rather "imprecision", because that's what we want to keep as much as
> possible) tracking. Why is somewhat subtle and my best attempt to
> explain this is recorded in an extensive comment for
> __mark_chain_precise() function. Some more careful thinking and code
> reading is probably required still to grok this completely,
> unfortunately. Whiteboarding and a bunch of extra handwaiving in
> person would be even more helpful, but is deemed impractical in Git
> commit.

Not sure if this is the preferred way to bring this up, if not please
direct me elsewhere. I've noticed problems with this patch in the 5.15
kernel line, originally the Ubuntu kernel. Bug report with more
information can be found here:
 https://bugs.launchpad.net/ubuntu/+source/linux-signed/+bug/2050098

> Next patch pushes this imprecision property even further, building on
> top of the insights described in this patch.

I tracked this down to the changes in upstream version from 5.15.126 to
127, and have confirmed that reverting this patch and the next
mentioned here solves our problem.

> End results are pretty nice, we get reduction in number of total
> instructions and states verified due to a better states reuse, as
> some of the states are now more generic and permissive due to less
> unnecessary precise=true requirements.

I'll describe the problem here briefly in case you don't have time to
read through the Ubuntu bug report. We use Slurm on a cluster and use
cgroup2 for resource confinement. Now with this change in 5.15 we get
this error from the slurm daemon on the node:

 slurmstepd: error: load_ebpf_prog: BPF load error (No space left on
 device). Please check your system limits (MEMLOCK).
 (debug log available here:
 https://launchpadlibrarian.net/710598602/slurmd_cgroup_log.txt)

And more importantly the cgroup confinement is not working anymore.
As I said reverting this patch brings back functionality. Now it would
be easy to blame Slurm here, but I've tested newer kernels, 6.5,
6.6.13, 6.7.1 which all work fine. Which makes me believe some crucial
parts might not have been backported to 5.15.

Best regards,
Stefan

> SELFTESTS RESULTS
> =================
> 
> $ ./veristat -C -e file,prog,insns,states
> ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep -v
> '+0' File                                     Program
> Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states
> (A)  Total states (B)  Total states (DIFF)
> ---------------------------------------  ----------------------
> ---------------  ---------------  ------------------
> ----------------  ----------------  -------------------
> bpf_iter_ksym.bpf.linked1.o              dump_ksym
>        347              285       -62 (-17.87%)                20
>            19          -1 (-5.00%) pyperf600_bpf_loop.bpf.linked1.o
>       on_event                           3678             3736
> +58 (+1.58%)               276               285          +9 (+3.26%)
> setget_sockopt.bpf.linked1.o             skops_sockopt
>       4038             3947        -91 (-2.25%)               347
>           343          -4 (-1.15%) test_l4lb.bpf.linked1.o
>       balancer_ingress                   4559             2611
> -1948 (-42.73%)               118               105        -13
> (-11.02%) test_l4lb_noinline.bpf.linked1.o         balancer_ingress
>                 6279             6268        -11 (-0.18%)
>   237               236          -1 (-0.42%)
> test_misc_tcp_hdr_options.bpf.linked1.o  misc_estab
>       1307             1303         -4 (-0.31%)               100
>            99          -1 (-1.00%) test_sk_lookup.bpf.linked1.o
>       ctx_narrow_access                   456              447
>  -9 (-1.97%)                39                38          -1 (-2.56%)
> test_sysctl_loop1.bpf.linked1.o          sysctl_tcp_mem
>       1389             1384         -5 (-0.36%)                26
>            25          -1 (-3.85%) test_tc_dtime.bpf.linked1.o
>       egress_fwdns_prio101                518              485
> -33 (-6.37%)                51                46          -5 (-9.80%)
> test_tc_dtime.bpf.linked1.o              egress_host
>        519              468        -51 (-9.83%)                50
>            44         -6 (-12.00%) test_tc_dtime.bpf.linked1.o
>       ingress_fwdns_prio101               842             1000
> +158 (+18.76%)                73                88        +15
> (+20.55%) xdp_synproxy_kern.bpf.linked1.o          syncookie_tc
>               405757           373173     -32584 (-8.03%)
> 25735             22882      -2853 (-11.09%)
> xdp_synproxy_kern.bpf.linked1.o          syncookie_xdp
>     479055           371590   -107465 (-22.43%)             29145
>         22207      -6938 (-23.81%)
> ---------------------------------------  ----------------------
> ---------------  ---------------  ------------------
> ----------------  ----------------  -------------------
> 
> Slight regression in test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101
> is left for a follow up, there might be some more precision-related
> bugs in existing BPF verifier logic.
> 
> CILIUM RESULTS
> ==============
> 
> $ ./veristat -C -e file,prog,insns,states
> ~/subprog-precise-results-cilium.csv
> ~/imprecise-early-results-cilium.csv | grep -v '+0' File
> Program                         Total insns (A)  Total insns (B)
> Total insns (DIFF)  Total states (A)  Total states (B)  Total states
> (DIFF) -------------  ------------------------------  ---------------
>  ---------------  ------------------  ----------------
> ----------------  ------------------- bpf_host.o     cil_from_host
>                            762              556      -206 (-27.03%)
>              43                37         -6 (-13.95%) bpf_host.o
> tail_handle_nat_fwd_ipv4                  23541            23426
>  -115 (-0.49%)              1538              1537          -1
> (-0.07%) bpf_host.o     tail_nodeport_nat_egress_ipv4
> 33592            33566        -26 (-0.08%)              2163
>     2161          -2 (-0.09%) bpf_lxc.o      tail_handle_nat_fwd_ipv4
>                  23541            23426       -115 (-0.49%)
>    1538              1537          -1 (-0.07%) bpf_overlay.o
> tail_nodeport_nat_egress_ipv4             33581            33543
>   -38 (-0.11%)              2160              2157          -3
> (-0.14%) bpf_xdp.o      tail_handle_nat_fwd_ipv4
> 21659            20920       -739 (-3.41%)              1440
>     1376         -64 (-4.44%) bpf_xdp.o      tail_handle_nat_fwd_ipv6
>                  17084            17039        -45 (-0.26%)
>     907               905          -2 (-0.22%) bpf_xdp.o
> tail_lb_ipv4                              73442            73430
>   -12 (-0.02%)              4370              4369          -1
> (-0.02%) bpf_xdp.o      tail_lb_ipv6
> 152114           151895       -219 (-0.14%)              6493
>      6479         -14 (-0.22%) bpf_xdp.o
> tail_nodeport_nat_egress_ipv4             17377            17200
>  -177 (-1.02%)              1125              1111         -14
> (-1.24%) bpf_xdp.o      tail_nodeport_nat_ingress_ipv6
> 6405             6397         -8 (-0.12%)               309
>     308          -1 (-0.32%) bpf_xdp.o      tail_rev_nodeport_lb4
>                  7126             6934       -192 (-2.69%)
>    414               402         -12 (-2.90%) bpf_xdp.o
> tail_rev_nodeport_lb6                     18059            17905
>  -154 (-0.85%)              1105              1096          -9
> (-0.81%) -------------  ------------------------------
> ---------------  ---------------  ------------------
> ----------------  ----------------  -------------------
> 
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  kernel/bpf/verifier.c | 103
> +++++++++++++++++++++++++++++++++++++----- 1 file changed, 91
> insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index c1169ee1bc7c..ff3fc21ce99b 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -2749,8 +2749,11 @@ static void mark_all_scalars_precise(struct
> bpf_verifier_env *env, 
>  	/* big hammer: mark all scalars precise in this path.
>  	 * pop_stack may still get !precise scalars.
> +	 * We also skip current state and go straight to first
> parent state,
> +	 * because precision markings in current non-checkpointed
> state are
> +	 * not needed. See why in the comment in
> __mark_chain_precision below. */
> -	for (; st; st = st->parent)
> +	for (st = st->parent; st; st = st->parent) {
>  		for (i = 0; i <= st->curframe; i++) {
>  			func = st->frame[i];
>  			for (j = 0; j < BPF_REG_FP; j++) {
> @@ -2768,8 +2771,88 @@ static void mark_all_scalars_precise(struct
> bpf_verifier_env *env, reg->precise = true;
>  			}
>  		}
> +	}
>  }
>  
> +/*
> + * __mark_chain_precision() backtracks BPF program instruction
> sequence and
> + * chain of verifier states making sure that register *regno* (if
> regno >= 0)
> + * and/or stack slot *spi* (if spi >= 0) are marked as precisely
> tracked
> + * SCALARS, as well as any other registers and slots that contribute
> to
> + * a tracked state of given registers/stack slots, depending on
> specific BPF
> + * assembly instructions (see backtrack_insns() for exact
> instruction handling
> + * logic). This backtracking relies on recorded jmp_history and is
> able to
> + * traverse entire chain of parent states. This process ends only
> when all the
> + * necessary registers/slots and their transitive dependencies are
> marked as
> + * precise.
> + *
> + * One important and subtle aspect is that precise marks *do not
> matter* in
> + * the currently verified state (current state). It is important to
> understand
> + * why this is the case.
> + *
> + * First, note that current state is the state that is not yet
> "checkpointed",
> + * i.e., it is not yet put into env->explored_states, and it has no
> children
> + * states as well. It's ephemeral, and can end up either a) being
> discarded if
> + * compatible explored state is found at some point or BPF_EXIT
> instruction is
> + * reached or b) checkpointed and put into env->explored_states,
> branching out
> + * into one or more children states.
> + *
> + * In the former case, precise markings in current state are
> completely
> + * ignored by state comparison code (see regsafe() for details). Only
> + * checkpointed ("old") state precise markings are important, and if
> old
> + * state's register/slot is precise, regsafe() assumes current
> state's
> + * register/slot as precise and checks value ranges exactly and
> precisely. If
> + * states turn out to be compatible, current state's necessary
> precise
> + * markings and any required parent states' precise markings are
> enforced
> + * after the fact with propagate_precision() logic, after the fact.
> But it's
> + * important to realize that in this case, even after marking
> current state
> + * registers/slots as precise, we immediately discard current state.
> So what
> + * actually matters is any of the precise markings propagated into
> current
> + * state's parent states, which are always checkpointed (due to b)
> case above).
> + * As such, for scenario a) it doesn't matter if current state has
> precise
> + * markings set or not.
> + *
> + * Now, for the scenario b), checkpointing and forking into
> child(ren)
> + * state(s). Note that before current state gets to checkpointing
> step, any
> + * processed instruction always assumes precise SCALAR register/slot
> + * knowledge: if precise value or range is useful to prune jump
> branch, BPF
> + * verifier takes this opportunity enthusiastically. Similarly, when
> + * register's value is used to calculate offset or memory address,
> exact
> + * knowledge of SCALAR range is assumed, checked, and enforced. So,
> similar to
> + * what we mentioned above about state comparison ignoring precise
> markings
> + * during state comparison, BPF verifier ignores and also assumes
> precise
> + * markings *at will* during instruction verification process. But
> as verifier
> + * assumes precision, it also propagates any precision dependencies
> across
> + * parent states, which are not yet finalized, so can be further
> restricted
> + * based on new knowledge gained from restrictions enforced by their
> children
> + * states. This is so that once those parent states are finalized,
> i.e., when
> + * they have no more active children state, state comparison logic in
> + * is_state_visited() would enforce strict and precise SCALAR
> ranges, if
> + * required for correctness.
> + *
> + * To build a bit more intuition, note also that once a state is
> checkpointed,
> + * the path we took to get to that state is not important. This is
> crucial
> + * property for state pruning. When state is checkpointed and
> finalized at
> + * some instruction index, it can be correctly and safely used to
> "short
> + * circuit" any *compatible* state that reaches exactly the same
> instruction
> + * index. I.e., if we jumped to that instruction from a completely
> different
> + * code path than original finalized state was derived from, it
> doesn't
> + * matter, current state can be discarded because from that
> instruction
> + * forward having a compatible state will ensure we will safely
> reach the
> + * exit. States describe preconditions for further exploration, but
> completely
> + * forget the history of how we got here.
> + *
> + * This also means that even if we needed precise SCALAR range to
> get to
> + * finalized state, but from that point forward *that same* SCALAR
> register is
> + * never used in a precise context (i.e., it's precise value is not
> needed for
> + * correctness), it's correct and safe to mark such register as
> "imprecise"
> + * (i.e., precise marking set to false). This is what we rely on
> when we do
> + * not set precise marking in current state. If no child state
> requires
> + * precision for any given SCALAR register, it's safe to dictate
> that it can
> + * be imprecise. If any child state does require this register to be
> precise,
> + * we'll mark it precise later retroactively during precise markings
> + * propagation from child state to parent states.
> + */
>  static int __mark_chain_precision(struct bpf_verifier_env *env, int
> frame, int regno, int spi)
>  {
> @@ -2787,6 +2870,10 @@ static int __mark_chain_precision(struct
> bpf_verifier_env *env, int frame, int r if (!env->bpf_capable)
>  		return 0;
>  
> +	/* Do sanity checks against current state of register and/or
> stack
> +	 * slot, but don't set precise flag in current state, as
> precision
> +	 * tracking in the current state is unnecessary.
> +	 */
>  	func = st->frame[frame];
>  	if (regno >= 0) {
>  		reg = &func->regs[regno];
> @@ -2794,11 +2881,7 @@ static int __mark_chain_precision(struct
> bpf_verifier_env *env, int frame, int r WARN_ONCE(1, "backtracing
> misuse"); return -EFAULT;
>  		}
> -		if (!reg->precise)
> -			new_marks = true;
> -		else
> -			reg_mask = 0;
> -		reg->precise = true;
> +		new_marks = true;
>  	}
>  
>  	while (spi >= 0) {
> @@ -2811,11 +2894,7 @@ static int __mark_chain_precision(struct
> bpf_verifier_env *env, int frame, int r stack_mask = 0;
>  			break;
>  		}
> -		if (!reg->precise)
> -			new_marks = true;
> -		else
> -			stack_mask = 0;
> -		reg->precise = true;
> +		new_marks = true;
>  		break;
>  	}
>  
> @@ -11534,7 +11613,7 @@ static bool regsafe(struct bpf_verifier_env
> *env, struct bpf_reg_state *rold, if (env->explore_alu_limits)
>  			return false;
>  		if (rcur->type == SCALAR_VALUE) {
> -			if (!rold->precise && !rcur->precise)
> +			if (!rold->precise)
>  				return true;
>  			/* new val must satisfy old val knowledge */
>  			return range_within(rold, rcur) &&
Stefan Fleischmann Jan. 24, 2024, 12:38 p.m. UTC | #2
On Wed, 24 Jan 2024 11:06:50 +0100 Stefan Fleischmann <sfle@kth.se>
wrote:
> On Fri, 4 Nov 2022 09:36:47 -0700
> Andrii Nakryiko <andrii@kernel.org> wrote:
> 
> > Setting reg->precise to true in current state is not necessary from
> > correctness standpoint, but it does pessimise the whole precision
> > (or rather "imprecision", because that's what we want to keep as
> > much as possible) tracking. Why is somewhat subtle and my best
> > attempt to explain this is recorded in an extensive comment for
> > __mark_chain_precise() function. Some more careful thinking and code
> > reading is probably required still to grok this completely,
> > unfortunately. Whiteboarding and a bunch of extra handwaiving in
> > person would be even more helpful, but is deemed impractical in Git
> > commit.  
> 
> Not sure if this is the preferred way to bring this up, if not please
> direct me elsewhere. I've noticed problems with this patch in the 5.15
> kernel line, originally the Ubuntu kernel. Bug report with more
> information can be found here:
>  https://bugs.launchpad.net/ubuntu/+source/linux-signed/+bug/2050098
> 
> > Next patch pushes this imprecision property even further, building
> > on top of the insights described in this patch.  
> 
> I tracked this down to the changes in upstream version from 5.15.126
> to 127, and have confirmed that reverting this patch and the next
> mentioned here solves our problem.
> 
> > End results are pretty nice, we get reduction in number of total
> > instructions and states verified due to a better states reuse, as
> > some of the states are now more generic and permissive due to less
> > unnecessary precise=true requirements.  
> 
> I'll describe the problem here briefly in case you don't have time to
> read through the Ubuntu bug report. We use Slurm on a cluster and use
> cgroup2 for resource confinement. Now with this change in 5.15 we get
> this error from the slurm daemon on the node:
> 
>  slurmstepd: error: load_ebpf_prog: BPF load error (No space left on
>  device). Please check your system limits (MEMLOCK).
>  (debug log available here:
>  https://launchpadlibrarian.net/710598602/slurmd_cgroup_log.txt)
> 
> And more importantly the cgroup confinement is not working anymore.
> As I said reverting this patch brings back functionality. Now it would
> be easy to blame Slurm here, but I've tested newer kernels, 6.5,
> 6.6.13, 6.7.1 which all work fine. Which makes me believe some crucial
> parts might not have been backported to 5.15.

Sorry for the noise, turns out this is a bug in Slurm that is only
triggered by long bpf logs. So I suppose the newer kernels produce less
logs, hence the issue is not triggered there.

Best regards,
Stefan


> Best regards,
> Stefan
> 
> > SELFTESTS RESULTS
> > =================
> > 
> > $ ./veristat -C -e file,prog,insns,states
> > ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep
> > -v '+0' File                                     Program
> > Total insns (A)  Total insns (B)  Total insns (DIFF)  Total states
> > (A)  Total states (B)  Total states (DIFF)
> > ---------------------------------------  ----------------------
> > ---------------  ---------------  ------------------
> > ----------------  ----------------  -------------------
> > bpf_iter_ksym.bpf.linked1.o              dump_ksym
> >        347              285       -62 (-17.87%)                20
> >            19          -1 (-5.00%) pyperf600_bpf_loop.bpf.linked1.o
> >       on_event                           3678             3736
> > +58 (+1.58%)               276               285          +9
> > (+3.26%) setget_sockopt.bpf.linked1.o             skops_sockopt
> >       4038             3947        -91 (-2.25%)               347
> >           343          -4 (-1.15%) test_l4lb.bpf.linked1.o
> >       balancer_ingress                   4559             2611
> > -1948 (-42.73%)               118               105        -13
> > (-11.02%) test_l4lb_noinline.bpf.linked1.o         balancer_ingress
> >                 6279             6268        -11 (-0.18%)
> >   237               236          -1 (-0.42%)
> > test_misc_tcp_hdr_options.bpf.linked1.o  misc_estab
> >       1307             1303         -4 (-0.31%)               100
> >            99          -1 (-1.00%) test_sk_lookup.bpf.linked1.o
> >       ctx_narrow_access                   456              447
> >  -9 (-1.97%)                39                38          -1
> > (-2.56%) test_sysctl_loop1.bpf.linked1.o          sysctl_tcp_mem
> >       1389             1384         -5 (-0.36%)                26
> >            25          -1 (-3.85%) test_tc_dtime.bpf.linked1.o
> >       egress_fwdns_prio101                518              485
> > -33 (-6.37%)                51                46          -5
> > (-9.80%) test_tc_dtime.bpf.linked1.o              egress_host
> >        519              468        -51 (-9.83%)                50
> >            44         -6 (-12.00%) test_tc_dtime.bpf.linked1.o
> >       ingress_fwdns_prio101               842             1000
> > +158 (+18.76%)                73                88        +15
> > (+20.55%) xdp_synproxy_kern.bpf.linked1.o          syncookie_tc
> >               405757           373173     -32584 (-8.03%)
> > 25735             22882      -2853 (-11.09%)
> > xdp_synproxy_kern.bpf.linked1.o          syncookie_xdp
> >     479055           371590   -107465 (-22.43%)             29145
> >         22207      -6938 (-23.81%)
> > ---------------------------------------  ----------------------
> > ---------------  ---------------  ------------------
> > ----------------  ----------------  -------------------
> > 
> > Slight regression in
> > test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101 is left for a
> > follow up, there might be some more precision-related bugs in
> > existing BPF verifier logic.
> > 
> > CILIUM RESULTS
> > ==============
> > 
> > $ ./veristat -C -e file,prog,insns,states
> > ~/subprog-precise-results-cilium.csv
> > ~/imprecise-early-results-cilium.csv | grep -v '+0' File
> > Program                         Total insns (A)  Total insns (B)
> > Total insns (DIFF)  Total states (A)  Total states (B)  Total states
> > (DIFF) -------------  ------------------------------
> > --------------- ---------------  ------------------
> > ---------------- ----------------  ------------------- bpf_host.o
> >   cil_from_host 762              556      -206 (-27.03%)
> >              43                37         -6 (-13.95%) bpf_host.o
> > tail_handle_nat_fwd_ipv4                  23541            23426
> >  -115 (-0.49%)              1538              1537          -1
> > (-0.07%) bpf_host.o     tail_nodeport_nat_egress_ipv4
> > 33592            33566        -26 (-0.08%)              2163
> >     2161          -2 (-0.09%) bpf_lxc.o
> > tail_handle_nat_fwd_ipv4 23541            23426       -115 (-0.49%)
> >    1538              1537          -1 (-0.07%) bpf_overlay.o
> > tail_nodeport_nat_egress_ipv4             33581            33543
> >   -38 (-0.11%)              2160              2157          -3
> > (-0.14%) bpf_xdp.o      tail_handle_nat_fwd_ipv4
> > 21659            20920       -739 (-3.41%)              1440
> >     1376         -64 (-4.44%) bpf_xdp.o
> > tail_handle_nat_fwd_ipv6 17084            17039        -45 (-0.26%)
> >     907               905          -2 (-0.22%) bpf_xdp.o
> > tail_lb_ipv4                              73442            73430
> >   -12 (-0.02%)              4370              4369          -1
> > (-0.02%) bpf_xdp.o      tail_lb_ipv6
> > 152114           151895       -219 (-0.14%)              6493
> >      6479         -14 (-0.22%) bpf_xdp.o
> > tail_nodeport_nat_egress_ipv4             17377            17200
> >  -177 (-1.02%)              1125              1111         -14
> > (-1.24%) bpf_xdp.o      tail_nodeport_nat_ingress_ipv6
> > 6405             6397         -8 (-0.12%)               309
> >     308          -1 (-0.32%) bpf_xdp.o      tail_rev_nodeport_lb4
> >                  7126             6934       -192 (-2.69%)
> >    414               402         -12 (-2.90%) bpf_xdp.o
> > tail_rev_nodeport_lb6                     18059            17905
> >  -154 (-0.85%)              1105              1096          -9
> > (-0.81%) -------------  ------------------------------
> > ---------------  ---------------  ------------------
> > ----------------  ----------------  -------------------
> > 
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  kernel/bpf/verifier.c | 103
> > +++++++++++++++++++++++++++++++++++++----- 1 file changed, 91
> > insertions(+), 12 deletions(-)
> > 
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index c1169ee1bc7c..ff3fc21ce99b 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -2749,8 +2749,11 @@ static void mark_all_scalars_precise(struct
> > bpf_verifier_env *env, 
> >  	/* big hammer: mark all scalars precise in this path.
> >  	 * pop_stack may still get !precise scalars.
> > +	 * We also skip current state and go straight to first
> > parent state,
> > +	 * because precision markings in current non-checkpointed
> > state are
> > +	 * not needed. See why in the comment in
> > __mark_chain_precision below. */
> > -	for (; st; st = st->parent)
> > +	for (st = st->parent; st; st = st->parent) {
> >  		for (i = 0; i <= st->curframe; i++) {
> >  			func = st->frame[i];
> >  			for (j = 0; j < BPF_REG_FP; j++) {
> > @@ -2768,8 +2771,88 @@ static void mark_all_scalars_precise(struct
> > bpf_verifier_env *env, reg->precise = true;
> >  			}
> >  		}
> > +	}
> >  }
> >  
> > +/*
> > + * __mark_chain_precision() backtracks BPF program instruction
> > sequence and
> > + * chain of verifier states making sure that register *regno* (if
> > regno >= 0)
> > + * and/or stack slot *spi* (if spi >= 0) are marked as precisely
> > tracked
> > + * SCALARS, as well as any other registers and slots that
> > contribute to
> > + * a tracked state of given registers/stack slots, depending on
> > specific BPF
> > + * assembly instructions (see backtrack_insns() for exact
> > instruction handling
> > + * logic). This backtracking relies on recorded jmp_history and is
> > able to
> > + * traverse entire chain of parent states. This process ends only
> > when all the
> > + * necessary registers/slots and their transitive dependencies are
> > marked as
> > + * precise.
> > + *
> > + * One important and subtle aspect is that precise marks *do not
> > matter* in
> > + * the currently verified state (current state). It is important to
> > understand
> > + * why this is the case.
> > + *
> > + * First, note that current state is the state that is not yet
> > "checkpointed",
> > + * i.e., it is not yet put into env->explored_states, and it has no
> > children
> > + * states as well. It's ephemeral, and can end up either a) being
> > discarded if
> > + * compatible explored state is found at some point or BPF_EXIT
> > instruction is
> > + * reached or b) checkpointed and put into env->explored_states,
> > branching out
> > + * into one or more children states.
> > + *
> > + * In the former case, precise markings in current state are
> > completely
> > + * ignored by state comparison code (see regsafe() for details).
> > Only
> > + * checkpointed ("old") state precise markings are important, and
> > if old
> > + * state's register/slot is precise, regsafe() assumes current
> > state's
> > + * register/slot as precise and checks value ranges exactly and
> > precisely. If
> > + * states turn out to be compatible, current state's necessary
> > precise
> > + * markings and any required parent states' precise markings are
> > enforced
> > + * after the fact with propagate_precision() logic, after the fact.
> > But it's
> > + * important to realize that in this case, even after marking
> > current state
> > + * registers/slots as precise, we immediately discard current
> > state. So what
> > + * actually matters is any of the precise markings propagated into
> > current
> > + * state's parent states, which are always checkpointed (due to b)
> > case above).
> > + * As such, for scenario a) it doesn't matter if current state has
> > precise
> > + * markings set or not.
> > + *
> > + * Now, for the scenario b), checkpointing and forking into
> > child(ren)
> > + * state(s). Note that before current state gets to checkpointing
> > step, any
> > + * processed instruction always assumes precise SCALAR
> > register/slot
> > + * knowledge: if precise value or range is useful to prune jump
> > branch, BPF
> > + * verifier takes this opportunity enthusiastically. Similarly,
> > when
> > + * register's value is used to calculate offset or memory address,
> > exact
> > + * knowledge of SCALAR range is assumed, checked, and enforced. So,
> > similar to
> > + * what we mentioned above about state comparison ignoring precise
> > markings
> > + * during state comparison, BPF verifier ignores and also assumes
> > precise
> > + * markings *at will* during instruction verification process. But
> > as verifier
> > + * assumes precision, it also propagates any precision dependencies
> > across
> > + * parent states, which are not yet finalized, so can be further
> > restricted
> > + * based on new knowledge gained from restrictions enforced by
> > their children
> > + * states. This is so that once those parent states are finalized,
> > i.e., when
> > + * they have no more active children state, state comparison logic
> > in
> > + * is_state_visited() would enforce strict and precise SCALAR
> > ranges, if
> > + * required for correctness.
> > + *
> > + * To build a bit more intuition, note also that once a state is
> > checkpointed,
> > + * the path we took to get to that state is not important. This is
> > crucial
> > + * property for state pruning. When state is checkpointed and
> > finalized at
> > + * some instruction index, it can be correctly and safely used to
> > "short
> > + * circuit" any *compatible* state that reaches exactly the same
> > instruction
> > + * index. I.e., if we jumped to that instruction from a completely
> > different
> > + * code path than original finalized state was derived from, it
> > doesn't
> > + * matter, current state can be discarded because from that
> > instruction
> > + * forward having a compatible state will ensure we will safely
> > reach the
> > + * exit. States describe preconditions for further exploration, but
> > completely
> > + * forget the history of how we got here.
> > + *
> > + * This also means that even if we needed precise SCALAR range to
> > get to
> > + * finalized state, but from that point forward *that same* SCALAR
> > register is
> > + * never used in a precise context (i.e., it's precise value is not
> > needed for
> > + * correctness), it's correct and safe to mark such register as
> > "imprecise"
> > + * (i.e., precise marking set to false). This is what we rely on
> > when we do
> > + * not set precise marking in current state. If no child state
> > requires
> > + * precision for any given SCALAR register, it's safe to dictate
> > that it can
> > + * be imprecise. If any child state does require this register to
> > be precise,
> > + * we'll mark it precise later retroactively during precise
> > markings
> > + * propagation from child state to parent states.
> > + */
> >  static int __mark_chain_precision(struct bpf_verifier_env *env, int
> > frame, int regno, int spi)
> >  {
> > @@ -2787,6 +2870,10 @@ static int __mark_chain_precision(struct
> > bpf_verifier_env *env, int frame, int r if (!env->bpf_capable)
> >  		return 0;
> >  
> > +	/* Do sanity checks against current state of register
> > and/or stack
> > +	 * slot, but don't set precise flag in current state, as
> > precision
> > +	 * tracking in the current state is unnecessary.
> > +	 */
> >  	func = st->frame[frame];
> >  	if (regno >= 0) {
> >  		reg = &func->regs[regno];
> > @@ -2794,11 +2881,7 @@ static int __mark_chain_precision(struct
> > bpf_verifier_env *env, int frame, int r WARN_ONCE(1, "backtracing
> > misuse"); return -EFAULT;
> >  		}
> > -		if (!reg->precise)
> > -			new_marks = true;
> > -		else
> > -			reg_mask = 0;
> > -		reg->precise = true;
> > +		new_marks = true;
> >  	}
> >  
> >  	while (spi >= 0) {
> > @@ -2811,11 +2894,7 @@ static int __mark_chain_precision(struct
> > bpf_verifier_env *env, int frame, int r stack_mask = 0;
> >  			break;
> >  		}
> > -		if (!reg->precise)
> > -			new_marks = true;
> > -		else
> > -			stack_mask = 0;
> > -		reg->precise = true;
> > +		new_marks = true;
> >  		break;
> >  	}
> >  
> > @@ -11534,7 +11613,7 @@ static bool regsafe(struct bpf_verifier_env
> > *env, struct bpf_reg_state *rold, if (env->explore_alu_limits)
> >  			return false;
> >  		if (rcur->type == SCALAR_VALUE) {
> > -			if (!rold->precise && !rcur->precise)
> > +			if (!rold->precise)
> >  				return true;
> >  			/* new val must satisfy old val knowledge
> > */ return range_within(rold, rcur) &&  
>
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c1169ee1bc7c..ff3fc21ce99b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2749,8 +2749,11 @@  static void mark_all_scalars_precise(struct bpf_verifier_env *env,
 
 	/* big hammer: mark all scalars precise in this path.
 	 * pop_stack may still get !precise scalars.
+	 * We also skip current state and go straight to first parent state,
+	 * because precision markings in current non-checkpointed state are
+	 * not needed. See why in the comment in __mark_chain_precision below.
 	 */
-	for (; st; st = st->parent)
+	for (st = st->parent; st; st = st->parent) {
 		for (i = 0; i <= st->curframe; i++) {
 			func = st->frame[i];
 			for (j = 0; j < BPF_REG_FP; j++) {
@@ -2768,8 +2771,88 @@  static void mark_all_scalars_precise(struct bpf_verifier_env *env,
 				reg->precise = true;
 			}
 		}
+	}
 }
 
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ */
 static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
 				  int spi)
 {
@@ -2787,6 +2870,10 @@  static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 	if (!env->bpf_capable)
 		return 0;
 
+	/* Do sanity checks against current state of register and/or stack
+	 * slot, but don't set precise flag in current state, as precision
+	 * tracking in the current state is unnecessary.
+	 */
 	func = st->frame[frame];
 	if (regno >= 0) {
 		reg = &func->regs[regno];
@@ -2794,11 +2881,7 @@  static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 			WARN_ONCE(1, "backtracing misuse");
 			return -EFAULT;
 		}
-		if (!reg->precise)
-			new_marks = true;
-		else
-			reg_mask = 0;
-		reg->precise = true;
+		new_marks = true;
 	}
 
 	while (spi >= 0) {
@@ -2811,11 +2894,7 @@  static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 			stack_mask = 0;
 			break;
 		}
-		if (!reg->precise)
-			new_marks = true;
-		else
-			stack_mask = 0;
-		reg->precise = true;
+		new_marks = true;
 		break;
 	}
 
@@ -11534,7 +11613,7 @@  static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		if (env->explore_alu_limits)
 			return false;
 		if (rcur->type == SCALAR_VALUE) {
-			if (!rold->precise && !rcur->precise)
+			if (!rold->precise)
 				return true;
 			/* new val must satisfy old val knowledge */
 			return range_within(rold, rcur) &&