diff mbox series

[bpf] bpf: fix tracking of stack size for var-off access

Message ID 20231113235008.127238-1-andreimatei1@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf] bpf: fix tracking of stack size for var-off access | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1146 this patch: 1146
netdev/cc_maintainers warning 10 maintainers not CCed: jolsa@kernel.org john.fastabend@gmail.com sdf@google.com yonghong.song@linux.dev daniel@iogearbox.net martin.lau@linux.dev song@kernel.org haoluo@google.com andrii@kernel.org kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 1162 this patch: 1162
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1173 this patch: 1173
netdev/checkpatch warning WARNING: Missing a blank line after declarations WARNING: Please use correct Fixes: style 'Fixes: <12 chars of sha1> ("<title line>")' - ie: 'Fixes: 01f810ace9ed ("bpf: Allow variable-offset stack access")' WARNING: braces {} are not necessary for single statement blocks WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-PR fail PR summary
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-2 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-3 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-17 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-18 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-llvm-16 / build / build for x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-llvm-16 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-llvm-16 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-llvm-16 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-16 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-16 / veristat
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Andrei Matei Nov. 13, 2023, 11:50 p.m. UTC
Before this patch, writes to the stack using registers containing a
variable offset (as opposed to registers with fixed, known values) were
not properly contributing to the function's needed stack size. As a
result, it was possible for a program to verify, but then to attempt to
read out-of-bounds data at runtime because a too small stack had been
allocated for it.

Each function tracks the size of the stack it needs in
bpf_subprog_info.stack_depth, which is maintained by
update_stack_depth(). For regular memory accesses, check_mem_access()
was calling update_state_depth() but it was passing in only the fixed
part of the offset register, ignoring the variable offset. This was
incorrect; the minimum possible value of that register should be used
instead.

This patch fixes it by pushing down the update_stack_depth() call into
grow_stack_depth(), which then correctly uses the registers lower bound.
grow_stack_depth() is responsible for tracking the maximum stack size
for the current verifier state, so it seems like a good idea to couple
it with also updating the per-function high-water mark. As a result of
this re-arrangement, update_stack_depth() is no longer needlessly called
for reads; it is now called only for writes (plus other cases like
helper memory access). I think this is a good thing, as reads cannot
possibly grow the needed stack.

Reported-by: Hao Sun <sunhao.th@gmail.com>
Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access")
Closes: https://lore.kernel.org/bpf/CABWLsev9g8UP_c3a=1qbuZUi20tGoUXoU07FPf-5FLvhOKOY+Q@mail.gmail.com/
Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
---
 kernel/bpf/verifier.c | 47 ++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

Comments

Andrii Nakryiko Nov. 21, 2023, 12:46 a.m. UTC | #1
On Mon, Nov 13, 2023 at 3:51 PM Andrei Matei <andreimatei1@gmail.com> wrote:
>
> Before this patch, writes to the stack using registers containing a
> variable offset (as opposed to registers with fixed, known values) were
> not properly contributing to the function's needed stack size. As a
> result, it was possible for a program to verify, but then to attempt to
> read out-of-bounds data at runtime because a too small stack had been
> allocated for it.
>
> Each function tracks the size of the stack it needs in
> bpf_subprog_info.stack_depth, which is maintained by
> update_stack_depth(). For regular memory accesses, check_mem_access()
> was calling update_state_depth() but it was passing in only the fixed
> part of the offset register, ignoring the variable offset. This was
> incorrect; the minimum possible value of that register should be used
> instead.
>
> This patch fixes it by pushing down the update_stack_depth() call into
> grow_stack_depth(), which then correctly uses the registers lower bound.
> grow_stack_depth() is responsible for tracking the maximum stack size
> for the current verifier state, so it seems like a good idea to couple
> it with also updating the per-function high-water mark. As a result of
> this re-arrangement, update_stack_depth() is no longer needlessly called
> for reads; it is now called only for writes (plus other cases like
> helper memory access). I think this is a good thing, as reads cannot
> possibly grow the needed stack.

I'm going to disagree. I think we should calculate max stack size both
on reads and writes. I'm not sure why it's ok for a BPF program to
access a stack with some big offset, but the BPF verifier not
rejecting this. What do I miss?

>
> Reported-by: Hao Sun <sunhao.th@gmail.com>
> Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access")
> Closes: https://lore.kernel.org/bpf/CABWLsev9g8UP_c3a=1qbuZUi20tGoUXoU07FPf-5FLvhOKOY+Q@mail.gmail.com/
> Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
> ---
>  kernel/bpf/verifier.c | 47 ++++++++++++++++++++++---------------------
>  1 file changed, 24 insertions(+), 23 deletions(-)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index a2267d5ed14e..303a3572b169 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1669,8 +1669,29 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n)
>         return 0;
>  }
>
> -static int grow_stack_state(struct bpf_func_state *state, int size)
> +static int update_stack_depth(struct bpf_verifier_env *env,
> +                             const struct bpf_func_state *func,
> +                             int off)
> +{
> +       u16 stack = env->subprog_info[func->subprogno].stack_depth;
> +
> +       if (stack >= -off)
> +               return 0;
> +
> +       /* update known max for given subprogram */
> +       env->subprog_info[func->subprogno].stack_depth = -off;
> +       return 0;
> +}

given this is targeting bpf tree and will probably be backported to
stable kernels, let's minimize code movement. Can you just add
update_stack_depth forward declaration here instead?

> +
> +/* Possibly update state->allocated_stack to be at least size bytes. Also
> + * possibly update the function's high-water mark in its bpf_subprog_info.
> + */
> +static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
>  {
> +       int err = update_stack_depth(env, state, -size);
> +       if (err) {
> +               return err;
> +       }
>         size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
>
>         if (old_n >= n)
> @@ -4638,7 +4659,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
>         struct bpf_reg_state *reg = NULL;
>         u32 dst_reg = insn->dst_reg;
>
> -       err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
> +       err = grow_stack_state(env, state, round_up(slot + 1, BPF_REG_SIZE));
>         if (err)
>                 return err;
>         /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
> @@ -4796,7 +4817,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
>             (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
>                 writing_zero = true;
>
> -       err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
> +       err = grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE));
>         if (err)
>                 return err;
>
> @@ -5928,20 +5949,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
>                                            strict);
>  }
>
> -static int update_stack_depth(struct bpf_verifier_env *env,
> -                             const struct bpf_func_state *func,
> -                             int off)
> -{
> -       u16 stack = env->subprog_info[func->subprogno].stack_depth;
> -
> -       if (stack >= -off)
> -               return 0;
> -
> -       /* update known max for given subprogram */
> -       env->subprog_info[func->subprogno].stack_depth = -off;
> -       return 0;
> -}
> -
>  /* starting from main bpf function walk all instructions of the function
>   * and recursively walk all callees that given function can call.
>   * Ignore jump and exit insns.
> @@ -6822,7 +6829,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>  {
>         struct bpf_reg_state *regs = cur_regs(env);
>         struct bpf_reg_state *reg = regs + regno;
> -       struct bpf_func_state *state;
>         int size, err = 0;
>
>         size = bpf_size_to_bytes(bpf_size);
> @@ -6965,11 +6971,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>                 if (err)
>                         return err;
>
> -               state = func(env, reg);
> -               err = update_stack_depth(env, state, off);
> -               if (err)
> -                       return err;
> -

It *feels* like this stack depth update *and* growing allocated stack
slots should happen somewhere in check_stack_access_within_bounds() or
right after it. It shouldn't matter whether we read or write to the
stack slot: either way that slot becomes part of the verifier state
that we should take into account during state comparison. Eduard not
so long ago added a change that allows reading STACK_INVALID slots, so
it's completely valid to read something that was never written to (and
so grow_stack_state() wasn't called for that slot, as it is
implemented right now). So I think we should fix that.

Let's also add a test that will trigger this situation with both
direct stack slot read into register and through helper?

cc'ing Eduard just in case I'm missing some subtle detail here

>                 if (t == BPF_READ)
>                         err = check_stack_read(env, regno, off, size,
>                                                value_regno);
> --
> 2.39.2
>
>
Hao Sun Nov. 21, 2023, 5:16 p.m. UTC | #2
On Tue, Nov 21, 2023 at 1:46 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
>
> It *feels* like this stack depth update *and* growing allocated stack
> slots should happen somewhere in check_stack_access_within_bounds() or
> right after it. It shouldn't matter whether we read or write to the
> stack slot: either way that slot becomes part of the verifier state
> that we should take into account during state comparison. Eduard not
> so long ago added a change that allows reading STACK_INVALID slots, so
> it's completely valid to read something that was never written to (and
> so grow_stack_state() wasn't called for that slot, as it is
> implemented right now). So I think we should fix that.
>

Agree. The following cases are currently confusing to me.

The verifier accepts the following program, which goes from #4 to #8
and directly read the stack at runtime without any previous write:
func#0 @0
0: R1=ctx() R10=fp0
0: (bf) r6 = r10                      ; R6_w=fp0 R10=fp0
1: (85) call bpf_get_prandom_u32#7    ; R0_w=scalar()
2: (bf) r3 = r0                       ; R0_w=scalar(id=1) R3_w=scalar(id=1)
3: (bf) r8 = r0                       ; R0_w=scalar(id=1) R8_w=scalar(id=1)
4: (4e) if w0 & w3 goto pc+3          ; R0_w=scalar(id=1) R3_w=scalar(id=1)
5: (63) *(u32 *)(r6 -196) = r3        ; R3_w=scalar(id=1) R6_w=fp0
fp-200=mmmm????
6: (18) r7 = 0x19                     ; R7=25
8: (61) r7 = *(u32 *)(r6 -200)        ; R6=fp0
R7_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
fp-200=mmmm????
9: (95) exit

from 4 to 8: safe
verification time 358 usec
stack depth 200
processed 10 insns (limit 1000000) max_states_per_insn 0 total_states
1 peak_states 1 mark_read 1

The state is pruned, because of this:
static bool stacksafe(...)
         ....
         if (env->allow_uninit_stack &&
             old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
             continue;

Yet, the sample direct read would be rejected:

func#0 @0
0: R1=ctx() R10=fp0
0: (bf) r6 = r10                      ; R6_w=fp0 R10=fp0
1: (61) r7 = *(u32 *)(r6 -200)
invalid read from stack R6 off=-200 size=4

Eduard, you added support for reading uninit slots, should we also add something
like the following:

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c2d31aa3d31..aa861d2da240 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6446,7 +6446,7 @@ static int check_stack_slot_within_bounds(int off,
 {
        int min_valid_off;

-       if (t == BPF_WRITE)
+       if (t == BPF_WRITE || env->allow_uninit_stack)
                min_valid_off = -MAX_BPF_STACK;
        else
                min_valid_off = -state->allocated_stack;
Andrei Matei Nov. 21, 2023, 8:29 p.m. UTC | #3
On Tue, Nov 21, 2023 at 12:16 PM Hao Sun <sunhao.th@gmail.com> wrote:
>
> On Tue, Nov 21, 2023 at 1:46 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> >
> > It *feels* like this stack depth update *and* growing allocated stack
> > slots should happen somewhere in check_stack_access_within_bounds() or
> > right after it. It shouldn't matter whether we read or write to the
> > stack slot: either way that slot becomes part of the verifier state
> > that we should take into account during state comparison. Eduard not
> > so long ago added a change that allows reading STACK_INVALID slots, so
> > it's completely valid to read something that was never written to (and
> > so grow_stack_state() wasn't called for that slot, as it is
> > implemented right now). So I think we should fix that.
> >
>
> Agree. The following cases are currently confusing to me.

Like Hao, I'm also confused about when reads from unitialized stack slots are
supposed to be allowed and when they're not. For example, the "uninitialized
stack1" test ([0]) seems to check that at least some type of uninitialized read
is not permitted. The reason why the respective access is rejected and this
test currently passes is the following check in check_stack_range_initialized:
[1]. We summarily reject the access to slots beyond state->allocated_stack. If
we were to not reject them there, and instead treat the slots as STACK_INVALID,
then the test's access would be allowed just below, through the combination of
env->allow_uninit_stack and `clobber`.

So, assuming that this tests (and a few others) are sane, Andrii's suggestion
of calling grow_stack_state()/update_stack_depth() in
check_stack_access_within_bounds() does not immediately work: doing so
would change
the behavior in check_stack_range_initialized() and allow the access.

On the other hand, perhaps the test is not sane and the access should be
permitted, in the spirit of allowing reads of uninitialized stack? Perhaps the
different treatment of slots beyond state->allocated_stack and STACK_INVALID
slots is not intentional. Though, I am fairly confused about the idea of
allowing such reads in general - don't they allow leaking of kernel memory from
the uninit stack to user space? Even if restricted to root, is the leak ok?

To also state the obvious - I'm happy to continue working on this patch and fix
a bug I've caused. However, if someone who actually knows the deal wants to
take over, I'm certainly not attached to anything :).

[0] https://github.com/torvalds/linux/blob/98b1cc82c4affc16f5598d4fa14b1858671b2263/tools/testing/selftests/bpf/progs/verifier_basic_stack.c#L28-L44
[1] https://github.com/torvalds/linux/blob/98b1cc82c4affc16f5598d4fa14b1858671b2263/kernel/bpf/verifier.c#L7280


>
> The verifier accepts the following program, which goes from #4 to #8
> and directly read the stack at runtime without any previous write:
> func#0 @0
> 0: R1=ctx() R10=fp0
> 0: (bf) r6 = r10                      ; R6_w=fp0 R10=fp0
> 1: (85) call bpf_get_prandom_u32#7    ; R0_w=scalar()
> 2: (bf) r3 = r0                       ; R0_w=scalar(id=1) R3_w=scalar(id=1)
> 3: (bf) r8 = r0                       ; R0_w=scalar(id=1) R8_w=scalar(id=1)
> 4: (4e) if w0 & w3 goto pc+3          ; R0_w=scalar(id=1) R3_w=scalar(id=1)
> 5: (63) *(u32 *)(r6 -196) = r3        ; R3_w=scalar(id=1) R6_w=fp0
> fp-200=mmmm????
> 6: (18) r7 = 0x19                     ; R7=25
> 8: (61) r7 = *(u32 *)(r6 -200)        ; R6=fp0
> R7_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> fp-200=mmmm????
> 9: (95) exit
>
> from 4 to 8: safe
> verification time 358 usec
> stack depth 200
> processed 10 insns (limit 1000000) max_states_per_insn 0 total_states
> 1 peak_states 1 mark_read 1
>
> The state is pruned, because of this:
> static bool stacksafe(...)
>          ....
>          if (env->allow_uninit_stack &&
>              old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
>              continue;
>
> Yet, the sample direct read would be rejected:
>
> func#0 @0
> 0: R1=ctx() R10=fp0
> 0: (bf) r6 = r10                      ; R6_w=fp0 R10=fp0
> 1: (61) r7 = *(u32 *)(r6 -200)
> invalid read from stack R6 off=-200 size=4
>
> Eduard, you added support for reading uninit slots, should we also add something
> like the following:
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 8c2d31aa3d31..aa861d2da240 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -6446,7 +6446,7 @@ static int check_stack_slot_within_bounds(int off,
>  {
>         int min_valid_off;
>
> -       if (t == BPF_WRITE)
> +       if (t == BPF_WRITE || env->allow_uninit_stack)
>                 min_valid_off = -MAX_BPF_STACK;
>         else
>                 min_valid_off = -state->allocated_stack;
Andrei Matei Nov. 21, 2023, 8:34 p.m. UTC | #4
On Mon, Nov 20, 2023 at 7:46 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Nov 13, 2023 at 3:51 PM Andrei Matei <andreimatei1@gmail.com> wrote:
> >
> > Before this patch, writes to the stack using registers containing a
> > variable offset (as opposed to registers with fixed, known values) were
> > not properly contributing to the function's needed stack size. As a
> > result, it was possible for a program to verify, but then to attempt to
> > read out-of-bounds data at runtime because a too small stack had been
> > allocated for it.
> >
> > Each function tracks the size of the stack it needs in
> > bpf_subprog_info.stack_depth, which is maintained by
> > update_stack_depth(). For regular memory accesses, check_mem_access()
> > was calling update_state_depth() but it was passing in only the fixed
> > part of the offset register, ignoring the variable offset. This was
> > incorrect; the minimum possible value of that register should be used
> > instead.
> >
> > This patch fixes it by pushing down the update_stack_depth() call into
> > grow_stack_depth(), which then correctly uses the registers lower bound.
> > grow_stack_depth() is responsible for tracking the maximum stack size
> > for the current verifier state, so it seems like a good idea to couple
> > it with also updating the per-function high-water mark. As a result of
> > this re-arrangement, update_stack_depth() is no longer needlessly called
> > for reads; it is now called only for writes (plus other cases like
> > helper memory access). I think this is a good thing, as reads cannot
> > possibly grow the needed stack.
>
> I'm going to disagree. I think we should calculate max stack size both
> on reads and writes. I'm not sure why it's ok for a BPF program to
> access a stack with some big offset, but the BPF verifier not
> rejecting this. What do I miss?

My intention was certainly not to change any verification behavior. I was
relying on reads to uninit stack not being allowed regardless of the tracked
stacked bounds; I now see that things don't work that way.
Andrii Nakryiko Nov. 21, 2023, 9:55 p.m. UTC | #5
On Tue, Nov 21, 2023 at 12:30 PM Andrei Matei <andreimatei1@gmail.com> wrote:
>
> On Tue, Nov 21, 2023 at 12:16 PM Hao Sun <sunhao.th@gmail.com> wrote:
> >
> > On Tue, Nov 21, 2023 at 1:46 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > >
> > > It *feels* like this stack depth update *and* growing allocated stack
> > > slots should happen somewhere in check_stack_access_within_bounds() or
> > > right after it. It shouldn't matter whether we read or write to the
> > > stack slot: either way that slot becomes part of the verifier state
> > > that we should take into account during state comparison. Eduard not
> > > so long ago added a change that allows reading STACK_INVALID slots, so
> > > it's completely valid to read something that was never written to (and
> > > so grow_stack_state() wasn't called for that slot, as it is
> > > implemented right now). So I think we should fix that.
> > >
> >
> > Agree. The following cases are currently confusing to me.
>
> Like Hao, I'm also confused about when reads from unitialized stack slots are
> supposed to be allowed and when they're not. For example, the "uninitialized
> stack1" test ([0]) seems to check that at least some type of uninitialized read
> is not permitted. The reason why the respective access is rejected and this
> test currently passes is the following check in check_stack_range_initialized:
> [1]. We summarily reject the access to slots beyond state->allocated_stack. If
> we were to not reject them there, and instead treat the slots as STACK_INVALID,
> then the test's access would be allowed just below, through the combination of
> env->allow_uninit_stack and `clobber`.

And that seems like a consistent and sane behavior if
bpf_allow_uninit_stack() is true, so I vote for fixing the test and
make the logic consistent.

>
> So, assuming that this tests (and a few others) are sane, Andrii's suggestion
> of calling grow_stack_state()/update_stack_depth() in
> check_stack_access_within_bounds() does not immediately work: doing so
> would change
> the behavior in check_stack_range_initialized() and allow the access.
>
> On the other hand, perhaps the test is not sane and the access should be
> permitted, in the spirit of allowing reads of uninitialized stack? Perhaps the
> different treatment of slots beyond state->allocated_stack and STACK_INVALID

yes, I think this divergence is not intentional, but maybe Eduard
remembers some other quirks and why it is what it is, let's see.

> slots is not intentional. Though, I am fairly confused about the idea of
> allowing such reads in general - don't they allow leaking of kernel memory from
> the uninit stack to user space? Even if restricted to root, is the leak ok?

This is guarded by bpf_allow_uninit_stack(), which check CAP_PERFMON.
Once BPF program has CAP_PERFMON, it can do bpf_probe_read_kernel()
and generally can leak whatever kernel memory. So that's why
STACK_INVALID is allowed to be read in such case. There is also
bpf_allow_ptr_leaks() which also checks CAP_PERFMON.

>
> To also state the obvious - I'm happy to continue working on this patch and fix
> a bug I've caused. However, if someone who actually knows the deal wants to
> take over, I'm certainly not attached to anything :).

It would be great if you can finish this, absolutely! Thanks!

>
> [0] https://github.com/torvalds/linux/blob/98b1cc82c4affc16f5598d4fa14b1858671b2263/tools/testing/selftests/bpf/progs/verifier_basic_stack.c#L28-L44
> [1] https://github.com/torvalds/linux/blob/98b1cc82c4affc16f5598d4fa14b1858671b2263/kernel/bpf/verifier.c#L7280
>
>
> >
> > The verifier accepts the following program, which goes from #4 to #8
> > and directly read the stack at runtime without any previous write:
> > func#0 @0
> > 0: R1=ctx() R10=fp0
> > 0: (bf) r6 = r10                      ; R6_w=fp0 R10=fp0
> > 1: (85) call bpf_get_prandom_u32#7    ; R0_w=scalar()
> > 2: (bf) r3 = r0                       ; R0_w=scalar(id=1) R3_w=scalar(id=1)
> > 3: (bf) r8 = r0                       ; R0_w=scalar(id=1) R8_w=scalar(id=1)
> > 4: (4e) if w0 & w3 goto pc+3          ; R0_w=scalar(id=1) R3_w=scalar(id=1)
> > 5: (63) *(u32 *)(r6 -196) = r3        ; R3_w=scalar(id=1) R6_w=fp0
> > fp-200=mmmm????
> > 6: (18) r7 = 0x19                     ; R7=25
> > 8: (61) r7 = *(u32 *)(r6 -200)        ; R6=fp0
> > R7_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> > fp-200=mmmm????
> > 9: (95) exit
> >
> > from 4 to 8: safe
> > verification time 358 usec
> > stack depth 200
> > processed 10 insns (limit 1000000) max_states_per_insn 0 total_states
> > 1 peak_states 1 mark_read 1
> >
> > The state is pruned, because of this:
> > static bool stacksafe(...)
> >          ....
> >          if (env->allow_uninit_stack &&
> >              old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
> >              continue;
> >
> > Yet, the sample direct read would be rejected:
> >
> > func#0 @0
> > 0: R1=ctx() R10=fp0
> > 0: (bf) r6 = r10                      ; R6_w=fp0 R10=fp0
> > 1: (61) r7 = *(u32 *)(r6 -200)
> > invalid read from stack R6 off=-200 size=4
> >
> > Eduard, you added support for reading uninit slots, should we also add something
> > like the following:
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 8c2d31aa3d31..aa861d2da240 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -6446,7 +6446,7 @@ static int check_stack_slot_within_bounds(int off,
> >  {
> >         int min_valid_off;
> >
> > -       if (t == BPF_WRITE)
> > +       if (t == BPF_WRITE || env->allow_uninit_stack)
> >                 min_valid_off = -MAX_BPF_STACK;
> >         else
> >                 min_valid_off = -state->allocated_stack;

as I mentioned above, yes, I think the behavior should be consistent
and such accesses should be allowed, just as if it was STACK_INVALID
Eduard Zingerman Nov. 22, 2023, 12:38 p.m. UTC | #6
On Tue, 2023-11-21 at 13:55 -0800, Andrii Nakryiko wrote:
[...]
> > So, assuming that this tests (and a few others) are sane, Andrii's suggestion
> > of calling grow_stack_state()/update_stack_depth() in
> > check_stack_access_within_bounds() does not immediately work: doing so
> > would change
> > the behavior in check_stack_range_initialized() and allow the access.
> > 
> > On the other hand, perhaps the test is not sane and the access should be
> > permitted, in the spirit of allowing reads of uninitialized stack? Perhaps the
> > different treatment of slots beyond state->allocated_stack and STACK_INVALID
> 
> yes, I think this divergence is not intentional, but maybe Eduard
> remembers some other quirks and why it is what it is, let's see.

Yes, this is probably an overlook from my side.
I should have allowed reads beyond allocated stack in this case when
doing changes for STACK_INVALID.
Sorry for delayed response.

[...]
Eduard Zingerman Nov. 22, 2023, 12:46 p.m. UTC | #7
On Tue, 2023-11-21 at 18:16 +0100, Hao Sun wrote:
[...]
> Yet, the sample direct read would be rejected:
> 
> func#0 @0
> 0: R1=ctx() R10=fp0
> 0: (bf) r6 = r10                      ; R6_w=fp0 R10=fp0
> 1: (61) r7 = *(u32 *)(r6 -200)
> invalid read from stack R6 off=-200 size=4
> 
> Eduard, you added support for reading uninit slots, should we also add something
> like the following:
> 
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 8c2d31aa3d31..aa861d2da240 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -6446,7 +6446,7 @@ static int check_stack_slot_within_bounds(int off,
>  {
>         int min_valid_off;
> 
> -       if (t == BPF_WRITE)
> +       if (t == BPF_WRITE || env->allow_uninit_stack)
>                 min_valid_off = -MAX_BPF_STACK;
>         else
>                 min_valid_off = -state->allocated_stack;

I agree with your logic and this change seems reasonable.
Sorry for delayed response.
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2267d5ed14e..303a3572b169 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1669,8 +1669,29 @@  static int resize_reference_state(struct bpf_func_state *state, size_t n)
 	return 0;
 }
 
-static int grow_stack_state(struct bpf_func_state *state, int size)
+static int update_stack_depth(struct bpf_verifier_env *env,
+			      const struct bpf_func_state *func,
+			      int off)
+{
+	u16 stack = env->subprog_info[func->subprogno].stack_depth;
+
+	if (stack >= -off)
+		return 0;
+
+	/* update known max for given subprogram */
+	env->subprog_info[func->subprogno].stack_depth = -off;
+	return 0;
+}
+
+/* Possibly update state->allocated_stack to be at least size bytes. Also
+ * possibly update the function's high-water mark in its bpf_subprog_info.
+ */
+static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
 {
+	int err = update_stack_depth(env, state, -size);
+	if (err) {
+		return err;
+	}
 	size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
 
 	if (old_n >= n)
@@ -4638,7 +4659,7 @@  static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = NULL;
 	u32 dst_reg = insn->dst_reg;
 
-	err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
+	err = grow_stack_state(env, state, round_up(slot + 1, BPF_REG_SIZE));
 	if (err)
 		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -4796,7 +4817,7 @@  static int check_stack_write_var_off(struct bpf_verifier_env *env,
 	    (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
 		writing_zero = true;
 
-	err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
+	err = grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE));
 	if (err)
 		return err;
 
@@ -5928,20 +5949,6 @@  static int check_ptr_alignment(struct bpf_verifier_env *env,
 					   strict);
 }
 
-static int update_stack_depth(struct bpf_verifier_env *env,
-			      const struct bpf_func_state *func,
-			      int off)
-{
-	u16 stack = env->subprog_info[func->subprogno].stack_depth;
-
-	if (stack >= -off)
-		return 0;
-
-	/* update known max for given subprogram */
-	env->subprog_info[func->subprogno].stack_depth = -off;
-	return 0;
-}
-
 /* starting from main bpf function walk all instructions of the function
  * and recursively walk all callees that given function can call.
  * Ignore jump and exit insns.
@@ -6822,7 +6829,6 @@  static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = regs + regno;
-	struct bpf_func_state *state;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -6965,11 +6971,6 @@  static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (err)
 			return err;
 
-		state = func(env, reg);
-		err = update_stack_depth(env, state, off);
-		if (err)
-			return err;
-
 		if (t == BPF_READ)
 			err = check_stack_read(env, regno, off, size,
 					       value_regno);