diff mbox series

[bpf-next,v5,4/5] bpf: verifier: Support eliding map lookup nullness

Message ID 92065ca054beccd6d0f35efe9715ef965e8d379f.1734045451.git.dxu@dxuuu.xyz (mailing list archive)
State New
Headers show
Series Support eliding map lookup nullness | expand

Commit Message

Daniel Xu Dec. 12, 2024, 11:22 p.m. UTC
This commit allows progs to elide a null check on statically known map
lookup keys. In other words, if the verifier can statically prove that
the lookup will be in-bounds, allow the prog to drop the null check.

This is useful for two reasons:

1. Large numbers of nullness checks (especially when they cannot fail)
   unnecessarily pushes prog towards BPF_COMPLEXITY_LIMIT_JMP_SEQ.
2. It forms a tighter contract between programmer and verifier.

For (1), bpftrace is starting to make heavier use of percpu scratch
maps. As a result, for user scripts with large number of unrolled loops,
we are starting to hit jump complexity verification errors.  These
percpu lookups cannot fail anyways, as we only use static key values.
Eliding nullness probably results in less work for verifier as well.

For (2), percpu scratch maps are often used as a larger stack, as the
currrent stack is limited to 512 bytes. In these situations, it is
desirable for the programmer to express: "this lookup should never fail,
and if it does, it means I messed up the code". By omitting the null
check, the programmer can "ask" the verifier to double check the logic.

Tests also have to be updated in sync with these changes, as the
verifier is more efficient with this change. Notable, iters.c tests had
to be changed to use a map type that still requires null checks, as it's
exercising verifier tracking logic w.r.t iterators.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 kernel/bpf/verifier.c                         | 80 ++++++++++++++++++-
 tools/testing/selftests/bpf/progs/iters.c     | 14 ++--
 .../selftests/bpf/progs/map_kptr_fail.c       |  2 +-
 .../selftests/bpf/progs/verifier_map_in_map.c |  2 +-
 .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
 5 files changed, 87 insertions(+), 13 deletions(-)

Comments

Eduard Zingerman Dec. 13, 2024, 4:04 a.m. UTC | #1
On Thu, 2024-12-12 at 16:22 -0700, Daniel Xu wrote:

I think these changes are fine in general, but see below.

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58b36cc96bd5..4947ef884a18 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -287,6 +287,7 @@ struct bpf_call_arg_meta {
>  	u32 ret_btf_id;
>  	u32 subprogno;
>  	struct btf_field *kptr_field;
> +	s64 const_map_key;
>  };
>  
>  struct bpf_kfunc_call_arg_meta {
> @@ -9163,6 +9164,53 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
>  	return 0;
>  }
>  
> +/* Returns constant key value if possible, else -1 */
> +static s64 get_constant_map_key(struct bpf_verifier_env *env,
> +				struct bpf_reg_state *key,
> +				u32 key_size)

I understand that this is not your use case, but maybe generalize this
a bit by checking maximal register value instead of a constant?

> +{
> +	struct bpf_func_state *state = func(env, key);
> +	struct bpf_reg_state *reg;
> +	int zero_size = 0;
> +	int stack_off;
> +	u8 *stype;
> +	int slot;
> +	int spi;
> +	int i;
> +
> +	if (!env->bpf_capable)
> +		return -1;
> +	if (key->type != PTR_TO_STACK)
> +		return -1;
> +	if (!tnum_is_const(key->var_off))
> +		return -1;
> +
> +	stack_off = key->off + key->var_off.value;
> +	slot = -stack_off - 1;
> +	spi = slot / BPF_REG_SIZE;
> +
> +	/* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> +	stype = state->stack[spi].slot_type;
> +	for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
> +		zero_size++;
> +	if (zero_size == key_size)
> +		return 0;
> +
> +	if (!is_spilled_reg(&state->stack[spi]))
> +		/* Not pointer to stack */
> +		return -1;

Nit: there is a 'is_spilled_scalar_reg' utility function.

> +
> +	reg = &state->stack[spi].spilled_ptr;
> +	if (reg->type != SCALAR_VALUE)
> +		/* Only scalars are valid array map keys */
> +		return -1;
> +	else if (!tnum_is_const(reg->var_off))
> +		/* Stack value not statically known */
> +		return -1;

I think you need to check if size of the spill matches the size of the key.
The mismatch would be unsafe when spill size is smaller than key size.
E.g. consider 1-byte spill with mask 'mmmmmmrr' and a 4-byte key,
at runtime the 'mmmmmm' part might be non-zero, rendering key to be
out of range.

> +
> +	return reg->var_off.value;
> +}
> +
>  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>  			  struct bpf_call_arg_meta *meta,
>  			  const struct bpf_func_proto *fn,

[...]
Daniel Xu Dec. 13, 2024, 8:57 p.m. UTC | #2
On Thu, Dec 12, 2024 at 08:04:45PM GMT, Eduard Zingerman wrote:
> On Thu, 2024-12-12 at 16:22 -0700, Daniel Xu wrote:
> 
> I think these changes are fine in general, but see below.
> 
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 58b36cc96bd5..4947ef884a18 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -287,6 +287,7 @@ struct bpf_call_arg_meta {
> >  	u32 ret_btf_id;
> >  	u32 subprogno;
> >  	struct btf_field *kptr_field;
> > +	s64 const_map_key;
> >  };
> >  
> >  struct bpf_kfunc_call_arg_meta {
> > @@ -9163,6 +9164,53 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
> >  	return 0;
> >  }
> >  
> > +/* Returns constant key value if possible, else -1 */
> > +static s64 get_constant_map_key(struct bpf_verifier_env *env,
> > +				struct bpf_reg_state *key,
> > +				u32 key_size)
> 
> I understand that this is not your use case, but maybe generalize this
> a bit by checking maximal register value instead of a constant?

I'll check on this. If it works I think you're right - it allows more
flexibility while retaining safety. User could define max_entries to be
a power of two and then mask key with with 0xFFFF.. to guarantee null
free codepaths.

> 
> > +{
> > +	struct bpf_func_state *state = func(env, key);
> > +	struct bpf_reg_state *reg;
> > +	int zero_size = 0;
> > +	int stack_off;
> > +	u8 *stype;
> > +	int slot;
> > +	int spi;
> > +	int i;
> > +
> > +	if (!env->bpf_capable)
> > +		return -1;
> > +	if (key->type != PTR_TO_STACK)
> > +		return -1;
> > +	if (!tnum_is_const(key->var_off))
> > +		return -1;
> > +
> > +	stack_off = key->off + key->var_off.value;
> > +	slot = -stack_off - 1;
> > +	spi = slot / BPF_REG_SIZE;
> > +
> > +	/* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> > +	stype = state->stack[spi].slot_type;
> > +	for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
> > +		zero_size++;
> > +	if (zero_size == key_size)
> > +		return 0;
> > +
> > +	if (!is_spilled_reg(&state->stack[spi]))
> > +		/* Not pointer to stack */
> > +		return -1;
> 
> Nit: there is a 'is_spilled_scalar_reg' utility function.

Ack.

> 
> > +
> > +	reg = &state->stack[spi].spilled_ptr;
> > +	if (reg->type != SCALAR_VALUE)
> > +		/* Only scalars are valid array map keys */
> > +		return -1;
> > +	else if (!tnum_is_const(reg->var_off))
> > +		/* Stack value not statically known */
> > +		return -1;
> 
> I think you need to check if size of the spill matches the size of the key.
> The mismatch would be unsafe when spill size is smaller than key size.
> E.g. consider 1-byte spill with mask 'mmmmmmrr' and a 4-byte key,
> at runtime the 'mmmmmm' part might be non-zero, rendering key to be
> out of range.

Ah great catch. I think you're right.
Andrii Nakryiko Dec. 13, 2024, 11:02 p.m. UTC | #3
On Thu, Dec 12, 2024 at 3:23 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> This commit allows progs to elide a null check on statically known map
> lookup keys. In other words, if the verifier can statically prove that
> the lookup will be in-bounds, allow the prog to drop the null check.
>
> This is useful for two reasons:
>
> 1. Large numbers of nullness checks (especially when they cannot fail)
>    unnecessarily pushes prog towards BPF_COMPLEXITY_LIMIT_JMP_SEQ.
> 2. It forms a tighter contract between programmer and verifier.
>
> For (1), bpftrace is starting to make heavier use of percpu scratch
> maps. As a result, for user scripts with large number of unrolled loops,
> we are starting to hit jump complexity verification errors.  These
> percpu lookups cannot fail anyways, as we only use static key values.
> Eliding nullness probably results in less work for verifier as well.
>
> For (2), percpu scratch maps are often used as a larger stack, as the
> currrent stack is limited to 512 bytes. In these situations, it is
> desirable for the programmer to express: "this lookup should never fail,
> and if it does, it means I messed up the code". By omitting the null
> check, the programmer can "ask" the verifier to double check the logic.
>
> Tests also have to be updated in sync with these changes, as the
> verifier is more efficient with this change. Notable, iters.c tests had
> to be changed to use a map type that still requires null checks, as it's
> exercising verifier tracking logic w.r.t iterators.
>
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> ---
>  kernel/bpf/verifier.c                         | 80 ++++++++++++++++++-
>  tools/testing/selftests/bpf/progs/iters.c     | 14 ++--
>  .../selftests/bpf/progs/map_kptr_fail.c       |  2 +-
>  .../selftests/bpf/progs/verifier_map_in_map.c |  2 +-
>  .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
>  5 files changed, 87 insertions(+), 13 deletions(-)
>

Eduard has great points. I've added a few more comments below.

pw-bot: cr

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58b36cc96bd5..4947ef884a18 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -287,6 +287,7 @@ struct bpf_call_arg_meta {
>         u32 ret_btf_id;
>         u32 subprogno;
>         struct btf_field *kptr_field;
> +       s64 const_map_key;
>  };
>
>  struct bpf_kfunc_call_arg_meta {
> @@ -9163,6 +9164,53 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
>         return 0;
>  }
>
> +/* Returns constant key value if possible, else -1 */
> +static s64 get_constant_map_key(struct bpf_verifier_env *env,
> +                               struct bpf_reg_state *key,
> +                               u32 key_size)
> +{
> +       struct bpf_func_state *state = func(env, key);
> +       struct bpf_reg_state *reg;
> +       int zero_size = 0;
> +       int stack_off;
> +       u8 *stype;
> +       int slot;
> +       int spi;
> +       int i;
> +
> +       if (!env->bpf_capable)
> +               return -1;
> +       if (key->type != PTR_TO_STACK)
> +               return -1;
> +       if (!tnum_is_const(key->var_off))
> +               return -1;
> +
> +       stack_off = key->off + key->var_off.value;
> +       slot = -stack_off - 1;
> +       spi = slot / BPF_REG_SIZE;
> +
> +       /* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> +       stype = state->stack[spi].slot_type;
> +       for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)

it's Friday and I'm lazy, but please double-check that this works for
both big-endian and little-endian :)

with Eduard's suggestion this also becomes interesting when you have
000mmm mix (as one example), because that gives you a small range, and
all values might be valid keys for arrays

> +               zero_size++;
> +       if (zero_size == key_size)
> +               return 0;
> +
> +       if (!is_spilled_reg(&state->stack[spi]))
> +               /* Not pointer to stack */

!is_spilled_reg and "Not pointer to stack" seem to be not exactly the
same things?

btw, we also have is_spilled_scalar_reg() which you can use here
instead of two separate checks?

> +               return -1;
> +
> +       reg = &state->stack[spi].spilled_ptr;
> +       if (reg->type != SCALAR_VALUE)
> +               /* Only scalars are valid array map keys */
> +               return -1;
> +       else if (!tnum_is_const(reg->var_off))
> +               /* Stack value not statically known */
> +               return -1;
> +
> +       return reg->var_off.value;
> +}
> +

[...]
Kumar Kartikeya Dwivedi Dec. 13, 2024, 11:10 p.m. UTC | #4
On Fri, 13 Dec 2024 at 00:24, Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> This commit allows progs to elide a null check on statically known map
> lookup keys. In other words, if the verifier can statically prove that
> the lookup will be in-bounds, allow the prog to drop the null check.
>
> This is useful for two reasons:
>
> 1. Large numbers of nullness checks (especially when they cannot fail)
>    unnecessarily pushes prog towards BPF_COMPLEXITY_LIMIT_JMP_SEQ.
> 2. It forms a tighter contract between programmer and verifier.
>
> For (1), bpftrace is starting to make heavier use of percpu scratch
> maps. As a result, for user scripts with large number of unrolled loops,
> we are starting to hit jump complexity verification errors.  These
> percpu lookups cannot fail anyways, as we only use static key values.
> Eliding nullness probably results in less work for verifier as well.
>
> For (2), percpu scratch maps are often used as a larger stack, as the
> currrent stack is limited to 512 bytes. In these situations, it is
> desirable for the programmer to express: "this lookup should never fail,
> and if it does, it means I messed up the code". By omitting the null
> check, the programmer can "ask" the verifier to double check the logic.
>
> Tests also have to be updated in sync with these changes, as the
> verifier is more efficient with this change. Notable, iters.c tests had
> to be changed to use a map type that still requires null checks, as it's
> exercising verifier tracking logic w.r.t iterators.
>
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> ---
>  kernel/bpf/verifier.c                         | 80 ++++++++++++++++++-
>  tools/testing/selftests/bpf/progs/iters.c     | 14 ++--
>  .../selftests/bpf/progs/map_kptr_fail.c       |  2 +-
>  .../selftests/bpf/progs/verifier_map_in_map.c |  2 +-
>  .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
>  5 files changed, 87 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58b36cc96bd5..4947ef884a18 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -287,6 +287,7 @@ struct bpf_call_arg_meta {
>         u32 ret_btf_id;
>         u32 subprogno;
>         struct btf_field *kptr_field;
> +       s64 const_map_key;
>  };
>
>  struct bpf_kfunc_call_arg_meta {
> @@ -9163,6 +9164,53 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
>         return 0;
>  }
>
> +/* Returns constant key value if possible, else -1 */
> +static s64 get_constant_map_key(struct bpf_verifier_env *env,
> +                               struct bpf_reg_state *key,
> +                               u32 key_size)
> +{
> +       struct bpf_func_state *state = func(env, key);
> +       struct bpf_reg_state *reg;
> +       int zero_size = 0;
> +       int stack_off;
> +       u8 *stype;
> +       int slot;
> +       int spi;
> +       int i;
> +
> +       if (!env->bpf_capable)
> +               return -1;
> +       if (key->type != PTR_TO_STACK)
> +               return -1;
> +       if (!tnum_is_const(key->var_off))
> +               return -1;
> +
> +       stack_off = key->off + key->var_off.value;
> +       slot = -stack_off - 1;
> +       spi = slot / BPF_REG_SIZE;
> +
> +       /* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> +       stype = state->stack[spi].slot_type;
> +       for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
> +               zero_size++;
> +       if (zero_size == key_size)
> +               return 0;
> +
> +       if (!is_spilled_reg(&state->stack[spi]))
> +               /* Not pointer to stack */
> +               return -1;
> +
> +       reg = &state->stack[spi].spilled_ptr;
> +       if (reg->type != SCALAR_VALUE)
> +               /* Only scalars are valid array map keys */
> +               return -1;
> +       else if (!tnum_is_const(reg->var_off))
> +               /* Stack value not statically known */
> +               return -1;
> +
> +       return reg->var_off.value;
> +}
> +
>  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>                           struct bpf_call_arg_meta *meta,
>                           const struct bpf_func_proto *fn,
> @@ -9173,6 +9221,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>         enum bpf_arg_type arg_type = fn->arg_type[arg];
>         enum bpf_reg_type type = reg->type;
>         u32 *arg_btf_id = NULL;
> +       u32 key_size;
>         int err = 0;
>         bool mask;
>
> @@ -9307,8 +9356,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>                         verbose(env, "invalid map_ptr to access map->key\n");
>                         return -EACCES;
>                 }
> -               err = check_helper_mem_access(env, regno, meta->map_ptr->key_size,
> -                                             BPF_READ, false, NULL);
> +               key_size = meta->map_ptr->key_size;
> +               err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
> +               if (err)
> +                       return err;
> +               meta->const_map_key = get_constant_map_key(env, reg, key_size);
>                 break;
>         case ARG_PTR_TO_MAP_VALUE:
>                 if (type_may_be_null(arg_type) && register_is_null(reg))
> @@ -10833,6 +10885,21 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
>                                  state->callback_subprogno == subprogno);
>  }
>
> +/* Returns whether or not the given map type can potentially elide
> + * lookup return value nullness check. This is possible if the key
> + * is statically known.
> + */
> +static bool can_elide_value_nullness(enum bpf_map_type type)
> +{
> +       switch (type) {
> +       case BPF_MAP_TYPE_ARRAY:
> +       case BPF_MAP_TYPE_PERCPU_ARRAY:
> +               return true;
> +       default:
> +               return false;
> +       }
> +}
> +
>  static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
>                             const struct bpf_func_proto **ptr)
>  {
> @@ -11199,10 +11266,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>                                 "kernel subsystem misconfigured verifier\n");
>                         return -EINVAL;
>                 }
> +
> +               if (func_id == BPF_FUNC_map_lookup_elem &&
> +                   can_elide_value_nullness(meta.map_ptr->map_type) &&
> +                   meta.const_map_key >= 0 &&
> +                   meta.const_map_key < meta.map_ptr->max_entries)
> +                       ret_flag &= ~PTR_MAYBE_NULL;

I think we probably need mark_chain_precision applied on the constant
key since its concrete value is made use of here to prevent pruning on
it. If it's already happening and I missed it, I think we should
atleast add a comment.

For context of a similar case with tail calls, see commit
cc52d9140aa9 ("bpf: Fix record_func_key to perform backtracking on r3")
for what happens when it is missed.

> +
>                 regs[BPF_REG_0].map_ptr = meta.map_ptr;
>                 regs[BPF_REG_0].map_uid = meta.map_uid;
>                 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
> -               if (!type_may_be_null(ret_type) &&
> +               if (!type_may_be_null(ret_flag) &&
>                     btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
>                         regs[BPF_REG_0].id = ++env->id_gen;
>                 }
> diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
> index 7c969c127573..190822b2f08b 100644
> --- a/tools/testing/selftests/bpf/progs/iters.c
> +++ b/tools/testing/selftests/bpf/progs/iters.c
> @@ -524,11 +524,11 @@ int iter_subprog_iters(const void *ctx)
>  }
>
>  struct {
> -       __uint(type, BPF_MAP_TYPE_ARRAY);
> +       __uint(type, BPF_MAP_TYPE_HASH);
>         __type(key, int);
>         __type(value, int);
>         __uint(max_entries, 1000);
> -} arr_map SEC(".maps");
> +} hash_map SEC(".maps");
>
>  SEC("?raw_tp")
>  __failure __msg("invalid mem access 'scalar'")
> @@ -539,7 +539,7 @@ int iter_err_too_permissive1(const void *ctx)
>
>         MY_PID_GUARD();
>
> -       map_val = bpf_map_lookup_elem(&arr_map, &key);
> +       map_val = bpf_map_lookup_elem(&hash_map, &key);
>         if (!map_val)
>                 return 0;
>
> @@ -561,12 +561,12 @@ int iter_err_too_permissive2(const void *ctx)
>
>         MY_PID_GUARD();
>
> -       map_val = bpf_map_lookup_elem(&arr_map, &key);
> +       map_val = bpf_map_lookup_elem(&hash_map, &key);
>         if (!map_val)
>                 return 0;
>
>         bpf_repeat(1000000) {
> -               map_val = bpf_map_lookup_elem(&arr_map, &key);
> +               map_val = bpf_map_lookup_elem(&hash_map, &key);
>         }
>
>         *map_val = 123;
> @@ -585,7 +585,7 @@ int iter_err_too_permissive3(const void *ctx)
>         MY_PID_GUARD();
>
>         bpf_repeat(1000000) {
> -               map_val = bpf_map_lookup_elem(&arr_map, &key);
> +               map_val = bpf_map_lookup_elem(&hash_map, &key);
>                 found = true;
>         }
>
> @@ -606,7 +606,7 @@ int iter_tricky_but_fine(const void *ctx)
>         MY_PID_GUARD();
>
>         bpf_repeat(1000000) {
> -               map_val = bpf_map_lookup_elem(&arr_map, &key);
> +               map_val = bpf_map_lookup_elem(&hash_map, &key);
>                 if (map_val) {
>                         found = true;
>                         break;
> diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
> index c2a6bd392e48..4c0ff01f1a96 100644
> --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
> +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
> @@ -345,7 +345,7 @@ int reject_indirect_global_func_access(struct __sk_buff *ctx)
>  }
>
>  SEC("?tc")
> -__failure __msg("Unreleased reference id=5 alloc_insn=")
> +__failure __msg("Unreleased reference id=4 alloc_insn=")
>  int kptr_xchg_ref_state(struct __sk_buff *ctx)
>  {
>         struct prog_test_ref_kfunc *p;
> diff --git a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
> index 4eaab1468eb7..7d088ba99ea5 100644
> --- a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
> +++ b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
> @@ -47,7 +47,7 @@ l0_%=:        r0 = 0;                                         \
>
>  SEC("xdp")
>  __description("map in map state pruning")
> -__success __msg("processed 26 insns")
> +__success __msg("processed 15 insns")
>  __log_level(2) __retval(0) __flag(BPF_F_TEST_STATE_FREQ)
>  __naked void map_in_map_state_pruning(void)
>  {
> diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
> index f420c0312aa0..4b39f8472f9b 100644
> --- a/tools/testing/selftests/bpf/verifier/map_kptr.c
> +++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
> @@ -373,7 +373,7 @@
>         .prog_type = BPF_PROG_TYPE_SCHED_CLS,
>         .fixup_map_kptr = { 1 },
>         .result = REJECT,
> -       .errstr = "Unreleased reference id=5 alloc_insn=20",
> +       .errstr = "Unreleased reference id=4 alloc_insn=20",
>         .fixup_kfunc_btf_id = {
>                 { "bpf_kfunc_call_test_acquire", 15 },
>         }
> --
> 2.46.0
>
>
Eduard Zingerman Dec. 13, 2024, 11:14 p.m. UTC | #5
On Sat, 2024-12-14 at 00:10 +0100, Kumar Kartikeya Dwivedi wrote:

[...]

> > @@ -11199,10 +11266,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
> >                                 "kernel subsystem misconfigured verifier\n");
> >                         return -EINVAL;
> >                 }
> > +
> > +               if (func_id == BPF_FUNC_map_lookup_elem &&
> > +                   can_elide_value_nullness(meta.map_ptr->map_type) &&
> > +                   meta.const_map_key >= 0 &&
> > +                   meta.const_map_key < meta.map_ptr->max_entries)
> > +                       ret_flag &= ~PTR_MAYBE_NULL;
> 
> I think we probably need mark_chain_precision applied on the constant
> key since its concrete value is made use of here to prevent pruning on
> it. If it's already happening and I missed it, I think we should
> atleast add a comment.
> 
> For context of a similar case with tail calls, see commit
> cc52d9140aa9 ("bpf: Fix record_func_key to perform backtracking on r3")
> for what happens when it is missed.

Great point, I'm sure this does not happen.

[...]
Eduard Zingerman Dec. 13, 2024, 11:18 p.m. UTC | #6
On Fri, 2024-12-13 at 15:14 -0800, Eduard Zingerman wrote:

[...]

> Great point, I'm sure this does not happen.

I mean, mark_chain_precision() does not happen at the moment.
Daniel Xu Dec. 14, 2024, 2:44 a.m. UTC | #7
On Fri, Dec 13, 2024 at 03:02:11PM GMT, Andrii Nakryiko wrote:
> On Thu, Dec 12, 2024 at 3:23 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> >
> > This commit allows progs to elide a null check on statically known map
> > lookup keys. In other words, if the verifier can statically prove that
> > the lookup will be in-bounds, allow the prog to drop the null check.
> >
> > This is useful for two reasons:
> >
> > 1. Large numbers of nullness checks (especially when they cannot fail)
> >    unnecessarily pushes prog towards BPF_COMPLEXITY_LIMIT_JMP_SEQ.
> > 2. It forms a tighter contract between programmer and verifier.
> >
> > For (1), bpftrace is starting to make heavier use of percpu scratch
> > maps. As a result, for user scripts with large number of unrolled loops,
> > we are starting to hit jump complexity verification errors.  These
> > percpu lookups cannot fail anyways, as we only use static key values.
> > Eliding nullness probably results in less work for verifier as well.
> >
> > For (2), percpu scratch maps are often used as a larger stack, as the
> > currrent stack is limited to 512 bytes. In these situations, it is
> > desirable for the programmer to express: "this lookup should never fail,
> > and if it does, it means I messed up the code". By omitting the null
> > check, the programmer can "ask" the verifier to double check the logic.
> >
> > Tests also have to be updated in sync with these changes, as the
> > verifier is more efficient with this change. Notable, iters.c tests had
> > to be changed to use a map type that still requires null checks, as it's
> > exercising verifier tracking logic w.r.t iterators.
> >
> > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > ---
> >  kernel/bpf/verifier.c                         | 80 ++++++++++++++++++-
> >  tools/testing/selftests/bpf/progs/iters.c     | 14 ++--
> >  .../selftests/bpf/progs/map_kptr_fail.c       |  2 +-
> >  .../selftests/bpf/progs/verifier_map_in_map.c |  2 +-
> >  .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
> >  5 files changed, 87 insertions(+), 13 deletions(-)
> >
> 
> Eduard has great points. I've added a few more comments below.
> 
> pw-bot: cr
> 
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 58b36cc96bd5..4947ef884a18 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -287,6 +287,7 @@ struct bpf_call_arg_meta {
> >         u32 ret_btf_id;
> >         u32 subprogno;
> >         struct btf_field *kptr_field;
> > +       s64 const_map_key;
> >  };
> >
> >  struct bpf_kfunc_call_arg_meta {
> > @@ -9163,6 +9164,53 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
> >         return 0;
> >  }
> >
> > +/* Returns constant key value if possible, else -1 */
> > +static s64 get_constant_map_key(struct bpf_verifier_env *env,
> > +                               struct bpf_reg_state *key,
> > +                               u32 key_size)
> > +{
> > +       struct bpf_func_state *state = func(env, key);
> > +       struct bpf_reg_state *reg;
> > +       int zero_size = 0;
> > +       int stack_off;
> > +       u8 *stype;
> > +       int slot;
> > +       int spi;
> > +       int i;
> > +
> > +       if (!env->bpf_capable)
> > +               return -1;
> > +       if (key->type != PTR_TO_STACK)
> > +               return -1;
> > +       if (!tnum_is_const(key->var_off))
> > +               return -1;
> > +
> > +       stack_off = key->off + key->var_off.value;
> > +       slot = -stack_off - 1;
> > +       spi = slot / BPF_REG_SIZE;
> > +
> > +       /* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> > +       stype = state->stack[spi].slot_type;
> > +       for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
> 
> it's Friday and I'm lazy, but please double-check that this works for
> both big-endian and little-endian :)

Any tips? Are the existing tests running thru s390x hosts in CI
sufficient or should I add some tests writen in C (and not BPF
assembler)? I can never think about endianness correctly...

> 
> with Eduard's suggestion this also becomes interesting when you have
> 000mmm mix (as one example), because that gives you a small range, and
> all values might be valid keys for arrays

Can you define what "small range" means? What range is there with 0's?
Any pointers would be helpful.


> 
> > +               zero_size++;
> > +       if (zero_size == key_size)
> > +               return 0;
> > +
> > +       if (!is_spilled_reg(&state->stack[spi]))
> > +               /* Not pointer to stack */
> 
> !is_spilled_reg and "Not pointer to stack" seem to be not exactly the
> same things?

You're right - comment is not helpful. I'll make the change to
use is_spilled_scalar_reg() which is probably as clear as it gets.

[..]
Eduard Zingerman Dec. 14, 2024, 3:13 a.m. UTC | #8
On Fri, 2024-12-13 at 19:44 -0700, Daniel Xu wrote:

[...]

> > > +       /* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> > > +       stype = state->stack[spi].slot_type;
> > > +       for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
> > 
> > it's Friday and I'm lazy, but please double-check that this works for
> > both big-endian and little-endian :)
> 
> Any tips? Are the existing tests running thru s390x hosts in CI
> sufficient or should I add some tests writen in C (and not BPF
> assembler)? I can never think about endianness correctly...

I think that if test operates on a key like:

      valid key 15
             v
      0000000f   <-- written to stack as a single u64 value
      ^^^^^^^
    stack zero marks

and is executed (e.g. using __retval annotation),
then CI passing for s390 should be enough.

There is a guide on how to gen a s390 environment locally:
https://docs.kernel.org/bpf/s390.html
I used it recently to build a vmlinux for s390 with no or minimal
issues. Used it to boot long time ago, but don't remember if there
were any surprises.

> > with Eduard's suggestion this also becomes interesting when you have
> > 000mmm mix (as one example), because that gives you a small range, and
> > all values might be valid keys for arrays
> 
> Can you define what "small range" means? What range is there with 0's?
> Any pointers would be helpful.

I think Andrii means that each 'm' adds 8 bits of range.
E.g. range for 0000_000m is 0-255, range for 0000_00mm is 0-65535, etc.

[...]
Andrii Nakryiko Dec. 16, 2024, 11:24 p.m. UTC | #9
On Fri, Dec 13, 2024 at 7:13 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Fri, 2024-12-13 at 19:44 -0700, Daniel Xu wrote:
>
> [...]
>
> > > > +       /* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> > > > +       stype = state->stack[spi].slot_type;
> > > > +       for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
> > >
> > > it's Friday and I'm lazy, but please double-check that this works for
> > > both big-endian and little-endian :)
> >
> > Any tips? Are the existing tests running thru s390x hosts in CI
> > sufficient or should I add some tests writen in C (and not BPF
> > assembler)? I can never think about endianness correctly...
>
> I think that if test operates on a key like:
>
>       valid key 15
>              v
>       0000000f   <-- written to stack as a single u64 value
>       ^^^^^^^
>     stack zero marks
>
> and is executed (e.g. using __retval annotation),
> then CI passing for s390 should be enough.

+1, something like that where for big-endian it will be all zero while
for little endian it would be 0xf (and then make sure that the test
should *fail* by making sure that 0xf is not a valid index, so NULL
check is necessary)

>
> There is a guide on how to gen a s390 environment locally:
> https://docs.kernel.org/bpf/s390.html
> I used it recently to build a vmlinux for s390 with no or minimal
> issues. Used it to boot long time ago, but don't remember if there
> were any surprises.
>
> > > with Eduard's suggestion this also becomes interesting when you have
> > > 000mmm mix (as one example), because that gives you a small range, and
> > > all values might be valid keys for arrays
> >
> > Can you define what "small range" means? What range is there with 0's?
> > Any pointers would be helpful.
>
> I think Andrii means that each 'm' adds 8 bits of range.
> E.g. range for 0000_000m is 0-255, range for 0000_00mm is 0-65535, etc.

yes, exactly, thank you, Eduard!

>
> [...]
>
Daniel Xu Dec. 19, 2024, 12:09 a.m. UTC | #10
On Mon, Dec 16, 2024 at 03:24:01PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 13, 2024 at 7:13 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >
> > On Fri, 2024-12-13 at 19:44 -0700, Daniel Xu wrote:
> >

[...]

> >
> > > > with Eduard's suggestion this also becomes interesting when you have
> > > > 000mmm mix (as one example), because that gives you a small range, and
> > > > all values might be valid keys for arrays
> > >
> > > Can you define what "small range" means? What range is there with 0's?
> > > Any pointers would be helpful.
> >
> > I think Andrii means that each 'm' adds 8 bits of range.
> > E.g. range for 0000_000m is 0-255, range for 0000_00mm is 0-65535, etc.
> 
> yes, exactly, thank you, Eduard!

Gave it some thought. Still seems like a good idea, but I'd prefer to
leave this extension for a separate patchset. Mostly b/c I'm running out
of space in my head to grok everything :P. Probably higher likelihood of
me getting the existing stuff correct if I don't add more scope.

Thanks,
Daniel
Daniel Xu Dec. 19, 2024, 9:41 p.m. UTC | #11
On Mon, Dec 16, 2024 at 03:24:01PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 13, 2024 at 7:13 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >
> > On Fri, 2024-12-13 at 19:44 -0700, Daniel Xu wrote:
> >
> > [...]
> >
> > > > > +       /* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
> > > > > +       stype = state->stack[spi].slot_type;
> > > > > +       for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
> > > >
> > > > it's Friday and I'm lazy, but please double-check that this works for
> > > > both big-endian and little-endian :)
> > >
> > > Any tips? Are the existing tests running thru s390x hosts in CI
> > > sufficient or should I add some tests writen in C (and not BPF
> > > assembler)? I can never think about endianness correctly...
> >
> > I think that if test operates on a key like:
> >
> >       valid key 15
> >              v
> >       0000000f   <-- written to stack as a single u64 value
> >       ^^^^^^^
> >     stack zero marks
> >
> > and is executed (e.g. using __retval annotation),
> > then CI passing for s390 should be enough.
> 
> +1, something like that where for big-endian it will be all zero while
> for little endian it would be 0xf (and then make sure that the test
> should *fail* by making sure that 0xf is not a valid index, so NULL
> check is necessary)

How would it work for LE to be 0xF but BE to be 0x0?

The prog passes a pointer to the beginning of the u32 to
bpf_map_lookup_elem(). The kernel does a 4 byte read starting from that
address. On both BE and LE all 4 bytes will be interpreted. So set bits
cannot just go away.

Am I missing something?

Thanks,
Daniel
Eduard Zingerman Dec. 20, 2024, 12:04 a.m. UTC | #12
On Thu, 2024-12-19 at 14:41 -0700, Daniel Xu wrote:

[...]

> > > I think that if test operates on a key like:
> > > 
> > >       valid key 15
> > >              v
> > >       0000000f   <-- written to stack as a single u64 value
> > >       ^^^^^^^
> > >     stack zero marks
> > > 
> > > and is executed (e.g. using __retval annotation),
> > > then CI passing for s390 should be enough.
> > 
> > +1, something like that where for big-endian it will be all zero while
> > for little endian it would be 0xf (and then make sure that the test
> > should *fail* by making sure that 0xf is not a valid index, so NULL
> > check is necessary)
> 
> How would it work for LE to be 0xF but BE to be 0x0?
> 
> The prog passes a pointer to the beginning of the u32 to
> bpf_map_lookup_elem(). The kernel does a 4 byte read starting from that
> address. On both BE and LE all 4 bytes will be interpreted. So set bits
> cannot just go away.
> 
> Am I missing something?

Ok, thinking a bit more, the best test I can come up with is:

  u8 vals[8];
  vals[0] = 0;
  ...
  vals[6] = 0;
  vals[7] = 0xf;
  p = bpf_map_lookup_elem(... vals ...);
  *p = 42;

For LE vals as u32 should be 0x0f;
For BE vals as u32 should be 0xf000_0000.
Hence, it is not safe to remove null check for this program.
What would verifier think about the value of such key?
As far as I understand, there would be stack zero for for vals[0-6]
and u8 stack spill for vals[7].
You were going to add a check for the spill size, which should help here.
So, a negative test like above that checks that verifier complains
that 'p' should be checked for nullness first?

If anyone has better test in mind, please speak-up.

[...]
Daniel Xu Dec. 20, 2024, 12:40 a.m. UTC | #13
On Thu, Dec 19, 2024 at 04:04:43PM -0800, Eduard Zingerman wrote:
> On Thu, 2024-12-19 at 14:41 -0700, Daniel Xu wrote:
> 
> [...]
> 
> > > > I think that if test operates on a key like:
> > > > 
> > > >       valid key 15
> > > >              v
> > > >       0000000f   <-- written to stack as a single u64 value
> > > >       ^^^^^^^
> > > >     stack zero marks
> > > > 
> > > > and is executed (e.g. using __retval annotation),
> > > > then CI passing for s390 should be enough.
> > > 
> > > +1, something like that where for big-endian it will be all zero while
> > > for little endian it would be 0xf (and then make sure that the test
> > > should *fail* by making sure that 0xf is not a valid index, so NULL
> > > check is necessary)
> > 
> > How would it work for LE to be 0xF but BE to be 0x0?
> > 
> > The prog passes a pointer to the beginning of the u32 to
> > bpf_map_lookup_elem(). The kernel does a 4 byte read starting from that
> > address. On both BE and LE all 4 bytes will be interpreted. So set bits
> > cannot just go away.
> > 
> > Am I missing something?
> 
> Ok, thinking a bit more, the best test I can come up with is:
> 
>   u8 vals[8];
>   vals[0] = 0;
>   ...
>   vals[6] = 0;
>   vals[7] = 0xf;
>   p = bpf_map_lookup_elem(... vals ...);
>   *p = 42;
> 
> For LE vals as u32 should be 0x0f;
> For BE vals as u32 should be 0xf000_0000.
> Hence, it is not safe to remove null check for this program.
> What would verifier think about the value of such key?
> As far as I understand, there would be stack zero for for vals[0-6]
> and u8 stack spill for vals[7].

Right. By checking that spill size is same as key size, we stay endian
neutral, as constant values are tracked in native endianness.

However, if we were to start interpreting combinations of STACK_ZERO,
STACK_MISC, and STACK_SPILL, the verifier would have to be endian aware
(IIUC). Which makes it a somewhat interesting problem but also requires
some thought to correctly handle the state space.

> You were going to add a check for the spill size, which should help here.
> So, a negative test like above that checks that verifier complains
> that 'p' should be checked for nullness first?
> 
> If anyone has better test in mind, please speak-up.

I think this case reduces down to a spill_size != key_size test. As long
as the sizes match, we don't have to worry about endianness.

Thanks,
Daniel
Eduard Zingerman Dec. 20, 2024, 12:43 a.m. UTC | #14
On Thu, 2024-12-19 at 17:40 -0700, Daniel Xu wrote:

[...]

> > Ok, thinking a bit more, the best test I can come up with is:
> > 
> >   u8 vals[8];
> >   vals[0] = 0;
> >   ...
> >   vals[6] = 0;
> >   vals[7] = 0xf;
> >   p = bpf_map_lookup_elem(... vals ...);
> >   *p = 42;
> > 
> > For LE vals as u32 should be 0x0f;
> > For BE vals as u32 should be 0xf000_0000.
> > Hence, it is not safe to remove null check for this program.
> > What would verifier think about the value of such key?
> > As far as I understand, there would be stack zero for for vals[0-6]
> > and u8 stack spill for vals[7].
> 
> Right. By checking that spill size is same as key size, we stay endian
> neutral, as constant values are tracked in native endianness.
> 
> However, if we were to start interpreting combinations of STACK_ZERO,
> STACK_MISC, and STACK_SPILL, the verifier would have to be endian aware
> (IIUC). Which makes it a somewhat interesting problem but also requires
> some thought to correctly handle the state space.

Right.

> > You were going to add a check for the spill size, which should help here.
> > So, a negative test like above that checks that verifier complains
> > that 'p' should be checked for nullness first?
> > 
> > If anyone has better test in mind, please speak-up.
> 
> I think this case reduces down to a spill_size != key_size test. As long
> as the sizes match, we don't have to worry about endianness.

Agree.
Alexei Starovoitov Dec. 20, 2024, 12:49 a.m. UTC | #15
On Thu, Dec 19, 2024 at 4:43 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Thu, 2024-12-19 at 17:40 -0700, Daniel Xu wrote:
>
> [...]
>
> > > Ok, thinking a bit more, the best test I can come up with is:
> > >
> > >   u8 vals[8];
> > >   vals[0] = 0;
> > >   ...
> > >   vals[6] = 0;
> > >   vals[7] = 0xf;
> > >   p = bpf_map_lookup_elem(... vals ...);
> > >   *p = 42;
> > >
> > > For LE vals as u32 should be 0x0f;
> > > For BE vals as u32 should be 0xf000_0000.
> > > Hence, it is not safe to remove null check for this program.
> > > What would verifier think about the value of such key?
> > > As far as I understand, there would be stack zero for for vals[0-6]
> > > and u8 stack spill for vals[7].
> >
> > Right. By checking that spill size is same as key size, we stay endian
> > neutral, as constant values are tracked in native endianness.
> >
> > However, if we were to start interpreting combinations of STACK_ZERO,
> > STACK_MISC, and STACK_SPILL, the verifier would have to be endian aware
> > (IIUC). Which makes it a somewhat interesting problem but also requires
> > some thought to correctly handle the state space.
>
> Right.
>
> > > You were going to add a check for the spill size, which should help here.
> > > So, a negative test like above that checks that verifier complains
> > > that 'p' should be checked for nullness first?
> > >
> > > If anyone has better test in mind, please speak-up.
> >
> > I think this case reduces down to a spill_size != key_size test. As long
> > as the sizes match, we don't have to worry about endianness.
>
> Agree.

Earlier I suggested to generalize this zero/misc/spill counting
into a helper and reuse here and in check_stack_read_fixed_off().

We do very similar checks there with a similar purpose.

It sounds there are ideas to make this particular feature smarter
than what we have in check_stack_read_fixed_off().
Let's not overdo it.
Even if a common helper is not possible, keep things consistent.
The simpler the better.
Daniel Xu Dec. 20, 2024, 4 a.m. UTC | #16
On Thu, Dec 19, 2024 at 04:49:13PM -0800, Alexei Starovoitov wrote:
> On Thu, Dec 19, 2024 at 4:43 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >
> > On Thu, 2024-12-19 at 17:40 -0700, Daniel Xu wrote:
> >
> > [...]
> >
> > > > Ok, thinking a bit more, the best test I can come up with is:
> > > >
> > > >   u8 vals[8];
> > > >   vals[0] = 0;
> > > >   ...
> > > >   vals[6] = 0;
> > > >   vals[7] = 0xf;
> > > >   p = bpf_map_lookup_elem(... vals ...);
> > > >   *p = 42;
> > > >
> > > > For LE vals as u32 should be 0x0f;
> > > > For BE vals as u32 should be 0xf000_0000.
> > > > Hence, it is not safe to remove null check for this program.
> > > > What would verifier think about the value of such key?
> > > > As far as I understand, there would be stack zero for for vals[0-6]
> > > > and u8 stack spill for vals[7].
> > >
> > > Right. By checking that spill size is same as key size, we stay endian
> > > neutral, as constant values are tracked in native endianness.
> > >
> > > However, if we were to start interpreting combinations of STACK_ZERO,
> > > STACK_MISC, and STACK_SPILL, the verifier would have to be endian aware
> > > (IIUC). Which makes it a somewhat interesting problem but also requires
> > > some thought to correctly handle the state space.
> >
> > Right.
> >
> > > > You were going to add a check for the spill size, which should help here.
> > > > So, a negative test like above that checks that verifier complains
> > > > that 'p' should be checked for nullness first?
> > > >
> > > > If anyone has better test in mind, please speak-up.
> > >
> > > I think this case reduces down to a spill_size != key_size test. As long
> > > as the sizes match, we don't have to worry about endianness.
> >
> > Agree.
> 
> Earlier I suggested to generalize this zero/misc/spill counting
> into a helper and reuse here and in check_stack_read_fixed_off().
> 
> We do very similar checks there with a similar purpose.

Looked again, didn't see any obvious way to share code that doesn't make
it more confusing. Let me post v6 without this particular refactor. If I
missed something I'll fix it up in v7.

> 
> It sounds there are ideas to make this particular feature smarter
> than what we have in check_stack_read_fixed_off().
> Let's not overdo it.
> Even if a common helper is not possible, keep things consistent.
> The simpler the better.

Fair enough. We can keep it simple. 

Thanks,
Daniel
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 58b36cc96bd5..4947ef884a18 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -287,6 +287,7 @@  struct bpf_call_arg_meta {
 	u32 ret_btf_id;
 	u32 subprogno;
 	struct btf_field *kptr_field;
+	s64 const_map_key;
 };
 
 struct bpf_kfunc_call_arg_meta {
@@ -9163,6 +9164,53 @@  static int check_reg_const_str(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* Returns constant key value if possible, else -1 */
+static s64 get_constant_map_key(struct bpf_verifier_env *env,
+				struct bpf_reg_state *key,
+				u32 key_size)
+{
+	struct bpf_func_state *state = func(env, key);
+	struct bpf_reg_state *reg;
+	int zero_size = 0;
+	int stack_off;
+	u8 *stype;
+	int slot;
+	int spi;
+	int i;
+
+	if (!env->bpf_capable)
+		return -1;
+	if (key->type != PTR_TO_STACK)
+		return -1;
+	if (!tnum_is_const(key->var_off))
+		return -1;
+
+	stack_off = key->off + key->var_off.value;
+	slot = -stack_off - 1;
+	spi = slot / BPF_REG_SIZE;
+
+	/* First handle precisely tracked STACK_ZERO, up to BPF_REG_SIZE */
+	stype = state->stack[spi].slot_type;
+	for (i = 0; i < BPF_REG_SIZE && stype[i] == STACK_ZERO; i++)
+		zero_size++;
+	if (zero_size == key_size)
+		return 0;
+
+	if (!is_spilled_reg(&state->stack[spi]))
+		/* Not pointer to stack */
+		return -1;
+
+	reg = &state->stack[spi].spilled_ptr;
+	if (reg->type != SCALAR_VALUE)
+		/* Only scalars are valid array map keys */
+		return -1;
+	else if (!tnum_is_const(reg->var_off))
+		/* Stack value not statically known */
+		return -1;
+
+	return reg->var_off.value;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
 			  const struct bpf_func_proto *fn,
@@ -9173,6 +9221,7 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	enum bpf_arg_type arg_type = fn->arg_type[arg];
 	enum bpf_reg_type type = reg->type;
 	u32 *arg_btf_id = NULL;
+	u32 key_size;
 	int err = 0;
 	bool mask;
 
@@ -9307,8 +9356,11 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		err = check_helper_mem_access(env, regno, meta->map_ptr->key_size,
-					      BPF_READ, false, NULL);
+		key_size = meta->map_ptr->key_size;
+		err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+		if (err)
+			return err;
+		meta->const_map_key = get_constant_map_key(env, reg, key_size);
 		break;
 	case ARG_PTR_TO_MAP_VALUE:
 		if (type_may_be_null(arg_type) && register_is_null(reg))
@@ -10833,6 +10885,21 @@  static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
 				 state->callback_subprogno == subprogno);
 }
 
+/* Returns whether or not the given map type can potentially elide
+ * lookup return value nullness check. This is possible if the key
+ * is statically known.
+ */
+static bool can_elide_value_nullness(enum bpf_map_type type)
+{
+	switch (type) {
+	case BPF_MAP_TYPE_ARRAY:
+	case BPF_MAP_TYPE_PERCPU_ARRAY:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
 			    const struct bpf_func_proto **ptr)
 {
@@ -11199,10 +11266,17 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				"kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
+
+		if (func_id == BPF_FUNC_map_lookup_elem &&
+		    can_elide_value_nullness(meta.map_ptr->map_type) &&
+		    meta.const_map_key >= 0 &&
+		    meta.const_map_key < meta.map_ptr->max_entries)
+			ret_flag &= ~PTR_MAYBE_NULL;
+
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].map_uid = meta.map_uid;
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
-		if (!type_may_be_null(ret_type) &&
+		if (!type_may_be_null(ret_flag) &&
 		    btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
 			regs[BPF_REG_0].id = ++env->id_gen;
 		}
diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 7c969c127573..190822b2f08b 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -524,11 +524,11 @@  int iter_subprog_iters(const void *ctx)
 }
 
 struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, int);
 	__type(value, int);
 	__uint(max_entries, 1000);
-} arr_map SEC(".maps");
+} hash_map SEC(".maps");
 
 SEC("?raw_tp")
 __failure __msg("invalid mem access 'scalar'")
@@ -539,7 +539,7 @@  int iter_err_too_permissive1(const void *ctx)
 
 	MY_PID_GUARD();
 
-	map_val = bpf_map_lookup_elem(&arr_map, &key);
+	map_val = bpf_map_lookup_elem(&hash_map, &key);
 	if (!map_val)
 		return 0;
 
@@ -561,12 +561,12 @@  int iter_err_too_permissive2(const void *ctx)
 
 	MY_PID_GUARD();
 
-	map_val = bpf_map_lookup_elem(&arr_map, &key);
+	map_val = bpf_map_lookup_elem(&hash_map, &key);
 	if (!map_val)
 		return 0;
 
 	bpf_repeat(1000000) {
-		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		map_val = bpf_map_lookup_elem(&hash_map, &key);
 	}
 
 	*map_val = 123;
@@ -585,7 +585,7 @@  int iter_err_too_permissive3(const void *ctx)
 	MY_PID_GUARD();
 
 	bpf_repeat(1000000) {
-		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		map_val = bpf_map_lookup_elem(&hash_map, &key);
 		found = true;
 	}
 
@@ -606,7 +606,7 @@  int iter_tricky_but_fine(const void *ctx)
 	MY_PID_GUARD();
 
 	bpf_repeat(1000000) {
-		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		map_val = bpf_map_lookup_elem(&hash_map, &key);
 		if (map_val) {
 			found = true;
 			break;
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index c2a6bd392e48..4c0ff01f1a96 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -345,7 +345,7 @@  int reject_indirect_global_func_access(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("Unreleased reference id=5 alloc_insn=")
+__failure __msg("Unreleased reference id=4 alloc_insn=")
 int kptr_xchg_ref_state(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
diff --git a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
index 4eaab1468eb7..7d088ba99ea5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
+++ b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
@@ -47,7 +47,7 @@  l0_%=:	r0 = 0;						\
 
 SEC("xdp")
 __description("map in map state pruning")
-__success __msg("processed 26 insns")
+__success __msg("processed 15 insns")
 __log_level(2) __retval(0) __flag(BPF_F_TEST_STATE_FREQ)
 __naked void map_in_map_state_pruning(void)
 {
diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
index f420c0312aa0..4b39f8472f9b 100644
--- a/tools/testing/selftests/bpf/verifier/map_kptr.c
+++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
@@ -373,7 +373,7 @@ 
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.fixup_map_kptr = { 1 },
 	.result = REJECT,
-	.errstr = "Unreleased reference id=5 alloc_insn=20",
+	.errstr = "Unreleased reference id=4 alloc_insn=20",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_acquire", 15 },
 	}