diff mbox series

[bpf-next,v1] bpf: Enable non-atomic allocations in local storage

Message ID 20220312011643.2136370-1-joannekoong@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [bpf-next,v1] bpf: Enable non-atomic allocations in local storage | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next success VM_Test
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 94 this patch: 96
netdev/cc_maintainers warning 8 maintainers not CCed: netdev@vger.kernel.org songliubraving@fb.com davem@davemloft.net yhs@fb.com john.fastabend@gmail.com kuba@kernel.org haoluo@google.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 30 this patch: 30
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 99 this patch: 101
netdev/checkpatch warning CHECK: multiple assignments should be avoided WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Joanne Koong March 12, 2022, 1:16 a.m. UTC
From: Joanne Koong <joannelkoong@gmail.com>

Currently, local storage memory can only be allocated atomically
(GFP_ATOMIC). This restriction is too strict for sleepable bpf
programs.

In this patch, the verifier detects whether the program is sleepable,
and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a
5th argument to bpf_task/sk/inode_storage_get. This flag will propagate
down to the local storage functions that allocate memory.

A few things to note:
* bpf_task/sk/inode_storage_update_elem functions are invoked by
userspace applications through syscalls. Preemption is disabled before
bpf_task/sk/inode_storage_update_elem is called, which means they will
always have to allocate memory atomically.

* In bpf_local_storage_update, we have to do the memory charging and
allocation outside the spinlock. There are some cases where it is
permissible for the memory charging and/or allocation to fail, so in
these failure cases, we continue on to determine at a later time whether
the failure is relevant.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 include/linux/bpf_local_storage.h |  7 +--
 kernel/bpf/bpf_inode_storage.c    |  9 ++--
 kernel/bpf/bpf_local_storage.c    | 77 +++++++++++++++++++++----------
 kernel/bpf/bpf_task_storage.c     | 10 ++--
 kernel/bpf/verifier.c             | 20 ++++++++
 net/core/bpf_sk_storage.c         | 21 +++++----
 6 files changed, 99 insertions(+), 45 deletions(-)

Comments

KP Singh March 15, 2022, 2:26 a.m. UTC | #1
On Sat, Mar 12, 2022 at 2:17 AM Joanne Koong <joannekoong@fb.com> wrote:
>
> From: Joanne Koong <joannelkoong@gmail.com>
>
> Currently, local storage memory can only be allocated atomically
> (GFP_ATOMIC). This restriction is too strict for sleepable bpf
> programs.
>
> In this patch, the verifier detects whether the program is sleepable,
> and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a
> 5th argument to bpf_task/sk/inode_storage_get. This flag will propagate
> down to the local storage functions that allocate memory.
>
> A few things to note:
> * bpf_task/sk/inode_storage_update_elem functions are invoked by
> userspace applications through syscalls. Preemption is disabled before
> bpf_task/sk/inode_storage_update_elem is called, which means they will
> always have to allocate memory atomically.
>
> * In bpf_local_storage_update, we have to do the memory charging and
> allocation outside the spinlock. There are some cases where it is
> permissible for the memory charging and/or allocation to fail, so in
> these failure cases, we continue on to determine at a later time whether
> the failure is relevant.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  include/linux/bpf_local_storage.h |  7 +--
>  kernel/bpf/bpf_inode_storage.c    |  9 ++--
>  kernel/bpf/bpf_local_storage.c    | 77 +++++++++++++++++++++----------
>  kernel/bpf/bpf_task_storage.c     | 10 ++--
>  kernel/bpf/verifier.c             | 20 ++++++++
>  net/core/bpf_sk_storage.c         | 21 +++++----
>  6 files changed, 99 insertions(+), 45 deletions(-)
>
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 37b3906af8b1..d6905e85399d 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -154,16 +154,17 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
>
>  struct bpf_local_storage_elem *
>  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
> -               bool charge_mem);
> +               gfp_t mem_flags);
>
>  int
>  bpf_local_storage_alloc(void *owner,
>                         struct bpf_local_storage_map *smap,
> -                       struct bpf_local_storage_elem *first_selem);
> +                       struct bpf_local_storage_elem *first_selem,
> +                       gfp_t mem_flags);
>
>  struct bpf_local_storage_data *
>  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> -                        void *value, u64 map_flags);
> +                        void *value, u64 map_flags, gfp_t mem_flags);
>
>  void bpf_local_storage_free_rcu(struct rcu_head *rcu);
>
> diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
> index e29d9e3d853e..41b0bec1e7e9 100644
> --- a/kernel/bpf/bpf_inode_storage.c
> +++ b/kernel/bpf/bpf_inode_storage.c
> @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
>
>         sdata = bpf_local_storage_update(f->f_inode,
>                                          (struct bpf_local_storage_map *)map,
> -                                        value, map_flags);
> +                                        value, map_flags, GFP_ATOMIC);
>         fput(f);
>         return PTR_ERR_OR_ZERO(sdata);
>  }
> @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
>         return err;
>  }
>
> -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
> -          void *, value, u64, flags)
> +/* *mem_flags* is set by the bpf verifier */
> +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
> +          void *, value, u64, flags, gfp_t, mem_flags)
>  {
>         struct bpf_local_storage_data *sdata;
>
> @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
>         if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>                 sdata = bpf_local_storage_update(
>                         inode, (struct bpf_local_storage_map *)map, value,
> -                       BPF_NOEXIST);
> +                       BPF_NOEXIST, mem_flags);
>                 return IS_ERR(sdata) ? (unsigned long)NULL :
>                                              (unsigned long)sdata->data;
>         }
> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index 092a1ac772d7..ca402f9cf1a5 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -63,23 +63,22 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
>
>  struct bpf_local_storage_elem *
>  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> -               void *value, bool charge_mem)
> +               void *value, gfp_t mem_flags)
>  {
>         struct bpf_local_storage_elem *selem;
>
> -       if (charge_mem && mem_charge(smap, owner, smap->elem_size))
> +       if (mem_charge(smap, owner, smap->elem_size))
>                 return NULL;
>
>         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> -                               GFP_ATOMIC | __GFP_NOWARN);
> +                               mem_flags | __GFP_NOWARN);
>         if (selem) {
>                 if (value)
>                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
>                 return selem;
>         }
>
> -       if (charge_mem)
> -               mem_uncharge(smap, owner, smap->elem_size);
> +       mem_uncharge(smap, owner, smap->elem_size);
>
>         return NULL;
>  }
> @@ -282,7 +281,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
>
>  int bpf_local_storage_alloc(void *owner,
>                             struct bpf_local_storage_map *smap,
> -                           struct bpf_local_storage_elem *first_selem)
> +                           struct bpf_local_storage_elem *first_selem,
> +                           gfp_t mem_flags)
>  {
>         struct bpf_local_storage *prev_storage, *storage;
>         struct bpf_local_storage **owner_storage_ptr;
> @@ -293,7 +293,7 @@ int bpf_local_storage_alloc(void *owner,
>                 return err;
>
>         storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
> -                                 GFP_ATOMIC | __GFP_NOWARN);
> +                                 mem_flags | __GFP_NOWARN);
>         if (!storage) {
>                 err = -ENOMEM;
>                 goto uncharge;
> @@ -350,13 +350,13 @@ int bpf_local_storage_alloc(void *owner,
>   */
>  struct bpf_local_storage_data *
>  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> -                        void *value, u64 map_flags)
> +                        void *value, u64 map_flags, gfp_t mem_flags)
>  {
>         struct bpf_local_storage_data *old_sdata = NULL;
>         struct bpf_local_storage_elem *selem;
>         struct bpf_local_storage *local_storage;
>         unsigned long flags;
> -       int err;
> +       int err, charge_err;
>
>         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
>         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>                 if (err)
>                         return ERR_PTR(err);
>
> -               selem = bpf_selem_alloc(smap, owner, value, true);
> +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
>                 if (!selem)
>                         return ERR_PTR(-ENOMEM);
>
> -               err = bpf_local_storage_alloc(owner, smap, selem);
> +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
>                 if (err) {
>                         kfree(selem);
>                         mem_uncharge(smap, owner, smap->elem_size);
> @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>                 }
>         }
>
> +       /* Since mem_flags can be non-atomic, we need to do the memory
> +        * allocation outside the spinlock.
> +        *
> +        * There are a few cases where it is permissible for the memory charge
> +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> +        * value already exists, we can swap values without needing an
> +        * allocation), so in the case of a failure here, continue on and see
> +        * if the failure is relevant.
> +        */
> +       charge_err = mem_charge(smap, owner, smap->elem_size);
> +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> +                               mem_flags | __GFP_NOWARN);
> +
>         raw_spin_lock_irqsave(&local_storage->lock, flags);
>
>         /* Recheck local_storage->list under local_storage->lock */
> @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>         if (old_sdata && (map_flags & BPF_F_LOCK)) {
>                 copy_map_value_locked(&smap->map, old_sdata->data, value,
>                                       false);
> -               selem = SELEM(old_sdata);
> -               goto unlock;
> +
> +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> +
> +               if (!charge_err)
> +                       mem_uncharge(smap, owner, smap->elem_size);
> +               kfree(selem);
> +
> +               return old_sdata;
> +       }
> +
> +       if (!old_sdata && charge_err) {
> +               /* If there is no existing local storage value, then this means
> +                * we needed the charge to succeed. We must make sure this did not
> +                * return an error.
> +                *
> +                * Please note that if an existing local storage value exists, then
> +                * it doesn't matter if the charge failed because we can just
> +                * "reuse" the charge from the existing local storage element.
> +                */

But we did allocate a new element which was unaccounted for, even if
it was temporarily.
[for the short period of time till we freed the old element]

Martin, is this something we are okay with?

> +               err = charge_err;
> +               goto unlock_err;
>         }
>
> -       /* local_storage->lock is held.  Hence, we are sure
> -        * we can unlink and uncharge the old_sdata successfully
> -        * later.  Hence, instead of charging the new selem now
> -        * and then uncharge the old selem later (which may cause
> -        * a potential but unnecessary charge failure),  avoid taking
> -        * a charge at all here (the "!old_sdata" check) and the
> -        * old_sdata will not be uncharged later during
> -        * bpf_selem_unlink_storage_nolock().
> -        */
> -       selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
>         if (!selem) {
>                 err = -ENOMEM;
>                 goto unlock_err;
>         }
>
> +       if (value)
> +               memcpy(SDATA(selem)->data, value, smap->map.value_size);
> +
>         /* First, link the new selem to the map */
>         bpf_selem_link_map(smap, selem);
>
> @@ -454,15 +479,17 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>         if (old_sdata) {
>                 bpf_selem_unlink_map(SELEM(old_sdata));
>                 bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
> -                                               false);
> +                                               !charge_err);
>         }
>
> -unlock:
>         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
>         return SDATA(selem);
>
>  unlock_err:
>         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> +       if (!charge_err)
> +               mem_uncharge(smap, owner, smap->elem_size);
> +       kfree(selem);
>         return ERR_PTR(err);
>  }
>
> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> index 5da7bed0f5f6..bb9e22bad42b 100644
> --- a/kernel/bpf/bpf_task_storage.c
> +++ b/kernel/bpf/bpf_task_storage.c
> @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
>
>         bpf_task_storage_lock();
>         sdata = bpf_local_storage_update(
> -               task, (struct bpf_local_storage_map *)map, value, map_flags);
> +               task, (struct bpf_local_storage_map *)map, value, map_flags,
> +               GFP_ATOMIC);
>         bpf_task_storage_unlock();
>
>         err = PTR_ERR_OR_ZERO(sdata);
> @@ -226,8 +227,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
>         return err;
>  }
>
> -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> -          task, void *, value, u64, flags)
> +/* *mem_flags* is set by the bpf verifier */

Is there a precedence of this happening for any other helpers?

You may want to add here that "any value, even if set by uapi, will be ignored"

Can we go even beyond this and ensure that the verifier understands
that this is an
"internal only arg" something in check_helper_call?

> +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> +          task, void *, value, u64, flags, gfp_t, mem_flags)
>  {
>         struct bpf_local_storage_data *sdata;
>
> @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>             (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
>                 sdata = bpf_local_storage_update(
>                         task, (struct bpf_local_storage_map *)map, value,
> -                       BPF_NOEXIST);
> +                       BPF_NOEXIST, mem_flags);
>
>  unlock:
>         bpf_task_storage_unlock();
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 0db6cd8dcb35..392fdaabedbd 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -13491,6 +13491,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         goto patch_call_imm;
>                 }
>
> +               if (insn->imm == BPF_FUNC_task_storage_get ||
> +                   insn->imm == BPF_FUNC_sk_storage_get ||
> +                   insn->imm == BPF_FUNC_inode_storage_get) {
> +                       if (env->prog->aux->sleepable)
> +                               insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_KERNEL);
> +                       else
> +                               insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_ATOMIC);
> +                       insn_buf[1] = *insn;
> +                       cnt = 2;
> +
> +                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +
> +                       delta += cnt - 1;
> +                       env->prog = prog = new_prog;
> +                       insn = new_prog->insnsi + i + delta;
> +                       goto patch_call_imm;
> +               }
> +
>                 /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
>                  * and other inlining handlers are currently limited to 64 bit
>                  * only.
> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> index d9c37fd10809..8790a3791d39 100644
> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -141,7 +141,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
>         if (sock) {
>                 sdata = bpf_local_storage_update(
>                         sock->sk, (struct bpf_local_storage_map *)map, value,
> -                       map_flags);
> +                       map_flags, GFP_ATOMIC);
>                 sockfd_put(sock);
>                 return PTR_ERR_OR_ZERO(sdata);
>         }
> @@ -172,7 +172,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
>  {
>         struct bpf_local_storage_elem *copy_selem;
>
> -       copy_selem = bpf_selem_alloc(smap, newsk, NULL, true);
> +       copy_selem = bpf_selem_alloc(smap, newsk, NULL, GFP_ATOMIC);
>         if (!copy_selem)
>                 return NULL;
>
> @@ -230,7 +230,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
>                         bpf_selem_link_map(smap, copy_selem);
>                         bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
>                 } else {
> -                       ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
> +                       ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
>                         if (ret) {
>                                 kfree(copy_selem);
>                                 atomic_sub(smap->elem_size,
> @@ -255,8 +255,9 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
>         return ret;
>  }
>
> -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> -          void *, value, u64, flags)
> +/* *mem_flags* is set by the bpf verifier */
> +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> +          void *, value, u64, flags, gfp_t, mem_flags)
>  {
>         struct bpf_local_storage_data *sdata;
>
> @@ -277,7 +278,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>             refcount_inc_not_zero(&sk->sk_refcnt)) {
>                 sdata = bpf_local_storage_update(
>                         sk, (struct bpf_local_storage_map *)map, value,
> -                       BPF_NOEXIST);
> +                       BPF_NOEXIST, mem_flags);
>                 /* sk must be a fullsock (guaranteed by verifier),
>                  * so sock_gen_put() is unnecessary.
>                  */
> @@ -417,14 +418,16 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
>         return false;
>  }
>
> -BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
> -          void *, value, u64, flags)
> +/* *mem_flags* is set by the bpf verifier */
> +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
> +          void *, value, u64, flags, gfp_t, mem_flags)
>  {
>         WARN_ON_ONCE(!bpf_rcu_lock_held());
>         if (in_hardirq() || in_nmi())
>                 return (unsigned long)NULL;
>
> -       return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
> +       return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
> +                                                    mem_flags);
>  }
>
>  BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
> --
> 2.30.2
>
Kumar Kartikeya Dwivedi March 15, 2022, 4:49 p.m. UTC | #2
On Tue, Mar 15, 2022 at 07:56:46AM IST, KP Singh wrote:
> On Sat, Mar 12, 2022 at 2:17 AM Joanne Koong <joannekoong@fb.com> wrote:
> >
> > From: Joanne Koong <joannelkoong@gmail.com>
> >
> > Currently, local storage memory can only be allocated atomically
> > (GFP_ATOMIC). This restriction is too strict for sleepable bpf
> > programs.
> >
> > In this patch, the verifier detects whether the program is sleepable,
> > and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a
> > 5th argument to bpf_task/sk/inode_storage_get. This flag will propagate
> > down to the local storage functions that allocate memory.
> >
> > A few things to note:
> > * bpf_task/sk/inode_storage_update_elem functions are invoked by
> > userspace applications through syscalls. Preemption is disabled before
> > bpf_task/sk/inode_storage_update_elem is called, which means they will
> > always have to allocate memory atomically.
> >
> > * In bpf_local_storage_update, we have to do the memory charging and
> > allocation outside the spinlock. There are some cases where it is
> > permissible for the memory charging and/or allocation to fail, so in
> > these failure cases, we continue on to determine at a later time whether
> > the failure is relevant.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> >  include/linux/bpf_local_storage.h |  7 +--
> >  kernel/bpf/bpf_inode_storage.c    |  9 ++--
> >  kernel/bpf/bpf_local_storage.c    | 77 +++++++++++++++++++++----------
> >  kernel/bpf/bpf_task_storage.c     | 10 ++--
> >  kernel/bpf/verifier.c             | 20 ++++++++
> >  net/core/bpf_sk_storage.c         | 21 +++++----
> >  6 files changed, 99 insertions(+), 45 deletions(-)
> >
> > diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> > index 37b3906af8b1..d6905e85399d 100644
> > --- a/include/linux/bpf_local_storage.h
> > +++ b/include/linux/bpf_local_storage.h
> > @@ -154,16 +154,17 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
> >
> >  struct bpf_local_storage_elem *
> >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
> > -               bool charge_mem);
> > +               gfp_t mem_flags);
> >
> >  int
> >  bpf_local_storage_alloc(void *owner,
> >                         struct bpf_local_storage_map *smap,
> > -                       struct bpf_local_storage_elem *first_selem);
> > +                       struct bpf_local_storage_elem *first_selem,
> > +                       gfp_t mem_flags);
> >
> >  struct bpf_local_storage_data *
> >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > -                        void *value, u64 map_flags);
> > +                        void *value, u64 map_flags, gfp_t mem_flags);
> >
> >  void bpf_local_storage_free_rcu(struct rcu_head *rcu);
> >
> > diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
> > index e29d9e3d853e..41b0bec1e7e9 100644
> > --- a/kernel/bpf/bpf_inode_storage.c
> > +++ b/kernel/bpf/bpf_inode_storage.c
> > @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
> >
> >         sdata = bpf_local_storage_update(f->f_inode,
> >                                          (struct bpf_local_storage_map *)map,
> > -                                        value, map_flags);
> > +                                        value, map_flags, GFP_ATOMIC);
> >         fput(f);
> >         return PTR_ERR_OR_ZERO(sdata);
> >  }
> > @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
> >         return err;
> >  }
> >
> > -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
> > -          void *, value, u64, flags)
> > +/* *mem_flags* is set by the bpf verifier */
> > +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
> > +          void *, value, u64, flags, gfp_t, mem_flags)
> >  {
> >         struct bpf_local_storage_data *sdata;
> >
> > @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
> >         if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> >                 sdata = bpf_local_storage_update(
> >                         inode, (struct bpf_local_storage_map *)map, value,
> > -                       BPF_NOEXIST);
> > +                       BPF_NOEXIST, mem_flags);
> >                 return IS_ERR(sdata) ? (unsigned long)NULL :
> >                                              (unsigned long)sdata->data;
> >         }
> > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > index 092a1ac772d7..ca402f9cf1a5 100644
> > --- a/kernel/bpf/bpf_local_storage.c
> > +++ b/kernel/bpf/bpf_local_storage.c
> > @@ -63,23 +63,22 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> >
> >  struct bpf_local_storage_elem *
> >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > -               void *value, bool charge_mem)
> > +               void *value, gfp_t mem_flags)
> >  {
> >         struct bpf_local_storage_elem *selem;
> >
> > -       if (charge_mem && mem_charge(smap, owner, smap->elem_size))
> > +       if (mem_charge(smap, owner, smap->elem_size))
> >                 return NULL;
> >
> >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > -                               GFP_ATOMIC | __GFP_NOWARN);
> > +                               mem_flags | __GFP_NOWARN);
> >         if (selem) {
> >                 if (value)
> >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> >                 return selem;
> >         }
> >
> > -       if (charge_mem)
> > -               mem_uncharge(smap, owner, smap->elem_size);
> > +       mem_uncharge(smap, owner, smap->elem_size);
> >
> >         return NULL;
> >  }
> > @@ -282,7 +281,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
> >
> >  int bpf_local_storage_alloc(void *owner,
> >                             struct bpf_local_storage_map *smap,
> > -                           struct bpf_local_storage_elem *first_selem)
> > +                           struct bpf_local_storage_elem *first_selem,
> > +                           gfp_t mem_flags)
> >  {
> >         struct bpf_local_storage *prev_storage, *storage;
> >         struct bpf_local_storage **owner_storage_ptr;
> > @@ -293,7 +293,7 @@ int bpf_local_storage_alloc(void *owner,
> >                 return err;
> >
> >         storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
> > -                                 GFP_ATOMIC | __GFP_NOWARN);
> > +                                 mem_flags | __GFP_NOWARN);
> >         if (!storage) {
> >                 err = -ENOMEM;
> >                 goto uncharge;
> > @@ -350,13 +350,13 @@ int bpf_local_storage_alloc(void *owner,
> >   */
> >  struct bpf_local_storage_data *
> >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > -                        void *value, u64 map_flags)
> > +                        void *value, u64 map_flags, gfp_t mem_flags)
> >  {
> >         struct bpf_local_storage_data *old_sdata = NULL;
> >         struct bpf_local_storage_elem *selem;
> >         struct bpf_local_storage *local_storage;
> >         unsigned long flags;
> > -       int err;
> > +       int err, charge_err;
> >
> >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >                 if (err)
> >                         return ERR_PTR(err);
> >
> > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> >                 if (!selem)
> >                         return ERR_PTR(-ENOMEM);
> >
> > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> >                 if (err) {
> >                         kfree(selem);
> >                         mem_uncharge(smap, owner, smap->elem_size);
> > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >                 }
> >         }
> >
> > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > +        * allocation outside the spinlock.
> > +        *
> > +        * There are a few cases where it is permissible for the memory charge
> > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > +        * value already exists, we can swap values without needing an
> > +        * allocation), so in the case of a failure here, continue on and see
> > +        * if the failure is relevant.
> > +        */
> > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > +                               mem_flags | __GFP_NOWARN);
> > +
> >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> >
> >         /* Recheck local_storage->list under local_storage->lock */
> > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> >                                       false);
> > -               selem = SELEM(old_sdata);
> > -               goto unlock;
> > +
> > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > +
> > +               if (!charge_err)
> > +                       mem_uncharge(smap, owner, smap->elem_size);
> > +               kfree(selem);
> > +
> > +               return old_sdata;
> > +       }
> > +
> > +       if (!old_sdata && charge_err) {
> > +               /* If there is no existing local storage value, then this means
> > +                * we needed the charge to succeed. We must make sure this did not
> > +                * return an error.
> > +                *
> > +                * Please note that if an existing local storage value exists, then
> > +                * it doesn't matter if the charge failed because we can just
> > +                * "reuse" the charge from the existing local storage element.
> > +                */
>
> But we did allocate a new element which was unaccounted for, even if
> it was temporarily.
> [for the short period of time till we freed the old element]
>
> Martin, is this something we are okay with?
>
> > +               err = charge_err;
> > +               goto unlock_err;
> >         }
> >
> > -       /* local_storage->lock is held.  Hence, we are sure
> > -        * we can unlink and uncharge the old_sdata successfully
> > -        * later.  Hence, instead of charging the new selem now
> > -        * and then uncharge the old selem later (which may cause
> > -        * a potential but unnecessary charge failure),  avoid taking
> > -        * a charge at all here (the "!old_sdata" check) and the
> > -        * old_sdata will not be uncharged later during
> > -        * bpf_selem_unlink_storage_nolock().
> > -        */
> > -       selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
> >         if (!selem) {
> >                 err = -ENOMEM;
> >                 goto unlock_err;
> >         }
> >
> > +       if (value)
> > +               memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > +
> >         /* First, link the new selem to the map */
> >         bpf_selem_link_map(smap, selem);
> >
> > @@ -454,15 +479,17 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >         if (old_sdata) {
> >                 bpf_selem_unlink_map(SELEM(old_sdata));
> >                 bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
> > -                                               false);
> > +                                               !charge_err);
> >         }
> >
> > -unlock:
> >         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> >         return SDATA(selem);
> >
> >  unlock_err:
> >         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > +       if (!charge_err)
> > +               mem_uncharge(smap, owner, smap->elem_size);
> > +       kfree(selem);
> >         return ERR_PTR(err);
> >  }
> >
> > diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> > index 5da7bed0f5f6..bb9e22bad42b 100644
> > --- a/kernel/bpf/bpf_task_storage.c
> > +++ b/kernel/bpf/bpf_task_storage.c
> > @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
> >
> >         bpf_task_storage_lock();
> >         sdata = bpf_local_storage_update(
> > -               task, (struct bpf_local_storage_map *)map, value, map_flags);
> > +               task, (struct bpf_local_storage_map *)map, value, map_flags,
> > +               GFP_ATOMIC);
> >         bpf_task_storage_unlock();
> >
> >         err = PTR_ERR_OR_ZERO(sdata);
> > @@ -226,8 +227,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
> >         return err;
> >  }
> >
> > -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> > -          task, void *, value, u64, flags)
> > +/* *mem_flags* is set by the bpf verifier */
>
> Is there a precedence of this happening for any other helpers?
>

Yes, e.g. bpf_timer_set_callback receives hidden prog->aux.

> You may want to add here that "any value, even if set by uapi, will be ignored"
>
> Can we go even beyond this and ensure that the verifier understands
> that this is an
> "internal only arg" something in check_helper_call?
>

Since bpf_func_proto was not changed, arg5_type is still ARG_DONTCARE, so
verifier should already ignore register type for R5, and anyway user cannot
assume it remains same across call (since it is caller-saved), so I think this
is fine.

> > +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> > +          task, void *, value, u64, flags, gfp_t, mem_flags)
> >  {
> >         struct bpf_local_storage_data *sdata;
> >
> > @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> >             (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
> >                 sdata = bpf_local_storage_update(
> >                         task, (struct bpf_local_storage_map *)map, value,
> > -                       BPF_NOEXIST);
> > +                       BPF_NOEXIST, mem_flags);
> >
> >  unlock:
> >         bpf_task_storage_unlock();
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 0db6cd8dcb35..392fdaabedbd 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -13491,6 +13491,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> >                         goto patch_call_imm;
> >                 }
> >
> > +               if (insn->imm == BPF_FUNC_task_storage_get ||
> > +                   insn->imm == BPF_FUNC_sk_storage_get ||
> > +                   insn->imm == BPF_FUNC_inode_storage_get) {
> > +                       if (env->prog->aux->sleepable)
> > +                               insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_KERNEL);
> > +                       else
> > +                               insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_ATOMIC);
> > +                       insn_buf[1] = *insn;
> > +                       cnt = 2;
> > +
> > +                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > +                       if (!new_prog)
> > +                               return -ENOMEM;
> > +
> > +                       delta += cnt - 1;
> > +                       env->prog = prog = new_prog;
> > +                       insn = new_prog->insnsi + i + delta;
> > +                       goto patch_call_imm;
> > +               }
> > +
> >                 /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
> >                  * and other inlining handlers are currently limited to 64 bit
> >                  * only.
> > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> > index d9c37fd10809..8790a3791d39 100644
> > --- a/net/core/bpf_sk_storage.c
> > +++ b/net/core/bpf_sk_storage.c
> > @@ -141,7 +141,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
> >         if (sock) {
> >                 sdata = bpf_local_storage_update(
> >                         sock->sk, (struct bpf_local_storage_map *)map, value,
> > -                       map_flags);
> > +                       map_flags, GFP_ATOMIC);
> >                 sockfd_put(sock);
> >                 return PTR_ERR_OR_ZERO(sdata);
> >         }
> > @@ -172,7 +172,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
> >  {
> >         struct bpf_local_storage_elem *copy_selem;
> >
> > -       copy_selem = bpf_selem_alloc(smap, newsk, NULL, true);
> > +       copy_selem = bpf_selem_alloc(smap, newsk, NULL, GFP_ATOMIC);
> >         if (!copy_selem)
> >                 return NULL;
> >
> > @@ -230,7 +230,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> >                         bpf_selem_link_map(smap, copy_selem);
> >                         bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
> >                 } else {
> > -                       ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
> > +                       ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
> >                         if (ret) {
> >                                 kfree(copy_selem);
> >                                 atomic_sub(smap->elem_size,
> > @@ -255,8 +255,9 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> >         return ret;
> >  }
> >
> > -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > -          void *, value, u64, flags)
> > +/* *mem_flags* is set by the bpf verifier */
> > +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > +          void *, value, u64, flags, gfp_t, mem_flags)
> >  {
> >         struct bpf_local_storage_data *sdata;
> >
> > @@ -277,7 +278,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >             refcount_inc_not_zero(&sk->sk_refcnt)) {
> >                 sdata = bpf_local_storage_update(
> >                         sk, (struct bpf_local_storage_map *)map, value,
> > -                       BPF_NOEXIST);
> > +                       BPF_NOEXIST, mem_flags);
> >                 /* sk must be a fullsock (guaranteed by verifier),
> >                  * so sock_gen_put() is unnecessary.
> >                  */
> > @@ -417,14 +418,16 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
> >         return false;
> >  }
> >
> > -BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
> > -          void *, value, u64, flags)
> > +/* *mem_flags* is set by the bpf verifier */
> > +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
> > +          void *, value, u64, flags, gfp_t, mem_flags)
> >  {
> >         WARN_ON_ONCE(!bpf_rcu_lock_held());
> >         if (in_hardirq() || in_nmi())
> >                 return (unsigned long)NULL;
> >
> > -       return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
> > +       return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
> > +                                                    mem_flags);
> >  }
> >
> >  BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
> > --
> > 2.30.2
> >

--
Kartikeya
Martin KaFai Lau March 15, 2022, 7:04 p.m. UTC | #3
On Tue, Mar 15, 2022 at 03:26:46AM +0100, KP Singh wrote:
[ ... ]

> >  struct bpf_local_storage_data *
> >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > -                        void *value, u64 map_flags)
> > +                        void *value, u64 map_flags, gfp_t mem_flags)
> >  {
> >         struct bpf_local_storage_data *old_sdata = NULL;
> >         struct bpf_local_storage_elem *selem;
> >         struct bpf_local_storage *local_storage;
> >         unsigned long flags;
> > -       int err;
> > +       int err, charge_err;
> >
> >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >                 if (err)
> >                         return ERR_PTR(err);
> >
> > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> >                 if (!selem)
> >                         return ERR_PTR(-ENOMEM);
> >
> > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> >                 if (err) {
> >                         kfree(selem);
> >                         mem_uncharge(smap, owner, smap->elem_size);
> > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >                 }
> >         }
> >
> > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > +        * allocation outside the spinlock.
> > +        *
> > +        * There are a few cases where it is permissible for the memory charge
> > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > +        * value already exists, we can swap values without needing an
> > +        * allocation), so in the case of a failure here, continue on and see
> > +        * if the failure is relevant.
> > +        */
> > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > +                               mem_flags | __GFP_NOWARN);
> > +
> >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> >
> >         /* Recheck local_storage->list under local_storage->lock */
> > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> >                                       false);
> > -               selem = SELEM(old_sdata);
> > -               goto unlock;
> > +
> > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > +
> > +               if (!charge_err)
> > +                       mem_uncharge(smap, owner, smap->elem_size);
> > +               kfree(selem);
> > +
> > +               return old_sdata;
> > +       }
> > +
> > +       if (!old_sdata && charge_err) {
> > +               /* If there is no existing local storage value, then this means
> > +                * we needed the charge to succeed. We must make sure this did not
> > +                * return an error.
> > +                *
> > +                * Please note that if an existing local storage value exists, then
> > +                * it doesn't matter if the charge failed because we can just
> > +                * "reuse" the charge from the existing local storage element.
> > +                */
> 
> But we did allocate a new element which was unaccounted for, even if
> it was temporarily.
> [for the short period of time till we freed the old element]
> 
> Martin, is this something we are okay with?
It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
Most things happen in a raw_spin_lock, so this should be very brief moment.
Not perfect but should be fine.

If it always error out on charge failure, it will risk the case that the
userspace's syscall will unnecessary be failed on mem charging while it only
tries to replace the old_sdata.

If the concern is the increased chance of brief moment of unaccounted memory
from the helper side now because GFP_KERNEL is from the helper only,
another option that came up to my mind is to decide to do the alloc before or
after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
calling from the bpf helper side and it is always done with BPF_NOEXIST
because the bpf helper has already done a lookup, so it should
always charge success first and then alloc.

Something like this that drafted on top of this patch.  Untested code:

diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
index 092a1ac772d7..b48beb57fe6e 100644
--- c/kernel/bpf/bpf_local_storage.c
+++ w/kernel/bpf/bpf_local_storage.c
@@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
-		void *value, bool charge_mem)
+		void *value, bool charge_mem, gfp_t mem_flags)
 {
 	struct bpf_local_storage_elem *selem;
 
@@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 		return NULL;
 
 	selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
-				GFP_ATOMIC | __GFP_NOWARN);
+				mem_flags | __GFP_NOWARN);
 	if (selem) {
 		if (value)
 			memcpy(SDATA(selem)->data, value, smap->map.value_size);

@@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		}
 	}
 
+	if (mem_flags == GFP_KERNEL) {
+		selem = bpf_selem_alloc(smap, owner, value, true, mem_flags);
+		if (!selem)
+			return ERR_PTR(-ENOMEM);
+	}
+
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 
 	/* Recheck local_storage->list under local_storage->lock */
@@ -438,10 +448,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	 * old_sdata will not be uncharged later during
 	 * bpf_selem_unlink_storage_nolock().
 	 */
-	selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
-	if (!selem) {
-		err = -ENOMEM;
-		goto unlock_err;
+	if (mem_flags != GFP_KERNEL) {
+		selem = bpf_selem_alloc(smap, owner, value, !old_sdata, mem_flags);
+		if (!selem) {
+			err = -ENOMEM;
+			goto unlock_err;
+		}
 	}
 
 	/* First, link the new selem to the map */
@@ -463,6 +475,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 
 unlock_err:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	if (selem) {
+		mem_uncharge(smap, owner, smap->elem_size);
+		kfree(selem);
+	}
 	return ERR_PTR(err);
 }
 

> 
> > +               err = charge_err;
> > +               goto unlock_err;
> >         }
> >
> > -       /* local_storage->lock is held.  Hence, we are sure
> > -        * we can unlink and uncharge the old_sdata successfully
> > -        * later.  Hence, instead of charging the new selem now
> > -        * and then uncharge the old selem later (which may cause
> > -        * a potential but unnecessary charge failure),  avoid taking
> > -        * a charge at all here (the "!old_sdata" check) and the
> > -        * old_sdata will not be uncharged later during
> > -        * bpf_selem_unlink_storage_nolock().
> > -        */
> > -       selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
> >         if (!selem) {
> >                 err = -ENOMEM;
> >                 goto unlock_err;
> >         }
> >
> > +       if (value)
> > +               memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > +
> >         /* First, link the new selem to the map */
> >         bpf_selem_link_map(smap, selem);
> >
> > @@ -454,15 +479,17 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >         if (old_sdata) {
> >                 bpf_selem_unlink_map(SELEM(old_sdata));
> >                 bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
> > -                                               false);
> > +                                               !charge_err);
> >         }
> >
> > -unlock:
> >         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> >         return SDATA(selem);
> >
> >  unlock_err:
> >         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > +       if (!charge_err)
> > +               mem_uncharge(smap, owner, smap->elem_size);
> > +       kfree(selem);
> >         return ERR_PTR(err);
> >  }
> >
> > diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> > index 5da7bed0f5f6..bb9e22bad42b 100644
> > --- a/kernel/bpf/bpf_task_storage.c
> > +++ b/kernel/bpf/bpf_task_storage.c
> > @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
> >
> >         bpf_task_storage_lock();
> >         sdata = bpf_local_storage_update(
> > -               task, (struct bpf_local_storage_map *)map, value, map_flags);
> > +               task, (struct bpf_local_storage_map *)map, value, map_flags,
> > +               GFP_ATOMIC);
> >         bpf_task_storage_unlock();
> >
> >         err = PTR_ERR_OR_ZERO(sdata);
> > @@ -226,8 +227,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
> >         return err;
> >  }
> >
> > -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> > -          task, void *, value, u64, flags)
> > +/* *mem_flags* is set by the bpf verifier */
> 
> Is there a precedence of this happening for any other helpers?
Kumar has also mentioned the timer helper.

> 
> You may want to add here that "any value, even if set by uapi, will be ignored"
> 
> Can we go even beyond this and ensure that the verifier understands
> that this is an
> "internal only arg" something in check_helper_call?
The compiler is free to store anything in R5 before calling the helper, so 
verifier cannot enforce it is unused or not, and the verifier does not have to.

> 
> > +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> > +          task, void *, value, u64, flags, gfp_t, mem_flags)
> >  {
> >         struct bpf_local_storage_data *sdata;
> >
> > @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> >             (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
> >                 sdata = bpf_local_storage_update(
> >                         task, (struct bpf_local_storage_map *)map, value,
> > -                       BPF_NOEXIST);
> > +                       BPF_NOEXIST, mem_flags);
> >
> >  unlock:
> >         bpf_task_storage_unlock();
KP Singh March 15, 2022, 7:51 p.m. UTC | #4
On Tue, Mar 15, 2022 at 8:05 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Mar 15, 2022 at 03:26:46AM +0100, KP Singh wrote:
> [ ... ]
>
> > >  struct bpf_local_storage_data *
> > >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > -                        void *value, u64 map_flags)
> > > +                        void *value, u64 map_flags, gfp_t mem_flags)
> > >  {
> > >         struct bpf_local_storage_data *old_sdata = NULL;
> > >         struct bpf_local_storage_elem *selem;
> > >         struct bpf_local_storage *local_storage;
> > >         unsigned long flags;
> > > -       int err;
> > > +       int err, charge_err;
> > >
> > >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > >                 if (err)
> > >                         return ERR_PTR(err);
> > >
> > > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> > >                 if (!selem)
> > >                         return ERR_PTR(-ENOMEM);
> > >
> > > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> > >                 if (err) {
> > >                         kfree(selem);
> > >                         mem_uncharge(smap, owner, smap->elem_size);
> > > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > >                 }
> > >         }
> > >
> > > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > > +        * allocation outside the spinlock.
> > > +        *
> > > +        * There are a few cases where it is permissible for the memory charge
> > > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > > +        * value already exists, we can swap values without needing an
> > > +        * allocation), so in the case of a failure here, continue on and see
> > > +        * if the failure is relevant.
> > > +        */
> > > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > +                               mem_flags | __GFP_NOWARN);
> > > +
> > >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> > >
> > >         /* Recheck local_storage->list under local_storage->lock */
> > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > >                                       false);
> > > -               selem = SELEM(old_sdata);
> > > -               goto unlock;
> > > +
> > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > +
> > > +               if (!charge_err)
> > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > +               kfree(selem);
> > > +
> > > +               return old_sdata;
> > > +       }
> > > +
> > > +       if (!old_sdata && charge_err) {
> > > +               /* If there is no existing local storage value, then this means
> > > +                * we needed the charge to succeed. We must make sure this did not
> > > +                * return an error.
> > > +                *
> > > +                * Please note that if an existing local storage value exists, then
> > > +                * it doesn't matter if the charge failed because we can just
> > > +                * "reuse" the charge from the existing local storage element.
> > > +                */
> >
> > But we did allocate a new element which was unaccounted for, even if
> > it was temporarily.
> > [for the short period of time till we freed the old element]
> >
> > Martin, is this something we are okay with?
> It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> Most things happen in a raw_spin_lock, so this should be very brief moment.
> Not perfect but should be fine.
>
> If it always error out on charge failure, it will risk the case that the
> userspace's syscall will unnecessary be failed on mem charging while it only
> tries to replace the old_sdata.
>
> If the concern is the increased chance of brief moment of unaccounted memory
> from the helper side now because GFP_KERNEL is from the helper only,
> another option that came up to my mind is to decide to do the alloc before or
> after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> calling from the bpf helper side and it is always done with BPF_NOEXIST
> because the bpf helper has already done a lookup, so it should
> always charge success first and then alloc.
>
> Something like this that drafted on top of this patch.  Untested code:

I think this looks okay. One minor comment below:

>
> diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> index 092a1ac772d7..b48beb57fe6e 100644
> --- c/kernel/bpf/bpf_local_storage.c
> +++ w/kernel/bpf/bpf_local_storage.c
> @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
>
>  struct bpf_local_storage_elem *
>  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> -               void *value, bool charge_mem)
> +               void *value, bool charge_mem, gfp_t mem_flags)
>  {
>         struct bpf_local_storage_elem *selem;
>
> @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
>                 return NULL;
>
>         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> -                               GFP_ATOMIC | __GFP_NOWARN);
> +                               mem_flags | __GFP_NOWARN);
>         if (selem) {
>                 if (value)
>                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
>
> @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>                 }
>         }
>
> +       if (mem_flags == GFP_KERNEL) {

It seems like what this check really is (and similarly for the other
mem_flags based check you have below.

"am I called from a sleepable helper"

and I wonder if, instead of the verifier setting mem_flags, could set
a boolean "sleepable_helper_call"
which might be more useful and readable and is more relevant to the
check that the verifier is
performing "if (env->prog->aux->sleepable)"


> +               selem = bpf_selem_alloc(smap, owner, value, true, mem_flags);
> +               if (!selem)
> +                       return ERR_PTR(-ENOMEM);
> +       }
> +
>         raw_spin_lock_irqsave(&local_storage->lock, flags);
>
>         /* Recheck local_storage->list under local_storage->lock */
> @@ -438,10 +448,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>          * old_sdata will not be uncharged later during
>          * bpf_selem_unlink_storage_nolock().
>          */
> -       selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
> -       if (!selem) {
> -               err = -ENOMEM;
> -               goto unlock_err;
> +       if (mem_flags != GFP_KERNEL) {
> +               selem = bpf_selem_alloc(smap, owner, value, !old_sdata, mem_flags);
> +               if (!selem) {
> +                       err = -ENOMEM;
> +                       goto unlock_err;
> +               }
>         }
>
>         /* First, link the new selem to the map */
> @@ -463,6 +475,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
>
>  unlock_err:
>         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> +       if (selem) {
> +               mem_uncharge(smap, owner, smap->elem_size);
> +               kfree(selem);
> +       }
>         return ERR_PTR(err);
>  }
>
>
> >
> > > +               err = charge_err;
> > > +               goto unlock_err;
> > >         }
> > >
> > > -       /* local_storage->lock is held.  Hence, we are sure
> > > -        * we can unlink and uncharge the old_sdata successfully
> > > -        * later.  Hence, instead of charging the new selem now
> > > -        * and then uncharge the old selem later (which may cause
> > > -        * a potential but unnecessary charge failure),  avoid taking
> > > -        * a charge at all here (the "!old_sdata" check) and the
> > > -        * old_sdata will not be uncharged later during
> > > -        * bpf_selem_unlink_storage_nolock().
> > > -        */
> > > -       selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
> > >         if (!selem) {
> > >                 err = -ENOMEM;
> > >                 goto unlock_err;
> > >         }
> > >
> > > +       if (value)
> > > +               memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > > +
> > >         /* First, link the new selem to the map */
> > >         bpf_selem_link_map(smap, selem);
> > >
> > > @@ -454,15 +479,17 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > >         if (old_sdata) {
> > >                 bpf_selem_unlink_map(SELEM(old_sdata));
> > >                 bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
> > > -                                               false);
> > > +                                               !charge_err);
> > >         }
> > >
> > > -unlock:
> > >         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > >         return SDATA(selem);
> > >
> > >  unlock_err:
> > >         raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > +       if (!charge_err)
> > > +               mem_uncharge(smap, owner, smap->elem_size);
> > > +       kfree(selem);
> > >         return ERR_PTR(err);
> > >  }
> > >
> > > diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> > > index 5da7bed0f5f6..bb9e22bad42b 100644
> > > --- a/kernel/bpf/bpf_task_storage.c
> > > +++ b/kernel/bpf/bpf_task_storage.c
> > > @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
> > >
> > >         bpf_task_storage_lock();
> > >         sdata = bpf_local_storage_update(
> > > -               task, (struct bpf_local_storage_map *)map, value, map_flags);
> > > +               task, (struct bpf_local_storage_map *)map, value, map_flags,
> > > +               GFP_ATOMIC);
> > >         bpf_task_storage_unlock();
> > >
> > >         err = PTR_ERR_OR_ZERO(sdata);
> > > @@ -226,8 +227,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
> > >         return err;
> > >  }
> > >
> > > -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> > > -          task, void *, value, u64, flags)
> > > +/* *mem_flags* is set by the bpf verifier */
> >
> > Is there a precedence of this happening for any other helpers?
> Kumar has also mentioned the timer helper.
>
> >
> > You may want to add here that "any value, even if set by uapi, will be ignored"

I guess this comment is still helpful if a user is not using the
bpf_helpers header
generated from the uapi doc strings?

> >
> > Can we go even beyond this and ensure that the verifier understands
> > that this is an
> > "internal only arg" something in check_helper_call?
> The compiler is free to store anything in R5 before calling the helper, so
> verifier cannot enforce it is unused or not, and the verifier does not have to.

Thanks Kumar and Martin.

>
> >
> > > +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> > > +          task, void *, value, u64, flags, gfp_t, mem_flags)
> > >  {
> > >         struct bpf_local_storage_data *sdata;
> > >
> > > @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> > >             (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
> > >                 sdata = bpf_local_storage_update(
> > >                         task, (struct bpf_local_storage_map *)map, value,
> > > -                       BPF_NOEXIST);
> > > +                       BPF_NOEXIST, mem_flags);
> > >
> > >  unlock:
> > >         bpf_task_storage_unlock();
Alexei Starovoitov March 15, 2022, 8:02 p.m. UTC | #5
On Tue, Mar 15, 2022 at 12:51 PM KP Singh <kpsingh@kernel.org> wrote:
>
> On Tue, Mar 15, 2022 at 8:05 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Tue, Mar 15, 2022 at 03:26:46AM +0100, KP Singh wrote:
> > [ ... ]
> >
> > > >  struct bpf_local_storage_data *
> > > >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > -                        void *value, u64 map_flags)
> > > > +                        void *value, u64 map_flags, gfp_t mem_flags)
> > > >  {
> > > >         struct bpf_local_storage_data *old_sdata = NULL;
> > > >         struct bpf_local_storage_elem *selem;
> > > >         struct bpf_local_storage *local_storage;
> > > >         unsigned long flags;
> > > > -       int err;
> > > > +       int err, charge_err;
> > > >
> > > >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> > > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > > > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > >                 if (err)
> > > >                         return ERR_PTR(err);
> > > >
> > > > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > > > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> > > >                 if (!selem)
> > > >                         return ERR_PTR(-ENOMEM);
> > > >
> > > > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > > > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> > > >                 if (err) {
> > > >                         kfree(selem);
> > > >                         mem_uncharge(smap, owner, smap->elem_size);
> > > > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > >                 }
> > > >         }
> > > >
> > > > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > > > +        * allocation outside the spinlock.
> > > > +        *
> > > > +        * There are a few cases where it is permissible for the memory charge
> > > > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > > > +        * value already exists, we can swap values without needing an
> > > > +        * allocation), so in the case of a failure here, continue on and see
> > > > +        * if the failure is relevant.
> > > > +        */
> > > > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > > > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > +                               mem_flags | __GFP_NOWARN);
> > > > +
> > > >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> > > >
> > > >         /* Recheck local_storage->list under local_storage->lock */
> > > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > > >                                       false);
> > > > -               selem = SELEM(old_sdata);
> > > > -               goto unlock;
> > > > +
> > > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > +
> > > > +               if (!charge_err)
> > > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > > +               kfree(selem);
> > > > +
> > > > +               return old_sdata;
> > > > +       }
> > > > +
> > > > +       if (!old_sdata && charge_err) {
> > > > +               /* If there is no existing local storage value, then this means
> > > > +                * we needed the charge to succeed. We must make sure this did not
> > > > +                * return an error.
> > > > +                *
> > > > +                * Please note that if an existing local storage value exists, then
> > > > +                * it doesn't matter if the charge failed because we can just
> > > > +                * "reuse" the charge from the existing local storage element.
> > > > +                */
> > >
> > > But we did allocate a new element which was unaccounted for, even if
> > > it was temporarily.
> > > [for the short period of time till we freed the old element]
> > >
> > > Martin, is this something we are okay with?
> > It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> > and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> > Most things happen in a raw_spin_lock, so this should be very brief moment.
> > Not perfect but should be fine.
> >
> > If it always error out on charge failure, it will risk the case that the
> > userspace's syscall will unnecessary be failed on mem charging while it only
> > tries to replace the old_sdata.
> >
> > If the concern is the increased chance of brief moment of unaccounted memory
> > from the helper side now because GFP_KERNEL is from the helper only,
> > another option that came up to my mind is to decide to do the alloc before or
> > after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> > calling from the bpf helper side and it is always done with BPF_NOEXIST
> > because the bpf helper has already done a lookup, so it should
> > always charge success first and then alloc.
> >
> > Something like this that drafted on top of this patch.  Untested code:
>
> I think this looks okay. One minor comment below:
>
> >
> > diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> > index 092a1ac772d7..b48beb57fe6e 100644
> > --- c/kernel/bpf/bpf_local_storage.c
> > +++ w/kernel/bpf/bpf_local_storage.c
> > @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> >
> >  struct bpf_local_storage_elem *
> >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > -               void *value, bool charge_mem)
> > +               void *value, bool charge_mem, gfp_t mem_flags)
> >  {
> >         struct bpf_local_storage_elem *selem;
> >
> > @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> >                 return NULL;
> >
> >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > -                               GFP_ATOMIC | __GFP_NOWARN);
> > +                               mem_flags | __GFP_NOWARN);
> >         if (selem) {
> >                 if (value)
> >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> >
> > @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> >                 }
> >         }
> >
> > +       if (mem_flags == GFP_KERNEL) {
>
> It seems like what this check really is (and similarly for the other
> mem_flags based check you have below.
>
> "am I called from a sleepable helper"
>
> and I wonder if, instead of the verifier setting mem_flags, could set
> a boolean "sleepable_helper_call"
> which might be more useful and readable and is more relevant to the
> check that the verifier is
> performing "if (env->prog->aux->sleepable)"

I think you're proposing to pass a boolean flag
into the helper instead of gfp_t?
I think gfp_t is cleaner.
For example we might allow local storage access under bpf_spin_lock
or other critical sections.
Passing boolean flag of the prog state is not equivalent
to the availability of gfp at the callsite inside bpf prog code.
KP Singh March 15, 2022, 8:08 p.m. UTC | #6
On Tue, Mar 15, 2022 at 9:02 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 15, 2022 at 12:51 PM KP Singh <kpsingh@kernel.org> wrote:
> >
> > On Tue, Mar 15, 2022 at 8:05 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Tue, Mar 15, 2022 at 03:26:46AM +0100, KP Singh wrote:
> > > [ ... ]
> > >
> > > > >  struct bpf_local_storage_data *
> > > > >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > -                        void *value, u64 map_flags)
> > > > > +                        void *value, u64 map_flags, gfp_t mem_flags)
> > > > >  {
> > > > >         struct bpf_local_storage_data *old_sdata = NULL;
> > > > >         struct bpf_local_storage_elem *selem;
> > > > >         struct bpf_local_storage *local_storage;
> > > > >         unsigned long flags;
> > > > > -       int err;
> > > > > +       int err, charge_err;
> > > > >
> > > > >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> > > > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > > > > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > >                 if (err)
> > > > >                         return ERR_PTR(err);
> > > > >
> > > > > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > > > > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> > > > >                 if (!selem)
> > > > >                         return ERR_PTR(-ENOMEM);
> > > > >
> > > > > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > > > > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> > > > >                 if (err) {
> > > > >                         kfree(selem);
> > > > >                         mem_uncharge(smap, owner, smap->elem_size);
> > > > > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > >                 }
> > > > >         }
> > > > >
> > > > > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > > > > +        * allocation outside the spinlock.
> > > > > +        *
> > > > > +        * There are a few cases where it is permissible for the memory charge
> > > > > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > > > > +        * value already exists, we can swap values without needing an
> > > > > +        * allocation), so in the case of a failure here, continue on and see
> > > > > +        * if the failure is relevant.
> > > > > +        */
> > > > > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > > > > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > +                               mem_flags | __GFP_NOWARN);
> > > > > +
> > > > >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> > > > >
> > > > >         /* Recheck local_storage->list under local_storage->lock */
> > > > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > > > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > > > >                                       false);
> > > > > -               selem = SELEM(old_sdata);
> > > > > -               goto unlock;
> > > > > +
> > > > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > > +
> > > > > +               if (!charge_err)
> > > > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > > > +               kfree(selem);
> > > > > +
> > > > > +               return old_sdata;
> > > > > +       }
> > > > > +
> > > > > +       if (!old_sdata && charge_err) {
> > > > > +               /* If there is no existing local storage value, then this means
> > > > > +                * we needed the charge to succeed. We must make sure this did not
> > > > > +                * return an error.
> > > > > +                *
> > > > > +                * Please note that if an existing local storage value exists, then
> > > > > +                * it doesn't matter if the charge failed because we can just
> > > > > +                * "reuse" the charge from the existing local storage element.
> > > > > +                */
> > > >
> > > > But we did allocate a new element which was unaccounted for, even if
> > > > it was temporarily.
> > > > [for the short period of time till we freed the old element]
> > > >
> > > > Martin, is this something we are okay with?
> > > It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> > > and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> > > Most things happen in a raw_spin_lock, so this should be very brief moment.
> > > Not perfect but should be fine.
> > >
> > > If it always error out on charge failure, it will risk the case that the
> > > userspace's syscall will unnecessary be failed on mem charging while it only
> > > tries to replace the old_sdata.
> > >
> > > If the concern is the increased chance of brief moment of unaccounted memory
> > > from the helper side now because GFP_KERNEL is from the helper only,
> > > another option that came up to my mind is to decide to do the alloc before or
> > > after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> > > calling from the bpf helper side and it is always done with BPF_NOEXIST
> > > because the bpf helper has already done a lookup, so it should
> > > always charge success first and then alloc.
> > >
> > > Something like this that drafted on top of this patch.  Untested code:
> >
> > I think this looks okay. One minor comment below:
> >
> > >
> > > diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> > > index 092a1ac772d7..b48beb57fe6e 100644
> > > --- c/kernel/bpf/bpf_local_storage.c
> > > +++ w/kernel/bpf/bpf_local_storage.c
> > > @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> > >
> > >  struct bpf_local_storage_elem *
> > >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > -               void *value, bool charge_mem)
> > > +               void *value, bool charge_mem, gfp_t mem_flags)
> > >  {
> > >         struct bpf_local_storage_elem *selem;
> > >
> > > @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > >                 return NULL;
> > >
> > >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > -                               GFP_ATOMIC | __GFP_NOWARN);
> > > +                               mem_flags | __GFP_NOWARN);
> > >         if (selem) {
> > >                 if (value)
> > >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > >
> > > @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > >                 }
> > >         }
> > >
> > > +       if (mem_flags == GFP_KERNEL) {
> >
> > It seems like what this check really is (and similarly for the other
> > mem_flags based check you have below.
> >
> > "am I called from a sleepable helper"
> >
> > and I wonder if, instead of the verifier setting mem_flags, could set
> > a boolean "sleepable_helper_call"
> > which might be more useful and readable and is more relevant to the
> > check that the verifier is
> > performing "if (env->prog->aux->sleepable)"
>
> I think you're proposing to pass a boolean flag
> into the helper instead of gfp_t?
> I think gfp_t is cleaner.
> For example we might allow local storage access under bpf_spin_lock
> or other critical sections.
> Passing boolean flag of the prog state is not equivalent
> to the availability of gfp at the callsite inside bpf prog code.

Ah yes, then using gfp_t makes sense as we may have other use cases.

I think we can follow up with the changes Martin suggested separately as
the current behaviour is essentially the same.

In any case, you can add my:

Acked-by: KP Singh <kpsingh@kernel.org>
Joanne Koong March 15, 2022, 8:33 p.m. UTC | #7
On Tue, Mar 15, 2022 at 1:08 PM KP Singh <kpsingh@kernel.org> wrote:
>
> On Tue, Mar 15, 2022 at 9:02 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Mar 15, 2022 at 12:51 PM KP Singh <kpsingh@kernel.org> wrote:
> > >
> > > On Tue, Mar 15, 2022 at 8:05 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > On Tue, Mar 15, 2022 at 03:26:46AM +0100, KP Singh wrote:
> > > > [ ... ]
> > > >
> > > > > >  struct bpf_local_storage_data *
> > > > > >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > -                        void *value, u64 map_flags)
> > > > > > +                        void *value, u64 map_flags, gfp_t mem_flags)
> > > > > >  {
> > > > > >         struct bpf_local_storage_data *old_sdata = NULL;
> > > > > >         struct bpf_local_storage_elem *selem;
> > > > > >         struct bpf_local_storage *local_storage;
> > > > > >         unsigned long flags;
> > > > > > -       int err;
> > > > > > +       int err, charge_err;
> > > > > >
> > > > > >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> > > > > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > > > > > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > >                 if (err)
> > > > > >                         return ERR_PTR(err);
> > > > > >
> > > > > > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > > > > > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> > > > > >                 if (!selem)
> > > > > >                         return ERR_PTR(-ENOMEM);
> > > > > >
> > > > > > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > > > > > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> > > > > >                 if (err) {
> > > > > >                         kfree(selem);
> > > > > >                         mem_uncharge(smap, owner, smap->elem_size);
> > > > > > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > >                 }
> > > > > >         }
> > > > > >
> > > > > > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > > > > > +        * allocation outside the spinlock.
> > > > > > +        *
> > > > > > +        * There are a few cases where it is permissible for the memory charge
> > > > > > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > > > > > +        * value already exists, we can swap values without needing an
> > > > > > +        * allocation), so in the case of a failure here, continue on and see
> > > > > > +        * if the failure is relevant.
> > > > > > +        */
> > > > > > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > > > > > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > > +                               mem_flags | __GFP_NOWARN);
> > > > > > +
> > > > > >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> > > > > >
> > > > > >         /* Recheck local_storage->list under local_storage->lock */
> > > > > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > > > > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > > > > >                                       false);
> > > > > > -               selem = SELEM(old_sdata);
> > > > > > -               goto unlock;
> > > > > > +
> > > > > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > > > +
> > > > > > +               if (!charge_err)
> > > > > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > > > > +               kfree(selem);
> > > > > > +
> > > > > > +               return old_sdata;
> > > > > > +       }
> > > > > > +
> > > > > > +       if (!old_sdata && charge_err) {
> > > > > > +               /* If there is no existing local storage value, then this means
> > > > > > +                * we needed the charge to succeed. We must make sure this did not
> > > > > > +                * return an error.
> > > > > > +                *
> > > > > > +                * Please note that if an existing local storage value exists, then
> > > > > > +                * it doesn't matter if the charge failed because we can just
> > > > > > +                * "reuse" the charge from the existing local storage element.
> > > > > > +                */
> > > > >
> > > > > But we did allocate a new element which was unaccounted for, even if
> > > > > it was temporarily.
> > > > > [for the short period of time till we freed the old element]
> > > > >
> > > > > Martin, is this something we are okay with?
> > > > It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> > > > and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> > > > Most things happen in a raw_spin_lock, so this should be very brief moment.
> > > > Not perfect but should be fine.
> > > >
> > > > If it always error out on charge failure, it will risk the case that the
> > > > userspace's syscall will unnecessary be failed on mem charging while it only
> > > > tries to replace the old_sdata.
> > > >
> > > > If the concern is the increased chance of brief moment of unaccounted memory
> > > > from the helper side now because GFP_KERNEL is from the helper only,
> > > > another option that came up to my mind is to decide to do the alloc before or
> > > > after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> > > > calling from the bpf helper side and it is always done with BPF_NOEXIST
> > > > because the bpf helper has already done a lookup, so it should
> > > > always charge success first and then alloc.
> > > >
> > > > Something like this that drafted on top of this patch.  Untested code:
> > >
> > > I think this looks okay. One minor comment below:
> > >
> > > >
> > > > diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> > > > index 092a1ac772d7..b48beb57fe6e 100644
> > > > --- c/kernel/bpf/bpf_local_storage.c
> > > > +++ w/kernel/bpf/bpf_local_storage.c
> > > > @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> > > >
> > > >  struct bpf_local_storage_elem *
> > > >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > -               void *value, bool charge_mem)
> > > > +               void *value, bool charge_mem, gfp_t mem_flags)
> > > >  {
> > > >         struct bpf_local_storage_elem *selem;
> > > >
> > > > @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > >                 return NULL;
> > > >
> > > >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > -                               GFP_ATOMIC | __GFP_NOWARN);
> > > > +                               mem_flags | __GFP_NOWARN);
> > > >         if (selem) {
> > > >                 if (value)
> > > >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > > >
> > > > @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > >                 }
> > > >         }
> > > >
> > > > +       if (mem_flags == GFP_KERNEL) {
> > >
> > > It seems like what this check really is (and similarly for the other
> > > mem_flags based check you have below.
> > >
> > > "am I called from a sleepable helper"
> > >
> > > and I wonder if, instead of the verifier setting mem_flags, could set
> > > a boolean "sleepable_helper_call"
> > > which might be more useful and readable and is more relevant to the
> > > check that the verifier is
> > > performing "if (env->prog->aux->sleepable)"
> >
> > I think you're proposing to pass a boolean flag
> > into the helper instead of gfp_t?
> > I think gfp_t is cleaner.
> > For example we might allow local storage access under bpf_spin_lock
> > or other critical sections.
> > Passing boolean flag of the prog state is not equivalent
> > to the availability of gfp at the callsite inside bpf prog code.
>
> Ah yes, then using gfp_t makes sense as we may have other use cases.
>
> I think we can follow up with the changes Martin suggested separately as
> the current behaviour is essentially the same.
>
> In any case, you can add my:
>
> Acked-by: KP Singh <kpsingh@kernel.org>

Thanks for the discussion, KP, Kumar, Martin, and Alexei!

For v2, I will make the following changes:
1) Allocate the memory before/after the raw_spin_lock_irqsave,
depending on mem_flags
2) Change the comment "*mem_flags* is set by the bpf verifier" to
"*mem_flags* is set by the bpf verifier. Any value set through uapi
will be ignored"
Alexei Starovoitov March 15, 2022, 8:41 p.m. UTC | #8
On Tue, Mar 15, 2022 at 1:33 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Tue, Mar 15, 2022 at 1:08 PM KP Singh <kpsingh@kernel.org> wrote:
> >
> > On Tue, Mar 15, 2022 at 9:02 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Mar 15, 2022 at 12:51 PM KP Singh <kpsingh@kernel.org> wrote:
> > > >
> > > > On Tue, Mar 15, 2022 at 8:05 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > >
> > > > > On Tue, Mar 15, 2022 at 03:26:46AM +0100, KP Singh wrote:
> > > > > [ ... ]
> > > > >
> > > > > > >  struct bpf_local_storage_data *
> > > > > > >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > > -                        void *value, u64 map_flags)
> > > > > > > +                        void *value, u64 map_flags, gfp_t mem_flags)
> > > > > > >  {
> > > > > > >         struct bpf_local_storage_data *old_sdata = NULL;
> > > > > > >         struct bpf_local_storage_elem *selem;
> > > > > > >         struct bpf_local_storage *local_storage;
> > > > > > >         unsigned long flags;
> > > > > > > -       int err;
> > > > > > > +       int err, charge_err;
> > > > > > >
> > > > > > >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> > > > > > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > > > > > > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > >                 if (err)
> > > > > > >                         return ERR_PTR(err);
> > > > > > >
> > > > > > > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > > > > > > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> > > > > > >                 if (!selem)
> > > > > > >                         return ERR_PTR(-ENOMEM);
> > > > > > >
> > > > > > > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > > > > > > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> > > > > > >                 if (err) {
> > > > > > >                         kfree(selem);
> > > > > > >                         mem_uncharge(smap, owner, smap->elem_size);
> > > > > > > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > >                 }
> > > > > > >         }
> > > > > > >
> > > > > > > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > > > > > > +        * allocation outside the spinlock.
> > > > > > > +        *
> > > > > > > +        * There are a few cases where it is permissible for the memory charge
> > > > > > > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > > > > > > +        * value already exists, we can swap values without needing an
> > > > > > > +        * allocation), so in the case of a failure here, continue on and see
> > > > > > > +        * if the failure is relevant.
> > > > > > > +        */
> > > > > > > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > > > > > > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > > > +                               mem_flags | __GFP_NOWARN);
> > > > > > > +
> > > > > > >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> > > > > > >
> > > > > > >         /* Recheck local_storage->list under local_storage->lock */
> > > > > > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > > > > > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > > > > > >                                       false);
> > > > > > > -               selem = SELEM(old_sdata);
> > > > > > > -               goto unlock;
> > > > > > > +
> > > > > > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > > > > +
> > > > > > > +               if (!charge_err)
> > > > > > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > > > > > +               kfree(selem);
> > > > > > > +
> > > > > > > +               return old_sdata;
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       if (!old_sdata && charge_err) {
> > > > > > > +               /* If there is no existing local storage value, then this means
> > > > > > > +                * we needed the charge to succeed. We must make sure this did not
> > > > > > > +                * return an error.
> > > > > > > +                *
> > > > > > > +                * Please note that if an existing local storage value exists, then
> > > > > > > +                * it doesn't matter if the charge failed because we can just
> > > > > > > +                * "reuse" the charge from the existing local storage element.
> > > > > > > +                */
> > > > > >
> > > > > > But we did allocate a new element which was unaccounted for, even if
> > > > > > it was temporarily.
> > > > > > [for the short period of time till we freed the old element]
> > > > > >
> > > > > > Martin, is this something we are okay with?
> > > > > It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> > > > > and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> > > > > Most things happen in a raw_spin_lock, so this should be very brief moment.
> > > > > Not perfect but should be fine.
> > > > >
> > > > > If it always error out on charge failure, it will risk the case that the
> > > > > userspace's syscall will unnecessary be failed on mem charging while it only
> > > > > tries to replace the old_sdata.
> > > > >
> > > > > If the concern is the increased chance of brief moment of unaccounted memory
> > > > > from the helper side now because GFP_KERNEL is from the helper only,
> > > > > another option that came up to my mind is to decide to do the alloc before or
> > > > > after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> > > > > calling from the bpf helper side and it is always done with BPF_NOEXIST
> > > > > because the bpf helper has already done a lookup, so it should
> > > > > always charge success first and then alloc.
> > > > >
> > > > > Something like this that drafted on top of this patch.  Untested code:
> > > >
> > > > I think this looks okay. One minor comment below:
> > > >
> > > > >
> > > > > diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> > > > > index 092a1ac772d7..b48beb57fe6e 100644
> > > > > --- c/kernel/bpf/bpf_local_storage.c
> > > > > +++ w/kernel/bpf/bpf_local_storage.c
> > > > > @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> > > > >
> > > > >  struct bpf_local_storage_elem *
> > > > >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > > -               void *value, bool charge_mem)
> > > > > +               void *value, bool charge_mem, gfp_t mem_flags)
> > > > >  {
> > > > >         struct bpf_local_storage_elem *selem;
> > > > >
> > > > > @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > >                 return NULL;
> > > > >
> > > > >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > -                               GFP_ATOMIC | __GFP_NOWARN);
> > > > > +                               mem_flags | __GFP_NOWARN);
> > > > >         if (selem) {
> > > > >                 if (value)
> > > > >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > > > >
> > > > > @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > >                 }
> > > > >         }
> > > > >
> > > > > +       if (mem_flags == GFP_KERNEL) {
> > > >
> > > > It seems like what this check really is (and similarly for the other
> > > > mem_flags based check you have below.
> > > >
> > > > "am I called from a sleepable helper"
> > > >
> > > > and I wonder if, instead of the verifier setting mem_flags, could set
> > > > a boolean "sleepable_helper_call"
> > > > which might be more useful and readable and is more relevant to the
> > > > check that the verifier is
> > > > performing "if (env->prog->aux->sleepable)"
> > >
> > > I think you're proposing to pass a boolean flag
> > > into the helper instead of gfp_t?
> > > I think gfp_t is cleaner.
> > > For example we might allow local storage access under bpf_spin_lock
> > > or other critical sections.
> > > Passing boolean flag of the prog state is not equivalent
> > > to the availability of gfp at the callsite inside bpf prog code.
> >
> > Ah yes, then using gfp_t makes sense as we may have other use cases.
> >
> > I think we can follow up with the changes Martin suggested separately as
> > the current behaviour is essentially the same.
> >
> > In any case, you can add my:
> >
> > Acked-by: KP Singh <kpsingh@kernel.org>
>
> Thanks for the discussion, KP, Kumar, Martin, and Alexei!
>
> For v2, I will make the following changes:
> 1) Allocate the memory before/after the raw_spin_lock_irqsave,
> depending on mem_flags
> 2) Change the comment "*mem_flags* is set by the bpf verifier" to
> "*mem_flags* is set by the bpf verifier. Any value set through uapi
> will be ignored"

"set through uapi" is not accurate, since this argument is not part of uapi.
We'd need a paragraph to describe everything :)
I would just leave it as "mem_flags is a hidden argument provided by
the verifier".

Since respin is coming, maybe we can rename 'mem_flags' too?
Typically the gfp_t argument is called:
gfp_flags, gfp_mask, gfp_extra_flags, gfp, or flags.
I'd pick gfp_flags, but fine with others too.
KP Singh March 15, 2022, 8:47 p.m. UTC | #9
On Tue, Mar 15, 2022 at 9:41 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 15, 2022 at 1:33 PM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > On Tue, Mar 15, 2022 at 1:08 PM KP Singh <kpsingh@kernel.org> wrote:
> > >
> > > On Tue, Mar 15, 2022 at 9:02 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Tue, Mar 15, 2022 at 12:51 PM KP Singh <kpsingh@kernel.org> wrote:
> > > > >
> > > > > On Tue, Mar 15, 2022 at 8:05 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > >
> > > > > > On Tue, Mar 15, 2022 at 03:26:46AM +0100, KP Singh wrote:
> > > > > > [ ... ]
> > > > > >
> > > > > > > >  struct bpf_local_storage_data *
> > > > > > > >  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > > > -                        void *value, u64 map_flags)
> > > > > > > > +                        void *value, u64 map_flags, gfp_t mem_flags)
> > > > > > > >  {
> > > > > > > >         struct bpf_local_storage_data *old_sdata = NULL;
> > > > > > > >         struct bpf_local_storage_elem *selem;
> > > > > > > >         struct bpf_local_storage *local_storage;
> > > > > > > >         unsigned long flags;
> > > > > > > > -       int err;
> > > > > > > > +       int err, charge_err;
> > > > > > > >
> > > > > > > >         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> > > > > > > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> > > > > > > > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > > >                 if (err)
> > > > > > > >                         return ERR_PTR(err);
> > > > > > > >
> > > > > > > > -               selem = bpf_selem_alloc(smap, owner, value, true);
> > > > > > > > +               selem = bpf_selem_alloc(smap, owner, value, mem_flags);
> > > > > > > >                 if (!selem)
> > > > > > > >                         return ERR_PTR(-ENOMEM);
> > > > > > > >
> > > > > > > > -               err = bpf_local_storage_alloc(owner, smap, selem);
> > > > > > > > +               err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
> > > > > > > >                 if (err) {
> > > > > > > >                         kfree(selem);
> > > > > > > >                         mem_uncharge(smap, owner, smap->elem_size);
> > > > > > > > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > > >                 }
> > > > > > > >         }
> > > > > > > >
> > > > > > > > +       /* Since mem_flags can be non-atomic, we need to do the memory
> > > > > > > > +        * allocation outside the spinlock.
> > > > > > > > +        *
> > > > > > > > +        * There are a few cases where it is permissible for the memory charge
> > > > > > > > +        * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
> > > > > > > > +        * value already exists, we can swap values without needing an
> > > > > > > > +        * allocation), so in the case of a failure here, continue on and see
> > > > > > > > +        * if the failure is relevant.
> > > > > > > > +        */
> > > > > > > > +       charge_err = mem_charge(smap, owner, smap->elem_size);
> > > > > > > > +       selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > > > > +                               mem_flags | __GFP_NOWARN);
> > > > > > > > +
> > > > > > > >         raw_spin_lock_irqsave(&local_storage->lock, flags);
> > > > > > > >
> > > > > > > >         /* Recheck local_storage->list under local_storage->lock */
> > > > > > > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > > > > > > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > > > > > > >                                       false);
> > > > > > > > -               selem = SELEM(old_sdata);
> > > > > > > > -               goto unlock;
> > > > > > > > +
> > > > > > > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > > > > > +
> > > > > > > > +               if (!charge_err)
> > > > > > > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > > > > > > +               kfree(selem);
> > > > > > > > +
> > > > > > > > +               return old_sdata;
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +       if (!old_sdata && charge_err) {
> > > > > > > > +               /* If there is no existing local storage value, then this means
> > > > > > > > +                * we needed the charge to succeed. We must make sure this did not
> > > > > > > > +                * return an error.
> > > > > > > > +                *
> > > > > > > > +                * Please note that if an existing local storage value exists, then
> > > > > > > > +                * it doesn't matter if the charge failed because we can just
> > > > > > > > +                * "reuse" the charge from the existing local storage element.
> > > > > > > > +                */
> > > > > > >
> > > > > > > But we did allocate a new element which was unaccounted for, even if
> > > > > > > it was temporarily.
> > > > > > > [for the short period of time till we freed the old element]
> > > > > > >
> > > > > > > Martin, is this something we are okay with?
> > > > > > It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> > > > > > and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> > > > > > Most things happen in a raw_spin_lock, so this should be very brief moment.
> > > > > > Not perfect but should be fine.
> > > > > >
> > > > > > If it always error out on charge failure, it will risk the case that the
> > > > > > userspace's syscall will unnecessary be failed on mem charging while it only
> > > > > > tries to replace the old_sdata.
> > > > > >
> > > > > > If the concern is the increased chance of brief moment of unaccounted memory
> > > > > > from the helper side now because GFP_KERNEL is from the helper only,
> > > > > > another option that came up to my mind is to decide to do the alloc before or
> > > > > > after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> > > > > > calling from the bpf helper side and it is always done with BPF_NOEXIST
> > > > > > because the bpf helper has already done a lookup, so it should
> > > > > > always charge success first and then alloc.
> > > > > >
> > > > > > Something like this that drafted on top of this patch.  Untested code:
> > > > >
> > > > > I think this looks okay. One minor comment below:
> > > > >
> > > > > >
> > > > > > diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> > > > > > index 092a1ac772d7..b48beb57fe6e 100644
> > > > > > --- c/kernel/bpf/bpf_local_storage.c
> > > > > > +++ w/kernel/bpf/bpf_local_storage.c
> > > > > > @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> > > > > >
> > > > > >  struct bpf_local_storage_elem *
> > > > > >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > > > -               void *value, bool charge_mem)
> > > > > > +               void *value, bool charge_mem, gfp_t mem_flags)
> > > > > >  {
> > > > > >         struct bpf_local_storage_elem *selem;
> > > > > >
> > > > > > @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > > >                 return NULL;
> > > > > >
> > > > > >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > > -                               GFP_ATOMIC | __GFP_NOWARN);
> > > > > > +                               mem_flags | __GFP_NOWARN);
> > > > > >         if (selem) {
> > > > > >                 if (value)
> > > > > >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > > > > >
> > > > > > @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > >                 }
> > > > > >         }
> > > > > >
> > > > > > +       if (mem_flags == GFP_KERNEL) {
> > > > >
> > > > > It seems like what this check really is (and similarly for the other
> > > > > mem_flags based check you have below.
> > > > >
> > > > > "am I called from a sleepable helper"
> > > > >
> > > > > and I wonder if, instead of the verifier setting mem_flags, could set
> > > > > a boolean "sleepable_helper_call"
> > > > > which might be more useful and readable and is more relevant to the
> > > > > check that the verifier is
> > > > > performing "if (env->prog->aux->sleepable)"
> > > >
> > > > I think you're proposing to pass a boolean flag
> > > > into the helper instead of gfp_t?
> > > > I think gfp_t is cleaner.
> > > > For example we might allow local storage access under bpf_spin_lock
> > > > or other critical sections.
> > > > Passing boolean flag of the prog state is not equivalent
> > > > to the availability of gfp at the callsite inside bpf prog code.
> > >
> > > Ah yes, then using gfp_t makes sense as we may have other use cases.
> > >
> > > I think we can follow up with the changes Martin suggested separately as
> > > the current behaviour is essentially the same.
> > >
> > > In any case, you can add my:
> > >
> > > Acked-by: KP Singh <kpsingh@kernel.org>
> >
> > Thanks for the discussion, KP, Kumar, Martin, and Alexei!
> >
> > For v2, I will make the following changes:
> > 1) Allocate the memory before/after the raw_spin_lock_irqsave,
> > depending on mem_flags
> > 2) Change the comment "*mem_flags* is set by the bpf verifier" to
> > "*mem_flags* is set by the bpf verifier. Any value set through uapi
> > will be ignored"
>
> "set through uapi" is not accurate, since this argument is not part of uapi.
> We'd need a paragraph to describe everything :)
> I would just leave it as "mem_flags is a hidden argument provided by
> the verifier".

SGTM :)

>
> Since respin is coming, maybe we can rename 'mem_flags' too?
> Typically the gfp_t argument is called:
> gfp_flags, gfp_mask, gfp_extra_flags, gfp, or flags.
> I'd pick gfp_flags, but fine with others too.
Martin KaFai Lau March 15, 2022, 9:01 p.m. UTC | #10
On Tue, Mar 15, 2022 at 01:33:05PM -0700, Joanne Koong wrote:
> > > > > > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > > > > > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > > > > > >                                       false);
> > > > > > > -               selem = SELEM(old_sdata);
> > > > > > > -               goto unlock;
> > > > > > > +
> > > > > > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > > > > +
> > > > > > > +               if (!charge_err)
> > > > > > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > > > > > +               kfree(selem);
> > > > > > > +
> > > > > > > +               return old_sdata;
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       if (!old_sdata && charge_err) {
> > > > > > > +               /* If there is no existing local storage value, then this means
> > > > > > > +                * we needed the charge to succeed. We must make sure this did not
> > > > > > > +                * return an error.
> > > > > > > +                *
> > > > > > > +                * Please note that if an existing local storage value exists, then
> > > > > > > +                * it doesn't matter if the charge failed because we can just
> > > > > > > +                * "reuse" the charge from the existing local storage element.
> > > > > > > +                */
> > > > > >
> > > > > > But we did allocate a new element which was unaccounted for, even if
> > > > > > it was temporarily.
> > > > > > [for the short period of time till we freed the old element]
> > > > > >
> > > > > > Martin, is this something we are okay with?
> > > > > It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> > > > > and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> > > > > Most things happen in a raw_spin_lock, so this should be very brief moment.
> > > > > Not perfect but should be fine.
> > > > >
> > > > > If it always error out on charge failure, it will risk the case that the
> > > > > userspace's syscall will unnecessary be failed on mem charging while it only
> > > > > tries to replace the old_sdata.
> > > > >
> > > > > If the concern is the increased chance of brief moment of unaccounted memory
> > > > > from the helper side now because GFP_KERNEL is from the helper only,
> > > > > another option that came up to my mind is to decide to do the alloc before or
> > > > > after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> > > > > calling from the bpf helper side and it is always done with BPF_NOEXIST
> > > > > because the bpf helper has already done a lookup, so it should
> > > > > always charge success first and then alloc.
> > > > >
> > > > > Something like this that drafted on top of this patch.  Untested code:
> > > >
> > > > I think this looks okay. One minor comment below:
> > > >
> > > > >
> > > > > diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> > > > > index 092a1ac772d7..b48beb57fe6e 100644
> > > > > --- c/kernel/bpf/bpf_local_storage.c
> > > > > +++ w/kernel/bpf/bpf_local_storage.c
> > > > > @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> > > > >
> > > > >  struct bpf_local_storage_elem *
> > > > >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > > -               void *value, bool charge_mem)
> > > > > +               void *value, bool charge_mem, gfp_t mem_flags)
> > > > >  {
> > > > >         struct bpf_local_storage_elem *selem;
> > > > >
> > > > > @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > >                 return NULL;
> > > > >
> > > > >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > -                               GFP_ATOMIC | __GFP_NOWARN);
> > > > > +                               mem_flags | __GFP_NOWARN);
> > > > >         if (selem) {
> > > > >                 if (value)
> > > > >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > > > >
> > > > > @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > >                 }
> > > > >         }
> > > > >
> > > > > +       if (mem_flags == GFP_KERNEL) {
> > > >
> > > > It seems like what this check really is (and similarly for the other
> > > > mem_flags based check you have below.
> > > >
> > > > "am I called from a sleepable helper"
> > > >
> > > > and I wonder if, instead of the verifier setting mem_flags, could set
> > > > a boolean "sleepable_helper_call"
> > > > which might be more useful and readable and is more relevant to the
> > > > check that the verifier is
> > > > performing "if (env->prog->aux->sleepable)"
> > >
> > > I think you're proposing to pass a boolean flag
> > > into the helper instead of gfp_t?
> > > I think gfp_t is cleaner.
> > > For example we might allow local storage access under bpf_spin_lock
> > > or other critical sections.
> > > Passing boolean flag of the prog state is not equivalent
> > > to the availability of gfp at the callsite inside bpf prog code.
> >
> > Ah yes, then using gfp_t makes sense as we may have other use cases.
> >
> > I think we can follow up with the changes Martin suggested separately as
> > the current behaviour is essentially the same.
> >
> > In any case, you can add my:
> >
> > Acked-by: KP Singh <kpsingh@kernel.org>
> 
> Thanks for the discussion, KP, Kumar, Martin, and Alexei!
> 
> For v2, I will make the following changes:
> 1) Allocate the memory before/after the raw_spin_lock_irqsave,
> depending on mem_flags
> 2) Change the comment "*mem_flags* is set by the bpf verifier" to
> "*mem_flags* is set by the bpf verifier. Any value set through uapi
> will be ignored"
It will also be useful to double check if the existing sleepable
lsm selftests can cover this change.
Joanne Koong March 16, 2022, 2:11 a.m. UTC | #11
On Tue, Mar 15, 2022 at 2:01 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Mar 15, 2022 at 01:33:05PM -0700, Joanne Koong wrote:
> > > > > > > > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > > > >         if (old_sdata && (map_flags & BPF_F_LOCK)) {
> > > > > > > >                 copy_map_value_locked(&smap->map, old_sdata->data, value,
> > > > > > > >                                       false);
> > > > > > > > -               selem = SELEM(old_sdata);
> > > > > > > > -               goto unlock;
> > > > > > > > +
> > > > > > > > +               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > > > > > +
> > > > > > > > +               if (!charge_err)
> > > > > > > > +                       mem_uncharge(smap, owner, smap->elem_size);
> > > > > > > > +               kfree(selem);
> > > > > > > > +
> > > > > > > > +               return old_sdata;
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +       if (!old_sdata && charge_err) {
> > > > > > > > +               /* If there is no existing local storage value, then this means
> > > > > > > > +                * we needed the charge to succeed. We must make sure this did not
> > > > > > > > +                * return an error.
> > > > > > > > +                *
> > > > > > > > +                * Please note that if an existing local storage value exists, then
> > > > > > > > +                * it doesn't matter if the charge failed because we can just
> > > > > > > > +                * "reuse" the charge from the existing local storage element.
> > > > > > > > +                */
> > > > > > >
> > > > > > > But we did allocate a new element which was unaccounted for, even if
> > > > > > > it was temporarily.
> > > > > > > [for the short period of time till we freed the old element]
> > > > > > >
> > > > > > > Martin, is this something we are okay with?
> > > > > > It is the behavior today already.  Take a look at the bpf_selem_alloc(...., !sold_data)
> > > > > > and the later "if (old_sdata) { /* ... */ bpf_selem_unlink_storage_nolock(..., false); }"
> > > > > > Most things happen in a raw_spin_lock, so this should be very brief moment.
> > > > > > Not perfect but should be fine.
> > > > > >
> > > > > > If it always error out on charge failure, it will risk the case that the
> > > > > > userspace's syscall will unnecessary be failed on mem charging while it only
> > > > > > tries to replace the old_sdata.
> > > > > >
> > > > > > If the concern is the increased chance of brief moment of unaccounted memory
> > > > > > from the helper side now because GFP_KERNEL is from the helper only,
> > > > > > another option that came up to my mind is to decide to do the alloc before or
> > > > > > after raw_spin_lock_irqsave() based on mem_flags.  The GFP_KERNEL here is only
> > > > > > calling from the bpf helper side and it is always done with BPF_NOEXIST
> > > > > > because the bpf helper has already done a lookup, so it should
> > > > > > always charge success first and then alloc.
> > > > > >
> > > > > > Something like this that drafted on top of this patch.  Untested code:
> > > > >
> > > > > I think this looks okay. One minor comment below:
> > > > >
> > > > > >
> > > > > > diff --git c/kernel/bpf/bpf_local_storage.c w/kernel/bpf/bpf_local_storage.c
> > > > > > index 092a1ac772d7..b48beb57fe6e 100644
> > > > > > --- c/kernel/bpf/bpf_local_storage.c
> > > > > > +++ w/kernel/bpf/bpf_local_storage.c
> > > > > > @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
> > > > > >
> > > > > >  struct bpf_local_storage_elem *
> > > > > >  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > > > -               void *value, bool charge_mem)
> > > > > > +               void *value, bool charge_mem, gfp_t mem_flags)
> > > > > >  {
> > > > > >         struct bpf_local_storage_elem *selem;
> > > > > >
> > > > > > @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > > >                 return NULL;
> > > > > >
> > > > > >         selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> > > > > > -                               GFP_ATOMIC | __GFP_NOWARN);
> > > > > > +                               mem_flags | __GFP_NOWARN);
> > > > > >         if (selem) {
> > > > > >                 if (value)
> > > > > >                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
> > > > > >
> > > > > > @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> > > > > >                 }
> > > > > >         }
> > > > > >
> > > > > > +       if (mem_flags == GFP_KERNEL) {
> > > > >
> > > > > It seems like what this check really is (and similarly for the other
> > > > > mem_flags based check you have below.
> > > > >
> > > > > "am I called from a sleepable helper"
> > > > >
> > > > > and I wonder if, instead of the verifier setting mem_flags, could set
> > > > > a boolean "sleepable_helper_call"
> > > > > which might be more useful and readable and is more relevant to the
> > > > > check that the verifier is
> > > > > performing "if (env->prog->aux->sleepable)"
> > > >
> > > > I think you're proposing to pass a boolean flag
> > > > into the helper instead of gfp_t?
> > > > I think gfp_t is cleaner.
> > > > For example we might allow local storage access under bpf_spin_lock
> > > > or other critical sections.
> > > > Passing boolean flag of the prog state is not equivalent
> > > > to the availability of gfp at the callsite inside bpf prog code.
> > >
> > > Ah yes, then using gfp_t makes sense as we may have other use cases.
> > >
> > > I think we can follow up with the changes Martin suggested separately as
> > > the current behaviour is essentially the same.
> > >
> > > In any case, you can add my:
> > >
> > > Acked-by: KP Singh <kpsingh@kernel.org>
> >
> > Thanks for the discussion, KP, Kumar, Martin, and Alexei!
> >
> > For v2, I will make the following changes:
> > 1) Allocate the memory before/after the raw_spin_lock_irqsave,
> > depending on mem_flags
> > 2) Change the comment "*mem_flags* is set by the bpf verifier" to
> > "*mem_flags* is set by the bpf verifier. Any value set through uapi
> > will be ignored"
> It will also be useful to double check if the existing sleepable
> lsm selftests can cover this change.

Sounds great!! For v2, I will do the following:
1) Allocate the memory before/after the raw_spin_lock_irqsave,
depending on the flags
2) Rename mem_flags to gfp_flags
3) Reword the comment "*mem_flags* is set by the bpf verifier" to
"*gfp_flags* is a hidden argument provided by the verifier"
4) See if any of the sleepable lsm selftests can provide coverage on
the spinlock codepath in bpf_local_storage_update
diff mbox series

Patch

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 37b3906af8b1..d6905e85399d 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -154,16 +154,17 @@  void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
-		bool charge_mem);
+		gfp_t mem_flags);
 
 int
 bpf_local_storage_alloc(void *owner,
 			struct bpf_local_storage_map *smap,
-			struct bpf_local_storage_elem *first_selem);
+			struct bpf_local_storage_elem *first_selem,
+			gfp_t mem_flags);
 
 struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
-			 void *value, u64 map_flags);
+			 void *value, u64 map_flags, gfp_t mem_flags);
 
 void bpf_local_storage_free_rcu(struct rcu_head *rcu);
 
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e29d9e3d853e..41b0bec1e7e9 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -136,7 +136,7 @@  static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
 
 	sdata = bpf_local_storage_update(f->f_inode,
 					 (struct bpf_local_storage_map *)map,
-					 value, map_flags);
+					 value, map_flags, GFP_ATOMIC);
 	fput(f);
 	return PTR_ERR_OR_ZERO(sdata);
 }
@@ -169,8 +169,9 @@  static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
 	return err;
 }
 
-BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
-	   void *, value, u64, flags)
+/* *mem_flags* is set by the bpf verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+	   void *, value, u64, flags, gfp_t, mem_flags)
 {
 	struct bpf_local_storage_data *sdata;
 
@@ -196,7 +197,7 @@  BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
 	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
 		sdata = bpf_local_storage_update(
 			inode, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST);
+			BPF_NOEXIST, mem_flags);
 		return IS_ERR(sdata) ? (unsigned long)NULL :
 					     (unsigned long)sdata->data;
 	}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 092a1ac772d7..ca402f9cf1a5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -63,23 +63,22 @@  static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
-		void *value, bool charge_mem)
+		void *value, gfp_t mem_flags)
 {
 	struct bpf_local_storage_elem *selem;
 
-	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+	if (mem_charge(smap, owner, smap->elem_size))
 		return NULL;
 
 	selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
-				GFP_ATOMIC | __GFP_NOWARN);
+				mem_flags | __GFP_NOWARN);
 	if (selem) {
 		if (value)
 			memcpy(SDATA(selem)->data, value, smap->map.value_size);
 		return selem;
 	}
 
-	if (charge_mem)
-		mem_uncharge(smap, owner, smap->elem_size);
+	mem_uncharge(smap, owner, smap->elem_size);
 
 	return NULL;
 }
@@ -282,7 +281,8 @@  static int check_flags(const struct bpf_local_storage_data *old_sdata,
 
 int bpf_local_storage_alloc(void *owner,
 			    struct bpf_local_storage_map *smap,
-			    struct bpf_local_storage_elem *first_selem)
+			    struct bpf_local_storage_elem *first_selem,
+			    gfp_t mem_flags)
 {
 	struct bpf_local_storage *prev_storage, *storage;
 	struct bpf_local_storage **owner_storage_ptr;
@@ -293,7 +293,7 @@  int bpf_local_storage_alloc(void *owner,
 		return err;
 
 	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
-				  GFP_ATOMIC | __GFP_NOWARN);
+				  mem_flags | __GFP_NOWARN);
 	if (!storage) {
 		err = -ENOMEM;
 		goto uncharge;
@@ -350,13 +350,13 @@  int bpf_local_storage_alloc(void *owner,
  */
 struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
-			 void *value, u64 map_flags)
+			 void *value, u64 map_flags, gfp_t mem_flags)
 {
 	struct bpf_local_storage_data *old_sdata = NULL;
 	struct bpf_local_storage_elem *selem;
 	struct bpf_local_storage *local_storage;
 	unsigned long flags;
-	int err;
+	int err, charge_err;
 
 	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
@@ -373,11 +373,11 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		if (err)
 			return ERR_PTR(err);
 
-		selem = bpf_selem_alloc(smap, owner, value, true);
+		selem = bpf_selem_alloc(smap, owner, value, mem_flags);
 		if (!selem)
 			return ERR_PTR(-ENOMEM);
 
-		err = bpf_local_storage_alloc(owner, smap, selem);
+		err = bpf_local_storage_alloc(owner, smap, selem, mem_flags);
 		if (err) {
 			kfree(selem);
 			mem_uncharge(smap, owner, smap->elem_size);
@@ -404,6 +404,19 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		}
 	}
 
+	/* Since mem_flags can be non-atomic, we need to do the memory
+	 * allocation outside the spinlock.
+	 *
+	 * There are a few cases where it is permissible for the memory charge
+	 * and allocation to fail (eg if BPF_F_LOCK is set and a local storage
+	 * value already exists, we can swap values without needing an
+	 * allocation), so in the case of a failure here, continue on and see
+	 * if the failure is relevant.
+	 */
+	charge_err = mem_charge(smap, owner, smap->elem_size);
+	selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+				mem_flags | __GFP_NOWARN);
+
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 
 	/* Recheck local_storage->list under local_storage->lock */
@@ -425,25 +438,37 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata && (map_flags & BPF_F_LOCK)) {
 		copy_map_value_locked(&smap->map, old_sdata->data, value,
 				      false);
-		selem = SELEM(old_sdata);
-		goto unlock;
+
+		raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+		if (!charge_err)
+			mem_uncharge(smap, owner, smap->elem_size);
+		kfree(selem);
+
+		return old_sdata;
+	}
+
+	if (!old_sdata && charge_err) {
+		/* If there is no existing local storage value, then this means
+		 * we needed the charge to succeed. We must make sure this did not
+		 * return an error.
+		 *
+		 * Please note that if an existing local storage value exists, then
+		 * it doesn't matter if the charge failed because we can just
+		 * "reuse" the charge from the existing local storage element.
+		 */
+		err = charge_err;
+		goto unlock_err;
 	}
 
-	/* local_storage->lock is held.  Hence, we are sure
-	 * we can unlink and uncharge the old_sdata successfully
-	 * later.  Hence, instead of charging the new selem now
-	 * and then uncharge the old selem later (which may cause
-	 * a potential but unnecessary charge failure),  avoid taking
-	 * a charge at all here (the "!old_sdata" check) and the
-	 * old_sdata will not be uncharged later during
-	 * bpf_selem_unlink_storage_nolock().
-	 */
-	selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
 	if (!selem) {
 		err = -ENOMEM;
 		goto unlock_err;
 	}
 
+	if (value)
+		memcpy(SDATA(selem)->data, value, smap->map.value_size);
+
 	/* First, link the new selem to the map */
 	bpf_selem_link_map(smap, selem);
 
@@ -454,15 +479,17 @@  bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						false);
+						!charge_err);
 	}
 
-unlock:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	return SDATA(selem);
 
 unlock_err:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	if (!charge_err)
+		mem_uncharge(smap, owner, smap->elem_size);
+	kfree(selem);
 	return ERR_PTR(err);
 }
 
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 5da7bed0f5f6..bb9e22bad42b 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -174,7 +174,8 @@  static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
 
 	bpf_task_storage_lock();
 	sdata = bpf_local_storage_update(
-		task, (struct bpf_local_storage_map *)map, value, map_flags);
+		task, (struct bpf_local_storage_map *)map, value, map_flags,
+		GFP_ATOMIC);
 	bpf_task_storage_unlock();
 
 	err = PTR_ERR_OR_ZERO(sdata);
@@ -226,8 +227,9 @@  static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
 	return err;
 }
 
-BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
-	   task, void *, value, u64, flags)
+/* *mem_flags* is set by the bpf verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+	   task, void *, value, u64, flags, gfp_t, mem_flags)
 {
 	struct bpf_local_storage_data *sdata;
 
@@ -250,7 +252,7 @@  BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
 	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
 		sdata = bpf_local_storage_update(
 			task, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST);
+			BPF_NOEXIST, mem_flags);
 
 unlock:
 	bpf_task_storage_unlock();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0db6cd8dcb35..392fdaabedbd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13491,6 +13491,26 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
+		if (insn->imm == BPF_FUNC_task_storage_get ||
+		    insn->imm == BPF_FUNC_sk_storage_get ||
+		    insn->imm == BPF_FUNC_inode_storage_get) {
+			if (env->prog->aux->sleepable)
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_KERNEL);
+			else
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_ATOMIC);
+			insn_buf[1] = *insn;
+			cnt = 2;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d9c37fd10809..8790a3791d39 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -141,7 +141,7 @@  static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 	if (sock) {
 		sdata = bpf_local_storage_update(
 			sock->sk, (struct bpf_local_storage_map *)map, value,
-			map_flags);
+			map_flags, GFP_ATOMIC);
 		sockfd_put(sock);
 		return PTR_ERR_OR_ZERO(sdata);
 	}
@@ -172,7 +172,7 @@  bpf_sk_storage_clone_elem(struct sock *newsk,
 {
 	struct bpf_local_storage_elem *copy_selem;
 
-	copy_selem = bpf_selem_alloc(smap, newsk, NULL, true);
+	copy_selem = bpf_selem_alloc(smap, newsk, NULL, GFP_ATOMIC);
 	if (!copy_selem)
 		return NULL;
 
@@ -230,7 +230,7 @@  int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 			bpf_selem_link_map(smap, copy_selem);
 			bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
 		} else {
-			ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
+			ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
 			if (ret) {
 				kfree(copy_selem);
 				atomic_sub(smap->elem_size,
@@ -255,8 +255,9 @@  int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 	return ret;
 }
 
-BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
-	   void *, value, u64, flags)
+/* *mem_flags* is set by the bpf verifier */
+BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
+	   void *, value, u64, flags, gfp_t, mem_flags)
 {
 	struct bpf_local_storage_data *sdata;
 
@@ -277,7 +278,7 @@  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 	    refcount_inc_not_zero(&sk->sk_refcnt)) {
 		sdata = bpf_local_storage_update(
 			sk, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST);
+			BPF_NOEXIST, mem_flags);
 		/* sk must be a fullsock (guaranteed by verifier),
 		 * so sock_gen_put() is unnecessary.
 		 */
@@ -417,14 +418,16 @@  static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
 	return false;
 }
 
-BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
-	   void *, value, u64, flags)
+/* *mem_flags* is set by the bpf verifier */
+BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
+	   void *, value, u64, flags, gfp_t, mem_flags)
 {
 	WARN_ON_ONCE(!bpf_rcu_lock_held());
 	if (in_hardirq() || in_nmi())
 		return (unsigned long)NULL;
 
-	return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
+	return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
+						     mem_flags);
 }
 
 BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,