diff mbox series

[bpf-next,1/2] bpf: Allow bpf_local_storage to be used by sleepable programs

Message ID 20210826235127.303505-2-kpsingh@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Sleepable local storage | expand

Checks

Context Check Description
bpf/vmtest success Kernel LATEST + selftests
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 8 maintainers not CCed: lmb@cloudflare.com netdev@vger.kernel.org yhs@fb.com davem@davemloft.net gustavoars@kernel.org kuba@kernel.org songliubraving@fb.com john.fastabend@gmail.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 106 this patch: 106
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 106 this patch: 106
netdev/header_inline success Link
bpf/vmtest-bpf-next success Kernel LATEST + selftests

Commit Message

KP Singh Aug. 26, 2021, 11:51 p.m. UTC
Other maps like hashmaps are already available to sleepable programs.
Sleepable BPF programs run under trace RCU. Allow task, local and inode
storage to be used from sleepable programs.

The local storage code mostly runs under the programs RCU read section
(in __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable})
(rcu_read_lock or rcu_read_lock_trace) with the exception the logic
where the map is freed.

After some discussions and help from Jann Horn, the following changes
were made:

bpf_local_storage{_elem} are freed with a kfree_rcu
wrapped with a call_rcu_tasks_trace callback instead of a direct
kfree_rcu which does not respect the trace RCU grace periods. The
callback frees the storage/selem with kfree_rcu which handles the normal
RCU grace period similar to BPF trampolines.

Update the synchronise_rcu to synchronize_rcu_mult in the map freeing
code to wait for trace RCU and normal RCU grace periods.
While this is an expensive operation, bpf_local_storage_map_free
is not called from within a BPF program, rather only called when the
owning object is being freed.

Update the dereferencing of the pointers to use rcu_derference_protected
(with either the trace or normal RCU locks held) and add warnings in the
beginning of the get and delete helpers.

Signed-off-by: KP Singh <kpsingh@kernel.org>
---
 include/linux/bpf_local_storage.h |  5 ++++
 kernel/bpf/bpf_inode_storage.c    |  9 +++++--
 kernel/bpf/bpf_local_storage.c    | 43 ++++++++++++++++++++++++-------
 kernel/bpf/bpf_task_storage.c     |  6 ++++-
 kernel/bpf/verifier.c             |  3 +++
 net/core/bpf_sk_storage.c         |  8 +++++-
 6 files changed, 61 insertions(+), 13 deletions(-)

Comments

Martin KaFai Lau Aug. 27, 2021, 8:55 p.m. UTC | #1
On Thu, Aug 26, 2021 at 11:51:26PM +0000, KP Singh wrote:
> Other maps like hashmaps are already available to sleepable programs.
> Sleepable BPF programs run under trace RCU. Allow task, local and inode
> storage to be used from sleepable programs.
> 
> The local storage code mostly runs under the programs RCU read section
> (in __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable})
> (rcu_read_lock or rcu_read_lock_trace) with the exception the logic
> where the map is freed.
> 
> After some discussions and help from Jann Horn, the following changes
> were made:
> 
> bpf_local_storage{_elem} are freed with a kfree_rcu
> wrapped with a call_rcu_tasks_trace callback instead of a direct
> kfree_rcu which does not respect the trace RCU grace periods. The
> callback frees the storage/selem with kfree_rcu which handles the normal
> RCU grace period similar to BPF trampolines.
> 
> Update the synchronise_rcu to synchronize_rcu_mult in the map freeing
> code to wait for trace RCU and normal RCU grace periods.
> While this is an expensive operation, bpf_local_storage_map_free
> is not called from within a BPF program, rather only called when the
> owning object is being freed.
> 
> Update the dereferencing of the pointers to use rcu_derference_protected
> (with either the trace or normal RCU locks held) and add warnings in the
> beginning of the get and delete helpers.
> 
> Signed-off-by: KP Singh <kpsingh@kernel.org>
> ---
>  include/linux/bpf_local_storage.h |  5 ++++
>  kernel/bpf/bpf_inode_storage.c    |  9 +++++--
>  kernel/bpf/bpf_local_storage.c    | 43 ++++++++++++++++++++++++-------
>  kernel/bpf/bpf_task_storage.c     |  6 ++++-
>  kernel/bpf/verifier.c             |  3 +++
>  net/core/bpf_sk_storage.c         |  8 +++++-
>  6 files changed, 61 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 24496bc28e7b..8453488b334d 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -16,6 +16,9 @@
>  
>  #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
>  
> +#define bpf_local_storage_rcu_lock_held()			\
> +	(rcu_read_lock_held() || rcu_read_lock_trace_held() ||	\
> +		rcu_read_lock_bh_held())
There is a similar test in hashtab.  May be renaming it to a more
generic name such that it can be reused there?

[ ... ]

> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index b305270b7a4b..7760bc4e9565 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -11,6 +11,8 @@
>  #include <net/sock.h>
>  #include <uapi/linux/sock_diag.h>
>  #include <uapi/linux/btf.h>
> +#include <linux/rcupdate_trace.h>
> +#include <linux/rcupdate_wait.h>
>  
>  #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
>  
> @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
>  	return NULL;
>  }
>  
> +void bpf_local_storage_free_rcu(struct rcu_head *rcu)
> +{
> +	struct bpf_local_storage *local_storage;
> +
> +	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
> +	kfree_rcu(local_storage, rcu);
> +}
> +
> +static void bpf_selem_free_rcu(struct rcu_head *rcu)
> +{
> +	struct bpf_local_storage_elem *selem;
> +
> +	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
> +	kfree_rcu(selem, rcu);
> +}
> +
>  /* local_storage->lock must be held and selem->local_storage == local_storage.
>   * The caller must ensure selem->smap is still valid to be
>   * dereferenced for its smap->elem_size and smap->cache_idx.
> @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
>  		 *
>  		 * Although the unlock will be done under
>  		 * rcu_read_lock(),  it is more intutivie to
> -		 * read if kfree_rcu(local_storage, rcu) is done
> +		 * read if the freeing of the storage is done
>  		 * after the raw_spin_unlock_bh(&local_storage->lock).
>  		 *
>  		 * Hence, a "bool free_local_storage" is returned
> -		 * to the caller which then calls the kfree_rcu()
> -		 * after unlock.
> +		 * to the caller which then calls then frees the storage after
> +		 * all the RCU grace periods have expired.
>  		 */
>  	}
>  	hlist_del_init_rcu(&selem->snode);
> @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
>  	    SDATA(selem))
>  		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
>  
> -	kfree_rcu(selem, rcu);
> +	call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
Although the common use case is usually storage_get() much more often
than storage_delete(), do you aware any performance impact for
the bpf prog that does a lot of storage_delete()?

>  
>  	return free_local_storage;
>  }
> @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
>  	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
>  
>  	if (free_local_storage)
> -		kfree_rcu(local_storage, rcu);
> +		call_rcu_tasks_trace(&local_storage->rcu,
> +				     bpf_local_storage_free_rcu);
>  }
>  
>  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
>  	struct bpf_local_storage_elem *selem;
>  
>  	/* Fast path (cache hit) */
> -	sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> +	sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> +					  bpf_local_storage_rcu_lock_held());
There are other places using rcu_dereference() also.
e.g. in bpf_local_storage_update().
Should they be changed also?

[ ... ]

> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -13,6 +13,7 @@
>  #include <net/sock.h>
>  #include <uapi/linux/sock_diag.h>
>  #include <uapi/linux/btf.h>
> +#include <linux/rcupdate_trace.h>
>  
>  DEFINE_BPF_STORAGE_CACHE(sk_cache);
>  
> @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
>  	struct bpf_local_storage *sk_storage;
>  	struct bpf_local_storage_map *smap;
>  
> -	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> +	sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> +					       bpf_local_storage_rcu_lock_held());
>  	if (!sk_storage)
>  		return NULL;
>  
> @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>  {
>  	struct bpf_local_storage_data *sdata;
>  
> +	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
>  	if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
sk is protected by rcu_read_lock here.
Is it always safe to access it with the rcu_read_lock_trace alone ?

>  		return (unsigned long)NULL;
>
KP Singh Aug. 29, 2021, 9:52 p.m. UTC | #2
On Fri, Aug 27, 2021 at 10:55 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Thu, Aug 26, 2021 at 11:51:26PM +0000, KP Singh wrote:
> > Other maps like hashmaps are already available to sleepable programs.
> > Sleepable BPF programs run under trace RCU. Allow task, local and inode
> > storage to be used from sleepable programs.
> >
> > The local storage code mostly runs under the programs RCU read section
> > (in __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable})
> > (rcu_read_lock or rcu_read_lock_trace) with the exception the logic
> > where the map is freed.
> >
> > After some discussions and help from Jann Horn, the following changes
> > were made:
> >
> > bpf_local_storage{_elem} are freed with a kfree_rcu
> > wrapped with a call_rcu_tasks_trace callback instead of a direct
> > kfree_rcu which does not respect the trace RCU grace periods. The
> > callback frees the storage/selem with kfree_rcu which handles the normal
> > RCU grace period similar to BPF trampolines.
> >
> > Update the synchronise_rcu to synchronize_rcu_mult in the map freeing
> > code to wait for trace RCU and normal RCU grace periods.
> > While this is an expensive operation, bpf_local_storage_map_free
> > is not called from within a BPF program, rather only called when the
> > owning object is being freed.
> >
> > Update the dereferencing of the pointers to use rcu_derference_protected
> > (with either the trace or normal RCU locks held) and add warnings in the
> > beginning of the get and delete helpers.
> >
> > Signed-off-by: KP Singh <kpsingh@kernel.org>
> > ---

[...]

> > @@ -16,6 +16,9 @@
> >
> >  #define BPF_LOCAL_STORAGE_CACHE_SIZE 16
> >
> > +#define bpf_local_storage_rcu_lock_held()                    \
> > +     (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> > +             rcu_read_lock_bh_held())
> There is a similar test in hashtab.  May be renaming it to a more
> generic name such that it can be reused there?

Sure.

>
> [ ... ]
>
> > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > index b305270b7a4b..7760bc4e9565 100644
> > --- a/kernel/bpf/bpf_local_storage.c
> > +++ b/kernel/bpf/bpf_local_storage.c
> > @@ -11,6 +11,8 @@
> >  #include <net/sock.h>
> >  #include <uapi/linux/sock_diag.h>
> >  #include <uapi/linux/btf.h>
> > +#include <linux/rcupdate_trace.h>
> > +#include <linux/rcupdate_wait.h>
> >
> >  #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> >
> > @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> >       return NULL;
> >  }
> >
> > +void bpf_local_storage_free_rcu(struct rcu_head *rcu)
> > +{
> > +     struct bpf_local_storage *local_storage;
> > +
> > +     local_storage = container_of(rcu, struct bpf_local_storage, rcu);
> > +     kfree_rcu(local_storage, rcu);
> > +}
> > +
> > +static void bpf_selem_free_rcu(struct rcu_head *rcu)
> > +{
> > +     struct bpf_local_storage_elem *selem;
> > +
> > +     selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
> > +     kfree_rcu(selem, rcu);
> > +}
> > +
> >  /* local_storage->lock must be held and selem->local_storage == local_storage.
> >   * The caller must ensure selem->smap is still valid to be
> >   * dereferenced for its smap->elem_size and smap->cache_idx.
> > @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> >                *
> >                * Although the unlock will be done under
> >                * rcu_read_lock(),  it is more intutivie to
> > -              * read if kfree_rcu(local_storage, rcu) is done
> > +              * read if the freeing of the storage is done
> >                * after the raw_spin_unlock_bh(&local_storage->lock).
> >                *
> >                * Hence, a "bool free_local_storage" is returned
> > -              * to the caller which then calls the kfree_rcu()
> > -              * after unlock.
> > +              * to the caller which then calls then frees the storage after
> > +              * all the RCU grace periods have expired.
> >                */
> >       }
> >       hlist_del_init_rcu(&selem->snode);
> > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> >           SDATA(selem))
> >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> >
> > -     kfree_rcu(selem, rcu);
> > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> Although the common use case is usually storage_get() much more often
> than storage_delete(), do you aware any performance impact for
> the bpf prog that does a lot of storage_delete()?

I have not really measured the impact on deletes, My understanding is
that it should
not impact the BPF program, but yes, if there are some critical
sections that are prolonged
due to a sleepable program "sleeping" too long, then it would pile up
the callbacks.

But this is not something new, as we have a similar thing in BPF
trampolines. If this really
becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
with this flag would be allowed in sleepable progs.

We could then wait for both critical sections only when this flag is
set on the map.

>
> >
> >       return free_local_storage;
> >  }
> > @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> >       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> >
> >       if (free_local_storage)
> > -             kfree_rcu(local_storage, rcu);
> > +             call_rcu_tasks_trace(&local_storage->rcu,
> > +                                  bpf_local_storage_free_rcu);
> >  }
> >
> >  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> >       struct bpf_local_storage_elem *selem;
> >
> >       /* Fast path (cache hit) */
> > -     sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> > +     sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> > +                                       bpf_local_storage_rcu_lock_held());
> There are other places using rcu_dereference() also.
> e.g. in bpf_local_storage_update().
> Should they be changed also?

From what I saw, the other usage of rcu_derference is in a nested
(w.r.t to the RCU section that in bpf_prog_enter/exit) RCU
read side critical section/rcu_read_{lock, unlock} so it should not be required.

If there are some that are not, then they need to be updated. Did I miss any?

>
> [ ... ]
>
> > --- a/net/core/bpf_sk_storage.c
> > +++ b/net/core/bpf_sk_storage.c
> > @@ -13,6 +13,7 @@
> >  #include <net/sock.h>
> >  #include <uapi/linux/sock_diag.h>
> >  #include <uapi/linux/btf.h>
> > +#include <linux/rcupdate_trace.h>
> >
> >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> >
> > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> >       struct bpf_local_storage *sk_storage;
> >       struct bpf_local_storage_map *smap;
> >
> > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > +                                            bpf_local_storage_rcu_lock_held());
> >       if (!sk_storage)
> >               return NULL;
> >
> > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >  {
> >       struct bpf_local_storage_data *sdata;
> >
> > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> sk is protected by rcu_read_lock here.
> Is it always safe to access it with the rcu_read_lock_trace alone ?

We don't dereference sk with an rcu_dereference though, is it still the case for
tracing and LSM programs? Or is it somehow implicity protected even
though we don't
use rcu_dereference since that's just a READ_ONCE + some checks?

Would grabbing a refcount earlier in the code be helpful?

>
> >               return (unsigned long)NULL;
> >
Martin KaFai Lau Aug. 31, 2021, 2:11 a.m. UTC | #3
On Sun, Aug 29, 2021 at 11:52:03PM +0200, KP Singh wrote:
[ ... ]

> > > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > > index b305270b7a4b..7760bc4e9565 100644
> > > --- a/kernel/bpf/bpf_local_storage.c
> > > +++ b/kernel/bpf/bpf_local_storage.c
> > > @@ -11,6 +11,8 @@
> > >  #include <net/sock.h>
> > >  #include <uapi/linux/sock_diag.h>
> > >  #include <uapi/linux/btf.h>
> > > +#include <linux/rcupdate_trace.h>
> > > +#include <linux/rcupdate_wait.h>
> > >
> > >  #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> > >
> > > @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > >       return NULL;
> > >  }
> > >
> > > +void bpf_local_storage_free_rcu(struct rcu_head *rcu)
> > > +{
> > > +     struct bpf_local_storage *local_storage;
> > > +
> > > +     local_storage = container_of(rcu, struct bpf_local_storage, rcu);
> > > +     kfree_rcu(local_storage, rcu);
> > > +}
> > > +
> > > +static void bpf_selem_free_rcu(struct rcu_head *rcu)
> > > +{
> > > +     struct bpf_local_storage_elem *selem;
> > > +
> > > +     selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
> > > +     kfree_rcu(selem, rcu);
> > > +}
> > > +
> > >  /* local_storage->lock must be held and selem->local_storage == local_storage.
> > >   * The caller must ensure selem->smap is still valid to be
> > >   * dereferenced for its smap->elem_size and smap->cache_idx.
> > > @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > >                *
> > >                * Although the unlock will be done under
> > >                * rcu_read_lock(),  it is more intutivie to
> > > -              * read if kfree_rcu(local_storage, rcu) is done
> > > +              * read if the freeing of the storage is done
> > >                * after the raw_spin_unlock_bh(&local_storage->lock).
> > >                *
> > >                * Hence, a "bool free_local_storage" is returned
> > > -              * to the caller which then calls the kfree_rcu()
> > > -              * after unlock.
> > > +              * to the caller which then calls then frees the storage after
> > > +              * all the RCU grace periods have expired.
> > >                */
> > >       }
> > >       hlist_del_init_rcu(&selem->snode);
> > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > >           SDATA(selem))
> > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > >
> > > -     kfree_rcu(selem, rcu);
> > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > Although the common use case is usually storage_get() much more often
> > than storage_delete(), do you aware any performance impact for
> > the bpf prog that does a lot of storage_delete()?
> 
> I have not really measured the impact on deletes, My understanding is
> that it should
> not impact the BPF program, but yes, if there are some critical
> sections that are prolonged
> due to a sleepable program "sleeping" too long, then it would pile up
> the callbacks.
> 
> But this is not something new, as we have a similar thing in BPF
> trampolines. If this really
> becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> with this flag would be allowed in sleepable progs.
Agree that is similar to trampoline updates but not sure it is comparable
in terms of the frequency of elems being deleted here.  e.g. many
short lived tcp connections created by external traffic.

Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
existing sleepable bpf prog.

I don't know enough on call_rcu_tasks_trace() here, so the
earlier question on perf/callback-pile-up implications in order to
decide if extra logic or knob is needed here or not.

> 
> We could then wait for both critical sections only when this flag is
> set on the map.
> 
> >
> > >
> > >       return free_local_storage;
> > >  }
> > > @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> > >       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > >
> > >       if (free_local_storage)
> > > -             kfree_rcu(local_storage, rcu);
> > > +             call_rcu_tasks_trace(&local_storage->rcu,
> > > +                                  bpf_local_storage_free_rcu);
> > >  }
> > >
> > >  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> > > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> > >       struct bpf_local_storage_elem *selem;
> > >
> > >       /* Fast path (cache hit) */
> > > -     sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> > > +     sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> > > +                                       bpf_local_storage_rcu_lock_held());
> > There are other places using rcu_dereference() also.
> > e.g. in bpf_local_storage_update().
> > Should they be changed also?
> 
> From what I saw, the other usage of rcu_derference is in a nested
> (w.r.t to the RCU section that in bpf_prog_enter/exit) RCU
> read side critical section/rcu_read_{lock, unlock} so it should not be required.
hmm... not sure what nested or not has to do here.
It is likely we are talking different things.

Did you turn on the lockdep rcu debugging in kconfig?

afaik, lockdep uses the second "check" argument to WARN on incorrect usage.
Even the right check is passed here, the later rcu_dereference() will still
complain because it only checks for rcu_read_lock_held().

Also, after another look, why rcu_dereference_protected() is used
here instead of rcu_dereference_check()?  The spinlock is not acquired
here.  The same comment goes for similar rcu_dereference_protected() usages.

> 
> If there are some that are not, then they need to be updated. Did I miss any?
> 
> >
> > [ ... ]
> >
> > > --- a/net/core/bpf_sk_storage.c
> > > +++ b/net/core/bpf_sk_storage.c
> > > @@ -13,6 +13,7 @@
> > >  #include <net/sock.h>
> > >  #include <uapi/linux/sock_diag.h>
> > >  #include <uapi/linux/btf.h>
> > > +#include <linux/rcupdate_trace.h>
> > >
> > >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> > >
> > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> > >       struct bpf_local_storage *sk_storage;
> > >       struct bpf_local_storage_map *smap;
> > >
> > > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > > +                                            bpf_local_storage_rcu_lock_held());
> > >       if (!sk_storage)
> > >               return NULL;
> > >
> > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > >  {
> > >       struct bpf_local_storage_data *sdata;
> > >
> > > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> > >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> > sk is protected by rcu_read_lock here.
> > Is it always safe to access it with the rcu_read_lock_trace alone ?
> 
> We don't dereference sk with an rcu_dereference though, is it still the case for
> tracing and LSM programs? Or is it somehow implicity protected even
> though we don't use rcu_dereference since that's just a READ_ONCE + some checks?
e.g. the bpf_prog (currently run under rcu_read_lock()) may read the sk from
req_sk->sk which I don't think the verifier will optimize it out, so as good
as READ_ONCE(), iiuc.

The sk here is obtained from the bpf_lsm_socket_* hooks?  Those sk should have
a refcnt, right?  If that is the case, it should be good enough for now.

> 
> Would grabbing a refcount earlier in the code be helpful?
Grab the refcount earlier in the bpf prog itself?
right, it may be what eventually needs to be done
if somehow there is a usecase that the sleepable bpf prog can get a
hold on a rcu protected sk.
KP Singh Aug. 31, 2021, 9:50 a.m. UTC | #4
On Tue, Aug 31, 2021 at 4:11 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Sun, Aug 29, 2021 at 11:52:03PM +0200, KP Singh wrote:
> [ ... ]
>
> > > > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > > > index b305270b7a4b..7760bc4e9565 100644
> > > > --- a/kernel/bpf/bpf_local_storage.c
> > > > +++ b/kernel/bpf/bpf_local_storage.c
> > > > @@ -11,6 +11,8 @@
> > > >  #include <net/sock.h>
> > > >  #include <uapi/linux/sock_diag.h>
> > > >  #include <uapi/linux/btf.h>
> > > > +#include <linux/rcupdate_trace.h>
> > > > +#include <linux/rcupdate_wait.h>
> > > >
> > > >  #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> > > >
> > > > @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > >       return NULL;
> > > >  }
> > > >
> > > > +void bpf_local_storage_free_rcu(struct rcu_head *rcu)
> > > > +{
> > > > +     struct bpf_local_storage *local_storage;
> > > > +
> > > > +     local_storage = container_of(rcu, struct bpf_local_storage, rcu);
> > > > +     kfree_rcu(local_storage, rcu);
> > > > +}
> > > > +
> > > > +static void bpf_selem_free_rcu(struct rcu_head *rcu)
> > > > +{
> > > > +     struct bpf_local_storage_elem *selem;
> > > > +
> > > > +     selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
> > > > +     kfree_rcu(selem, rcu);
> > > > +}
> > > > +
> > > >  /* local_storage->lock must be held and selem->local_storage == local_storage.
> > > >   * The caller must ensure selem->smap is still valid to be
> > > >   * dereferenced for its smap->elem_size and smap->cache_idx.
> > > > @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > >                *
> > > >                * Although the unlock will be done under
> > > >                * rcu_read_lock(),  it is more intutivie to
> > > > -              * read if kfree_rcu(local_storage, rcu) is done
> > > > +              * read if the freeing of the storage is done
> > > >                * after the raw_spin_unlock_bh(&local_storage->lock).
> > > >                *
> > > >                * Hence, a "bool free_local_storage" is returned
> > > > -              * to the caller which then calls the kfree_rcu()
> > > > -              * after unlock.
> > > > +              * to the caller which then calls then frees the storage after
> > > > +              * all the RCU grace periods have expired.
> > > >                */
> > > >       }
> > > >       hlist_del_init_rcu(&selem->snode);
> > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > >           SDATA(selem))
> > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > >
> > > > -     kfree_rcu(selem, rcu);
> > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > Although the common use case is usually storage_get() much more often
> > > than storage_delete(), do you aware any performance impact for
> > > the bpf prog that does a lot of storage_delete()?
> >
> > I have not really measured the impact on deletes, My understanding is
> > that it should
> > not impact the BPF program, but yes, if there are some critical
> > sections that are prolonged
> > due to a sleepable program "sleeping" too long, then it would pile up
> > the callbacks.
> >
> > But this is not something new, as we have a similar thing in BPF
> > trampolines. If this really
> > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > with this flag would be allowed in sleepable progs.
> Agree that is similar to trampoline updates but not sure it is comparable
> in terms of the frequency of elems being deleted here.  e.g. many
> short lived tcp connections created by external traffic.
>
> Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> existing sleepable bpf prog.
>
> I don't know enough on call_rcu_tasks_trace() here, so the
> earlier question on perf/callback-pile-up implications in order to
> decide if extra logic or knob is needed here or not.

I will defer to the others, maybe Alexei and Paul, we could also just
add the flag to not affect existing performance characteristics?

>
> >
> > We could then wait for both critical sections only when this flag is
> > set on the map.
> >
> > >
> > > >
> > > >       return free_local_storage;
> > > >  }
> > > > @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> > > >       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > >
> > > >       if (free_local_storage)
> > > > -             kfree_rcu(local_storage, rcu);
> > > > +             call_rcu_tasks_trace(&local_storage->rcu,
> > > > +                                  bpf_local_storage_free_rcu);
> > > >  }
> > > >
> > > >  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> > > > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> > > >       struct bpf_local_storage_elem *selem;
> > > >
> > > >       /* Fast path (cache hit) */
> > > > -     sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> > > > +     sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> > > > +                                       bpf_local_storage_rcu_lock_held());
> > > There are other places using rcu_dereference() also.
> > > e.g. in bpf_local_storage_update().
> > > Should they be changed also?
> >
> > From what I saw, the other usage of rcu_derference is in a nested
> > (w.r.t to the RCU section that in bpf_prog_enter/exit) RCU
> > read side critical section/rcu_read_{lock, unlock} so it should not be required.
> hmm... not sure what nested or not has to do here.
> It is likely we are talking different things.
>
Yeah, we were looking at different things.

e.g. bpf_selem_unlink does not need to be changed as it is in
a rcu_read_lock.

But you are right there is another in bpf_local_storage_update which I will fix.

> Did you turn on the lockdep rcu debugging in kconfig?

Yes I have PROVE_RCU and LOCKDEP

>
> afaik, lockdep uses the second "check" argument to WARN on incorrect usage.
> Even the right check is passed here, the later rcu_dereference() will still
> complain because it only checks for rcu_read_lock_held().


>
> Also, after another look, why rcu_dereference_protected() is used
> here instead of rcu_dereference_check()?  The spinlock is not acquired
> here.  The same comment goes for similar rcu_dereference_protected() usages.


Good catch, it should be rcu_dereference_check.

>
> >
> > If there are some that are not, then they need to be updated. Did I miss any?
> >
> > >
> > > [ ... ]
> > >
> > > > --- a/net/core/bpf_sk_storage.c
> > > > +++ b/net/core/bpf_sk_storage.c
> > > > @@ -13,6 +13,7 @@
> > > >  #include <net/sock.h>
> > > >  #include <uapi/linux/sock_diag.h>
> > > >  #include <uapi/linux/btf.h>
> > > > +#include <linux/rcupdate_trace.h>
> > > >
> > > >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> > > >
> > > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> > > >       struct bpf_local_storage *sk_storage;
> > > >       struct bpf_local_storage_map *smap;
> > > >
> > > > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > > > +                                            bpf_local_storage_rcu_lock_held());
> > > >       if (!sk_storage)
> > > >               return NULL;
> > > >
> > > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > > >  {
> > > >       struct bpf_local_storage_data *sdata;
> > > >
> > > > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> > > >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > sk is protected by rcu_read_lock here.
> > > Is it always safe to access it with the rcu_read_lock_trace alone ?
> >
> > We don't dereference sk with an rcu_dereference though, is it still the case for
> > tracing and LSM programs? Or is it somehow implicity protected even
> > though we don't use rcu_dereference since that's just a READ_ONCE + some checks?
> e.g. the bpf_prog (currently run under rcu_read_lock()) may read the sk from
> req_sk->sk which I don't think the verifier will optimize it out, so as good
> as READ_ONCE(), iiuc.
>
> The sk here is obtained from the bpf_lsm_socket_* hooks?  Those sk should have
> a refcnt, right?  If that is the case, it should be good enough for now.

The one passed in the arguments yes, but if you notice the discussion in

https://lore.kernel.org/bpf/20210826133913.627361-1-memxor@gmail.com/T/#me254212a125516a6c5d2fbf349b97c199e66dce0

one may also get an sk in LSM and tracing progs by pointer walking.

>
> >
> > Would grabbing a refcount earlier in the code be helpful?
> Grab the refcount earlier in the bpf prog itself?
> right, it may be what eventually needs to be done
> if somehow there is a usecase that the sleepable bpf prog can get a
> hold on a rcu protected sk.
Martin KaFai Lau Aug. 31, 2021, 6:22 p.m. UTC | #5
On Tue, Aug 31, 2021 at 11:50:48AM +0200, KP Singh wrote:
> On Tue, Aug 31, 2021 at 4:11 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Sun, Aug 29, 2021 at 11:52:03PM +0200, KP Singh wrote:
> > [ ... ]
> >
> > > > > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > > > > index b305270b7a4b..7760bc4e9565 100644
> > > > > --- a/kernel/bpf/bpf_local_storage.c
> > > > > +++ b/kernel/bpf/bpf_local_storage.c
> > > > > @@ -11,6 +11,8 @@
> > > > >  #include <net/sock.h>
> > > > >  #include <uapi/linux/sock_diag.h>
> > > > >  #include <uapi/linux/btf.h>
> > > > > +#include <linux/rcupdate_trace.h>
> > > > > +#include <linux/rcupdate_wait.h>
> > > > >
> > > > >  #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> > > > >
> > > > > @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > >       return NULL;
> > > > >  }
> > > > >
> > > > > +void bpf_local_storage_free_rcu(struct rcu_head *rcu)
> > > > > +{
> > > > > +     struct bpf_local_storage *local_storage;
> > > > > +
> > > > > +     local_storage = container_of(rcu, struct bpf_local_storage, rcu);
> > > > > +     kfree_rcu(local_storage, rcu);
> > > > > +}
> > > > > +
> > > > > +static void bpf_selem_free_rcu(struct rcu_head *rcu)
> > > > > +{
> > > > > +     struct bpf_local_storage_elem *selem;
> > > > > +
> > > > > +     selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
> > > > > +     kfree_rcu(selem, rcu);
> > > > > +}
> > > > > +
> > > > >  /* local_storage->lock must be held and selem->local_storage == local_storage.
> > > > >   * The caller must ensure selem->smap is still valid to be
> > > > >   * dereferenced for its smap->elem_size and smap->cache_idx.
> > > > > @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > >                *
> > > > >                * Although the unlock will be done under
> > > > >                * rcu_read_lock(),  it is more intutivie to
> > > > > -              * read if kfree_rcu(local_storage, rcu) is done
> > > > > +              * read if the freeing of the storage is done
> > > > >                * after the raw_spin_unlock_bh(&local_storage->lock).
> > > > >                *
> > > > >                * Hence, a "bool free_local_storage" is returned
> > > > > -              * to the caller which then calls the kfree_rcu()
> > > > > -              * after unlock.
> > > > > +              * to the caller which then calls then frees the storage after
> > > > > +              * all the RCU grace periods have expired.
> > > > >                */
> > > > >       }
> > > > >       hlist_del_init_rcu(&selem->snode);
> > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > >           SDATA(selem))
> > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > >
> > > > > -     kfree_rcu(selem, rcu);
> > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > Although the common use case is usually storage_get() much more often
> > > > than storage_delete(), do you aware any performance impact for
> > > > the bpf prog that does a lot of storage_delete()?
> > >
> > > I have not really measured the impact on deletes, My understanding is
> > > that it should
> > > not impact the BPF program, but yes, if there are some critical
> > > sections that are prolonged
> > > due to a sleepable program "sleeping" too long, then it would pile up
> > > the callbacks.
> > >
> > > But this is not something new, as we have a similar thing in BPF
> > > trampolines. If this really
> > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > with this flag would be allowed in sleepable progs.
> > Agree that is similar to trampoline updates but not sure it is comparable
> > in terms of the frequency of elems being deleted here.  e.g. many
> > short lived tcp connections created by external traffic.
> >
> > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > existing sleepable bpf prog.
> >
> > I don't know enough on call_rcu_tasks_trace() here, so the
> > earlier question on perf/callback-pile-up implications in order to
> > decide if extra logic or knob is needed here or not.
> 
> I will defer to the others, maybe Alexei and Paul,

> we could also just
> add the flag to not affect existing performance characteristics?
I would see if it is really necessary first.  Other sleepable
supported maps do not need a flag.  Adding one here for local
storage will be confusing especially if it turns out to be
unnecessary.

Could you run some tests first which can guide the decision?

> 
> >
> > >
> > > We could then wait for both critical sections only when this flag is
> > > set on the map.
> > >
> > > >
> > > > >
> > > > >       return free_local_storage;
> > > > >  }
> > > > > @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> > > > >       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > >
> > > > >       if (free_local_storage)
> > > > > -             kfree_rcu(local_storage, rcu);
> > > > > +             call_rcu_tasks_trace(&local_storage->rcu,
> > > > > +                                  bpf_local_storage_free_rcu);
> > > > >  }
> > > > >
> > > > >  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> > > > >       struct bpf_local_storage_elem *selem;
> > > > >
> > > > >       /* Fast path (cache hit) */
> > > > > -     sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> > > > > +     sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> > > > > +                                       bpf_local_storage_rcu_lock_held());
> > > > There are other places using rcu_dereference() also.
> > > > e.g. in bpf_local_storage_update().
> > > > Should they be changed also?
> > >
> > > From what I saw, the other usage of rcu_derference is in a nested
> > > (w.r.t to the RCU section that in bpf_prog_enter/exit) RCU
> > > read side critical section/rcu_read_{lock, unlock} so it should not be required.
> > hmm... not sure what nested or not has to do here.
> > It is likely we are talking different things.
> >
> Yeah, we were looking at different things.
> 
> e.g. bpf_selem_unlink does not need to be changed as it is in
> a rcu_read_lock.
No.  It is not always under rcu_read_lock().  From the patch 2 test,
it should have a splat either from bpf_inode_storage_delete()
or bpf_sk_storage_delete(), depending on which one runs first.

> But you are right there is another in bpf_local_storage_update which I will fix.
> 
> > Did you turn on the lockdep rcu debugging in kconfig?
> 
> Yes I have PROVE_RCU and LOCKDEP
> 
> >
> > afaik, lockdep uses the second "check" argument to WARN on incorrect usage.
> > Even the right check is passed here, the later rcu_dereference() will still
> > complain because it only checks for rcu_read_lock_held().
> 
> 
> >
> > Also, after another look, why rcu_dereference_protected() is used
> > here instead of rcu_dereference_check()?  The spinlock is not acquired
> > here.  The same comment goes for similar rcu_dereference_protected() usages.
> 
> 
> Good catch, it should be rcu_dereference_check.
> 
> >
> > >
> > > If there are some that are not, then they need to be updated. Did I miss any?
> > >
> > > >
> > > > [ ... ]
> > > >
> > > > > --- a/net/core/bpf_sk_storage.c
> > > > > +++ b/net/core/bpf_sk_storage.c
> > > > > @@ -13,6 +13,7 @@
> > > > >  #include <net/sock.h>
> > > > >  #include <uapi/linux/sock_diag.h>
> > > > >  #include <uapi/linux/btf.h>
> > > > > +#include <linux/rcupdate_trace.h>
> > > > >
> > > > >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> > > > >
> > > > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> > > > >       struct bpf_local_storage *sk_storage;
> > > > >       struct bpf_local_storage_map *smap;
> > > > >
> > > > > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > > > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > > > > +                                            bpf_local_storage_rcu_lock_held());
> > > > >       if (!sk_storage)
> > > > >               return NULL;
> > > > >
> > > > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > > > >  {
> > > > >       struct bpf_local_storage_data *sdata;
> > > > >
> > > > > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> > > > >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > > sk is protected by rcu_read_lock here.
> > > > Is it always safe to access it with the rcu_read_lock_trace alone ?
> > >
> > > We don't dereference sk with an rcu_dereference though, is it still the case for
> > > tracing and LSM programs? Or is it somehow implicity protected even
> > > though we don't use rcu_dereference since that's just a READ_ONCE + some checks?
> > e.g. the bpf_prog (currently run under rcu_read_lock()) may read the sk from
> > req_sk->sk which I don't think the verifier will optimize it out, so as good
> > as READ_ONCE(), iiuc.
> >
> > The sk here is obtained from the bpf_lsm_socket_* hooks?  Those sk should have
> > a refcnt, right?  If that is the case, it should be good enough for now.
> 
> The one passed in the arguments yes, but if you notice the discussion in
> 
> https://lore.kernel.org/bpf/20210826133913.627361-1-memxor@gmail.com/T/#me254212a125516a6c5d2fbf349b97c199e66dce0
> 
> one may also get an sk in LSM and tracing progs by pointer walking.
Right.  There is pointer walking case.
e.g. "struct request_sock __rcu *fastopen_rsk" in tcp_sock.
I don't think it is possible for lsm to get a hold on tcp_sock
but agree that other similar cases could happen.

May be for now, in sleepable program, only allow safe sk ptr
to be used in helpers that take sk PTR_TO_BTF_ID argument.
e.g. sock->sk is safe in the test in patch 2.  The same should go for other
storages like inode.  This needs verifier change.

In the very near future, it can move to Yonghong's (cc) btf tagging solution
to tag a particular member of a struct to make this verifier checking more
generic.
KP Singh Aug. 31, 2021, 7:38 p.m. UTC | #6
On Tue, Aug 31, 2021 at 8:22 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Aug 31, 2021 at 11:50:48AM +0200, KP Singh wrote:
> > On Tue, Aug 31, 2021 at 4:11 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Sun, Aug 29, 2021 at 11:52:03PM +0200, KP Singh wrote:
> > > [ ... ]
> > >
> > > > > > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > > > > > index b305270b7a4b..7760bc4e9565 100644
> > > > > > --- a/kernel/bpf/bpf_local_storage.c
> > > > > > +++ b/kernel/bpf/bpf_local_storage.c
> > > > > > @@ -11,6 +11,8 @@
> > > > > >  #include <net/sock.h>
> > > > > >  #include <uapi/linux/sock_diag.h>
> > > > > >  #include <uapi/linux/btf.h>
> > > > > > +#include <linux/rcupdate_trace.h>
> > > > > > +#include <linux/rcupdate_wait.h>
> > > > > >
> > > > > >  #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> > > > > >
> > > > > > @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > > > >       return NULL;
> > > > > >  }
> > > > > >
> > > > > > +void bpf_local_storage_free_rcu(struct rcu_head *rcu)
> > > > > > +{
> > > > > > +     struct bpf_local_storage *local_storage;
> > > > > > +
> > > > > > +     local_storage = container_of(rcu, struct bpf_local_storage, rcu);
> > > > > > +     kfree_rcu(local_storage, rcu);
> > > > > > +}
> > > > > > +
> > > > > > +static void bpf_selem_free_rcu(struct rcu_head *rcu)
> > > > > > +{
> > > > > > +     struct bpf_local_storage_elem *selem;
> > > > > > +
> > > > > > +     selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
> > > > > > +     kfree_rcu(selem, rcu);
> > > > > > +}
> > > > > > +
> > > > > >  /* local_storage->lock must be held and selem->local_storage == local_storage.
> > > > > >   * The caller must ensure selem->smap is still valid to be
> > > > > >   * dereferenced for its smap->elem_size and smap->cache_idx.
> > > > > > @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > >                *
> > > > > >                * Although the unlock will be done under
> > > > > >                * rcu_read_lock(),  it is more intutivie to
> > > > > > -              * read if kfree_rcu(local_storage, rcu) is done
> > > > > > +              * read if the freeing of the storage is done
> > > > > >                * after the raw_spin_unlock_bh(&local_storage->lock).
> > > > > >                *
> > > > > >                * Hence, a "bool free_local_storage" is returned
> > > > > > -              * to the caller which then calls the kfree_rcu()
> > > > > > -              * after unlock.
> > > > > > +              * to the caller which then calls then frees the storage after
> > > > > > +              * all the RCU grace periods have expired.
> > > > > >                */
> > > > > >       }
> > > > > >       hlist_del_init_rcu(&selem->snode);
> > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > >           SDATA(selem))
> > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > >
> > > > > > -     kfree_rcu(selem, rcu);
> > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > Although the common use case is usually storage_get() much more often
> > > > > than storage_delete(), do you aware any performance impact for
> > > > > the bpf prog that does a lot of storage_delete()?
> > > >
> > > > I have not really measured the impact on deletes, My understanding is
> > > > that it should
> > > > not impact the BPF program, but yes, if there are some critical
> > > > sections that are prolonged
> > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > the callbacks.
> > > >
> > > > But this is not something new, as we have a similar thing in BPF
> > > > trampolines. If this really
> > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > with this flag would be allowed in sleepable progs.
> > > Agree that is similar to trampoline updates but not sure it is comparable
> > > in terms of the frequency of elems being deleted here.  e.g. many
> > > short lived tcp connections created by external traffic.
> > >
> > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > existing sleepable bpf prog.
> > >
> > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > earlier question on perf/callback-pile-up implications in order to
> > > decide if extra logic or knob is needed here or not.
> >
> > I will defer to the others, maybe Alexei and Paul,
>
> > we could also just
> > add the flag to not affect existing performance characteristics?
> I would see if it is really necessary first.  Other sleepable
> supported maps do not need a flag.  Adding one here for local
> storage will be confusing especially if it turns out to be
> unnecessary.
>
> Could you run some tests first which can guide the decision?

I think the performance impact would happen only in the worst case which
needs some work to simulate. What do you think about:

A bprm_committed_creds program that processes a large argv
and also gets a storage on the inode.

A file_open program that tries to delete the local storage on the inode.

Trigger this code in parallel. i.e. lots of programs that execute with a very
large argv and then in parallel the executable being opened to trigger the
delete.

Do you have any other ideas? Is there something we could re-use from
the selftests?

>
> >
> > >
> > > >
> > > > We could then wait for both critical sections only when this flag is
> > > > set on the map.
> > > >
> > > > >
> > > > > >
> > > > > >       return free_local_storage;
> > > > > >  }
> > > > > > @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> > > > > >       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > > > >
> > > > > >       if (free_local_storage)
> > > > > > -             kfree_rcu(local_storage, rcu);
> > > > > > +             call_rcu_tasks_trace(&local_storage->rcu,
> > > > > > +                                  bpf_local_storage_free_rcu);
> > > > > >  }
> > > > > >
> > > > > >  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> > > > > >       struct bpf_local_storage_elem *selem;
> > > > > >
> > > > > >       /* Fast path (cache hit) */
> > > > > > -     sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> > > > > > +     sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> > > > > > +                                       bpf_local_storage_rcu_lock_held());
> > > > > There are other places using rcu_dereference() also.
> > > > > e.g. in bpf_local_storage_update().
> > > > > Should they be changed also?
> > > >
> > > > From what I saw, the other usage of rcu_derference is in a nested
> > > > (w.r.t to the RCU section that in bpf_prog_enter/exit) RCU
> > > > read side critical section/rcu_read_{lock, unlock} so it should not be required.
> > > hmm... not sure what nested or not has to do here.
> > > It is likely we are talking different things.
> > >
> > Yeah, we were looking at different things.
> >
> > e.g. bpf_selem_unlink does not need to be changed as it is in
> > a rcu_read_lock.
> No.  It is not always under rcu_read_lock().  From the patch 2 test,
> it should have a splat either from bpf_inode_storage_delete()
> or bpf_sk_storage_delete(), depending on which one runs first.

I missed this one, but I wonder why it did not trigger a warning. The test does
exercise the delete and rcu_dereference should have warned me that I am not
holding an rcu_read_lock();


>
> > But you are right there is another in bpf_local_storage_update which I will fix.
> >
> > > Did you turn on the lockdep rcu debugging in kconfig?
> >
> > Yes I have PROVE_RCU and LOCKDEP
> >
> > >
> > > afaik, lockdep uses the second "check" argument to WARN on incorrect usage.
> > > Even the right check is passed here, the later rcu_dereference() will still
> > > complain because it only checks for rcu_read_lock_held().
> >
> >
> > >
> > > Also, after another look, why rcu_dereference_protected() is used
> > > here instead of rcu_dereference_check()?  The spinlock is not acquired
> > > here.  The same comment goes for similar rcu_dereference_protected() usages.
> >
> >
> > Good catch, it should be rcu_dereference_check.
> >
> > >
> > > >
> > > > If there are some that are not, then they need to be updated. Did I miss any?
> > > >
> > > > >
> > > > > [ ... ]
> > > > >
> > > > > > --- a/net/core/bpf_sk_storage.c
> > > > > > +++ b/net/core/bpf_sk_storage.c
> > > > > > @@ -13,6 +13,7 @@
> > > > > >  #include <net/sock.h>
> > > > > >  #include <uapi/linux/sock_diag.h>
> > > > > >  #include <uapi/linux/btf.h>
> > > > > > +#include <linux/rcupdate_trace.h>
> > > > > >
> > > > > >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> > > > > >
> > > > > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> > > > > >       struct bpf_local_storage *sk_storage;
> > > > > >       struct bpf_local_storage_map *smap;
> > > > > >
> > > > > > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > > > > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > > > > > +                                            bpf_local_storage_rcu_lock_held());
> > > > > >       if (!sk_storage)
> > > > > >               return NULL;
> > > > > >
> > > > > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > > > > >  {
> > > > > >       struct bpf_local_storage_data *sdata;
> > > > > >
> > > > > > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> > > > > >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > > > sk is protected by rcu_read_lock here.
> > > > > Is it always safe to access it with the rcu_read_lock_trace alone ?
> > > >
> > > > We don't dereference sk with an rcu_dereference though, is it still the case for
> > > > tracing and LSM programs? Or is it somehow implicity protected even
> > > > though we don't use rcu_dereference since that's just a READ_ONCE + some checks?
> > > e.g. the bpf_prog (currently run under rcu_read_lock()) may read the sk from
> > > req_sk->sk which I don't think the verifier will optimize it out, so as good
> > > as READ_ONCE(), iiuc.
> > >
> > > The sk here is obtained from the bpf_lsm_socket_* hooks?  Those sk should have
> > > a refcnt, right?  If that is the case, it should be good enough for now.
> >
> > The one passed in the arguments yes, but if you notice the discussion in
> >
> > https://lore.kernel.org/bpf/20210826133913.627361-1-memxor@gmail.com/T/#me254212a125516a6c5d2fbf349b97c199e66dce0
> >
> > one may also get an sk in LSM and tracing progs by pointer walking.
> Right.  There is pointer walking case.
> e.g. "struct request_sock __rcu *fastopen_rsk" in tcp_sock.
> I don't think it is possible for lsm to get a hold on tcp_sock
> but agree that other similar cases could happen.
>
> May be for now, in sleepable program, only allow safe sk ptr
> to be used in helpers that take sk PTR_TO_BTF_ID argument.
> e.g. sock->sk is safe in the test in patch 2.  The same should go for other
> storages like inode.  This needs verifier change.
>

Sorry, I may be missing some context. Do you mean wait for Yonghong's work?

Or is there another way to update the verifier to recognize safe sk and inode
pointers?

> In the very near future, it can move to Yonghong's (cc) btf tagging solution
> to tag a particular member of a struct to make this verifier checking more
> generic.
Martin KaFai Lau Sept. 1, 2021, 6:32 a.m. UTC | #7
On Tue, Aug 31, 2021 at 09:38:01PM +0200, KP Singh wrote:
[ ... ]

> > > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > >           SDATA(selem))
> > > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > > >
> > > > > > > -     kfree_rcu(selem, rcu);
> > > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > Although the common use case is usually storage_get() much more often
> > > > > > than storage_delete(), do you aware any performance impact for
> > > > > > the bpf prog that does a lot of storage_delete()?
> > > > >
> > > > > I have not really measured the impact on deletes, My understanding is
> > > > > that it should
> > > > > not impact the BPF program, but yes, if there are some critical
> > > > > sections that are prolonged
> > > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > > the callbacks.
> > > > >
> > > > > But this is not something new, as we have a similar thing in BPF
> > > > > trampolines. If this really
> > > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > > with this flag would be allowed in sleepable progs.
> > > > Agree that is similar to trampoline updates but not sure it is comparable
> > > > in terms of the frequency of elems being deleted here.  e.g. many
> > > > short lived tcp connections created by external traffic.
> > > >
> > > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > > existing sleepable bpf prog.
> > > >
> > > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > > earlier question on perf/callback-pile-up implications in order to
> > > > decide if extra logic or knob is needed here or not.
> > >
> > > I will defer to the others, maybe Alexei and Paul,
> >
> > > we could also just
> > > add the flag to not affect existing performance characteristics?
> > I would see if it is really necessary first.  Other sleepable
> > supported maps do not need a flag.  Adding one here for local
> > storage will be confusing especially if it turns out to be
> > unnecessary.
> >
> > Could you run some tests first which can guide the decision?
> 
> I think the performance impact would happen only in the worst case which
> needs some work to simulate. What do you think about:
> 
> A bprm_committed_creds program that processes a large argv
> and also gets a storage on the inode.
> 
> A file_open program that tries to delete the local storage on the inode.
> 
> Trigger this code in parallel. i.e. lots of programs that execute with a very
> large argv and then in parallel the executable being opened to trigger the
> delete.
> 
> Do you have any other ideas? Is there something we could re-use from
> the selftests?
There is a bench framework in tools/testing/selftests/bpf/benchs/
that has a parallel thread setup which could be useful.

Don't know how to simulate the "sleeping" too long which
then pile-up callbacks.  This is not bpf specific.
Paul, I wonder if you have similar test to trigger this to
compare between call_rcu_tasks_trace() and call_rcu()?

[ ... ]

> > > > > > > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> > > > > > >       struct bpf_local_storage_elem *selem;
> > > > > > >
> > > > > > >       /* Fast path (cache hit) */
> > > > > > > -     sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> > > > > > > +     sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> > > > > > > +                                       bpf_local_storage_rcu_lock_held());
> > > > > > There are other places using rcu_dereference() also.
> > > > > > e.g. in bpf_local_storage_update().
> > > > > > Should they be changed also?
> > > > >
> > > > > From what I saw, the other usage of rcu_derference is in a nested
> > > > > (w.r.t to the RCU section that in bpf_prog_enter/exit) RCU
> > > > > read side critical section/rcu_read_{lock, unlock} so it should not be required.
> > > > hmm... not sure what nested or not has to do here.
> > > > It is likely we are talking different things.
> > > >
> > > Yeah, we were looking at different things.
> > >
> > > e.g. bpf_selem_unlink does not need to be changed as it is in
> > > a rcu_read_lock.
> > No.  It is not always under rcu_read_lock().  From the patch 2 test,
> > it should have a splat either from bpf_inode_storage_delete()
> > or bpf_sk_storage_delete(), depending on which one runs first.
> 
> I missed this one, but I wonder why it did not trigger a warning. The test does
> exercise the delete and rcu_dereference should have warned me that I am not
> holding an rcu_read_lock();
hmm... not sure either.  may be some kconfigs that disabled rcu_read_lock_held()?
I would also take a look at RCU_LOCKDEP_WARN().

I just quickly tried the patches to check:

[  143.376587] =============================
[  143.377068] WARNING: suspicious RCU usage
[  143.377541] 5.14.0-rc5-01271-g68e5bda2b18e #4966 Tainted: G           O
[  143.378378] -----------------------------
[  143.378857] kernel/bpf/bpf_local_storage.c:114 suspicious rcu_dereference_check() usage!
[  143.379914]
[  143.379914] other info that might help us debug this:
[  143.379914]
[  143.380838]
[  143.380838] rcu_scheduler_active = 2, debug_locks = 1
[  143.381602] 4 locks held by mv/1781:
[  143.382025]  #0: ffff888121e7c438 (sb_writers#6){.+.+}-{0:0}, at: do_renameat2+0x2f5/0xa80
[  143.383009]  #1: ffff88812ce68760 (&type->i_mutex_dir_key#5/1){+.+.}-{3:3}, at: lock_rename+0x1f4/0x250
[  143.384144]  #2: ffffffff843fbc60 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x45/0x160
[  143.385326]  #3: ffff88811d8348b8 (&storage->lock){..-.}-{2:2}, at: __bpf_selem_unlink_storage+0x7d/0x170
[  143.386459]
[  143.386459] stack backtrace:
[  143.386983] CPU: 2 PID: 1781 Comm: mv Tainted: G           O      5.14.0-rc5-01271-g68e5bda2b18e #4966
[  143.388071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
[  143.389146] Call Trace:
[  143.389446]  dump_stack_lvl+0x5b/0x82
[  143.389901]  dump_stack+0x10/0x12
[  143.390302]  lockdep_rcu_suspicious+0x15c/0x167
[  143.390854]  bpf_selem_unlink_storage_nolock+0x2e1/0x6d0
[  143.391501]  __bpf_selem_unlink_storage+0xb7/0x170
[  143.392085]  bpf_selem_unlink+0x1b/0x30
[  143.392554]  bpf_inode_storage_delete+0x57/0xa0
[  143.393112]  bpf_prog_31e277fe2c132665_inode_rename+0x9c/0x268
[  143.393814]  bpf_trampoline_6442476301_0+0x4e/0x1000
[  143.394413]  bpf_lsm_inode_rename+0x5/0x10

> > > > > > > --- a/net/core/bpf_sk_storage.c
> > > > > > > +++ b/net/core/bpf_sk_storage.c
> > > > > > > @@ -13,6 +13,7 @@
> > > > > > >  #include <net/sock.h>
> > > > > > >  #include <uapi/linux/sock_diag.h>
> > > > > > >  #include <uapi/linux/btf.h>
> > > > > > > +#include <linux/rcupdate_trace.h>
> > > > > > >
> > > > > > >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> > > > > > >
> > > > > > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> > > > > > >       struct bpf_local_storage *sk_storage;
> > > > > > >       struct bpf_local_storage_map *smap;
> > > > > > >
> > > > > > > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > > > > > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > > > > > > +                                            bpf_local_storage_rcu_lock_held());
> > > > > > >       if (!sk_storage)
> > > > > > >               return NULL;
> > > > > > >
> > > > > > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > > > > > >  {
> > > > > > >       struct bpf_local_storage_data *sdata;
> > > > > > >
> > > > > > > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> > > > > > >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > > > > sk is protected by rcu_read_lock here.
> > > > > > Is it always safe to access it with the rcu_read_lock_trace alone ?
> > > > >
> > > > > We don't dereference sk with an rcu_dereference though, is it still the case for
> > > > > tracing and LSM programs? Or is it somehow implicity protected even
> > > > > though we don't use rcu_dereference since that's just a READ_ONCE + some checks?
> > > > e.g. the bpf_prog (currently run under rcu_read_lock()) may read the sk from
> > > > req_sk->sk which I don't think the verifier will optimize it out, so as good
> > > > as READ_ONCE(), iiuc.
> > > >
> > > > The sk here is obtained from the bpf_lsm_socket_* hooks?  Those sk should have
> > > > a refcnt, right?  If that is the case, it should be good enough for now.
> > >
> > > The one passed in the arguments yes, but if you notice the discussion in
> > >
> > > https://lore.kernel.org/bpf/20210826133913.627361-1-memxor@gmail.com/T/#me254212a125516a6c5d2fbf349b97c199e66dce0
> > >
> > > one may also get an sk in LSM and tracing progs by pointer walking.
> > Right.  There is pointer walking case.
> > e.g. "struct request_sock __rcu *fastopen_rsk" in tcp_sock.
> > I don't think it is possible for lsm to get a hold on tcp_sock
> > but agree that other similar cases could happen.
> >
> > May be for now, in sleepable program, only allow safe sk ptr
> > to be used in helpers that take sk PTR_TO_BTF_ID argument.
> > e.g. sock->sk is safe in the test in patch 2.  The same should go for other
> > storages like inode.  This needs verifier change.
> >
> 
> Sorry, I may be missing some context. Do you mean wait for Yonghong's work?
I don't think we have to wait.  Just saying Yonghong's work could fit
well in this use case in the future.

> Or is there another way to update the verifier to recognize safe sk and inode
> pointers?
I was thinking specifically for this pointer walking case.
Take a look at btf_struct_access().  It walks the struct
in the verifier and figures out reading sock->sk will get
a "struct sock *".  It marks the reg to PTR_TO_BTF_ID.
This will allow the bpf prog to directly read from sk (e.g. sk->sk_num)
or pass the sk to helper that takes a "struct sock *" pointer.
Reading from any sk pointer is fine since it is protected by BPF_PROBE_MEM
read.  However, we only allow the sk from sock->sk to be passed to the
helper here because we only know this one is refcnt-ed.

Take a look at check_ptr_to_btf_access().  An individual verifier_ops 
can also have its own btf_struct_access.  One possibility is
to introduce a (new) PTR_TO_RDONLY_BTF_ID to mean it can only
do BPR_PROBE_MEM read but cannot be used in helper.
Paul E. McKenney Sept. 1, 2021, 8:26 p.m. UTC | #8
On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> On Tue, Aug 31, 2021 at 09:38:01PM +0200, KP Singh wrote:
> [ ... ]
> 
> > > > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > > >           SDATA(selem))
> > > > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > > > >
> > > > > > > > -     kfree_rcu(selem, rcu);
> > > > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > > Although the common use case is usually storage_get() much more often
> > > > > > > than storage_delete(), do you aware any performance impact for
> > > > > > > the bpf prog that does a lot of storage_delete()?
> > > > > >
> > > > > > I have not really measured the impact on deletes, My understanding is
> > > > > > that it should
> > > > > > not impact the BPF program, but yes, if there are some critical
> > > > > > sections that are prolonged
> > > > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > > > the callbacks.
> > > > > >
> > > > > > But this is not something new, as we have a similar thing in BPF
> > > > > > trampolines. If this really
> > > > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > > > with this flag would be allowed in sleepable progs.
> > > > > Agree that is similar to trampoline updates but not sure it is comparable
> > > > > in terms of the frequency of elems being deleted here.  e.g. many
> > > > > short lived tcp connections created by external traffic.
> > > > >
> > > > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > > > existing sleepable bpf prog.
> > > > >
> > > > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > > > earlier question on perf/callback-pile-up implications in order to
> > > > > decide if extra logic or knob is needed here or not.
> > > >
> > > > I will defer to the others, maybe Alexei and Paul,
> > >
> > > > we could also just
> > > > add the flag to not affect existing performance characteristics?
> > > I would see if it is really necessary first.  Other sleepable
> > > supported maps do not need a flag.  Adding one here for local
> > > storage will be confusing especially if it turns out to be
> > > unnecessary.
> > >
> > > Could you run some tests first which can guide the decision?
> > 
> > I think the performance impact would happen only in the worst case which
> > needs some work to simulate. What do you think about:
> > 
> > A bprm_committed_creds program that processes a large argv
> > and also gets a storage on the inode.
> > 
> > A file_open program that tries to delete the local storage on the inode.
> > 
> > Trigger this code in parallel. i.e. lots of programs that execute with a very
> > large argv and then in parallel the executable being opened to trigger the
> > delete.
> > 
> > Do you have any other ideas? Is there something we could re-use from
> > the selftests?
> 
> There is a bench framework in tools/testing/selftests/bpf/benchs/
> that has a parallel thread setup which could be useful.
> 
> Don't know how to simulate the "sleeping" too long which
> then pile-up callbacks.  This is not bpf specific.
> Paul, I wonder if you have similar test to trigger this to
> compare between call_rcu_tasks_trace() and call_rcu()?

It is definitely the case that call_rcu() is way more scalable than
is call_rcu_tasks_trace().  Something about call_rcu_tasks_trace()
acquiring a global lock. ;-)

So actually testing it makes a lot of sense.

I do have an rcuscale module, but it is set up more for synchronous grace
periods such as synchronize_rcu() and synchronize_rcu_tasks_trace().  It
has the beginnings of support for call_rcu() and call_rcu_tasks_trace(),
but I would not yet trust them.

But I also have a test for global locking:

$ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make

This gives a median lock overhead of 960ns.  Running a single CPU rather
than 16 of them:

$ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make

This gives a median lock overhead of 4.1ns, which is way faster.
And the greater the number of CPUs, the greater the lock overhead.

On the other hand, if each call to call_rcu_tasks_trace() involves a
fork()/exec() pair, I would be rather surprised if that global lock was
your bottleneck.

Of course, if call_rcu_tasks_trace() does prove to be a bottleneck,
there are of course things that can be done.

> [ ... ]
> 
> > > > > > > > @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> > > > > > > >       struct bpf_local_storage_elem *selem;
> > > > > > > >
> > > > > > > >       /* Fast path (cache hit) */
> > > > > > > > -     sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> > > > > > > > +     sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> > > > > > > > +                                       bpf_local_storage_rcu_lock_held());
> > > > > > > There are other places using rcu_dereference() also.
> > > > > > > e.g. in bpf_local_storage_update().
> > > > > > > Should they be changed also?
> > > > > >
> > > > > > From what I saw, the other usage of rcu_derference is in a nested
> > > > > > (w.r.t to the RCU section that in bpf_prog_enter/exit) RCU
> > > > > > read side critical section/rcu_read_{lock, unlock} so it should not be required.
> > > > > hmm... not sure what nested or not has to do here.
> > > > > It is likely we are talking different things.
> > > > >
> > > > Yeah, we were looking at different things.
> > > >
> > > > e.g. bpf_selem_unlink does not need to be changed as it is in
> > > > a rcu_read_lock.
> > > No.  It is not always under rcu_read_lock().  From the patch 2 test,
> > > it should have a splat either from bpf_inode_storage_delete()
> > > or bpf_sk_storage_delete(), depending on which one runs first.
> > 
> > I missed this one, but I wonder why it did not trigger a warning. The test does
> > exercise the delete and rcu_dereference should have warned me that I am not
> > holding an rcu_read_lock();
> hmm... not sure either.  may be some kconfigs that disabled rcu_read_lock_held()?
> I would also take a look at RCU_LOCKDEP_WARN().
> 
> I just quickly tried the patches to check:
> 
> [  143.376587] =============================
> [  143.377068] WARNING: suspicious RCU usage
> [  143.377541] 5.14.0-rc5-01271-g68e5bda2b18e #4966 Tainted: G           O
> [  143.378378] -----------------------------
> [  143.378857] kernel/bpf/bpf_local_storage.c:114 suspicious rcu_dereference_check() usage!
> [  143.379914]
> [  143.379914] other info that might help us debug this:
> [  143.379914]
> [  143.380838]
> [  143.380838] rcu_scheduler_active = 2, debug_locks = 1
> [  143.381602] 4 locks held by mv/1781:
> [  143.382025]  #0: ffff888121e7c438 (sb_writers#6){.+.+}-{0:0}, at: do_renameat2+0x2f5/0xa80
> [  143.383009]  #1: ffff88812ce68760 (&type->i_mutex_dir_key#5/1){+.+.}-{3:3}, at: lock_rename+0x1f4/0x250
> [  143.384144]  #2: ffffffff843fbc60 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x45/0x160
> [  143.385326]  #3: ffff88811d8348b8 (&storage->lock){..-.}-{2:2}, at: __bpf_selem_unlink_storage+0x7d/0x170
> [  143.386459]
> [  143.386459] stack backtrace:
> [  143.386983] CPU: 2 PID: 1781 Comm: mv Tainted: G           O      5.14.0-rc5-01271-g68e5bda2b18e #4966
> [  143.388071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> [  143.389146] Call Trace:
> [  143.389446]  dump_stack_lvl+0x5b/0x82
> [  143.389901]  dump_stack+0x10/0x12
> [  143.390302]  lockdep_rcu_suspicious+0x15c/0x167
> [  143.390854]  bpf_selem_unlink_storage_nolock+0x2e1/0x6d0
> [  143.391501]  __bpf_selem_unlink_storage+0xb7/0x170
> [  143.392085]  bpf_selem_unlink+0x1b/0x30
> [  143.392554]  bpf_inode_storage_delete+0x57/0xa0
> [  143.393112]  bpf_prog_31e277fe2c132665_inode_rename+0x9c/0x268
> [  143.393814]  bpf_trampoline_6442476301_0+0x4e/0x1000
> [  143.394413]  bpf_lsm_inode_rename+0x5/0x10

I am not sure what line 114 is (it is a blank line in bpf-next), but
you might be missing a rcu_read_lock_trace_held() in the second argument
of rcu_dereference_check().

							Thanx, Paul

> > > > > > > > --- a/net/core/bpf_sk_storage.c
> > > > > > > > +++ b/net/core/bpf_sk_storage.c
> > > > > > > > @@ -13,6 +13,7 @@
> > > > > > > >  #include <net/sock.h>
> > > > > > > >  #include <uapi/linux/sock_diag.h>
> > > > > > > >  #include <uapi/linux/btf.h>
> > > > > > > > +#include <linux/rcupdate_trace.h>
> > > > > > > >
> > > > > > > >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> > > > > > > >
> > > > > > > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> > > > > > > >       struct bpf_local_storage *sk_storage;
> > > > > > > >       struct bpf_local_storage_map *smap;
> > > > > > > >
> > > > > > > > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > > > > > > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > > > > > > > +                                            bpf_local_storage_rcu_lock_held());
> > > > > > > >       if (!sk_storage)
> > > > > > > >               return NULL;
> > > > > > > >
> > > > > > > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > > > > > > >  {
> > > > > > > >       struct bpf_local_storage_data *sdata;
> > > > > > > >
> > > > > > > > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> > > > > > > >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > > > > > sk is protected by rcu_read_lock here.
> > > > > > > Is it always safe to access it with the rcu_read_lock_trace alone ?
> > > > > >
> > > > > > We don't dereference sk with an rcu_dereference though, is it still the case for
> > > > > > tracing and LSM programs? Or is it somehow implicity protected even
> > > > > > though we don't use rcu_dereference since that's just a READ_ONCE + some checks?
> > > > > e.g. the bpf_prog (currently run under rcu_read_lock()) may read the sk from
> > > > > req_sk->sk which I don't think the verifier will optimize it out, so as good
> > > > > as READ_ONCE(), iiuc.
> > > > >
> > > > > The sk here is obtained from the bpf_lsm_socket_* hooks?  Those sk should have
> > > > > a refcnt, right?  If that is the case, it should be good enough for now.
> > > >
> > > > The one passed in the arguments yes, but if you notice the discussion in
> > > >
> > > > https://lore.kernel.org/bpf/20210826133913.627361-1-memxor@gmail.com/T/#me254212a125516a6c5d2fbf349b97c199e66dce0
> > > >
> > > > one may also get an sk in LSM and tracing progs by pointer walking.
> > > Right.  There is pointer walking case.
> > > e.g. "struct request_sock __rcu *fastopen_rsk" in tcp_sock.
> > > I don't think it is possible for lsm to get a hold on tcp_sock
> > > but agree that other similar cases could happen.
> > >
> > > May be for now, in sleepable program, only allow safe sk ptr
> > > to be used in helpers that take sk PTR_TO_BTF_ID argument.
> > > e.g. sock->sk is safe in the test in patch 2.  The same should go for other
> > > storages like inode.  This needs verifier change.
> > >
> > 
> > Sorry, I may be missing some context. Do you mean wait for Yonghong's work?
> I don't think we have to wait.  Just saying Yonghong's work could fit
> well in this use case in the future.
> 
> > Or is there another way to update the verifier to recognize safe sk and inode
> > pointers?
> I was thinking specifically for this pointer walking case.
> Take a look at btf_struct_access().  It walks the struct
> in the verifier and figures out reading sock->sk will get
> a "struct sock *".  It marks the reg to PTR_TO_BTF_ID.
> This will allow the bpf prog to directly read from sk (e.g. sk->sk_num)
> or pass the sk to helper that takes a "struct sock *" pointer.
> Reading from any sk pointer is fine since it is protected by BPF_PROBE_MEM
> read.  However, we only allow the sk from sock->sk to be passed to the
> helper here because we only know this one is refcnt-ed.
> 
> Take a look at check_ptr_to_btf_access().  An individual verifier_ops 
> can also have its own btf_struct_access.  One possibility is
> to introduce a (new) PTR_TO_RDONLY_BTF_ID to mean it can only
> do BPR_PROBE_MEM read but cannot be used in helper.
Martin KaFai Lau Sept. 2, 2021, 4:44 a.m. UTC | #9
On Wed, Sep 01, 2021 at 01:26:05PM -0700, Paul E. McKenney wrote:
> On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> > On Tue, Aug 31, 2021 at 09:38:01PM +0200, KP Singh wrote:
> > [ ... ]
> > 
> > > > > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > > > >           SDATA(selem))
> > > > > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > > > > >
> > > > > > > > > -     kfree_rcu(selem, rcu);
> > > > > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > > > Although the common use case is usually storage_get() much more often
> > > > > > > > than storage_delete(), do you aware any performance impact for
> > > > > > > > the bpf prog that does a lot of storage_delete()?
> > > > > > >
> > > > > > > I have not really measured the impact on deletes, My understanding is
> > > > > > > that it should
> > > > > > > not impact the BPF program, but yes, if there are some critical
> > > > > > > sections that are prolonged
> > > > > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > > > > the callbacks.
> > > > > > >
> > > > > > > But this is not something new, as we have a similar thing in BPF
> > > > > > > trampolines. If this really
> > > > > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > > > > with this flag would be allowed in sleepable progs.
> > > > > > Agree that is similar to trampoline updates but not sure it is comparable
> > > > > > in terms of the frequency of elems being deleted here.  e.g. many
> > > > > > short lived tcp connections created by external traffic.
> > > > > >
> > > > > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > > > > existing sleepable bpf prog.
> > > > > >
> > > > > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > > > > earlier question on perf/callback-pile-up implications in order to
> > > > > > decide if extra logic or knob is needed here or not.
> > > > >
> > > > > I will defer to the others, maybe Alexei and Paul,
> > > >
> > > > > we could also just
> > > > > add the flag to not affect existing performance characteristics?
> > > > I would see if it is really necessary first.  Other sleepable
> > > > supported maps do not need a flag.  Adding one here for local
> > > > storage will be confusing especially if it turns out to be
> > > > unnecessary.
> > > >
> > > > Could you run some tests first which can guide the decision?
> > > 
> > > I think the performance impact would happen only in the worst case which
> > > needs some work to simulate. What do you think about:
> > > 
> > > A bprm_committed_creds program that processes a large argv
> > > and also gets a storage on the inode.
> > > 
> > > A file_open program that tries to delete the local storage on the inode.
> > > 
> > > Trigger this code in parallel. i.e. lots of programs that execute with a very
> > > large argv and then in parallel the executable being opened to trigger the
> > > delete.
> > > 
> > > Do you have any other ideas? Is there something we could re-use from
> > > the selftests?
> > 
> > There is a bench framework in tools/testing/selftests/bpf/benchs/
> > that has a parallel thread setup which could be useful.
> > 
> > Don't know how to simulate the "sleeping" too long which
> > then pile-up callbacks.  This is not bpf specific.
> > Paul, I wonder if you have similar test to trigger this to
> > compare between call_rcu_tasks_trace() and call_rcu()?
> 
> It is definitely the case that call_rcu() is way more scalable than
> is call_rcu_tasks_trace().  Something about call_rcu_tasks_trace()
> acquiring a global lock. ;-)
> 
> So actually testing it makes a lot of sense.
> 
> I do have an rcuscale module, but it is set up more for synchronous grace
> periods such as synchronize_rcu() and synchronize_rcu_tasks_trace().  It
> has the beginnings of support for call_rcu() and call_rcu_tasks_trace(),
> but I would not yet trust them.
> 
> But I also have a test for global locking:
> 
> $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> 
> This gives a median lock overhead of 960ns.  Running a single CPU rather
> than 16 of them:
> 
> $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> 
> This gives a median lock overhead of 4.1ns, which is way faster.
> And the greater the number of CPUs, the greater the lock overhead.
Thanks for the explanation and numbers!

I think the global lock will be an issue for the current non-sleepable
netdev bpf-prog which could be triggered by external traffic,  so a flag
is needed here to provide a fast path.  I suspect other non-prealloc map
may need it in the future, so probably
s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.

[ ... ]

> > [  143.376587] =============================
> > [  143.377068] WARNING: suspicious RCU usage
> > [  143.377541] 5.14.0-rc5-01271-g68e5bda2b18e #4966 Tainted: G           O
> > [  143.378378] -----------------------------
> > [  143.378857] kernel/bpf/bpf_local_storage.c:114 suspicious rcu_dereference_check() usage!
> > [  143.379914]
> > [  143.379914] other info that might help us debug this:
> > [  143.379914]
> > [  143.380838]
> > [  143.380838] rcu_scheduler_active = 2, debug_locks = 1
> > [  143.381602] 4 locks held by mv/1781:
> > [  143.382025]  #0: ffff888121e7c438 (sb_writers#6){.+.+}-{0:0}, at: do_renameat2+0x2f5/0xa80
> > [  143.383009]  #1: ffff88812ce68760 (&type->i_mutex_dir_key#5/1){+.+.}-{3:3}, at: lock_rename+0x1f4/0x250
> > [  143.384144]  #2: ffffffff843fbc60 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x45/0x160
> > [  143.385326]  #3: ffff88811d8348b8 (&storage->lock){..-.}-{2:2}, at: __bpf_selem_unlink_storage+0x7d/0x170
> > [  143.386459]
> > [  143.386459] stack backtrace:
> > [  143.386983] CPU: 2 PID: 1781 Comm: mv Tainted: G           O      5.14.0-rc5-01271-g68e5bda2b18e #4966
> > [  143.388071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> > [  143.389146] Call Trace:
> > [  143.389446]  dump_stack_lvl+0x5b/0x82
> > [  143.389901]  dump_stack+0x10/0x12
> > [  143.390302]  lockdep_rcu_suspicious+0x15c/0x167
> > [  143.390854]  bpf_selem_unlink_storage_nolock+0x2e1/0x6d0
> > [  143.391501]  __bpf_selem_unlink_storage+0xb7/0x170
> > [  143.392085]  bpf_selem_unlink+0x1b/0x30
> > [  143.392554]  bpf_inode_storage_delete+0x57/0xa0
> > [  143.393112]  bpf_prog_31e277fe2c132665_inode_rename+0x9c/0x268
> > [  143.393814]  bpf_trampoline_6442476301_0+0x4e/0x1000
> > [  143.394413]  bpf_lsm_inode_rename+0x5/0x10
> 
> I am not sure what line 114 is (it is a blank line in bpf-next), but
> you might be missing a rcu_read_lock_trace_held() in the second argument
> of rcu_dereference_check().
Right, this path is only under rcu_read_lock_trace().
Martin KaFai Lau Sept. 30, 2021, 6:46 p.m. UTC | #10
On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> > > > > > > > --- a/net/core/bpf_sk_storage.c
> > > > > > > > +++ b/net/core/bpf_sk_storage.c
> > > > > > > > @@ -13,6 +13,7 @@
> > > > > > > >  #include <net/sock.h>
> > > > > > > >  #include <uapi/linux/sock_diag.h>
> > > > > > > >  #include <uapi/linux/btf.h>
> > > > > > > > +#include <linux/rcupdate_trace.h>
> > > > > > > >
> > > > > > > >  DEFINE_BPF_STORAGE_CACHE(sk_cache);
> > > > > > > >
> > > > > > > > @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
> > > > > > > >       struct bpf_local_storage *sk_storage;
> > > > > > > >       struct bpf_local_storage_map *smap;
> > > > > > > >
> > > > > > > > -     sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > > > > > > > +     sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> > > > > > > > +                                            bpf_local_storage_rcu_lock_held());
> > > > > > > >       if (!sk_storage)
> > > > > > > >               return NULL;
> > > > > > > >
> > > > > > > > @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > > > > > > >  {
> > > > > > > >       struct bpf_local_storage_data *sdata;
> > > > > > > >
> > > > > > > > +     WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
> > > > > > > >       if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
> > > > > > > sk is protected by rcu_read_lock here.
> > > > > > > Is it always safe to access it with the rcu_read_lock_trace alone ?
> > > > > >
> > > > > > We don't dereference sk with an rcu_dereference though, is it still the case for
> > > > > > tracing and LSM programs? Or is it somehow implicity protected even
> > > > > > though we don't use rcu_dereference since that's just a READ_ONCE + some checks?
> > > > > e.g. the bpf_prog (currently run under rcu_read_lock()) may read the sk from
> > > > > req_sk->sk which I don't think the verifier will optimize it out, so as good
> > > > > as READ_ONCE(), iiuc.
> > > > >
> > > > > The sk here is obtained from the bpf_lsm_socket_* hooks?  Those sk should have
> > > > > a refcnt, right?  If that is the case, it should be good enough for now.
> > > >
> > > > The one passed in the arguments yes, but if you notice the discussion in
> > > >
> > > > https://lore.kernel.org/bpf/20210826133913.627361-1-memxor@gmail.com/T/#me254212a125516a6c5d2fbf349b97c199e66dce0
> > > >
> > > > one may also get an sk in LSM and tracing progs by pointer walking.
> > > Right.  There is pointer walking case.
> > > e.g. "struct request_sock __rcu *fastopen_rsk" in tcp_sock.
> > > I don't think it is possible for lsm to get a hold on tcp_sock
> > > but agree that other similar cases could happen.
> > >
> > > May be for now, in sleepable program, only allow safe sk ptr
> > > to be used in helpers that take sk PTR_TO_BTF_ID argument.
> > > e.g. sock->sk is safe in the test in patch 2.  The same should go for other
> > > storages like inode.  This needs verifier change.
> > >
> > 
> > Sorry, I may be missing some context. Do you mean wait for Yonghong's work?
> I don't think we have to wait.  Just saying Yonghong's work could fit
> well in this use case in the future.
> 
> > Or is there another way to update the verifier to recognize safe sk and inode
> > pointers?
> I was thinking specifically for this pointer walking case.
> Take a look at btf_struct_access().  It walks the struct
> in the verifier and figures out reading sock->sk will get
> a "struct sock *".  It marks the reg to PTR_TO_BTF_ID.
> This will allow the bpf prog to directly read from sk (e.g. sk->sk_num)
> or pass the sk to helper that takes a "struct sock *" pointer.
> Reading from any sk pointer is fine since it is protected by BPF_PROBE_MEM
> read.  However, we only allow the sk from sock->sk to be passed to the
> helper here because we only know this one is refcnt-ed.
> 
> Take a look at check_ptr_to_btf_access().  An individual verifier_ops 
> can also have its own btf_struct_access.  One possibility is
> to introduce a (new) PTR_TO_RDONLY_BTF_ID to mean it can only
> do BPR_PROBE_MEM read but cannot be used in helper.
KP,  not sure how far you are with the verifier change, if it is
moving well, please ignore.  Otherwise,
I looked at the sock a bit and I currently don't see
potential concern on the following pointer case without the
rcu_read_lock for those sock-related sleepable lsm hooks in bpf_lsm.c.
If cases did come up later (e.g. other lsm hooks are added or something got
overlooked), we could bring in a bigger hammer to make the above verifier
change.  I think it should be fine to stop some exotic usage
later that is proven to be not safe.  For the lsm sleepable hook case,
another option is to lock the sock first before calling the bpf prog.

If you agree a similar situation is also true for inode and task,
do you want to respin the patches addressing other points
discussed in this thread.
KP Singh Nov. 2, 2021, 4 p.m. UTC | #11
On Thu, Sep 30, 2021 at 8:47 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> > > > > > > > > --- a/net/core/bpf_sk_storage.c
> > > > > > > > > +++ b/net/core/bpf_sk_storage.c
> > > > > > > > > @@ -13,6 +13,7 @@
> > > > > > > > >  #include <net/sock.h>
> > > > > > > > >  #include <uapi/linux/sock_diag.h>
> > > > > > > > >  #include <uapi/linux/btf.h>
> > > > > > > > > +#include <linux/rcupdate_trace.h>
> > > > > > > > >

[...]

> > > Or is there another way to update the verifier to recognize safe sk and inode
> > > pointers?
> > I was thinking specifically for this pointer walking case.
> > Take a look at btf_struct_access().  It walks the struct
> > in the verifier and figures out reading sock->sk will get
> > a "struct sock *".  It marks the reg to PTR_TO_BTF_ID.
> > This will allow the bpf prog to directly read from sk (e.g. sk->sk_num)
> > or pass the sk to helper that takes a "struct sock *" pointer.
> > Reading from any sk pointer is fine since it is protected by BPF_PROBE_MEM
> > read.  However, we only allow the sk from sock->sk to be passed to the
> > helper here because we only know this one is refcnt-ed.
> >
> > Take a look at check_ptr_to_btf_access().  An individual verifier_ops
> > can also have its own btf_struct_access.  One possibility is
> > to introduce a (new) PTR_TO_RDONLY_BTF_ID to mean it can only
> > do BPR_PROBE_MEM read but cannot be used in helper.
> KP,  not sure how far you are with the verifier change, if it is
> moving well, please ignore.  Otherwise,
> I looked at the sock a bit and I currently don't see
> potential concern on the following pointer case without the
> rcu_read_lock for those sock-related sleepable lsm hooks in bpf_lsm.c.
> If cases did come up later (e.g. other lsm hooks are added or something got
> overlooked), we could bring in a bigger hammer to make the above verifier
> change.  I think it should be fine to stop some exotic usage

+1 Makes sense.

> later that is proven to be not safe.  For the lsm sleepable hook case,
> another option is to lock the sock first before calling the bpf prog.
>
> If you agree a similar situation is also true for inode and task,
> do you want to respin the patches addressing other points
> discussed in this thread.

I will respin the series with the other changes discussed.
KP Singh Nov. 23, 2021, 5:11 p.m. UTC | #12
On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Wed, Sep 01, 2021 at 01:26:05PM -0700, Paul E. McKenney wrote:
> > On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> > > On Tue, Aug 31, 2021 at 09:38:01PM +0200, KP Singh wrote:
> > > [ ... ]
> > >
> > > > > > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > > > > >           SDATA(selem))
> > > > > > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > > > > > >
> > > > > > > > > > -     kfree_rcu(selem, rcu);
> > > > > > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > > > > Although the common use case is usually storage_get() much more often
> > > > > > > > > than storage_delete(), do you aware any performance impact for
> > > > > > > > > the bpf prog that does a lot of storage_delete()?
> > > > > > > >
> > > > > > > > I have not really measured the impact on deletes, My understanding is
> > > > > > > > that it should
> > > > > > > > not impact the BPF program, but yes, if there are some critical
> > > > > > > > sections that are prolonged
> > > > > > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > > > > > the callbacks.
> > > > > > > >
> > > > > > > > But this is not something new, as we have a similar thing in BPF
> > > > > > > > trampolines. If this really
> > > > > > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > > > > > with this flag would be allowed in sleepable progs.
> > > > > > > Agree that is similar to trampoline updates but not sure it is comparable
> > > > > > > in terms of the frequency of elems being deleted here.  e.g. many
> > > > > > > short lived tcp connections created by external traffic.
> > > > > > >
> > > > > > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > > > > > existing sleepable bpf prog.
> > > > > > >
> > > > > > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > > > > > earlier question on perf/callback-pile-up implications in order to
> > > > > > > decide if extra logic or knob is needed here or not.
> > > > > >
> > > > > > I will defer to the others, maybe Alexei and Paul,
> > > > >
> > > > > > we could also just
> > > > > > add the flag to not affect existing performance characteristics?
> > > > > I would see if it is really necessary first.  Other sleepable
> > > > > supported maps do not need a flag.  Adding one here for local
> > > > > storage will be confusing especially if it turns out to be
> > > > > unnecessary.
> > > > >
> > > > > Could you run some tests first which can guide the decision?
> > > >
> > > > I think the performance impact would happen only in the worst case which
> > > > needs some work to simulate. What do you think about:
> > > >
> > > > A bprm_committed_creds program that processes a large argv
> > > > and also gets a storage on the inode.
> > > >
> > > > A file_open program that tries to delete the local storage on the inode.
> > > >
> > > > Trigger this code in parallel. i.e. lots of programs that execute with a very
> > > > large argv and then in parallel the executable being opened to trigger the
> > > > delete.
> > > >
> > > > Do you have any other ideas? Is there something we could re-use from
> > > > the selftests?
> > >
> > > There is a bench framework in tools/testing/selftests/bpf/benchs/
> > > that has a parallel thread setup which could be useful.
> > >
> > > Don't know how to simulate the "sleeping" too long which
> > > then pile-up callbacks.  This is not bpf specific.
> > > Paul, I wonder if you have similar test to trigger this to
> > > compare between call_rcu_tasks_trace() and call_rcu()?
> >
> > It is definitely the case that call_rcu() is way more scalable than
> > is call_rcu_tasks_trace().  Something about call_rcu_tasks_trace()
> > acquiring a global lock. ;-)
> >
> > So actually testing it makes a lot of sense.
> >
> > I do have an rcuscale module, but it is set up more for synchronous grace
> > periods such as synchronize_rcu() and synchronize_rcu_tasks_trace().  It
> > has the beginnings of support for call_rcu() and call_rcu_tasks_trace(),
> > but I would not yet trust them.
> >
> > But I also have a test for global locking:
> >
> > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> >
> > This gives a median lock overhead of 960ns.  Running a single CPU rather
> > than 16 of them:
> >
> > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> >
> > This gives a median lock overhead of 4.1ns, which is way faster.
> > And the greater the number of CPUs, the greater the lock overhead.
> Thanks for the explanation and numbers!
>
> I think the global lock will be an issue for the current non-sleepable
> netdev bpf-prog which could be triggered by external traffic,  so a flag
> is needed here to provide a fast path.  I suspect other non-prealloc map
> may need it in the future, so probably
> s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.

I was re-working the patches and had a couple of questions.

There are two data structures that get freed under RCU here:

struct bpf_local_storage
struct bpf_local_storage_selem

We can choose to free the bpf_local_storage_selem under
call_rcu_tasks_trace based on
whether the map it belongs to is sleepable with something like:

if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
    call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
else
    kfree_rcu(selem, rcu);

Questions:

* Can we free bpf_local_storage under kfree_rcu by ensuring it's
always accessed in a
  classical RCU critical section? Or maybe I am missing something and
this also needs to be freed
  under trace RCU if any of the selems are from a sleepable map.

* There is an issue with nested raw spinlocks, e.g. in
bpf_inode_storage.c:bpf_inode_storage_free

  hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
  /* Always unlink from map before unlinking from
  * local_storage.
  */
  bpf_selem_unlink_map(selem);
  free_inode_storage = bpf_selem_unlink_storage_nolock(
                 local_storage, selem, false);
  }
  raw_spin_unlock_bh(&local_storage->lock);

in bpf_selem_unlink_storage_nolock (if we add the above logic with the
flag in place of kfree_rcu)
call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
raw spin lock.

I am moving the freeing code out of the spinlock, saving the selems on
a local list and then
doing the free RCU (trace or normal) callbacks at the end. WDYT?



- KP

>
> [ ... ]
>
> > > [  143.376587] =============================
> > > [  143.377068] WARNING: suspicious RCU usage
> > > [  143.377541] 5.14.0-rc5-01271-g68e5bda2b18e #4966 Tainted: G           O
> > > [  143.378378] -----------------------------
> > > [  143.378857] kernel/bpf/bpf_local_storage.c:114 suspicious rcu_dereference_check() usage!
> > > [  143.379914]
> > > [  143.379914] other info that might help us debug this:
> > > [  143.379914]
> > > [  143.380838]
> > > [  143.380838] rcu_scheduler_active = 2, debug_locks = 1
> > > [  143.381602] 4 locks held by mv/1781:
> > > [  143.382025]  #0: ffff888121e7c438 (sb_writers#6){.+.+}-{0:0}, at: do_renameat2+0x2f5/0xa80
> > > [  143.383009]  #1: ffff88812ce68760 (&type->i_mutex_dir_key#5/1){+.+.}-{3:3}, at: lock_rename+0x1f4/0x250
> > > [  143.384144]  #2: ffffffff843fbc60 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x45/0x160
> > > [  143.385326]  #3: ffff88811d8348b8 (&storage->lock){..-.}-{2:2}, at: __bpf_selem_unlink_storage+0x7d/0x170
> > > [  143.386459]
> > > [  143.386459] stack backtrace:
> > > [  143.386983] CPU: 2 PID: 1781 Comm: mv Tainted: G           O      5.14.0-rc5-01271-g68e5bda2b18e #4966
> > > [  143.388071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> > > [  143.389146] Call Trace:
> > > [  143.389446]  dump_stack_lvl+0x5b/0x82
> > > [  143.389901]  dump_stack+0x10/0x12
> > > [  143.390302]  lockdep_rcu_suspicious+0x15c/0x167
> > > [  143.390854]  bpf_selem_unlink_storage_nolock+0x2e1/0x6d0
> > > [  143.391501]  __bpf_selem_unlink_storage+0xb7/0x170
> > > [  143.392085]  bpf_selem_unlink+0x1b/0x30
> > > [  143.392554]  bpf_inode_storage_delete+0x57/0xa0
> > > [  143.393112]  bpf_prog_31e277fe2c132665_inode_rename+0x9c/0x268
> > > [  143.393814]  bpf_trampoline_6442476301_0+0x4e/0x1000
> > > [  143.394413]  bpf_lsm_inode_rename+0x5/0x10
> >
> > I am not sure what line 114 is (it is a blank line in bpf-next), but
> > you might be missing a rcu_read_lock_trace_held() in the second argument
> > of rcu_dereference_check().
> Right, this path is only under rcu_read_lock_trace().
Paul E. McKenney Nov. 23, 2021, 6:22 p.m. UTC | #13
On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Wed, Sep 01, 2021 at 01:26:05PM -0700, Paul E. McKenney wrote:
> > > On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> > > > On Tue, Aug 31, 2021 at 09:38:01PM +0200, KP Singh wrote:
> > > > [ ... ]
> > > >
> > > > > > > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > > > > > >           SDATA(selem))
> > > > > > > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > > > > > > >
> > > > > > > > > > > -     kfree_rcu(selem, rcu);
> > > > > > > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > > > > > Although the common use case is usually storage_get() much more often
> > > > > > > > > > than storage_delete(), do you aware any performance impact for
> > > > > > > > > > the bpf prog that does a lot of storage_delete()?
> > > > > > > > >
> > > > > > > > > I have not really measured the impact on deletes, My understanding is
> > > > > > > > > that it should
> > > > > > > > > not impact the BPF program, but yes, if there are some critical
> > > > > > > > > sections that are prolonged
> > > > > > > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > > > > > > the callbacks.
> > > > > > > > >
> > > > > > > > > But this is not something new, as we have a similar thing in BPF
> > > > > > > > > trampolines. If this really
> > > > > > > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > > > > > > with this flag would be allowed in sleepable progs.
> > > > > > > > Agree that is similar to trampoline updates but not sure it is comparable
> > > > > > > > in terms of the frequency of elems being deleted here.  e.g. many
> > > > > > > > short lived tcp connections created by external traffic.
> > > > > > > >
> > > > > > > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > > > > > > existing sleepable bpf prog.
> > > > > > > >
> > > > > > > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > > > > > > earlier question on perf/callback-pile-up implications in order to
> > > > > > > > decide if extra logic or knob is needed here or not.
> > > > > > >
> > > > > > > I will defer to the others, maybe Alexei and Paul,
> > > > > >
> > > > > > > we could also just
> > > > > > > add the flag to not affect existing performance characteristics?
> > > > > > I would see if it is really necessary first.  Other sleepable
> > > > > > supported maps do not need a flag.  Adding one here for local
> > > > > > storage will be confusing especially if it turns out to be
> > > > > > unnecessary.
> > > > > >
> > > > > > Could you run some tests first which can guide the decision?
> > > > >
> > > > > I think the performance impact would happen only in the worst case which
> > > > > needs some work to simulate. What do you think about:
> > > > >
> > > > > A bprm_committed_creds program that processes a large argv
> > > > > and also gets a storage on the inode.
> > > > >
> > > > > A file_open program that tries to delete the local storage on the inode.
> > > > >
> > > > > Trigger this code in parallel. i.e. lots of programs that execute with a very
> > > > > large argv and then in parallel the executable being opened to trigger the
> > > > > delete.
> > > > >
> > > > > Do you have any other ideas? Is there something we could re-use from
> > > > > the selftests?
> > > >
> > > > There is a bench framework in tools/testing/selftests/bpf/benchs/
> > > > that has a parallel thread setup which could be useful.
> > > >
> > > > Don't know how to simulate the "sleeping" too long which
> > > > then pile-up callbacks.  This is not bpf specific.
> > > > Paul, I wonder if you have similar test to trigger this to
> > > > compare between call_rcu_tasks_trace() and call_rcu()?
> > >
> > > It is definitely the case that call_rcu() is way more scalable than
> > > is call_rcu_tasks_trace().  Something about call_rcu_tasks_trace()
> > > acquiring a global lock. ;-)
> > >
> > > So actually testing it makes a lot of sense.
> > >
> > > I do have an rcuscale module, but it is set up more for synchronous grace
> > > periods such as synchronize_rcu() and synchronize_rcu_tasks_trace().  It
> > > has the beginnings of support for call_rcu() and call_rcu_tasks_trace(),
> > > but I would not yet trust them.
> > >
> > > But I also have a test for global locking:
> > >
> > > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> > >
> > > This gives a median lock overhead of 960ns.  Running a single CPU rather
> > > than 16 of them:
> > >
> > > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> > >
> > > This gives a median lock overhead of 4.1ns, which is way faster.
> > > And the greater the number of CPUs, the greater the lock overhead.
> > Thanks for the explanation and numbers!
> >
> > I think the global lock will be an issue for the current non-sleepable
> > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > is needed here to provide a fast path.  I suspect other non-prealloc map
> > may need it in the future, so probably
> > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> 
> I was re-working the patches and had a couple of questions.
> 
> There are two data structures that get freed under RCU here:
> 
> struct bpf_local_storage
> struct bpf_local_storage_selem
> 
> We can choose to free the bpf_local_storage_selem under
> call_rcu_tasks_trace based on
> whether the map it belongs to is sleepable with something like:
> 
> if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
>     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> else
>     kfree_rcu(selem, rcu);
> 
> Questions:
> 
> * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> always accessed in a
>   classical RCU critical section? Or maybe I am missing something and
> this also needs to be freed
>   under trace RCU if any of the selems are from a sleepable map.
> 
> * There is an issue with nested raw spinlocks, e.g. in
> bpf_inode_storage.c:bpf_inode_storage_free
> 
>   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
>   /* Always unlink from map before unlinking from
>   * local_storage.
>   */
>   bpf_selem_unlink_map(selem);
>   free_inode_storage = bpf_selem_unlink_storage_nolock(
>                  local_storage, selem, false);
>   }
>   raw_spin_unlock_bh(&local_storage->lock);
> 
> in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> flag in place of kfree_rcu)
> call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> raw spin lock.
> 
> I am moving the freeing code out of the spinlock, saving the selems on
> a local list and then
> doing the free RCU (trace or normal) callbacks at the end. WDYT?

Depending on the urgency, another approach is to rely on my ongoing work
removing the call_rcu_tasks_trace() bottleneck.  This commit on branch
"dev" in the -rcu tree allows boot-time setting of per-CPU callback
queues for call_rcu_tasks_trace(), along with the other RCU-tasks flavors:

0b886cc4b10f ("rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing")

Preceding commits actually set up the queues.  With these commits, you
could boot with rcupdate.rcu_task_enqueue_lim=N, where N greater than
or equal to the number of CPUs on your system, to get per-CPU queuing.
These commits probably still have a bug or three, but on the other hand,
they have survived a couple of weeks worth of rcutorture runs.

This week's work will allow automatic transition between single-queue
and per-CPU-queue operation based on lock contention and the number of
callbacks queued.

My current plan is to get this into the next merge window (v5.17).

Thoughts?

							Thanx, Paul

> - KP
> 
> >
> > [ ... ]
> >
> > > > [  143.376587] =============================
> > > > [  143.377068] WARNING: suspicious RCU usage
> > > > [  143.377541] 5.14.0-rc5-01271-g68e5bda2b18e #4966 Tainted: G           O
> > > > [  143.378378] -----------------------------
> > > > [  143.378857] kernel/bpf/bpf_local_storage.c:114 suspicious rcu_dereference_check() usage!
> > > > [  143.379914]
> > > > [  143.379914] other info that might help us debug this:
> > > > [  143.379914]
> > > > [  143.380838]
> > > > [  143.380838] rcu_scheduler_active = 2, debug_locks = 1
> > > > [  143.381602] 4 locks held by mv/1781:
> > > > [  143.382025]  #0: ffff888121e7c438 (sb_writers#6){.+.+}-{0:0}, at: do_renameat2+0x2f5/0xa80
> > > > [  143.383009]  #1: ffff88812ce68760 (&type->i_mutex_dir_key#5/1){+.+.}-{3:3}, at: lock_rename+0x1f4/0x250
> > > > [  143.384144]  #2: ffffffff843fbc60 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x45/0x160
> > > > [  143.385326]  #3: ffff88811d8348b8 (&storage->lock){..-.}-{2:2}, at: __bpf_selem_unlink_storage+0x7d/0x170
> > > > [  143.386459]
> > > > [  143.386459] stack backtrace:
> > > > [  143.386983] CPU: 2 PID: 1781 Comm: mv Tainted: G           O      5.14.0-rc5-01271-g68e5bda2b18e #4966
> > > > [  143.388071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> > > > [  143.389146] Call Trace:
> > > > [  143.389446]  dump_stack_lvl+0x5b/0x82
> > > > [  143.389901]  dump_stack+0x10/0x12
> > > > [  143.390302]  lockdep_rcu_suspicious+0x15c/0x167
> > > > [  143.390854]  bpf_selem_unlink_storage_nolock+0x2e1/0x6d0
> > > > [  143.391501]  __bpf_selem_unlink_storage+0xb7/0x170
> > > > [  143.392085]  bpf_selem_unlink+0x1b/0x30
> > > > [  143.392554]  bpf_inode_storage_delete+0x57/0xa0
> > > > [  143.393112]  bpf_prog_31e277fe2c132665_inode_rename+0x9c/0x268
> > > > [  143.393814]  bpf_trampoline_6442476301_0+0x4e/0x1000
> > > > [  143.394413]  bpf_lsm_inode_rename+0x5/0x10
> > >
> > > I am not sure what line 114 is (it is a blank line in bpf-next), but
> > > you might be missing a rcu_read_lock_trace_held() in the second argument
> > > of rcu_dereference_check().
> > Right, this path is only under rcu_read_lock_trace().
Martin KaFai Lau Nov. 23, 2021, 10:29 p.m. UTC | #14
On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > I think the global lock will be an issue for the current non-sleepable
> > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > may need it in the future, so probably
> > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > 
> > I was re-working the patches and had a couple of questions.
> > 
> > There are two data structures that get freed under RCU here:
> > 
> > struct bpf_local_storage
> > struct bpf_local_storage_selem
> > 
> > We can choose to free the bpf_local_storage_selem under
> > call_rcu_tasks_trace based on
> > whether the map it belongs to is sleepable with something like:
> > 
> > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
Paul's current work (mentioned by his previous email) will improve the
performance of call_rcu_tasks_trace, so it probably can avoid the
new BPF_F_SLEEPABLE flag and make it easier to use.

> >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > else
> >     kfree_rcu(selem, rcu);
> > 
> > Questions:
> > 
> > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> >   always accessed in a  classical RCU critical section?
>>    Or maybe I am missing something and this also needs to be freed
> >   under trace RCU if any of the selems are from a sleepable map.
In the inode_storage_lookup() of this patch:

+#define bpf_local_storage_rcu_lock_held()                      \
+       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
+        rcu_read_lock_bh_held())

@@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
	if (!bsb)
		return NULL;

-	inode_storage = rcu_dereference(bsb->storage);
+	inode_storage = rcu_dereference_protected(bsb->storage,
+						  bpf_local_storage_rcu_lock_held());

Thus, it is not always in classical RCU critical.

> > 
> > * There is an issue with nested raw spinlocks, e.g. in
> > bpf_inode_storage.c:bpf_inode_storage_free
> > 
> >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> >   /* Always unlink from map before unlinking from
> >   * local_storage.
> >   */
> >   bpf_selem_unlink_map(selem);
> >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> >                  local_storage, selem, false);
> >   }
> >   raw_spin_unlock_bh(&local_storage->lock);
> > 
> > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > flag in place of kfree_rcu)
> > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > raw spin lock.
> > 
> > I am moving the freeing code out of the spinlock, saving the selems on
> > a local list and then doing the free RCU (trace or normal) callbacks
> > at the end. WDYT?
There could be more than one selem to save.

I think the splat is from CONFIG_PROVE_RAW_LOCK_NESTING=y.

Just happened to bump into Paul briefly offline, his work probably can
also avoid the spin_lock in call_rcu_tasks_trace().

I would ignore this splat for now which should go away when it is
merged with Paul's work in the 5.17 merge cycle.

> Depending on the urgency, another approach is to rely on my ongoing work
> removing the call_rcu_tasks_trace() bottleneck.  This commit on branch
> "dev" in the -rcu tree allows boot-time setting of per-CPU callback
> queues for call_rcu_tasks_trace(), along with the other RCU-tasks flavors:
> 
> 0b886cc4b10f ("rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing")
> 
> Preceding commits actually set up the queues.  With these commits, you
> could boot with rcupdate.rcu_task_enqueue_lim=N, where N greater than
> or equal to the number of CPUs on your system, to get per-CPU queuing.
> These commits probably still have a bug or three, but on the other hand,
> they have survived a couple of weeks worth of rcutorture runs.
> 
> This week's work will allow automatic transition between single-queue
> and per-CPU-queue operation based on lock contention and the number of
> callbacks queued.
> 
> My current plan is to get this into the next merge window (v5.17).
That would be great.
KP Singh Nov. 23, 2021, 11:11 p.m. UTC | #15
On Tue, Nov 23, 2021 at 7:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Wed, Sep 01, 2021 at 01:26:05PM -0700, Paul E. McKenney wrote:
> > > > On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> > > > > On Tue, Aug 31, 2021 at 09:38:01PM +0200, KP Singh wrote:
> > > > > [ ... ]
> > > > >
> > > > > > > > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > > > > > > >           SDATA(selem))
> > > > > > > > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > > > > > > > >
> > > > > > > > > > > > -     kfree_rcu(selem, rcu);
> > > > > > > > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > > > > > > Although the common use case is usually storage_get() much more often
> > > > > > > > > > > than storage_delete(), do you aware any performance impact for
> > > > > > > > > > > the bpf prog that does a lot of storage_delete()?
> > > > > > > > > >
> > > > > > > > > > I have not really measured the impact on deletes, My understanding is
> > > > > > > > > > that it should
> > > > > > > > > > not impact the BPF program, but yes, if there are some critical
> > > > > > > > > > sections that are prolonged
> > > > > > > > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > > > > > > > the callbacks.
> > > > > > > > > >
> > > > > > > > > > But this is not something new, as we have a similar thing in BPF
> > > > > > > > > > trampolines. If this really
> > > > > > > > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > > > > > > > with this flag would be allowed in sleepable progs.
> > > > > > > > > Agree that is similar to trampoline updates but not sure it is comparable
> > > > > > > > > in terms of the frequency of elems being deleted here.  e.g. many
> > > > > > > > > short lived tcp connections created by external traffic.
> > > > > > > > >
> > > > > > > > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > > > > > > > existing sleepable bpf prog.
> > > > > > > > >
> > > > > > > > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > > > > > > > earlier question on perf/callback-pile-up implications in order to
> > > > > > > > > decide if extra logic or knob is needed here or not.
> > > > > > > >
> > > > > > > > I will defer to the others, maybe Alexei and Paul,
> > > > > > >
> > > > > > > > we could also just
> > > > > > > > add the flag to not affect existing performance characteristics?
> > > > > > > I would see if it is really necessary first.  Other sleepable
> > > > > > > supported maps do not need a flag.  Adding one here for local
> > > > > > > storage will be confusing especially if it turns out to be
> > > > > > > unnecessary.
> > > > > > >
> > > > > > > Could you run some tests first which can guide the decision?
> > > > > >
> > > > > > I think the performance impact would happen only in the worst case which
> > > > > > needs some work to simulate. What do you think about:
> > > > > >
> > > > > > A bprm_committed_creds program that processes a large argv
> > > > > > and also gets a storage on the inode.
> > > > > >
> > > > > > A file_open program that tries to delete the local storage on the inode.
> > > > > >
> > > > > > Trigger this code in parallel. i.e. lots of programs that execute with a very
> > > > > > large argv and then in parallel the executable being opened to trigger the
> > > > > > delete.
> > > > > >
> > > > > > Do you have any other ideas? Is there something we could re-use from
> > > > > > the selftests?
> > > > >
> > > > > There is a bench framework in tools/testing/selftests/bpf/benchs/
> > > > > that has a parallel thread setup which could be useful.
> > > > >
> > > > > Don't know how to simulate the "sleeping" too long which
> > > > > then pile-up callbacks.  This is not bpf specific.
> > > > > Paul, I wonder if you have similar test to trigger this to
> > > > > compare between call_rcu_tasks_trace() and call_rcu()?
> > > >
> > > > It is definitely the case that call_rcu() is way more scalable than
> > > > is call_rcu_tasks_trace().  Something about call_rcu_tasks_trace()
> > > > acquiring a global lock. ;-)
> > > >
> > > > So actually testing it makes a lot of sense.
> > > >
> > > > I do have an rcuscale module, but it is set up more for synchronous grace
> > > > periods such as synchronize_rcu() and synchronize_rcu_tasks_trace().  It
> > > > has the beginnings of support for call_rcu() and call_rcu_tasks_trace(),
> > > > but I would not yet trust them.
> > > >
> > > > But I also have a test for global locking:
> > > >
> > > > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> > > >
> > > > This gives a median lock overhead of 960ns.  Running a single CPU rather
> > > > than 16 of them:
> > > >
> > > > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> > > >
> > > > This gives a median lock overhead of 4.1ns, which is way faster.
> > > > And the greater the number of CPUs, the greater the lock overhead.
> > > Thanks for the explanation and numbers!
> > >
> > > I think the global lock will be an issue for the current non-sleepable
> > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > may need it in the future, so probably
> > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> >
> > I was re-working the patches and had a couple of questions.
> >
> > There are two data structures that get freed under RCU here:
> >
> > struct bpf_local_storage
> > struct bpf_local_storage_selem
> >
> > We can choose to free the bpf_local_storage_selem under
> > call_rcu_tasks_trace based on
> > whether the map it belongs to is sleepable with something like:
> >
> > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > else
> >     kfree_rcu(selem, rcu);
> >
> > Questions:
> >
> > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > always accessed in a
> >   classical RCU critical section? Or maybe I am missing something and
> > this also needs to be freed
> >   under trace RCU if any of the selems are from a sleepable map.
> >
> > * There is an issue with nested raw spinlocks, e.g. in
> > bpf_inode_storage.c:bpf_inode_storage_free
> >
> >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> >   /* Always unlink from map before unlinking from
> >   * local_storage.
> >   */
> >   bpf_selem_unlink_map(selem);
> >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> >                  local_storage, selem, false);
> >   }
> >   raw_spin_unlock_bh(&local_storage->lock);
> >
> > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > flag in place of kfree_rcu)
> > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > raw spin lock.
> >
> > I am moving the freeing code out of the spinlock, saving the selems on
> > a local list and then
> > doing the free RCU (trace or normal) callbacks at the end. WDYT?
>
> Depending on the urgency, another approach is to rely on my ongoing work

It's best to wait for your patches to land and keep this code simple.

> removing the call_rcu_tasks_trace() bottleneck.  This commit on branch
> "dev" in the -rcu tree allows boot-time setting of per-CPU callback
> queues for call_rcu_tasks_trace(), along with the other RCU-tasks flavors:
>
> 0b886cc4b10f ("rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing")
>
> Preceding commits actually set up the queues.  With these commits, you
> could boot with rcupdate.rcu_task_enqueue_lim=N, where N greater than
> or equal to the number of CPUs on your system, to get per-CPU queuing.
> These commits probably still have a bug or three, but on the other hand,
> they have survived a couple of weeks worth of rcutorture runs.

Thank you so much, this would make my life a lot easier :)

- KP

>
> This week's work will allow automatic transition between single-queue
> and per-CPU-queue operation based on lock contention and the number of
> callbacks queued.
>
> My current plan is to get this into the next merge window (v5.17).
>
> Thoughts?
>
>                                                         Thanx, Paul
>
> > - KP
> >
> > >
> > > [ ... ]
> > >
> > > > > [  143.376587] =============================
> > > > > [  143.377068] WARNING: suspicious RCU usage
> > > > > [  143.377541] 5.14.0-rc5-01271-g68e5bda2b18e #4966 Tainted: G           O
> > > > > [  143.378378] -----------------------------
> > > > > [  143.378857] kernel/bpf/bpf_local_storage.c:114 suspicious rcu_dereference_check() usage!
> > > > > [  143.379914]
> > > > > [  143.379914] other info that might help us debug this:
> > > > > [  143.379914]
> > > > > [  143.380838]
> > > > > [  143.380838] rcu_scheduler_active = 2, debug_locks = 1
> > > > > [  143.381602] 4 locks held by mv/1781:
> > > > > [  143.382025]  #0: ffff888121e7c438 (sb_writers#6){.+.+}-{0:0}, at: do_renameat2+0x2f5/0xa80
> > > > > [  143.383009]  #1: ffff88812ce68760 (&type->i_mutex_dir_key#5/1){+.+.}-{3:3}, at: lock_rename+0x1f4/0x250
> > > > > [  143.384144]  #2: ffffffff843fbc60 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x45/0x160
> > > > > [  143.385326]  #3: ffff88811d8348b8 (&storage->lock){..-.}-{2:2}, at: __bpf_selem_unlink_storage+0x7d/0x170
> > > > > [  143.386459]
> > > > > [  143.386459] stack backtrace:
> > > > > [  143.386983] CPU: 2 PID: 1781 Comm: mv Tainted: G           O      5.14.0-rc5-01271-g68e5bda2b18e #4966
> > > > > [  143.388071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> > > > > [  143.389146] Call Trace:
> > > > > [  143.389446]  dump_stack_lvl+0x5b/0x82
> > > > > [  143.389901]  dump_stack+0x10/0x12
> > > > > [  143.390302]  lockdep_rcu_suspicious+0x15c/0x167
> > > > > [  143.390854]  bpf_selem_unlink_storage_nolock+0x2e1/0x6d0
> > > > > [  143.391501]  __bpf_selem_unlink_storage+0xb7/0x170
> > > > > [  143.392085]  bpf_selem_unlink+0x1b/0x30
> > > > > [  143.392554]  bpf_inode_storage_delete+0x57/0xa0
> > > > > [  143.393112]  bpf_prog_31e277fe2c132665_inode_rename+0x9c/0x268
> > > > > [  143.393814]  bpf_trampoline_6442476301_0+0x4e/0x1000
> > > > > [  143.394413]  bpf_lsm_inode_rename+0x5/0x10
> > > >
> > > > I am not sure what line 114 is (it is a blank line in bpf-next), but
> > > > you might be missing a rcu_read_lock_trace_held() in the second argument
> > > > of rcu_dereference_check().
> > > Right, this path is only under rcu_read_lock_trace().
KP Singh Nov. 23, 2021, 11:14 p.m. UTC | #16
On Tue, Nov 23, 2021 at 11:30 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > I think the global lock will be an issue for the current non-sleepable
> > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > may need it in the future, so probably
> > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > >
> > > I was re-working the patches and had a couple of questions.
> > >
> > > There are two data structures that get freed under RCU here:
> > >
> > > struct bpf_local_storage
> > > struct bpf_local_storage_selem
> > >
> > > We can choose to free the bpf_local_storage_selem under
> > > call_rcu_tasks_trace based on
> > > whether the map it belongs to is sleepable with something like:
> > >
> > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> Paul's current work (mentioned by his previous email) will improve the
> performance of call_rcu_tasks_trace, so it probably can avoid the
> new BPF_F_SLEEPABLE flag and make it easier to use.
>
> > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > else
> > >     kfree_rcu(selem, rcu);
> > >
> > > Questions:
> > >
> > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > >   always accessed in a  classical RCU critical section?
> >>    Or maybe I am missing something and this also needs to be freed
> > >   under trace RCU if any of the selems are from a sleepable map.
> In the inode_storage_lookup() of this patch:
>
> +#define bpf_local_storage_rcu_lock_held()                      \
> +       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> +        rcu_read_lock_bh_held())
>
> @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
>         if (!bsb)
>                 return NULL;
>
> -       inode_storage = rcu_dereference(bsb->storage);
> +       inode_storage = rcu_dereference_protected(bsb->storage,
> +                                                 bpf_local_storage_rcu_lock_held());
>
> Thus, it is not always in classical RCU critical.

I was planning on adding a classical RCU read side critical section
whenever we called the lookup functions.

Would that have worked? (for the sake of learning).

>
> > >
> > > * There is an issue with nested raw spinlocks, e.g. in
> > > bpf_inode_storage.c:bpf_inode_storage_free
> > >
> > >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > >   /* Always unlink from map before unlinking from
> > >   * local_storage.
> > >   */
> > >   bpf_selem_unlink_map(selem);
> > >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> > >                  local_storage, selem, false);
> > >   }
> > >   raw_spin_unlock_bh(&local_storage->lock);
> > >
> > > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > > flag in place of kfree_rcu)
> > > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > > raw spin lock.
> > >
> > > I am moving the freeing code out of the spinlock, saving the selems on
> > > a local list and then doing the free RCU (trace or normal) callbacks
> > > at the end. WDYT?
> There could be more than one selem to save.
>
> I think the splat is from CONFIG_PROVE_RAW_LOCK_NESTING=y.
>
> Just happened to bump into Paul briefly offline, his work probably can
> also avoid the spin_lock in call_rcu_tasks_trace().
>
> I would ignore this splat for now which should go away when it is
> merged with Paul's work in the 5.17 merge cycle.

Agreed.

>
> > Depending on the urgency, another approach is to rely on my ongoing work
> > removing the call_rcu_tasks_trace() bottleneck.  This commit on branch
> > "dev" in the -rcu tree allows boot-time setting of per-CPU callback
> > queues for call_rcu_tasks_trace(), along with the other RCU-tasks flavors:
> >
> > 0b886cc4b10f ("rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing")
> >
> > Preceding commits actually set up the queues.  With these commits, you
> > could boot with rcupdate.rcu_task_enqueue_lim=N, where N greater than
> > or equal to the number of CPUs on your system, to get per-CPU queuing.
> > These commits probably still have a bug or three, but on the other hand,
> > they have survived a couple of weeks worth of rcutorture runs.
> >
> > This week's work will allow automatic transition between single-queue
> > and per-CPU-queue operation based on lock contention and the number of
> > callbacks queued.
> >
> > My current plan is to get this into the next merge window (v5.17).
> That would be great.

+1 :)
Martin KaFai Lau Nov. 24, 2021, 12:18 a.m. UTC | #17
On Wed, Nov 24, 2021 at 12:14:29AM +0100, KP Singh wrote:
> On Tue, Nov 23, 2021 at 11:30 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> > > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > I think the global lock will be an issue for the current non-sleepable
> > > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > > may need it in the future, so probably
> > > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > > >
> > > > I was re-working the patches and had a couple of questions.
> > > >
> > > > There are two data structures that get freed under RCU here:
> > > >
> > > > struct bpf_local_storage
> > > > struct bpf_local_storage_selem
> > > >
> > > > We can choose to free the bpf_local_storage_selem under
> > > > call_rcu_tasks_trace based on
> > > > whether the map it belongs to is sleepable with something like:
> > > >
> > > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > Paul's current work (mentioned by his previous email) will improve the
> > performance of call_rcu_tasks_trace, so it probably can avoid the
> > new BPF_F_SLEEPABLE flag and make it easier to use.
> >
> > > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > else
> > > >     kfree_rcu(selem, rcu);
> > > >
> > > > Questions:
> > > >
> > > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > > >   always accessed in a  classical RCU critical section?
> > >>    Or maybe I am missing something and this also needs to be freed
> > > >   under trace RCU if any of the selems are from a sleepable map.
> > In the inode_storage_lookup() of this patch:
> >
> > +#define bpf_local_storage_rcu_lock_held()                      \
> > +       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> > +        rcu_read_lock_bh_held())
> >
> > @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
> >         if (!bsb)
> >                 return NULL;
> >
> > -       inode_storage = rcu_dereference(bsb->storage);
> > +       inode_storage = rcu_dereference_protected(bsb->storage,
> > +                                                 bpf_local_storage_rcu_lock_held());
> >
> > Thus, it is not always in classical RCU critical.
> 
> I was planning on adding a classical RCU read side critical section
> whenever we called the lookup functions.
> 
> Would that have worked? (for the sake of learning).
ah. ic. You meant local_storage could be under rcu_read_lock()
if we wanted to since it is not exposed to the sleepable
bpf_prog which is under rcu_read_lock_trace()?

It should work after a quick thought but then we
need to figure out the needed places to add
an extra rcu_read_lock().  What is the
reason for doing this?
KP Singh Nov. 24, 2021, 10:20 p.m. UTC | #18
On Tue, Nov 23, 2021 at 11:30 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > I think the global lock will be an issue for the current non-sleepable
> > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > may need it in the future, so probably
> > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > >
> > > I was re-working the patches and had a couple of questions.
> > >
> > > There are two data structures that get freed under RCU here:
> > >
> > > struct bpf_local_storage
> > > struct bpf_local_storage_selem
> > >
> > > We can choose to free the bpf_local_storage_selem under
> > > call_rcu_tasks_trace based on
> > > whether the map it belongs to is sleepable with something like:
> > >
> > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> Paul's current work (mentioned by his previous email) will improve the
> performance of call_rcu_tasks_trace, so it probably can avoid the
> new BPF_F_SLEEPABLE flag and make it easier to use.
>
> > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > else
> > >     kfree_rcu(selem, rcu);
> > >
> > > Questions:
> > >
> > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > >   always accessed in a  classical RCU critical section?
> >>    Or maybe I am missing something and this also needs to be freed
> > >   under trace RCU if any of the selems are from a sleepable map.
> In the inode_storage_lookup() of this patch:
>
> +#define bpf_local_storage_rcu_lock_held()                      \
> +       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> +        rcu_read_lock_bh_held())
>
> @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
>         if (!bsb)
>                 return NULL;
>
> -       inode_storage = rcu_dereference(bsb->storage);
> +       inode_storage = rcu_dereference_protected(bsb->storage,
> +                                                 bpf_local_storage_rcu_lock_held());
>
> Thus, it is not always in classical RCU critical.
>
> > >
> > > * There is an issue with nested raw spinlocks, e.g. in
> > > bpf_inode_storage.c:bpf_inode_storage_free
> > >
> > >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > >   /* Always unlink from map before unlinking from
> > >   * local_storage.
> > >   */
> > >   bpf_selem_unlink_map(selem);
> > >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> > >                  local_storage, selem, false);
> > >   }
> > >   raw_spin_unlock_bh(&local_storage->lock);
> > >
> > > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > > flag in place of kfree_rcu)
> > > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > > raw spin lock.
> > >
> > > I am moving the freeing code out of the spinlock, saving the selems on
> > > a local list and then doing the free RCU (trace or normal) callbacks
> > > at the end. WDYT?
> There could be more than one selem to save.

Yes, that's why I was saving them on a local list and then calling
kfree_rcu or call_rcu_tasks_trace after unlocking the raw_spin_lock

INIT_HLIST_HEAD(&free_list);
raw_spin_lock_irqsave(&local_storage->lock, flags);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
    bpf_selem_unlink_map(selem);
    free_local_storage = bpf_selem_unlink_storage_nolock(
    local_storage, selem, false);
    hlist_add_head(&selem->snode, &free_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);

/* The element needs to be freed outside the raw spinlock because spin
* locks cannot nest inside a raw spin locks and call_rcu_tasks_trace
* grabs a spinklock when the RCU code calls into the scheduler.
*
* free_local_storage should always be true as long as
* local_storage->list was non-empty.
*/
hlist_for_each_entry_safe(selem, n, &free_list, snode) {
    if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
        call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
    else
        kfree_rcu(selem, rcu);
}

But... we won't need this anymore.

>
> I think the splat is from CONFIG_PROVE_RAW_LOCK_NESTING=y.
>
> Just happened to bump into Paul briefly offline, his work probably can
> also avoid the spin_lock in call_rcu_tasks_trace().
>
> I would ignore this splat for now which should go away when it is
> merged with Paul's work in the 5.17 merge cycle.
>
> > Depending on the urgency, another approach is to rely on my ongoing work
> > removing the call_rcu_tasks_trace() bottleneck.  This commit on branch
> > "dev" in the -rcu tree allows boot-time setting of per-CPU callback
> > queues for call_rcu_tasks_trace(), along with the other RCU-tasks flavors:
> >
> > 0b886cc4b10f ("rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing")
> >
> > Preceding commits actually set up the queues.  With these commits, you
> > could boot with rcupdate.rcu_task_enqueue_lim=N, where N greater than
> > or equal to the number of CPUs on your system, to get per-CPU queuing.
> > These commits probably still have a bug or three, but on the other hand,
> > they have survived a couple of weeks worth of rcutorture runs.
> >
> > This week's work will allow automatic transition between single-queue
> > and per-CPU-queue operation based on lock contention and the number of
> > callbacks queued.
> >
> > My current plan is to get this into the next merge window (v5.17).
> That would be great.
Paul E. McKenney Nov. 25, 2021, 3:47 a.m. UTC | #19
On Wed, Nov 24, 2021 at 12:11:46AM +0100, KP Singh wrote:
> On Tue, Nov 23, 2021 at 7:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > On Wed, Sep 01, 2021 at 01:26:05PM -0700, Paul E. McKenney wrote:
> > > > > On Tue, Aug 31, 2021 at 11:32:17PM -0700, Martin KaFai Lau wrote:
> > > > > > On Tue, Aug 31, 2021 at 09:38:01PM +0200, KP Singh wrote:
> > > > > > [ ... ]
> > > > > >
> > > > > > > > > > > > > @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
> > > > > > > > > > > > >           SDATA(selem))
> > > > > > > > > > > > >               RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
> > > > > > > > > > > > >
> > > > > > > > > > > > > -     kfree_rcu(selem, rcu);
> > > > > > > > > > > > > +     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > > > > > > > Although the common use case is usually storage_get() much more often
> > > > > > > > > > > > than storage_delete(), do you aware any performance impact for
> > > > > > > > > > > > the bpf prog that does a lot of storage_delete()?
> > > > > > > > > > >
> > > > > > > > > > > I have not really measured the impact on deletes, My understanding is
> > > > > > > > > > > that it should
> > > > > > > > > > > not impact the BPF program, but yes, if there are some critical
> > > > > > > > > > > sections that are prolonged
> > > > > > > > > > > due to a sleepable program "sleeping" too long, then it would pile up
> > > > > > > > > > > the callbacks.
> > > > > > > > > > >
> > > > > > > > > > > But this is not something new, as we have a similar thing in BPF
> > > > > > > > > > > trampolines. If this really
> > > > > > > > > > > becomes an issue, we could add a flag BPF_F_SLEEPABLE_STORAGE and only maps
> > > > > > > > > > > with this flag would be allowed in sleepable progs.
> > > > > > > > > > Agree that is similar to trampoline updates but not sure it is comparable
> > > > > > > > > > in terms of the frequency of elems being deleted here.  e.g. many
> > > > > > > > > > short lived tcp connections created by external traffic.
> > > > > > > > > >
> > > > > > > > > > Adding a BPF_F_SLEEPABLE_STORAGE later won't work.  It will break
> > > > > > > > > > existing sleepable bpf prog.
> > > > > > > > > >
> > > > > > > > > > I don't know enough on call_rcu_tasks_trace() here, so the
> > > > > > > > > > earlier question on perf/callback-pile-up implications in order to
> > > > > > > > > > decide if extra logic or knob is needed here or not.
> > > > > > > > >
> > > > > > > > > I will defer to the others, maybe Alexei and Paul,
> > > > > > > >
> > > > > > > > > we could also just
> > > > > > > > > add the flag to not affect existing performance characteristics?
> > > > > > > > I would see if it is really necessary first.  Other sleepable
> > > > > > > > supported maps do not need a flag.  Adding one here for local
> > > > > > > > storage will be confusing especially if it turns out to be
> > > > > > > > unnecessary.
> > > > > > > >
> > > > > > > > Could you run some tests first which can guide the decision?
> > > > > > >
> > > > > > > I think the performance impact would happen only in the worst case which
> > > > > > > needs some work to simulate. What do you think about:
> > > > > > >
> > > > > > > A bprm_committed_creds program that processes a large argv
> > > > > > > and also gets a storage on the inode.
> > > > > > >
> > > > > > > A file_open program that tries to delete the local storage on the inode.
> > > > > > >
> > > > > > > Trigger this code in parallel. i.e. lots of programs that execute with a very
> > > > > > > large argv and then in parallel the executable being opened to trigger the
> > > > > > > delete.
> > > > > > >
> > > > > > > Do you have any other ideas? Is there something we could re-use from
> > > > > > > the selftests?
> > > > > >
> > > > > > There is a bench framework in tools/testing/selftests/bpf/benchs/
> > > > > > that has a parallel thread setup which could be useful.
> > > > > >
> > > > > > Don't know how to simulate the "sleeping" too long which
> > > > > > then pile-up callbacks.  This is not bpf specific.
> > > > > > Paul, I wonder if you have similar test to trigger this to
> > > > > > compare between call_rcu_tasks_trace() and call_rcu()?
> > > > >
> > > > > It is definitely the case that call_rcu() is way more scalable than
> > > > > is call_rcu_tasks_trace().  Something about call_rcu_tasks_trace()
> > > > > acquiring a global lock. ;-)
> > > > >
> > > > > So actually testing it makes a lot of sense.
> > > > >
> > > > > I do have an rcuscale module, but it is set up more for synchronous grace
> > > > > periods such as synchronize_rcu() and synchronize_rcu_tasks_trace().  It
> > > > > has the beginnings of support for call_rcu() and call_rcu_tasks_trace(),
> > > > > but I would not yet trust them.
> > > > >
> > > > > But I also have a test for global locking:
> > > > >
> > > > > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> > > > >
> > > > > This gives a median lock overhead of 960ns.  Running a single CPU rather
> > > > > than 16 of them:
> > > > >
> > > > > $ tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs "NOPREEMPT" --kconfig "CONFIG_NR_CPUS=16" --bootargs "refscale.scale_type=lock refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" --trust-make
> > > > >
> > > > > This gives a median lock overhead of 4.1ns, which is way faster.
> > > > > And the greater the number of CPUs, the greater the lock overhead.
> > > > Thanks for the explanation and numbers!
> > > >
> > > > I think the global lock will be an issue for the current non-sleepable
> > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > may need it in the future, so probably
> > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > >
> > > I was re-working the patches and had a couple of questions.
> > >
> > > There are two data structures that get freed under RCU here:
> > >
> > > struct bpf_local_storage
> > > struct bpf_local_storage_selem
> > >
> > > We can choose to free the bpf_local_storage_selem under
> > > call_rcu_tasks_trace based on
> > > whether the map it belongs to is sleepable with something like:
> > >
> > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > else
> > >     kfree_rcu(selem, rcu);
> > >
> > > Questions:
> > >
> > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > > always accessed in a
> > >   classical RCU critical section? Or maybe I am missing something and
> > > this also needs to be freed
> > >   under trace RCU if any of the selems are from a sleepable map.
> > >
> > > * There is an issue with nested raw spinlocks, e.g. in
> > > bpf_inode_storage.c:bpf_inode_storage_free
> > >
> > >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > >   /* Always unlink from map before unlinking from
> > >   * local_storage.
> > >   */
> > >   bpf_selem_unlink_map(selem);
> > >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> > >                  local_storage, selem, false);
> > >   }
> > >   raw_spin_unlock_bh(&local_storage->lock);
> > >
> > > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > > flag in place of kfree_rcu)
> > > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > > raw spin lock.
> > >
> > > I am moving the freeing code out of the spinlock, saving the selems on
> > > a local list and then
> > > doing the free RCU (trace or normal) callbacks at the end. WDYT?
> >
> > Depending on the urgency, another approach is to rely on my ongoing work
> 
> It's best to wait for your patches to land and keep this code simple.
> 
> > removing the call_rcu_tasks_trace() bottleneck.  This commit on branch
> > "dev" in the -rcu tree allows boot-time setting of per-CPU callback
> > queues for call_rcu_tasks_trace(), along with the other RCU-tasks flavors:
> >
> > 0b886cc4b10f ("rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing")
> >
> > Preceding commits actually set up the queues.  With these commits, you
> > could boot with rcupdate.rcu_task_enqueue_lim=N, where N greater than
> > or equal to the number of CPUs on your system, to get per-CPU queuing.
> > These commits probably still have a bug or three, but on the other hand,
> > they have survived a couple of weeks worth of rcutorture runs.
> 
> Thank you so much, this would make my life a lot easier :)

And the code that automatically increases the number of callback queues
when it senses sufficient lock contention is now passing modest rcutorture
testing.  I have set things up so that, failing additional problems,
Stephen Rothwell should be taking it into the next -next releases.

Automatically decreasing based on few callbacks is a task for next
week.  ;-)

						Thanx, Paul

> - KP
> 
> >
> > This week's work will allow automatic transition between single-queue
> > and per-CPU-queue operation based on lock contention and the number of
> > callbacks queued.
> >
> > My current plan is to get this into the next merge window (v5.17).
> >
> > Thoughts?
> >
> >                                                         Thanx, Paul
> >
> > > - KP
> > >
> > > >
> > > > [ ... ]
> > > >
> > > > > > [  143.376587] =============================
> > > > > > [  143.377068] WARNING: suspicious RCU usage
> > > > > > [  143.377541] 5.14.0-rc5-01271-g68e5bda2b18e #4966 Tainted: G           O
> > > > > > [  143.378378] -----------------------------
> > > > > > [  143.378857] kernel/bpf/bpf_local_storage.c:114 suspicious rcu_dereference_check() usage!
> > > > > > [  143.379914]
> > > > > > [  143.379914] other info that might help us debug this:
> > > > > > [  143.379914]
> > > > > > [  143.380838]
> > > > > > [  143.380838] rcu_scheduler_active = 2, debug_locks = 1
> > > > > > [  143.381602] 4 locks held by mv/1781:
> > > > > > [  143.382025]  #0: ffff888121e7c438 (sb_writers#6){.+.+}-{0:0}, at: do_renameat2+0x2f5/0xa80
> > > > > > [  143.383009]  #1: ffff88812ce68760 (&type->i_mutex_dir_key#5/1){+.+.}-{3:3}, at: lock_rename+0x1f4/0x250
> > > > > > [  143.384144]  #2: ffffffff843fbc60 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x45/0x160
> > > > > > [  143.385326]  #3: ffff88811d8348b8 (&storage->lock){..-.}-{2:2}, at: __bpf_selem_unlink_storage+0x7d/0x170
> > > > > > [  143.386459]
> > > > > > [  143.386459] stack backtrace:
> > > > > > [  143.386983] CPU: 2 PID: 1781 Comm: mv Tainted: G           O      5.14.0-rc5-01271-g68e5bda2b18e #4966
> > > > > > [  143.388071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> > > > > > [  143.389146] Call Trace:
> > > > > > [  143.389446]  dump_stack_lvl+0x5b/0x82
> > > > > > [  143.389901]  dump_stack+0x10/0x12
> > > > > > [  143.390302]  lockdep_rcu_suspicious+0x15c/0x167
> > > > > > [  143.390854]  bpf_selem_unlink_storage_nolock+0x2e1/0x6d0
> > > > > > [  143.391501]  __bpf_selem_unlink_storage+0xb7/0x170
> > > > > > [  143.392085]  bpf_selem_unlink+0x1b/0x30
> > > > > > [  143.392554]  bpf_inode_storage_delete+0x57/0xa0
> > > > > > [  143.393112]  bpf_prog_31e277fe2c132665_inode_rename+0x9c/0x268
> > > > > > [  143.393814]  bpf_trampoline_6442476301_0+0x4e/0x1000
> > > > > > [  143.394413]  bpf_lsm_inode_rename+0x5/0x10
> > > > >
> > > > > I am not sure what line 114 is (it is a blank line in bpf-next), but
> > > > > you might be missing a rcu_read_lock_trace_held() in the second argument
> > > > > of rcu_dereference_check().
> > > > Right, this path is only under rcu_read_lock_trace().
Martin KaFai Lau Nov. 30, 2021, 2:34 a.m. UTC | #20
On Wed, Nov 24, 2021 at 11:20:40PM +0100, KP Singh wrote:
> On Tue, Nov 23, 2021 at 11:30 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> > > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > I think the global lock will be an issue for the current non-sleepable
> > > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > > may need it in the future, so probably
> > > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > > >
> > > > I was re-working the patches and had a couple of questions.
> > > >
> > > > There are two data structures that get freed under RCU here:
> > > >
> > > > struct bpf_local_storage
> > > > struct bpf_local_storage_selem
> > > >
> > > > We can choose to free the bpf_local_storage_selem under
> > > > call_rcu_tasks_trace based on
> > > > whether the map it belongs to is sleepable with something like:
> > > >
> > > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > Paul's current work (mentioned by his previous email) will improve the
> > performance of call_rcu_tasks_trace, so it probably can avoid the
> > new BPF_F_SLEEPABLE flag and make it easier to use.
> >
> > > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > else
> > > >     kfree_rcu(selem, rcu);
> > > >
> > > > Questions:
> > > >
> > > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > > >   always accessed in a  classical RCU critical section?
> > >>    Or maybe I am missing something and this also needs to be freed
> > > >   under trace RCU if any of the selems are from a sleepable map.
> > In the inode_storage_lookup() of this patch:
> >
> > +#define bpf_local_storage_rcu_lock_held()                      \
> > +       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> > +        rcu_read_lock_bh_held())
> >
> > @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
> >         if (!bsb)
> >                 return NULL;
> >
> > -       inode_storage = rcu_dereference(bsb->storage);
> > +       inode_storage = rcu_dereference_protected(bsb->storage,
> > +                                                 bpf_local_storage_rcu_lock_held());
> >
> > Thus, it is not always in classical RCU critical.
> >
> > > >
> > > > * There is an issue with nested raw spinlocks, e.g. in
> > > > bpf_inode_storage.c:bpf_inode_storage_free
> > > >
> > > >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > > >   /* Always unlink from map before unlinking from
> > > >   * local_storage.
> > > >   */
> > > >   bpf_selem_unlink_map(selem);
> > > >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> > > >                  local_storage, selem, false);
> > > >   }
> > > >   raw_spin_unlock_bh(&local_storage->lock);
> > > >
> > > > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > > > flag in place of kfree_rcu)
> > > > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > > > raw spin lock.
> > > >
> > > > I am moving the freeing code out of the spinlock, saving the selems on
> > > > a local list and then doing the free RCU (trace or normal) callbacks
> > > > at the end. WDYT?
> > There could be more than one selem to save.
> 
> Yes, that's why I was saving them on a local list and then calling
> kfree_rcu or call_rcu_tasks_trace after unlocking the raw_spin_lock
> 
> INIT_HLIST_HEAD(&free_list);
> raw_spin_lock_irqsave(&local_storage->lock, flags);
> hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
>     bpf_selem_unlink_map(selem);
>     free_local_storage = bpf_selem_unlink_storage_nolock(
>     local_storage, selem, false);
>     hlist_add_head(&selem->snode, &free_list);
> }
> raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> 
> /* The element needs to be freed outside the raw spinlock because spin
> * locks cannot nest inside a raw spin locks and call_rcu_tasks_trace
> * grabs a spinklock when the RCU code calls into the scheduler.
> *
> * free_local_storage should always be true as long as
> * local_storage->list was non-empty.
> */
> hlist_for_each_entry_safe(selem, n, &free_list, snode) {
>     if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
>         call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
>     else
>         kfree_rcu(selem, rcu);
> }
> 
> But... we won't need this anymore.
Yep, Paul's work (thanks!) will make this piece simpler. 

KP, this set functionally does not depend on Paul's changes.
Do you want to spin a new version so that it can be reviewed in parallel?
When the rcu-task changes land in -next, it can probably
be merged into bpf-next first before landing the sleepable
bpf storage work.
KP Singh Nov. 30, 2021, 4:22 p.m. UTC | #21
On Tue, Nov 30, 2021 at 3:34 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Wed, Nov 24, 2021 at 11:20:40PM +0100, KP Singh wrote:
> > On Tue, Nov 23, 2021 at 11:30 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> > > > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > I think the global lock will be an issue for the current non-sleepable
> > > > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > > > may need it in the future, so probably
> > > > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > > > >
> > > > > I was re-working the patches and had a couple of questions.
> > > > >
> > > > > There are two data structures that get freed under RCU here:
> > > > >
> > > > > struct bpf_local_storage
> > > > > struct bpf_local_storage_selem
> > > > >
> > > > > We can choose to free the bpf_local_storage_selem under
> > > > > call_rcu_tasks_trace based on
> > > > > whether the map it belongs to is sleepable with something like:
> > > > >
> > > > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > > Paul's current work (mentioned by his previous email) will improve the
> > > performance of call_rcu_tasks_trace, so it probably can avoid the
> > > new BPF_F_SLEEPABLE flag and make it easier to use.
> > >
> > > > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > else
> > > > >     kfree_rcu(selem, rcu);
> > > > >
> > > > > Questions:
> > > > >
> > > > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > > > >   always accessed in a  classical RCU critical section?
> > > >>    Or maybe I am missing something and this also needs to be freed
> > > > >   under trace RCU if any of the selems are from a sleepable map.
> > > In the inode_storage_lookup() of this patch:
> > >
> > > +#define bpf_local_storage_rcu_lock_held()                      \
> > > +       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> > > +        rcu_read_lock_bh_held())
> > >
> > > @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
> > >         if (!bsb)
> > >                 return NULL;
> > >
> > > -       inode_storage = rcu_dereference(bsb->storage);
> > > +       inode_storage = rcu_dereference_protected(bsb->storage,
> > > +                                                 bpf_local_storage_rcu_lock_held());
> > >
> > > Thus, it is not always in classical RCU critical.
> > >
> > > > >
> > > > > * There is an issue with nested raw spinlocks, e.g. in
> > > > > bpf_inode_storage.c:bpf_inode_storage_free
> > > > >
> > > > >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > > > >   /* Always unlink from map before unlinking from
> > > > >   * local_storage.
> > > > >   */
> > > > >   bpf_selem_unlink_map(selem);
> > > > >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> > > > >                  local_storage, selem, false);
> > > > >   }
> > > > >   raw_spin_unlock_bh(&local_storage->lock);
> > > > >
> > > > > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > > > > flag in place of kfree_rcu)
> > > > > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > > > > raw spin lock.
> > > > >
> > > > > I am moving the freeing code out of the spinlock, saving the selems on
> > > > > a local list and then doing the free RCU (trace or normal) callbacks
> > > > > at the end. WDYT?
> > > There could be more than one selem to save.
> >
> > Yes, that's why I was saving them on a local list and then calling
> > kfree_rcu or call_rcu_tasks_trace after unlocking the raw_spin_lock
> >
> > INIT_HLIST_HEAD(&free_list);
> > raw_spin_lock_irqsave(&local_storage->lock, flags);
> > hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> >     bpf_selem_unlink_map(selem);
> >     free_local_storage = bpf_selem_unlink_storage_nolock(
> >     local_storage, selem, false);
> >     hlist_add_head(&selem->snode, &free_list);
> > }
> > raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> >
> > /* The element needs to be freed outside the raw spinlock because spin
> > * locks cannot nest inside a raw spin locks and call_rcu_tasks_trace
> > * grabs a spinklock when the RCU code calls into the scheduler.
> > *
> > * free_local_storage should always be true as long as
> > * local_storage->list was non-empty.
> > */
> > hlist_for_each_entry_safe(selem, n, &free_list, snode) {
> >     if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> >         call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> >     else
> >         kfree_rcu(selem, rcu);
> > }
> >
> > But... we won't need this anymore.
> Yep, Paul's work (thanks!) will make this piece simpler.

+100

>
> KP, this set functionally does not depend on Paul's changes.
> Do you want to spin a new version so that it can be reviewed in parallel?

Sure, I will fix the remaining issues (i.e. with RCU locks and renames) and
spin a new version.

> When the rcu-task changes land in -next, it can probably
> be merged into bpf-next first before landing the sleepable
> bpf storage work.
Paul E. McKenney Nov. 30, 2021, 10:51 p.m. UTC | #22
On Tue, Nov 30, 2021 at 05:22:25PM +0100, KP Singh wrote:
> On Tue, Nov 30, 2021 at 3:34 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Wed, Nov 24, 2021 at 11:20:40PM +0100, KP Singh wrote:
> > > On Tue, Nov 23, 2021 at 11:30 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> > > > > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > > > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > > I think the global lock will be an issue for the current non-sleepable
> > > > > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > > > > may need it in the future, so probably
> > > > > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > > > > >
> > > > > > I was re-working the patches and had a couple of questions.
> > > > > >
> > > > > > There are two data structures that get freed under RCU here:
> > > > > >
> > > > > > struct bpf_local_storage
> > > > > > struct bpf_local_storage_selem
> > > > > >
> > > > > > We can choose to free the bpf_local_storage_selem under
> > > > > > call_rcu_tasks_trace based on
> > > > > > whether the map it belongs to is sleepable with something like:
> > > > > >
> > > > > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > > > Paul's current work (mentioned by his previous email) will improve the
> > > > performance of call_rcu_tasks_trace, so it probably can avoid the
> > > > new BPF_F_SLEEPABLE flag and make it easier to use.
> > > >
> > > > > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > else
> > > > > >     kfree_rcu(selem, rcu);
> > > > > >
> > > > > > Questions:
> > > > > >
> > > > > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > > > > >   always accessed in a  classical RCU critical section?
> > > > >>    Or maybe I am missing something and this also needs to be freed
> > > > > >   under trace RCU if any of the selems are from a sleepable map.
> > > > In the inode_storage_lookup() of this patch:
> > > >
> > > > +#define bpf_local_storage_rcu_lock_held()                      \
> > > > +       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> > > > +        rcu_read_lock_bh_held())
> > > >
> > > > @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
> > > >         if (!bsb)
> > > >                 return NULL;
> > > >
> > > > -       inode_storage = rcu_dereference(bsb->storage);
> > > > +       inode_storage = rcu_dereference_protected(bsb->storage,
> > > > +                                                 bpf_local_storage_rcu_lock_held());
> > > >
> > > > Thus, it is not always in classical RCU critical.
> > > >
> > > > > >
> > > > > > * There is an issue with nested raw spinlocks, e.g. in
> > > > > > bpf_inode_storage.c:bpf_inode_storage_free
> > > > > >
> > > > > >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > > > > >   /* Always unlink from map before unlinking from
> > > > > >   * local_storage.
> > > > > >   */
> > > > > >   bpf_selem_unlink_map(selem);
> > > > > >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> > > > > >                  local_storage, selem, false);
> > > > > >   }
> > > > > >   raw_spin_unlock_bh(&local_storage->lock);
> > > > > >
> > > > > > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > > > > > flag in place of kfree_rcu)
> > > > > > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > > > > > raw spin lock.
> > > > > >
> > > > > > I am moving the freeing code out of the spinlock, saving the selems on
> > > > > > a local list and then doing the free RCU (trace or normal) callbacks
> > > > > > at the end. WDYT?
> > > > There could be more than one selem to save.
> > >
> > > Yes, that's why I was saving them on a local list and then calling
> > > kfree_rcu or call_rcu_tasks_trace after unlocking the raw_spin_lock
> > >
> > > INIT_HLIST_HEAD(&free_list);
> > > raw_spin_lock_irqsave(&local_storage->lock, flags);
> > > hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > >     bpf_selem_unlink_map(selem);
> > >     free_local_storage = bpf_selem_unlink_storage_nolock(
> > >     local_storage, selem, false);
> > >     hlist_add_head(&selem->snode, &free_list);
> > > }
> > > raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > >
> > > /* The element needs to be freed outside the raw spinlock because spin
> > > * locks cannot nest inside a raw spin locks and call_rcu_tasks_trace
> > > * grabs a spinklock when the RCU code calls into the scheduler.
> > > *
> > > * free_local_storage should always be true as long as
> > > * local_storage->list was non-empty.
> > > */
> > > hlist_for_each_entry_safe(selem, n, &free_list, snode) {
> > >     if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > >         call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > >     else
> > >         kfree_rcu(selem, rcu);
> > > }
> > >
> > > But... we won't need this anymore.
> > Yep, Paul's work (thanks!) will make this piece simpler.
> 
> +100
> 
> >
> > KP, this set functionally does not depend on Paul's changes.
> > Do you want to spin a new version so that it can be reviewed in parallel?
> 
> Sure, I will fix the remaining issues (i.e. with RCU locks and renames) and
> spin a new version.
> 
> > When the rcu-task changes land in -next, it can probably
> > be merged into bpf-next first before landing the sleepable
> > bpf storage work.

And I just now got both the expand-queues and shrink-queues code at
least pretending to work, and it will be picked up in the next -next.
I was probably too late for today's edition, but there is always tomorrow.

There are probably still bugs, but it is passing much nastier tests than
a couple of weeks ago, so here is hoping...

							Thanx, Paul
Paul E. McKenney Dec. 4, 2021, 1:01 a.m. UTC | #23
On Tue, Nov 30, 2021 at 02:51:29PM -0800, Paul E. McKenney wrote:
> On Tue, Nov 30, 2021 at 05:22:25PM +0100, KP Singh wrote:
> > On Tue, Nov 30, 2021 at 3:34 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Wed, Nov 24, 2021 at 11:20:40PM +0100, KP Singh wrote:
> > > > On Tue, Nov 23, 2021 at 11:30 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > >
> > > > > On Tue, Nov 23, 2021 at 10:22:04AM -0800, Paul E. McKenney wrote:
> > > > > > On Tue, Nov 23, 2021 at 06:11:14PM +0100, KP Singh wrote:
> > > > > > > On Thu, Sep 2, 2021 at 6:45 AM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > > > I think the global lock will be an issue for the current non-sleepable
> > > > > > > > netdev bpf-prog which could be triggered by external traffic,  so a flag
> > > > > > > > is needed here to provide a fast path.  I suspect other non-prealloc map
> > > > > > > > may need it in the future, so probably
> > > > > > > > s/BPF_F_SLEEPABLE_STORAGE/BPF_F_SLEEPABLE/ instead.
> > > > > > >
> > > > > > > I was re-working the patches and had a couple of questions.
> > > > > > >
> > > > > > > There are two data structures that get freed under RCU here:
> > > > > > >
> > > > > > > struct bpf_local_storage
> > > > > > > struct bpf_local_storage_selem
> > > > > > >
> > > > > > > We can choose to free the bpf_local_storage_selem under
> > > > > > > call_rcu_tasks_trace based on
> > > > > > > whether the map it belongs to is sleepable with something like:
> > > > > > >
> > > > > > > if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > > > > Paul's current work (mentioned by his previous email) will improve the
> > > > > performance of call_rcu_tasks_trace, so it probably can avoid the
> > > > > new BPF_F_SLEEPABLE flag and make it easier to use.
> > > > >
> > > > > > >     call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > > > > > else
> > > > > > >     kfree_rcu(selem, rcu);
> > > > > > >
> > > > > > > Questions:
> > > > > > >
> > > > > > > * Can we free bpf_local_storage under kfree_rcu by ensuring it's
> > > > > > >   always accessed in a  classical RCU critical section?
> > > > > >>    Or maybe I am missing something and this also needs to be freed
> > > > > > >   under trace RCU if any of the selems are from a sleepable map.
> > > > > In the inode_storage_lookup() of this patch:
> > > > >
> > > > > +#define bpf_local_storage_rcu_lock_held()                      \
> > > > > +       (rcu_read_lock_held() || rcu_read_lock_trace_held() ||  \
> > > > > +        rcu_read_lock_bh_held())
> > > > >
> > > > > @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
> > > > >         if (!bsb)
> > > > >                 return NULL;
> > > > >
> > > > > -       inode_storage = rcu_dereference(bsb->storage);
> > > > > +       inode_storage = rcu_dereference_protected(bsb->storage,
> > > > > +                                                 bpf_local_storage_rcu_lock_held());
> > > > >
> > > > > Thus, it is not always in classical RCU critical.
> > > > >
> > > > > > >
> > > > > > > * There is an issue with nested raw spinlocks, e.g. in
> > > > > > > bpf_inode_storage.c:bpf_inode_storage_free
> > > > > > >
> > > > > > >   hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > > > > > >   /* Always unlink from map before unlinking from
> > > > > > >   * local_storage.
> > > > > > >   */
> > > > > > >   bpf_selem_unlink_map(selem);
> > > > > > >   free_inode_storage = bpf_selem_unlink_storage_nolock(
> > > > > > >                  local_storage, selem, false);
> > > > > > >   }
> > > > > > >   raw_spin_unlock_bh(&local_storage->lock);
> > > > > > >
> > > > > > > in bpf_selem_unlink_storage_nolock (if we add the above logic with the
> > > > > > > flag in place of kfree_rcu)
> > > > > > > call_rcu_tasks_trace grabs a spinlock and these cannot be nested in a
> > > > > > > raw spin lock.
> > > > > > >
> > > > > > > I am moving the freeing code out of the spinlock, saving the selems on
> > > > > > > a local list and then doing the free RCU (trace or normal) callbacks
> > > > > > > at the end. WDYT?
> > > > > There could be more than one selem to save.
> > > >
> > > > Yes, that's why I was saving them on a local list and then calling
> > > > kfree_rcu or call_rcu_tasks_trace after unlocking the raw_spin_lock
> > > >
> > > > INIT_HLIST_HEAD(&free_list);
> > > > raw_spin_lock_irqsave(&local_storage->lock, flags);
> > > > hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
> > > >     bpf_selem_unlink_map(selem);
> > > >     free_local_storage = bpf_selem_unlink_storage_nolock(
> > > >     local_storage, selem, false);
> > > >     hlist_add_head(&selem->snode, &free_list);
> > > > }
> > > > raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > >
> > > > /* The element needs to be freed outside the raw spinlock because spin
> > > > * locks cannot nest inside a raw spin locks and call_rcu_tasks_trace
> > > > * grabs a spinklock when the RCU code calls into the scheduler.
> > > > *
> > > > * free_local_storage should always be true as long as
> > > > * local_storage->list was non-empty.
> > > > */
> > > > hlist_for_each_entry_safe(selem, n, &free_list, snode) {
> > > >     if (selem->sdata.smap->map.map_flags & BPF_F_SLEEPABLE_STORAGE)
> > > >         call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
> > > >     else
> > > >         kfree_rcu(selem, rcu);
> > > > }
> > > >
> > > > But... we won't need this anymore.
> > > Yep, Paul's work (thanks!) will make this piece simpler.
> > 
> > +100
> > 
> > >
> > > KP, this set functionally does not depend on Paul's changes.
> > > Do you want to spin a new version so that it can be reviewed in parallel?
> > 
> > Sure, I will fix the remaining issues (i.e. with RCU locks and renames) and
> > spin a new version.
> > 
> > > When the rcu-task changes land in -next, it can probably
> > > be merged into bpf-next first before landing the sleepable
> > > bpf storage work.
> 
> And I just now got both the expand-queues and shrink-queues code at
> least pretending to work, and it will be picked up in the next -next.
> I was probably too late for today's edition, but there is always tomorrow.
> 
> There are probably still bugs, but it is passing much nastier tests than
> a couple of weeks ago, so here is hoping...

And this is now in -next.  Please let me know how it goes!

								Thanx, Paul
KP Singh Dec. 5, 2021, 2:27 a.m. UTC | #24
On Sat, Dec 4, 2021 at 2:01 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Tue, Nov 30, 2021 at 02:51:29PM -0800, Paul E. McKenney wrote:

[...]

> > There are probably still bugs, but it is passing much nastier tests than
> > a couple of weeks ago, so here is hoping...
>
> And this is now in -next.  Please let me know how it goes!

I was able to rebase this on linux-next which has Paul's changes and
ran it with:

root@kpsingh:~# cat /proc/cmdline
console=ttyS0,115200 root=/dev/sda rw nokaslr rcupdate.rcu_task_enqueue_lim=4

The warning about the nested spinlock in the raw spin locked section
is also gone and I don't
see any other warnings. Will send the rebased series after doing a few
more checks.

- KP

>
>                                                                 Thanx, Paul
Paul E. McKenney Dec. 5, 2021, 3:52 a.m. UTC | #25
On Sun, Dec 05, 2021 at 03:27:47AM +0100, KP Singh wrote:
> On Sat, Dec 4, 2021 at 2:01 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Tue, Nov 30, 2021 at 02:51:29PM -0800, Paul E. McKenney wrote:
> 
> [...]
> 
> > > There are probably still bugs, but it is passing much nastier tests than
> > > a couple of weeks ago, so here is hoping...
> >
> > And this is now in -next.  Please let me know how it goes!
> 
> I was able to rebase this on linux-next which has Paul's changes and
> ran it with:
> 
> root@kpsingh:~# cat /proc/cmdline
> console=ttyS0,115200 root=/dev/sda rw nokaslr rcupdate.rcu_task_enqueue_lim=4
> 
> The warning about the nested spinlock in the raw spin locked section
> is also gone and I don't
> see any other warnings. Will send the rebased series after doing a few
> more checks.

Thank you, and here is hoping for continued testing success!  ;-)

							Thanx, Paul
diff mbox series

Patch

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 24496bc28e7b..8453488b334d 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -16,6 +16,9 @@ 
 
 #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
 
+#define bpf_local_storage_rcu_lock_held()			\
+	(rcu_read_lock_held() || rcu_read_lock_trace_held() ||	\
+		rcu_read_lock_bh_held())
 struct bpf_local_storage_map_bucket {
 	struct hlist_head list;
 	raw_spinlock_t lock;
@@ -161,4 +164,6 @@  struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags);
 
+void bpf_local_storage_free_rcu(struct rcu_head *rcu);
+
 #endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 96ceed0e0fb5..409ef5d3efee 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -17,6 +17,7 @@ 
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
 
 DEFINE_BPF_STORAGE_CACHE(inode_cache);
 
@@ -44,7 +45,8 @@  static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
 	if (!bsb)
 		return NULL;
 
-	inode_storage = rcu_dereference(bsb->storage);
+	inode_storage = rcu_dereference_protected(bsb->storage,
+						  bpf_local_storage_rcu_lock_held());
 	if (!inode_storage)
 		return NULL;
 
@@ -97,7 +99,8 @@  void bpf_inode_storage_free(struct inode *inode)
 	 * local_storage->list was non-empty.
 	 */
 	if (free_inode_storage)
-		kfree_rcu(local_storage, rcu);
+		call_rcu_tasks_trace(&local_storage->rcu,
+				     bpf_local_storage_free_rcu);
 }
 
 static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -172,6 +175,7 @@  BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
 {
 	struct bpf_local_storage_data *sdata;
 
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
 		return (unsigned long)NULL;
 
@@ -204,6 +208,7 @@  BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
 BPF_CALL_2(bpf_inode_storage_delete,
 	   struct bpf_map *, map, struct inode *, inode)
 {
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (!inode)
 		return -EINVAL;
 
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b305270b7a4b..7760bc4e9565 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -11,6 +11,8 @@ 
 #include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
 
 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
 
@@ -81,6 +83,22 @@  bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
+void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage *local_storage;
+
+	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+	kfree_rcu(local_storage, rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage_elem *selem;
+
+	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	kfree_rcu(selem, rcu);
+}
+
 /* local_storage->lock must be held and selem->local_storage == local_storage.
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
@@ -118,12 +136,12 @@  bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 		 *
 		 * Although the unlock will be done under
 		 * rcu_read_lock(),  it is more intutivie to
-		 * read if kfree_rcu(local_storage, rcu) is done
+		 * read if the freeing of the storage is done
 		 * after the raw_spin_unlock_bh(&local_storage->lock).
 		 *
 		 * Hence, a "bool free_local_storage" is returned
-		 * to the caller which then calls the kfree_rcu()
-		 * after unlock.
+		 * to the caller which then calls then frees the storage after
+		 * all the RCU grace periods have expired.
 		 */
 	}
 	hlist_del_init_rcu(&selem->snode);
@@ -131,7 +149,7 @@  bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	kfree_rcu(selem, rcu);
+	call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
 
 	return free_local_storage;
 }
@@ -154,7 +172,8 @@  static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage)
-		kfree_rcu(local_storage, rcu);
+		call_rcu_tasks_trace(&local_storage->rcu,
+				     bpf_local_storage_free_rcu);
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -213,7 +232,8 @@  bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 	struct bpf_local_storage_elem *selem;
 
 	/* Fast path (cache hit) */
-	sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
+	sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
+					  bpf_local_storage_rcu_lock_held());
 	if (sdata && rcu_access_pointer(sdata->smap) == smap)
 		return sdata;
 
@@ -306,7 +326,8 @@  int bpf_local_storage_alloc(void *owner,
 		 * bucket->list, first_selem can be freed immediately
 		 * (instead of kfree_rcu) because
 		 * bpf_local_storage_map_free() does a
-		 * synchronize_rcu() before walking the bucket->list.
+		 * synchronize_rcu_mult (waiting for both sleepable and
+		 * normal programs) before walking the bucket->list.
 		 * Hence, no one is accessing selem from the
 		 * bucket->list under rcu_read_lock().
 		 */
@@ -485,8 +506,12 @@  void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
 	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
 	 * RCU read section to finish before proceeding. New RCU
 	 * read sections should be prevented via bpf_map_inc_not_zero.
+	 *
+	 * A call_rcu_tasks_trace is an expensive operation but
+	 * bpf_local_storage_map_free is not called from the BPF program or
+	 * hotpaths. It's called when the owning object is freed.
 	 */
-	synchronize_rcu();
+	synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
 
 	/* bpf prog and the userspace can no longer access this map
 	 * now.  No new selem (of this map) can be added
@@ -530,7 +555,7 @@  void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
 	 *
 	 * Hence, wait another rcu grace period for the storage to be freed.
 	 */
-	synchronize_rcu();
+	synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
 
 	kvfree(smap->buckets);
 	kfree(smap);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 3ce75758d394..87f0daeff314 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -17,6 +17,7 @@ 
 #include <uapi/linux/btf.h>
 #include <linux/btf_ids.h>
 #include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
 
 DEFINE_BPF_STORAGE_CACHE(task_cache);
 
@@ -59,7 +60,8 @@  task_storage_lookup(struct task_struct *task, struct bpf_map *map,
 	struct bpf_local_storage *task_storage;
 	struct bpf_local_storage_map *smap;
 
-	task_storage = rcu_dereference(task->bpf_storage);
+	task_storage = rcu_dereference_protected(task->bpf_storage,
+						 bpf_local_storage_rcu_lock_held());
 	if (!task_storage)
 		return NULL;
 
@@ -229,6 +231,7 @@  BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
 {
 	struct bpf_local_storage_data *sdata;
 
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
 		return (unsigned long)NULL;
 
@@ -260,6 +263,7 @@  BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
 {
 	int ret;
 
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (!task)
 		return -EINVAL;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 206c221453cf..4e0b807a0104 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11457,6 +11457,9 @@  static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 			}
 			break;
 		case BPF_MAP_TYPE_RINGBUF:
+		case BPF_MAP_TYPE_INODE_STORAGE:
+		case BPF_MAP_TYPE_SK_STORAGE:
+		case BPF_MAP_TYPE_TASK_STORAGE:
 			break;
 		default:
 			verbose(env,
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 68d2cbf8331a..b56578579a0d 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -13,6 +13,7 @@ 
 #include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
 
 DEFINE_BPF_STORAGE_CACHE(sk_cache);
 
@@ -22,7 +23,8 @@  bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
 	struct bpf_local_storage *sk_storage;
 	struct bpf_local_storage_map *smap;
 
-	sk_storage = rcu_dereference(sk->sk_bpf_storage);
+	sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
+					       bpf_local_storage_rcu_lock_held());
 	if (!sk_storage)
 		return NULL;
 
@@ -258,6 +260,7 @@  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 {
 	struct bpf_local_storage_data *sdata;
 
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
 		return (unsigned long)NULL;
 
@@ -288,6 +291,7 @@  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 
 BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
 {
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (!sk || !sk_fullsock(sk))
 		return -EINVAL;
 
@@ -416,6 +420,7 @@  static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
 BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
 	   void *, value, u64, flags)
 {
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (in_hardirq() || in_nmi())
 		return (unsigned long)NULL;
 
@@ -425,6 +430,7 @@  BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
 BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
 	   struct sock *, sk)
 {
+	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
 	if (in_hardirq() || in_nmi())
 		return -EPERM;