diff mbox series

[v3] cgroup/bpf: fast path skb BPF filtering

Message ID 462ce9402621f5e32f08cc8acbf3d9da4d7d69ca.1639579508.git.asml.silence@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [v3] cgroup/bpf: fast path skb BPF filtering | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next success VM_Test
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Pavel Begunkov Dec. 15, 2021, 2:49 p.m. UTC
Add per socket fast path for not enabled BPF skb filtering, which sheds
a nice chunk of send/recv overhead when affected. Testing udp with 128
byte payload and/or zerocopy with any payload size showed 2-3%
improvement in requests/s on the tx side using fast NICs across network,
and around 4% for dummy device. Same goes for rx, not measured, but
numbers should be relatable.
In my understanding, this should affect a good share of machines, and at
least it includes my laptops and some checked servers.

The core of the problem is that even though there is
cgroup_bpf_enabled_key guarding from __cgroup_bpf_run_filter_skb()
overhead, there are cases where we have several cgroups and loading a
BPF program to one also makes all others to go through the slow path
even when they don't have any BPF attached. It's even worse, because
apparently systemd or some other early init loads some BPF and so
triggers exactly this situation for normal networking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---

v2: replace bitmask appoach with empty_prog_array (suggested by Martin)
v3: add "bpf_" prefix to empty_prog_array (Martin)

 include/linux/bpf-cgroup.h | 24 +++++++++++++++++++++---
 include/linux/bpf.h        | 13 +++++++++++++
 kernel/bpf/cgroup.c        | 18 ++----------------
 kernel/bpf/core.c          | 16 ++++------------
 4 files changed, 40 insertions(+), 31 deletions(-)

Comments

Jakub Kicinski Dec. 15, 2021, 4:40 p.m. UTC | #1
On Wed, 15 Dec 2021 14:49:18 +0000 Pavel Begunkov wrote:
> +static inline bool
> +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
> +				 enum cgroup_bpf_attach_type type)
> +{
> +	struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
> +
> +	return array == &bpf_empty_prog_array.hdr;
> +}
> +
> +#define CGROUP_BPF_TYPE_ENABLED(sk, atype)				       \
> +({									       \
> +	struct cgroup *__cgrp = sock_cgroup_ptr(&(sk)->sk_cgrp_data);	       \
> +									       \
> +	!__cgroup_bpf_prog_array_is_empty(&__cgrp->bpf, (atype));	       \
> +})
> +

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index e7a163a3146b..0d2195c6fb2a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1161,6 +1161,19 @@ struct bpf_prog_array {
>  	struct bpf_prog_array_item items[];
>  };
>  
> +struct bpf_empty_prog_array {
> +	struct bpf_prog_array hdr;
> +	struct bpf_prog *null_prog;
> +};
> +
> +/* to avoid allocating empty bpf_prog_array for cgroups that
> + * don't have bpf program attached use one global 'bpf_empty_prog_array'
> + * It will not be modified the caller of bpf_prog_array_alloc()
> + * (since caller requested prog_cnt == 0)
> + * that pointer should be 'freed' by bpf_prog_array_free()
> + */
> +extern struct bpf_empty_prog_array bpf_empty_prog_array;

mumble mumble, this adds more "fun" dependencies [1] Maybe I'm going
about this all wrong, maybe I should be pulling out struct cgroup_bpf
so that cgroup.h does not need bpf-cgroup, not breaking bpf <-> bpf-cgroup.
Alexei, WDYT?

[1] https://lore.kernel.org/all/20211215061916.715513-2-kuba@kernel.org/
Stanislav Fomichev Dec. 15, 2021, 4:51 p.m. UTC | #2
On 12/15, Pavel Begunkov wrote:
> Add per socket fast path for not enabled BPF skb filtering, which sheds
> a nice chunk of send/recv overhead when affected. Testing udp with 128
> byte payload and/or zerocopy with any payload size showed 2-3%
> improvement in requests/s on the tx side using fast NICs across network,
> and around 4% for dummy device. Same goes for rx, not measured, but
> numbers should be relatable.
> In my understanding, this should affect a good share of machines, and at
> least it includes my laptops and some checked servers.

> The core of the problem is that even though there is
> cgroup_bpf_enabled_key guarding from __cgroup_bpf_run_filter_skb()
> overhead, there are cases where we have several cgroups and loading a
> BPF program to one also makes all others to go through the slow path
> even when they don't have any BPF attached. It's even worse, because
> apparently systemd or some other early init loads some BPF and so
> triggers exactly this situation for normal networking.

> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---

> v2: replace bitmask appoach with empty_prog_array (suggested by Martin)
> v3: add "bpf_" prefix to empty_prog_array (Martin)

>   include/linux/bpf-cgroup.h | 24 +++++++++++++++++++++---
>   include/linux/bpf.h        | 13 +++++++++++++
>   kernel/bpf/cgroup.c        | 18 ++----------------
>   kernel/bpf/core.c          | 16 ++++------------
>   4 files changed, 40 insertions(+), 31 deletions(-)

> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 11820a430d6c..c6dacdbdf565 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -219,11 +219,28 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map  
> *map, void *key, void *value);
>   int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
>   				     void *value, u64 flags);

> +static inline bool
> +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
> +				 enum cgroup_bpf_attach_type type)
> +{
> +	struct bpf_prog_array *array =  
> rcu_access_pointer(cgrp_bpf->effective[type]);
> +
> +	return array == &bpf_empty_prog_array.hdr;
> +}
> +
> +#define CGROUP_BPF_TYPE_ENABLED(sk, atype)				       \
> +({									       \
> +	struct cgroup *__cgrp = sock_cgroup_ptr(&(sk)->sk_cgrp_data);	       \
> +									       \
> +	!__cgroup_bpf_prog_array_is_empty(&__cgrp->bpf, (atype));	       \
> +})
> +
>   /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by  
> cgroup_bpf_enabled. */
>   #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
>   ({									      \
>   	int __ret = 0;							      \
> -	if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))		      \
> +	if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&		      \
> +	    CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS)) 	      \

Why not add this __cgroup_bpf_run_filter_skb check to
__cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already there
and you can use it. Maybe move the things around if you want
it to happen earlier.

>   		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
>   						    CGROUP_INET_INGRESS); \
>   									      \
> @@ -235,9 +252,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map  
> *map, void *key,
>   	int __ret = 0;							       \
>   	if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
>   		typeof(sk) __sk = sk_to_full_sk(sk);			       \
> -		if (sk_fullsock(__sk))					       \
> +		if (sk_fullsock(__sk) &&				       \
> +		    CGROUP_BPF_TYPE_ENABLED(__sk, CGROUP_INET_EGRESS))	       \
>   			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
> -						      CGROUP_INET_EGRESS); \
> +						      CGROUP_INET_EGRESS);     \
>   	}								       \
>   	__ret;								       \
>   })
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index e7a163a3146b..0d2195c6fb2a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1161,6 +1161,19 @@ struct bpf_prog_array {
>   	struct bpf_prog_array_item items[];
>   };

> +struct bpf_empty_prog_array {
> +	struct bpf_prog_array hdr;
> +	struct bpf_prog *null_prog;
> +};
> +
> +/* to avoid allocating empty bpf_prog_array for cgroups that
> + * don't have bpf program attached use one global 'bpf_empty_prog_array'
> + * It will not be modified the caller of bpf_prog_array_alloc()
> + * (since caller requested prog_cnt == 0)
> + * that pointer should be 'freed' by bpf_prog_array_free()
> + */
> +extern struct bpf_empty_prog_array bpf_empty_prog_array;
> +
>   struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
>   void bpf_prog_array_free(struct bpf_prog_array *progs);
>   int bpf_prog_array_length(struct bpf_prog_array *progs);
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 43eb3501721b..99e85f44e257 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1354,20 +1354,6 @@ int __cgroup_bpf_run_filter_sysctl(struct  
> ctl_table_header *head,
>   }

>   #ifdef CONFIG_NET
> -static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
> -					     enum cgroup_bpf_attach_type attach_type)
> -{
> -	struct bpf_prog_array *prog_array;
> -	bool empty;
> -
> -	rcu_read_lock();
> -	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
> -	empty = bpf_prog_array_is_empty(prog_array);
> -	rcu_read_unlock();
> -
> -	return empty;
> -}
> -
>   static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int  
> max_optlen,
>   			     struct bpf_sockopt_buf *buf)
>   {
> @@ -1430,7 +1416,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock  
> *sk, int *level,
>   	 * attached to the hook so we don't waste time allocating
>   	 * memory and locking the socket.
>   	 */
> -	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
> +	if (__cgroup_bpf_prog_array_is_empty(&cgrp->bpf, CGROUP_SETSOCKOPT))
>   		return 0;

>   	/* Allocate a bit more than the initial user buffer for
> @@ -1526,7 +1512,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock  
> *sk, int level,
>   	 * attached to the hook so we don't waste time allocating
>   	 * memory and locking the socket.
>   	 */
> -	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
> +	if (__cgroup_bpf_prog_array_is_empty(&cgrp->bpf, CGROUP_GETSOCKOPT))
>   		return retval;

>   	ctx.optlen = max_optlen;
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 2405e39d800f..fa76d1d839ad 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1967,18 +1967,10 @@ static struct bpf_prog_dummy {
>   	},
>   };

> -/* to avoid allocating empty bpf_prog_array for cgroups that
> - * don't have bpf program attached use one global 'empty_prog_array'
> - * It will not be modified the caller of bpf_prog_array_alloc()
> - * (since caller requested prog_cnt == 0)
> - * that pointer should be 'freed' by bpf_prog_array_free()
> - */
> -static struct {
> -	struct bpf_prog_array hdr;
> -	struct bpf_prog *null_prog;
> -} empty_prog_array = {
> +struct bpf_empty_prog_array bpf_empty_prog_array = {
>   	.null_prog = NULL,
>   };
> +EXPORT_SYMBOL(bpf_empty_prog_array);

>   struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
>   {
> @@ -1988,12 +1980,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32  
> prog_cnt, gfp_t flags)
>   			       (prog_cnt + 1),
>   			       flags);

> -	return &empty_prog_array.hdr;
> +	return &bpf_empty_prog_array.hdr;
>   }

>   void bpf_prog_array_free(struct bpf_prog_array *progs)
>   {
> -	if (!progs || progs == &empty_prog_array.hdr)
> +	if (!progs || progs == &bpf_empty_prog_array.hdr)
>   		return;
>   	kfree_rcu(progs, rcu);
>   }
> --
> 2.34.0
Pavel Begunkov Dec. 15, 2021, 5:18 p.m. UTC | #3
On 12/15/21 16:51, sdf@google.com wrote:
> On 12/15, Pavel Begunkov wrote:
>> Add per socket fast path for not enabled BPF skb filtering, which sheds
>> a nice chunk of send/recv overhead when affected. Testing udp with 128
>> byte payload and/or zerocopy with any payload size showed 2-3%
>> improvement in requests/s on the tx side using fast NICs across network,
>> and around 4% for dummy device. Same goes for rx, not measured, but
>> numbers should be relatable.
>> In my understanding, this should affect a good share of machines, and at
>> least it includes my laptops and some checked servers.
> 
>> The core of the problem is that even though there is
>> cgroup_bpf_enabled_key guarding from __cgroup_bpf_run_filter_skb()
>> overhead, there are cases where we have several cgroups and loading a
>> BPF program to one also makes all others to go through the slow path
>> even when they don't have any BPF attached. It's even worse, because
>> apparently systemd or some other early init loads some BPF and so
>> triggers exactly this situation for normal networking.
> 
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
> 
>> v2: replace bitmask appoach with empty_prog_array (suggested by Martin)
>> v3: add "bpf_" prefix to empty_prog_array (Martin)
> 
>>   include/linux/bpf-cgroup.h | 24 +++++++++++++++++++++---
>>   include/linux/bpf.h        | 13 +++++++++++++
>>   kernel/bpf/cgroup.c        | 18 ++----------------
>>   kernel/bpf/core.c          | 16 ++++------------
>>   4 files changed, 40 insertions(+), 31 deletions(-)
> 
>> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
>> index 11820a430d6c..c6dacdbdf565 100644
>> --- a/include/linux/bpf-cgroup.h
>> +++ b/include/linux/bpf-cgroup.h
>> @@ -219,11 +219,28 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
>>   int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
>>                        void *value, u64 flags);
> 
>> +static inline bool
>> +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
>> +                 enum cgroup_bpf_attach_type type)
>> +{
>> +    struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
>> +
>> +    return array == &bpf_empty_prog_array.hdr;
>> +}
>> +
>> +#define CGROUP_BPF_TYPE_ENABLED(sk, atype)                       \
>> +({                                           \
>> +    struct cgroup *__cgrp = sock_cgroup_ptr(&(sk)->sk_cgrp_data);           \
>> +                                           \
>> +    !__cgroup_bpf_prog_array_is_empty(&__cgrp->bpf, (atype));           \
>> +})
>> +
>>   /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
>>   #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)                  \
>>   ({                                          \
>>       int __ret = 0;                                  \
>> -    if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))              \
>> +    if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&              \
>> +        CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS))           \
> 
> Why not add this __cgroup_bpf_run_filter_skb check to
> __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already there
> and you can use it. Maybe move the things around if you want
> it to happen earlier.

For inlining. Just wanted to get it done right, otherwise I'll likely be
returning to it back in a few months complaining that I see measurable
overhead from the function call :)
Stanislav Fomichev Dec. 15, 2021, 5:33 p.m. UTC | #4
On 12/15, Pavel Begunkov wrote:
> On 12/15/21 16:51, sdf@google.com wrote:
> > On 12/15, Pavel Begunkov wrote:
> > > Add per socket fast path for not enabled BPF skb filtering, which  
> sheds
> > > a nice chunk of send/recv overhead when affected. Testing udp with 128
> > > byte payload and/or zerocopy with any payload size showed 2-3%
> > > improvement in requests/s on the tx side using fast NICs across  
> network,
> > > and around 4% for dummy device. Same goes for rx, not measured, but
> > > numbers should be relatable.
> > > In my understanding, this should affect a good share of machines, and  
> at
> > > least it includes my laptops and some checked servers.
> >
> > > The core of the problem is that even though there is
> > > cgroup_bpf_enabled_key guarding from __cgroup_bpf_run_filter_skb()
> > > overhead, there are cases where we have several cgroups and loading a
> > > BPF program to one also makes all others to go through the slow path
> > > even when they don't have any BPF attached. It's even worse, because
> > > apparently systemd or some other early init loads some BPF and so
> > > triggers exactly this situation for normal networking.
> >
> > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> > > ---
> >
> > > v2: replace bitmask appoach with empty_prog_array (suggested by  
> Martin)
> > > v3: add "bpf_" prefix to empty_prog_array (Martin)
> >
> > > � include/linux/bpf-cgroup.h | 24 +++++++++++++++++++++---
> > > � include/linux/bpf.h������� | 13 +++++++++++++
> > > � kernel/bpf/cgroup.c������� | 18 ++----------------
> > > � kernel/bpf/core.c��������� | 16 ++++------------
> > > � 4 files changed, 40 insertions(+), 31 deletions(-)
> >
> > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > > index 11820a430d6c..c6dacdbdf565 100644
> > > --- a/include/linux/bpf-cgroup.h
> > > +++ b/include/linux/bpf-cgroup.h
> > > @@ -219,11 +219,28 @@ int bpf_percpu_cgroup_storage_copy(struct  
> bpf_map *map, void *key, void *value);
> > > � int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > > ���������������������� void *value, u64 flags);
> >
> > > +static inline bool
> > > +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
> > > +���������������� enum cgroup_bpf_attach_type type)
> > > +{
> > > +��� struct bpf_prog_array *array =  
> rcu_access_pointer(cgrp_bpf->effective[type]);
> > > +
> > > +��� return array == &bpf_empty_prog_array.hdr;
> > > +}
> > > +
> > > +#define CGROUP_BPF_TYPE_ENABLED(sk, atype)���������������������� \
> > > +({������������������������������������������ \
> > > +��� struct cgroup *__cgrp =  
> sock_cgroup_ptr(&(sk)->sk_cgrp_data);���������� \
> > > +������������������������������������������ \
> > > +��� !__cgroup_bpf_prog_array_is_empty(&__cgrp->bpf,  
> (atype));���������� \
> > > +})
> > > +
> > > � /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by  
> cgroup_bpf_enabled. */
> > > � #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)����������������� \
> > > � ({����������������������������������������� \
> > > ����� int __ret = 0;��������������������������������� \
> > > -��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))������������� \
> > > +��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk  
> &&������������� \
> > > +������� CGROUP_BPF_TYPE_ENABLED((sk),  
> CGROUP_INET_INGRESS))���������� \
> >
> > Why not add this __cgroup_bpf_run_filter_skb check to
> > __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already  
> there
> > and you can use it. Maybe move the things around if you want
> > it to happen earlier.

> For inlining. Just wanted to get it done right, otherwise I'll likely be
> returning to it back in a few months complaining that I see measurable
> overhead from the function call :)

Do you expect that direct call to bring any visible overhead?
Would be nice to compare that inlined case vs
__cgroup_bpf_prog_array_is_empty inside of __cgroup_bpf_run_filter_skb
while you're at it (plus move offset initialization down?).
Pavel Begunkov Dec. 15, 2021, 5:38 p.m. UTC | #5
On 12/15/21 16:40, Jakub Kicinski wrote:
> On Wed, 15 Dec 2021 14:49:18 +0000 Pavel Begunkov wrote:
>> +static inline bool
>> +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
>> +				 enum cgroup_bpf_attach_type type)
>> +{
>> +	struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
>> +
>> +	return array == &bpf_empty_prog_array.hdr;
>> +}
>> +
>> +#define CGROUP_BPF_TYPE_ENABLED(sk, atype)				       \
>> +({									       \
>> +	struct cgroup *__cgrp = sock_cgroup_ptr(&(sk)->sk_cgrp_data);	       \
>> +									       \
>> +	!__cgroup_bpf_prog_array_is_empty(&__cgrp->bpf, (atype));	       \
>> +})
>> +
> 
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index e7a163a3146b..0d2195c6fb2a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1161,6 +1161,19 @@ struct bpf_prog_array {
>>   	struct bpf_prog_array_item items[];
>>   };
>>   
>> +struct bpf_empty_prog_array {
>> +	struct bpf_prog_array hdr;
>> +	struct bpf_prog *null_prog;
>> +};
>> +
>> +/* to avoid allocating empty bpf_prog_array for cgroups that
>> + * don't have bpf program attached use one global 'bpf_empty_prog_array'
>> + * It will not be modified the caller of bpf_prog_array_alloc()
>> + * (since caller requested prog_cnt == 0)
>> + * that pointer should be 'freed' by bpf_prog_array_free()
>> + */
>> +extern struct bpf_empty_prog_array bpf_empty_prog_array;
> 
> mumble mumble, this adds more "fun" dependencies [1] Maybe I'm going

Header dependencies? It's declared right after struct bpf_prog_array,
and the other member is a pointer, so not sure what can go wrong.


> about this all wrong, maybe I should be pulling out struct cgroup_bpf
> so that cgroup.h does not need bpf-cgroup, not breaking bpf <-> bpf-cgroup.
> Alexei, WDYT?
> 
> [1] https://lore.kernel.org/all/20211215061916.715513-2-kuba@kernel.org/
>
Pavel Begunkov Dec. 15, 2021, 5:53 p.m. UTC | #6
On 12/15/21 17:33, sdf@google.com wrote:
> On 12/15, Pavel Begunkov wrote:
>> On 12/15/21 16:51, sdf@google.com wrote:
>> > On 12/15, Pavel Begunkov wrote:
>> > > � /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
>> > > � #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)����������������� \
>> > > � ({����������������������������������������� \
>> > > ����� int __ret = 0;��������������������������������� \
>> > > -��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))������������� \
>> > > +��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&������������� \
>> > > +������� CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS))���������� \
>> >
>> > Why not add this __cgroup_bpf_run_filter_skb check to
>> > __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already there
>> > and you can use it. Maybe move the things around if you want
>> > it to happen earlier.
> 
>> For inlining. Just wanted to get it done right, otherwise I'll likely be
>> returning to it back in a few months complaining that I see measurable
>> overhead from the function call :)
> 
> Do you expect that direct call to bring any visible overhead?
> Would be nice to compare that inlined case vs
> __cgroup_bpf_prog_array_is_empty inside of __cgroup_bpf_run_filter_skb
> while you're at it (plus move offset initialization down?).

Sorry but that would be waste of time. I naively hope it will be visible
with net at some moment (if not already), that's how it was with io_uring,
that's what I see in the block layer. And in anyway, if just one inlined
won't make a difference, then 10 will.
Stanislav Fomichev Dec. 15, 2021, 6:24 p.m. UTC | #7
On 12/15, Pavel Begunkov wrote:
> On 12/15/21 17:33, sdf@google.com wrote:
> > On 12/15, Pavel Begunkov wrote:
> > > On 12/15/21 16:51, sdf@google.com wrote:
> > > > On 12/15, Pavel Begunkov wrote:
> > > > > � /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by  
> cgroup_bpf_enabled. */
> > > > > � #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,  
> skb)����������������� \
> > > > > � ({����������������������������������������� \
> > > > > ����� int __ret = 0;��������������������������������� \
> > > > > -��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))������������� \
> > > > > +��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk  
> &&������������� \
> > > > > +������� CGROUP_BPF_TYPE_ENABLED((sk),  
> CGROUP_INET_INGRESS))���������� \
> > > >
> > > > Why not add this __cgroup_bpf_run_filter_skb check to
> > > > __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already  
> there
> > > > and you can use it. Maybe move the things around if you want
> > > > it to happen earlier.
> >
> > > For inlining. Just wanted to get it done right, otherwise I'll likely  
> be
> > > returning to it back in a few months complaining that I see measurable
> > > overhead from the function call :)
> >
> > Do you expect that direct call to bring any visible overhead?
> > Would be nice to compare that inlined case vs
> > __cgroup_bpf_prog_array_is_empty inside of __cgroup_bpf_run_filter_skb
> > while you're at it (plus move offset initialization down?).

> Sorry but that would be waste of time. I naively hope it will be visible
> with net at some moment (if not already), that's how it was with io_uring,
> that's what I see in the block layer. And in anyway, if just one inlined
> won't make a difference, then 10 will.

I can probably do more experiments on my side once your patch is
accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
If you claim there is visible overhead for a direct call then there
should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
well.
Pavel Begunkov Dec. 15, 2021, 6:54 p.m. UTC | #8
On 12/15/21 18:24, sdf@google.com wrote:
> On 12/15, Pavel Begunkov wrote:
>> On 12/15/21 17:33, sdf@google.com wrote:
>> > On 12/15, Pavel Begunkov wrote:
>> > > On 12/15/21 16:51, sdf@google.com wrote:
>> > > > On 12/15, Pavel Begunkov wrote:
>> > > > > � /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
>> > > > > � #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)����������������� \
>> > > > > � ({����������������������������������������� \
>> > > > > ����� int __ret = 0;��������������������������������� \
>> > > > > -��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))������������� \
>> > > > > +��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&������������� \
>> > > > > +������� CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS))���������� \
>> > > >
>> > > > Why not add this __cgroup_bpf_run_filter_skb check to
>> > > > __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already there
>> > > > and you can use it. Maybe move the things around if you want
>> > > > it to happen earlier.
>> >
>> > > For inlining. Just wanted to get it done right, otherwise I'll likely be
>> > > returning to it back in a few months complaining that I see measurable
>> > > overhead from the function call :)
>> >
>> > Do you expect that direct call to bring any visible overhead?
>> > Would be nice to compare that inlined case vs
>> > __cgroup_bpf_prog_array_is_empty inside of __cgroup_bpf_run_filter_skb
>> > while you're at it (plus move offset initialization down?).
> 
>> Sorry but that would be waste of time. I naively hope it will be visible
>> with net at some moment (if not already), that's how it was with io_uring,
>> that's what I see in the block layer. And in anyway, if just one inlined
>> won't make a difference, then 10 will.
> 
> I can probably do more experiments on my side once your patch is
> accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
> If you claim there is visible overhead for a direct call then there
> should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
> well.

Interesting, sounds getsockopt might be performance sensitive to
someone.

FWIW, I forgot to mention that for testing tx I'm using io_uring
(for both zc and not) with good submission batching.
Stanislav Fomichev Dec. 15, 2021, 7:15 p.m. UTC | #9
On Wed, Dec 15, 2021 at 10:54 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 12/15/21 18:24, sdf@google.com wrote:
> > On 12/15, Pavel Begunkov wrote:
> >> On 12/15/21 17:33, sdf@google.com wrote:
> >> > On 12/15, Pavel Begunkov wrote:
> >> > > On 12/15/21 16:51, sdf@google.com wrote:
> >> > > > On 12/15, Pavel Begunkov wrote:
> >> > > > > � /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> >> > > > > � #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)����������������� \
> >> > > > > � ({����������������������������������������� \
> >> > > > > ����� int __ret = 0;��������������������������������� \
> >> > > > > -��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))������������� \
> >> > > > > +��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&������������� \
> >> > > > > +������� CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS))���������� \
> >> > > >
> >> > > > Why not add this __cgroup_bpf_run_filter_skb check to
> >> > > > __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already there
> >> > > > and you can use it. Maybe move the things around if you want
> >> > > > it to happen earlier.
> >> >
> >> > > For inlining. Just wanted to get it done right, otherwise I'll likely be
> >> > > returning to it back in a few months complaining that I see measurable
> >> > > overhead from the function call :)
> >> >
> >> > Do you expect that direct call to bring any visible overhead?
> >> > Would be nice to compare that inlined case vs
> >> > __cgroup_bpf_prog_array_is_empty inside of __cgroup_bpf_run_filter_skb
> >> > while you're at it (plus move offset initialization down?).
> >
> >> Sorry but that would be waste of time. I naively hope it will be visible
> >> with net at some moment (if not already), that's how it was with io_uring,
> >> that's what I see in the block layer. And in anyway, if just one inlined
> >> won't make a difference, then 10 will.
> >
> > I can probably do more experiments on my side once your patch is
> > accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
> > If you claim there is visible overhead for a direct call then there
> > should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
> > well.
>
> Interesting, sounds getsockopt might be performance sensitive to
> someone.
>
> FWIW, I forgot to mention that for testing tx I'm using io_uring
> (for both zc and not) with good submission batching.

Yeah, last time I saw 2-3% as well, but it was due to kmalloc, see
more details in 9cacf81f8161, it was pretty visible under perf.
That's why I'm a bit skeptical of your claims of direct calls being
somehow visible in these 2-3% (even skb pulls/pushes are not 2-3%?).
But tbf I don't understand how it all plays out with the io_uring.

(mostly trying to understand where there is some gain left on the
table for TCP_ZEROCOPY_RECEIVE).
Pavel Begunkov Dec. 15, 2021, 7:55 p.m. UTC | #10
On 12/15/21 19:15, Stanislav Fomichev wrote:
> On Wed, Dec 15, 2021 at 10:54 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 12/15/21 18:24, sdf@google.com wrote:
>>> On 12/15, Pavel Begunkov wrote:
>>>> On 12/15/21 17:33, sdf@google.com wrote:
>>>>> On 12/15, Pavel Begunkov wrote:
>>>>>> On 12/15/21 16:51, sdf@google.com wrote:
>>>>>>> On 12/15, Pavel Begunkov wrote:
>>>>>>>> � /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
>>>>>>>> � #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)����������������� \
>>>>>>>> � ({����������������������������������������� \
>>>>>>>> ����� int __ret = 0;��������������������������������� \
>>>>>>>> -��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))������������� \
>>>>>>>> +��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&������������� \
>>>>>>>> +������� CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS))���������� \
>>>>>>>
>>>>>>> Why not add this __cgroup_bpf_run_filter_skb check to
>>>>>>> __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already there
>>>>>>> and you can use it. Maybe move the things around if you want
>>>>>>> it to happen earlier.
>>>>>
>>>>>> For inlining. Just wanted to get it done right, otherwise I'll likely be
>>>>>> returning to it back in a few months complaining that I see measurable
>>>>>> overhead from the function call :)
>>>>>
>>>>> Do you expect that direct call to bring any visible overhead?
>>>>> Would be nice to compare that inlined case vs
>>>>> __cgroup_bpf_prog_array_is_empty inside of __cgroup_bpf_run_filter_skb
>>>>> while you're at it (plus move offset initialization down?).
>>>
>>>> Sorry but that would be waste of time. I naively hope it will be visible
>>>> with net at some moment (if not already), that's how it was with io_uring,
>>>> that's what I see in the block layer. And in anyway, if just one inlined
>>>> won't make a difference, then 10 will.
>>>
>>> I can probably do more experiments on my side once your patch is
>>> accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
>>> If you claim there is visible overhead for a direct call then there
>>> should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
>>> well.
>>
>> Interesting, sounds getsockopt might be performance sensitive to
>> someone.
>>
>> FWIW, I forgot to mention that for testing tx I'm using io_uring
>> (for both zc and not) with good submission batching.
> 
> Yeah, last time I saw 2-3% as well, but it was due to kmalloc, see
> more details in 9cacf81f8161, it was pretty visible under perf.
> That's why I'm a bit skeptical of your claims of direct calls being
> somehow visible in these 2-3% (even skb pulls/pushes are not 2-3%?).

migrate_disable/enable together were taking somewhat in-between
1% and 1.5% in profiling, don't remember the exact number. The rest
should be from rcu_read_lock/unlock() in BPF_PROG_RUN_ARRAY_CG_FLAGS()
and other extra bits on the way.

I'm skeptical I'll be able to measure inlining one function,
variability between boots/runs is usually greater and would hide it.

> But tbf I don't understand how it all plays out with the io_uring.

1 syscall per N requests (N=32 IIRC), 1 fdget() per N, no payload
page referencing (for zc), and so on


> (mostly trying to understand where there is some gain left on the
> table for TCP_ZEROCOPY_RECEIVE).
Stanislav Fomichev Dec. 15, 2021, 10:07 p.m. UTC | #11
On Wed, Dec 15, 2021 at 11:55 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 12/15/21 19:15, Stanislav Fomichev wrote:
> > On Wed, Dec 15, 2021 at 10:54 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 12/15/21 18:24, sdf@google.com wrote:
> >>> On 12/15, Pavel Begunkov wrote:
> >>>> On 12/15/21 17:33, sdf@google.com wrote:
> >>>>> On 12/15, Pavel Begunkov wrote:
> >>>>>> On 12/15/21 16:51, sdf@google.com wrote:
> >>>>>>> On 12/15, Pavel Begunkov wrote:
> >>>>>>>> � /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> >>>>>>>> � #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)����������������� \
> >>>>>>>> � ({����������������������������������������� \
> >>>>>>>> ����� int __ret = 0;��������������������������������� \
> >>>>>>>> -��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))������������� \
> >>>>>>>> +��� if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&������������� \
> >>>>>>>> +������� CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS))���������� \
> >>>>>>>
> >>>>>>> Why not add this __cgroup_bpf_run_filter_skb check to
> >>>>>>> __cgroup_bpf_run_filter_skb? Result of sock_cgroup_ptr() is already there
> >>>>>>> and you can use it. Maybe move the things around if you want
> >>>>>>> it to happen earlier.
> >>>>>
> >>>>>> For inlining. Just wanted to get it done right, otherwise I'll likely be
> >>>>>> returning to it back in a few months complaining that I see measurable
> >>>>>> overhead from the function call :)
> >>>>>
> >>>>> Do you expect that direct call to bring any visible overhead?
> >>>>> Would be nice to compare that inlined case vs
> >>>>> __cgroup_bpf_prog_array_is_empty inside of __cgroup_bpf_run_filter_skb
> >>>>> while you're at it (plus move offset initialization down?).
> >>>
> >>>> Sorry but that would be waste of time. I naively hope it will be visible
> >>>> with net at some moment (if not already), that's how it was with io_uring,
> >>>> that's what I see in the block layer. And in anyway, if just one inlined
> >>>> won't make a difference, then 10 will.
> >>>
> >>> I can probably do more experiments on my side once your patch is
> >>> accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
> >>> If you claim there is visible overhead for a direct call then there
> >>> should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
> >>> well.
> >>
> >> Interesting, sounds getsockopt might be performance sensitive to
> >> someone.
> >>
> >> FWIW, I forgot to mention that for testing tx I'm using io_uring
> >> (for both zc and not) with good submission batching.
> >
> > Yeah, last time I saw 2-3% as well, but it was due to kmalloc, see
> > more details in 9cacf81f8161, it was pretty visible under perf.
> > That's why I'm a bit skeptical of your claims of direct calls being
> > somehow visible in these 2-3% (even skb pulls/pushes are not 2-3%?).
>
> migrate_disable/enable together were taking somewhat in-between
> 1% and 1.5% in profiling, don't remember the exact number. The rest
> should be from rcu_read_lock/unlock() in BPF_PROG_RUN_ARRAY_CG_FLAGS()
> and other extra bits on the way.

You probably have a preemptiple kernel and preemptible rcu which most
likely explains why you see the overhead and I won't (non-preemptible
kernel in our env, rcu_read_lock is essentially a nop, just a compiler
barrier).

> I'm skeptical I'll be able to measure inlining one function,
> variability between boots/runs is usually greater and would hide it.

Right, that's why I suggested to mirror what we do in set/getsockopt
instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
to you, Martin and the rest.
Pavel Begunkov Dec. 16, 2021, 1:21 p.m. UTC | #12
On 12/15/21 22:07, Stanislav Fomichev wrote:
> On Wed, Dec 15, 2021 at 11:55 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 12/15/21 19:15, Stanislav Fomichev wrote:
>>> On Wed, Dec 15, 2021 at 10:54 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>>
>>>> On 12/15/21 18:24, sdf@google.com wrote:
[...]
>>>>> I can probably do more experiments on my side once your patch is
>>>>> accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
>>>>> If you claim there is visible overhead for a direct call then there
>>>>> should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
>>>>> well.
>>>>
>>>> Interesting, sounds getsockopt might be performance sensitive to
>>>> someone.
>>>>
>>>> FWIW, I forgot to mention that for testing tx I'm using io_uring
>>>> (for both zc and not) with good submission batching.
>>>
>>> Yeah, last time I saw 2-3% as well, but it was due to kmalloc, see
>>> more details in 9cacf81f8161, it was pretty visible under perf.
>>> That's why I'm a bit skeptical of your claims of direct calls being
>>> somehow visible in these 2-3% (even skb pulls/pushes are not 2-3%?).
>>
>> migrate_disable/enable together were taking somewhat in-between
>> 1% and 1.5% in profiling, don't remember the exact number. The rest
>> should be from rcu_read_lock/unlock() in BPF_PROG_RUN_ARRAY_CG_FLAGS()
>> and other extra bits on the way.
> 
> You probably have a preemptiple kernel and preemptible rcu which most
> likely explains why you see the overhead and I won't (non-preemptible
> kernel in our env, rcu_read_lock is essentially a nop, just a compiler
> barrier).

Right. For reference tried out non-preemptible, perf shows the function
taking 0.8% with a NIC and 1.2% with a dummy netdev.


>> I'm skeptical I'll be able to measure inlining one function,
>> variability between boots/runs is usually greater and would hide it.
> 
> Right, that's why I suggested to mirror what we do in set/getsockopt
> instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
> to you, Martin and the rest.
Martin KaFai Lau Dec. 16, 2021, 6:14 p.m. UTC | #13
On Thu, Dec 16, 2021 at 01:21:26PM +0000, Pavel Begunkov wrote:
> On 12/15/21 22:07, Stanislav Fomichev wrote:
> > On Wed, Dec 15, 2021 at 11:55 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> > > 
> > > On 12/15/21 19:15, Stanislav Fomichev wrote:
> > > > On Wed, Dec 15, 2021 at 10:54 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> > > > > 
> > > > > On 12/15/21 18:24, sdf@google.com wrote:
> [...]
> > > > > > I can probably do more experiments on my side once your patch is
> > > > > > accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
> > > > > > If you claim there is visible overhead for a direct call then there
> > > > > > should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
> > > > > > well.
> > > > > 
> > > > > Interesting, sounds getsockopt might be performance sensitive to
> > > > > someone.
> > > > > 
> > > > > FWIW, I forgot to mention that for testing tx I'm using io_uring
> > > > > (for both zc and not) with good submission batching.
> > > > 
> > > > Yeah, last time I saw 2-3% as well, but it was due to kmalloc, see
> > > > more details in 9cacf81f8161, it was pretty visible under perf.
> > > > That's why I'm a bit skeptical of your claims of direct calls being
> > > > somehow visible in these 2-3% (even skb pulls/pushes are not 2-3%?).
> > > 
> > > migrate_disable/enable together were taking somewhat in-between
> > > 1% and 1.5% in profiling, don't remember the exact number. The rest
> > > should be from rcu_read_lock/unlock() in BPF_PROG_RUN_ARRAY_CG_FLAGS()
> > > and other extra bits on the way.
> > 
> > You probably have a preemptiple kernel and preemptible rcu which most
> > likely explains why you see the overhead and I won't (non-preemptible
> > kernel in our env, rcu_read_lock is essentially a nop, just a compiler
> > barrier).
> 
> Right. For reference tried out non-preemptible, perf shows the function
> taking 0.8% with a NIC and 1.2% with a dummy netdev.
> 
> 
> > > I'm skeptical I'll be able to measure inlining one function,
> > > variability between boots/runs is usually greater and would hide it.
> > 
> > Right, that's why I suggested to mirror what we do in set/getsockopt
> > instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
> > to you, Martin and the rest.
I also suggested to try to stay with one way for fullsock context in v2
but it is for code readability reason.

How about calling CGROUP_BPF_TYPE_ENABLED() just next to cgroup_bpf_enabled()
in BPF_CGROUP_RUN_PROG_*SOCKOPT_*() instead ?
It is because both cgroup_bpf_enabled() and CGROUP_BPF_TYPE_ENABLED()
want to check if there is bpf to run before proceeding everything else
and then I don't need to jump to the non-inline function itself to see
if there is other prog array empty check.

Stan, do you have concern on an extra inlined sock_cgroup_ptr()
when there is bpf prog to run for set/getsockopt()?  I think
it should be mostly noise from looking at
__cgroup_bpf_run_filter_*sockopt()?
Stanislav Fomichev Dec. 16, 2021, 6:24 p.m. UTC | #14
On Thu, Dec 16, 2021 at 10:14 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Thu, Dec 16, 2021 at 01:21:26PM +0000, Pavel Begunkov wrote:
> > On 12/15/21 22:07, Stanislav Fomichev wrote:
> > > On Wed, Dec 15, 2021 at 11:55 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> > > >
> > > > On 12/15/21 19:15, Stanislav Fomichev wrote:
> > > > > On Wed, Dec 15, 2021 at 10:54 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> > > > > >
> > > > > > On 12/15/21 18:24, sdf@google.com wrote:
> > [...]
> > > > > > > I can probably do more experiments on my side once your patch is
> > > > > > > accepted. I'm mostly concerned with getsockopt(TCP_ZEROCOPY_RECEIVE).
> > > > > > > If you claim there is visible overhead for a direct call then there
> > > > > > > should be visible benefit to using CGROUP_BPF_TYPE_ENABLED there as
> > > > > > > well.
> > > > > >
> > > > > > Interesting, sounds getsockopt might be performance sensitive to
> > > > > > someone.
> > > > > >
> > > > > > FWIW, I forgot to mention that for testing tx I'm using io_uring
> > > > > > (for both zc and not) with good submission batching.
> > > > >
> > > > > Yeah, last time I saw 2-3% as well, but it was due to kmalloc, see
> > > > > more details in 9cacf81f8161, it was pretty visible under perf.
> > > > > That's why I'm a bit skeptical of your claims of direct calls being
> > > > > somehow visible in these 2-3% (even skb pulls/pushes are not 2-3%?).
> > > >
> > > > migrate_disable/enable together were taking somewhat in-between
> > > > 1% and 1.5% in profiling, don't remember the exact number. The rest
> > > > should be from rcu_read_lock/unlock() in BPF_PROG_RUN_ARRAY_CG_FLAGS()
> > > > and other extra bits on the way.
> > >
> > > You probably have a preemptiple kernel and preemptible rcu which most
> > > likely explains why you see the overhead and I won't (non-preemptible
> > > kernel in our env, rcu_read_lock is essentially a nop, just a compiler
> > > barrier).
> >
> > Right. For reference tried out non-preemptible, perf shows the function
> > taking 0.8% with a NIC and 1.2% with a dummy netdev.
> >
> >
> > > > I'm skeptical I'll be able to measure inlining one function,
> > > > variability between boots/runs is usually greater and would hide it.
> > >
> > > Right, that's why I suggested to mirror what we do in set/getsockopt
> > > instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
> > > to you, Martin and the rest.
> I also suggested to try to stay with one way for fullsock context in v2
> but it is for code readability reason.
>
> How about calling CGROUP_BPF_TYPE_ENABLED() just next to cgroup_bpf_enabled()
> in BPF_CGROUP_RUN_PROG_*SOCKOPT_*() instead ?

SG!

> It is because both cgroup_bpf_enabled() and CGROUP_BPF_TYPE_ENABLED()
> want to check if there is bpf to run before proceeding everything else
> and then I don't need to jump to the non-inline function itself to see
> if there is other prog array empty check.
>
> Stan, do you have concern on an extra inlined sock_cgroup_ptr()
> when there is bpf prog to run for set/getsockopt()?  I think
> it should be mostly noise from looking at
> __cgroup_bpf_run_filter_*sockopt()?

Yeah, my concern is also mostly about readability/consistency. Either
__cgroup_bpf_prog_array_is_empty everywhere or this new
CGROUP_BPF_TYPE_ENABLED everywhere. I'm slightly leaning towards
__cgroup_bpf_prog_array_is_empty because I don't believe direct
function calls add any visible overhead and macros are ugly :-) But
either way is fine as long as it looks consistent.
Pavel Begunkov Jan. 24, 2022, 3:46 p.m. UTC | #15
On 12/16/21 18:24, Stanislav Fomichev wrote:
> On Thu, Dec 16, 2021 at 10:14 AM Martin KaFai Lau <kafai@fb.com> wrote:
>> On Thu, Dec 16, 2021 at 01:21:26PM +0000, Pavel Begunkov wrote:
>>> On 12/15/21 22:07, Stanislav Fomichev wrote:
>>>>> I'm skeptical I'll be able to measure inlining one function,
>>>>> variability between boots/runs is usually greater and would hide it.
>>>>
>>>> Right, that's why I suggested to mirror what we do in set/getsockopt
>>>> instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
>>>> to you, Martin and the rest.
>> I also suggested to try to stay with one way for fullsock context in v2
>> but it is for code readability reason.
>>
>> How about calling CGROUP_BPF_TYPE_ENABLED() just next to cgroup_bpf_enabled()
>> in BPF_CGROUP_RUN_PROG_*SOCKOPT_*() instead ?
> 
> SG!
> 
>> It is because both cgroup_bpf_enabled() and CGROUP_BPF_TYPE_ENABLED()
>> want to check if there is bpf to run before proceeding everything else
>> and then I don't need to jump to the non-inline function itself to see
>> if there is other prog array empty check.
>>
>> Stan, do you have concern on an extra inlined sock_cgroup_ptr()
>> when there is bpf prog to run for set/getsockopt()?  I think
>> it should be mostly noise from looking at
>> __cgroup_bpf_run_filter_*sockopt()?
> 
> Yeah, my concern is also mostly about readability/consistency. Either
> __cgroup_bpf_prog_array_is_empty everywhere or this new
> CGROUP_BPF_TYPE_ENABLED everywhere. I'm slightly leaning towards
> __cgroup_bpf_prog_array_is_empty because I don't believe direct
> function calls add any visible overhead and macros are ugly :-) But
> either way is fine as long as it looks consistent.

Martin, Stanislav, do you think it's good to go? Any other concerns?
It feels it might end with bikeshedding and would be great to finally
get it done, especially since I find the issue to be pretty simple.
Stanislav Fomichev Jan. 24, 2022, 6:25 p.m. UTC | #16
On Mon, Jan 24, 2022 at 7:49 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 12/16/21 18:24, Stanislav Fomichev wrote:
> > On Thu, Dec 16, 2021 at 10:14 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >> On Thu, Dec 16, 2021 at 01:21:26PM +0000, Pavel Begunkov wrote:
> >>> On 12/15/21 22:07, Stanislav Fomichev wrote:
> >>>>> I'm skeptical I'll be able to measure inlining one function,
> >>>>> variability between boots/runs is usually greater and would hide it.
> >>>>
> >>>> Right, that's why I suggested to mirror what we do in set/getsockopt
> >>>> instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
> >>>> to you, Martin and the rest.
> >> I also suggested to try to stay with one way for fullsock context in v2
> >> but it is for code readability reason.
> >>
> >> How about calling CGROUP_BPF_TYPE_ENABLED() just next to cgroup_bpf_enabled()
> >> in BPF_CGROUP_RUN_PROG_*SOCKOPT_*() instead ?
> >
> > SG!
> >
> >> It is because both cgroup_bpf_enabled() and CGROUP_BPF_TYPE_ENABLED()
> >> want to check if there is bpf to run before proceeding everything else
> >> and then I don't need to jump to the non-inline function itself to see
> >> if there is other prog array empty check.
> >>
> >> Stan, do you have concern on an extra inlined sock_cgroup_ptr()
> >> when there is bpf prog to run for set/getsockopt()?  I think
> >> it should be mostly noise from looking at
> >> __cgroup_bpf_run_filter_*sockopt()?
> >
> > Yeah, my concern is also mostly about readability/consistency. Either
> > __cgroup_bpf_prog_array_is_empty everywhere or this new
> > CGROUP_BPF_TYPE_ENABLED everywhere. I'm slightly leaning towards
> > __cgroup_bpf_prog_array_is_empty because I don't believe direct
> > function calls add any visible overhead and macros are ugly :-) But
> > either way is fine as long as it looks consistent.
>
> Martin, Stanislav, do you think it's good to go? Any other concerns?
> It feels it might end with bikeshedding and would be great to finally
> get it done, especially since I find the issue to be pretty simple.

I'll leave it up to the bpf maintainers/reviewers. Personally, I'd
still prefer a respin with a consistent
__cgroup_bpf_prog_array_is_empty or CGROUP_BPF_TYPE_ENABLED everywhere
(shouldn't be a lot of effort?)
Pavel Begunkov Jan. 25, 2022, 6:54 p.m. UTC | #17
On 1/24/22 18:25, Stanislav Fomichev wrote:
> On Mon, Jan 24, 2022 at 7:49 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 12/16/21 18:24, Stanislav Fomichev wrote:
>>> On Thu, Dec 16, 2021 at 10:14 AM Martin KaFai Lau <kafai@fb.com> wrote:
>>>> On Thu, Dec 16, 2021 at 01:21:26PM +0000, Pavel Begunkov wrote:
>>>>> On 12/15/21 22:07, Stanislav Fomichev wrote:
>>>>>>> I'm skeptical I'll be able to measure inlining one function,
>>>>>>> variability between boots/runs is usually greater and would hide it.
>>>>>>
>>>>>> Right, that's why I suggested to mirror what we do in set/getsockopt
>>>>>> instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
>>>>>> to you, Martin and the rest.
>>>> I also suggested to try to stay with one way for fullsock context in v2
>>>> but it is for code readability reason.
>>>>
>>>> How about calling CGROUP_BPF_TYPE_ENABLED() just next to cgroup_bpf_enabled()
>>>> in BPF_CGROUP_RUN_PROG_*SOCKOPT_*() instead ?
>>>
>>> SG!
>>>
>>>> It is because both cgroup_bpf_enabled() and CGROUP_BPF_TYPE_ENABLED()
>>>> want to check if there is bpf to run before proceeding everything else
>>>> and then I don't need to jump to the non-inline function itself to see
>>>> if there is other prog array empty check.
>>>>
>>>> Stan, do you have concern on an extra inlined sock_cgroup_ptr()
>>>> when there is bpf prog to run for set/getsockopt()?  I think
>>>> it should be mostly noise from looking at
>>>> __cgroup_bpf_run_filter_*sockopt()?
>>>
>>> Yeah, my concern is also mostly about readability/consistency. Either
>>> __cgroup_bpf_prog_array_is_empty everywhere or this new
>>> CGROUP_BPF_TYPE_ENABLED everywhere. I'm slightly leaning towards
>>> __cgroup_bpf_prog_array_is_empty because I don't believe direct
>>> function calls add any visible overhead and macros are ugly :-) But
>>> either way is fine as long as it looks consistent.
>>
>> Martin, Stanislav, do you think it's good to go? Any other concerns?
>> It feels it might end with bikeshedding and would be great to finally
>> get it done, especially since I find the issue to be pretty simple.
> 
> I'll leave it up to the bpf maintainers/reviewers. Personally, I'd
> still prefer a respin with a consistent
> __cgroup_bpf_prog_array_is_empty or CGROUP_BPF_TYPE_ENABLED everywhere
> (shouldn't be a lot of effort?)

I can make CGROUP_BPF_TYPE_ENABLED() used everywhere, np.

I'll leave out unification with cgroup_bpf_enabled() as don't
really understand the fullsock dancing in
BPF_CGROUP_RUN_PROG_INET_EGRESS(). Any idea whether it's needed
and/or how to shove it out of inlined checks?
Stanislav Fomichev Jan. 25, 2022, 9:27 p.m. UTC | #18
On Tue, Jan 25, 2022 at 10:55 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 1/24/22 18:25, Stanislav Fomichev wrote:
> > On Mon, Jan 24, 2022 at 7:49 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 12/16/21 18:24, Stanislav Fomichev wrote:
> >>> On Thu, Dec 16, 2021 at 10:14 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >>>> On Thu, Dec 16, 2021 at 01:21:26PM +0000, Pavel Begunkov wrote:
> >>>>> On 12/15/21 22:07, Stanislav Fomichev wrote:
> >>>>>>> I'm skeptical I'll be able to measure inlining one function,
> >>>>>>> variability between boots/runs is usually greater and would hide it.
> >>>>>>
> >>>>>> Right, that's why I suggested to mirror what we do in set/getsockopt
> >>>>>> instead of the new extra CGROUP_BPF_TYPE_ENABLED. But I'll leave it up
> >>>>>> to you, Martin and the rest.
> >>>> I also suggested to try to stay with one way for fullsock context in v2
> >>>> but it is for code readability reason.
> >>>>
> >>>> How about calling CGROUP_BPF_TYPE_ENABLED() just next to cgroup_bpf_enabled()
> >>>> in BPF_CGROUP_RUN_PROG_*SOCKOPT_*() instead ?
> >>>
> >>> SG!
> >>>
> >>>> It is because both cgroup_bpf_enabled() and CGROUP_BPF_TYPE_ENABLED()
> >>>> want to check if there is bpf to run before proceeding everything else
> >>>> and then I don't need to jump to the non-inline function itself to see
> >>>> if there is other prog array empty check.
> >>>>
> >>>> Stan, do you have concern on an extra inlined sock_cgroup_ptr()
> >>>> when there is bpf prog to run for set/getsockopt()?  I think
> >>>> it should be mostly noise from looking at
> >>>> __cgroup_bpf_run_filter_*sockopt()?
> >>>
> >>> Yeah, my concern is also mostly about readability/consistency. Either
> >>> __cgroup_bpf_prog_array_is_empty everywhere or this new
> >>> CGROUP_BPF_TYPE_ENABLED everywhere. I'm slightly leaning towards
> >>> __cgroup_bpf_prog_array_is_empty because I don't believe direct
> >>> function calls add any visible overhead and macros are ugly :-) But
> >>> either way is fine as long as it looks consistent.
> >>
> >> Martin, Stanislav, do you think it's good to go? Any other concerns?
> >> It feels it might end with bikeshedding and would be great to finally
> >> get it done, especially since I find the issue to be pretty simple.
> >
> > I'll leave it up to the bpf maintainers/reviewers. Personally, I'd
> > still prefer a respin with a consistent
> > __cgroup_bpf_prog_array_is_empty or CGROUP_BPF_TYPE_ENABLED everywhere
> > (shouldn't be a lot of effort?)
>
> I can make CGROUP_BPF_TYPE_ENABLED() used everywhere, np.
>
> I'll leave out unification with cgroup_bpf_enabled() as don't
> really understand the fullsock dancing in
> BPF_CGROUP_RUN_PROG_INET_EGRESS(). Any idea whether it's needed
> and/or how to shove it out of inlined checks?

I'm not sure we can do anything better than whatever you did in your
patch. This request_sk->full_sk conversion is needed because
request_sk doesn't really have any cgroup association and we need to
pull it from the listener ("full_sk"). So you wave to get full_sk and
then run CGROUP_BPF_TYPE_ENABLED on it.
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 11820a430d6c..c6dacdbdf565 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -219,11 +219,28 @@  int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 				     void *value, u64 flags);
 
+static inline bool
+__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
+				 enum cgroup_bpf_attach_type type)
+{
+	struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
+
+	return array == &bpf_empty_prog_array.hdr;
+}
+
+#define CGROUP_BPF_TYPE_ENABLED(sk, atype)				       \
+({									       \
+	struct cgroup *__cgrp = sock_cgroup_ptr(&(sk)->sk_cgrp_data);	       \
+									       \
+	!__cgroup_bpf_prog_array_is_empty(&__cgrp->bpf, (atype));	       \
+})
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
 	int __ret = 0;							      \
-	if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))		      \
+	if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk &&		      \
+	    CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS)) 	      \
 		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
 						    CGROUP_INET_INGRESS); \
 									      \
@@ -235,9 +252,10 @@  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
 		typeof(sk) __sk = sk_to_full_sk(sk);			       \
-		if (sk_fullsock(__sk))					       \
+		if (sk_fullsock(__sk) &&				       \
+		    CGROUP_BPF_TYPE_ENABLED(__sk, CGROUP_INET_EGRESS))	       \
 			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
-						      CGROUP_INET_EGRESS); \
+						      CGROUP_INET_EGRESS);     \
 	}								       \
 	__ret;								       \
 })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e7a163a3146b..0d2195c6fb2a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1161,6 +1161,19 @@  struct bpf_prog_array {
 	struct bpf_prog_array_item items[];
 };
 
+struct bpf_empty_prog_array {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'bpf_empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+extern struct bpf_empty_prog_array bpf_empty_prog_array;
+
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array *progs);
 int bpf_prog_array_length(struct bpf_prog_array *progs);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43eb3501721b..99e85f44e257 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1354,20 +1354,6 @@  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 }
 
 #ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
-					     enum cgroup_bpf_attach_type attach_type)
-{
-	struct bpf_prog_array *prog_array;
-	bool empty;
-
-	rcu_read_lock();
-	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
-	empty = bpf_prog_array_is_empty(prog_array);
-	rcu_read_unlock();
-
-	return empty;
-}
-
 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
 			     struct bpf_sockopt_buf *buf)
 {
@@ -1430,7 +1416,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(&cgrp->bpf, CGROUP_SETSOCKOPT))
 		return 0;
 
 	/* Allocate a bit more than the initial user buffer for
@@ -1526,7 +1512,7 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(&cgrp->bpf, CGROUP_GETSOCKOPT))
 		return retval;
 
 	ctx.optlen = max_optlen;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2405e39d800f..fa76d1d839ad 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1967,18 +1967,10 @@  static struct bpf_prog_dummy {
 	},
 };
 
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
-	struct bpf_prog_array hdr;
-	struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
 	.null_prog = NULL,
 };
+EXPORT_SYMBOL(bpf_empty_prog_array);
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 {
@@ -1988,12 +1980,12 @@  struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 			       (prog_cnt + 1),
 			       flags);
 
-	return &empty_prog_array.hdr;
+	return &bpf_empty_prog_array.hdr;
 }
 
 void bpf_prog_array_free(struct bpf_prog_array *progs)
 {
-	if (!progs || progs == &empty_prog_array.hdr)
+	if (!progs || progs == &bpf_empty_prog_array.hdr)
 		return;
 	kfree_rcu(progs, rcu);
 }