diff mbox series

[bpf-next,v2,1/2] bpf: try to avoid kzalloc in cgroup/{s,g}etsockopt

Message ID 20210104221454.2204239-2-sdf@google.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: misc performance improvements for cgroup hooks | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 5 maintainers not CCed: songliubraving@fb.com kpsingh@kernel.org john.fastabend@gmail.com yhs@fb.com andrii@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 4950 this patch: 4950
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Please don't use multiple blank lines
netdev/build_allmodconfig_warn success Errors and warnings before: 5063 this patch: 5063
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Stanislav Fomichev Jan. 4, 2021, 10:14 p.m. UTC
When we attach a bpf program to cgroup/getsockopt any other getsockopt()
syscall starts incurring kzalloc/kfree cost. While, in general, it's
not an issue, sometimes it is, like in the case of TCP_ZEROCOPY_RECEIVE.
TCP_ZEROCOPY_RECEIVE (ab)uses getsockopt system call to implement
fastpath for incoming TCP, we don't want to have extra allocations in
there.

Let add a small buffer on the stack and use it for small (majority)
{s,g}etsockopt values. I've started with 128 bytes to cover
the options we care about (TCP_ZEROCOPY_RECEIVE which is 32 bytes
currently, with some planned extension to 64).

It seems natural to do the same for setsockopt, but it's a bit more
involved when the BPF program modifies the data (where we have to
kmalloc). The assumption is that for the majority of setsockopt
calls (which are doing pure BPF options or apply policy) this
will bring some benefit as well.

Collected some performance numbers using (on a 65k MTU localhost in a VM):
$ perf record -g -- ./tcp_mmap -s -z
$ ./tcp_mmap -H ::1 -z
$ ...
$ perf report --symbol-filter=__cgroup_bpf_run_filter_getsockopt

Without this patch:
     4.81%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_>
            |
             --4.74%--__cgroup_bpf_run_filter_getsockopt
                       |
                       |--1.06%--__kmalloc
                       |
                       |--0.71%--lock_sock_nested
                       |
                       |--0.62%--__might_fault
                       |
                        --0.52%--release_sock

With the patch applied:
     3.29%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.22%--__cgroup_bpf_run_filter_getsockopt
                       |
                       |--0.66%--lock_sock_nested
                       |
                       |--0.57%--__might_fault
                       |
                        --0.56%--release_sock

So it saves about 1% of the system call. Unfortunately, we still get
2-3% of overhead due to another socket lock/unlock :-(

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/cgroup.c    | 40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

Comments

Martin KaFai Lau Jan. 5, 2021, 12:03 a.m. UTC | #1
On Mon, Jan 04, 2021 at 02:14:53PM -0800, Stanislav Fomichev wrote:
> When we attach a bpf program to cgroup/getsockopt any other getsockopt()
> syscall starts incurring kzalloc/kfree cost. While, in general, it's
> not an issue, sometimes it is, like in the case of TCP_ZEROCOPY_RECEIVE.
> TCP_ZEROCOPY_RECEIVE (ab)uses getsockopt system call to implement
> fastpath for incoming TCP, we don't want to have extra allocations in
> there.
> 
> Let add a small buffer on the stack and use it for small (majority)
> {s,g}etsockopt values. I've started with 128 bytes to cover
> the options we care about (TCP_ZEROCOPY_RECEIVE which is 32 bytes
> currently, with some planned extension to 64).
> 
> It seems natural to do the same for setsockopt, but it's a bit more
> involved when the BPF program modifies the data (where we have to
> kmalloc). The assumption is that for the majority of setsockopt
> calls (which are doing pure BPF options or apply policy) this
> will bring some benefit as well.
> 
> Collected some performance numbers using (on a 65k MTU localhost in a VM):
> $ perf record -g -- ./tcp_mmap -s -z
> $ ./tcp_mmap -H ::1 -z
> $ ...
> $ perf report --symbol-filter=__cgroup_bpf_run_filter_getsockopt
> 
> Without this patch:
>      4.81%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_>
>             |
>              --4.74%--__cgroup_bpf_run_filter_getsockopt
>                        |
>                        |--1.06%--__kmalloc
>                        |
>                        |--0.71%--lock_sock_nested
>                        |
>                        |--0.62%--__might_fault
>                        |
>                         --0.52%--release_sock
> 
> With the patch applied:
>      3.29%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
>             |
>              --3.22%--__cgroup_bpf_run_filter_getsockopt
>                        |
>                        |--0.66%--lock_sock_nested
>                        |
>                        |--0.57%--__might_fault
>                        |
>                         --0.56%--release_sock
> 
> So it saves about 1% of the system call. Unfortunately, we still get
> 2-3% of overhead due to another socket lock/unlock :-(
That could be a future exercise to optimize the fast path sockopts. ;)

[ ... ]

> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -16,6 +16,7 @@
>  #include <linux/bpf-cgroup.h>
>  #include <net/sock.h>
>  #include <net/bpf_sk_storage.h>
> +#include <net/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
To be more specific, it should be <uapi/linux/tcp.h>.

>  
>  #include "../cgroup/cgroup-internal.h"
>  
> @@ -1298,6 +1299,7 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
>  	return empty;
>  }
>  
> +
Extra newline.

>  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
>  {
>  	if (unlikely(max_optlen < 0))
> @@ -1310,6 +1312,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
>  		max_optlen = PAGE_SIZE;
>  	}
>  
> +	if (max_optlen <= sizeof(ctx->buf)) {
> +		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
> +		 * bytes avoid the cost of kzalloc.
> +		 */
If it needs to respin, it will be good to have a few words here on why
it only BUILD_BUG checks for "struct tcp_zerocopy_receive".

> +		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
> +			     BPF_SOCKOPT_KERN_BUF_SIZE);
> +
> +		ctx->optval = ctx->buf;
> +		ctx->optval_end = ctx->optval + max_optlen;
> +		return max_optlen;
> +	}
> +
Stanislav Fomichev Jan. 5, 2021, 12:19 a.m. UTC | #2
On Mon, Jan 4, 2021 at 4:03 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Mon, Jan 04, 2021 at 02:14:53PM -0800, Stanislav Fomichev wrote:
> > When we attach a bpf program to cgroup/getsockopt any other getsockopt()
> > syscall starts incurring kzalloc/kfree cost. While, in general, it's
> > not an issue, sometimes it is, like in the case of TCP_ZEROCOPY_RECEIVE.
> > TCP_ZEROCOPY_RECEIVE (ab)uses getsockopt system call to implement
> > fastpath for incoming TCP, we don't want to have extra allocations in
> > there.
> >
> > Let add a small buffer on the stack and use it for small (majority)
> > {s,g}etsockopt values. I've started with 128 bytes to cover
> > the options we care about (TCP_ZEROCOPY_RECEIVE which is 32 bytes
> > currently, with some planned extension to 64).
> >
> > It seems natural to do the same for setsockopt, but it's a bit more
> > involved when the BPF program modifies the data (where we have to
> > kmalloc). The assumption is that for the majority of setsockopt
> > calls (which are doing pure BPF options or apply policy) this
> > will bring some benefit as well.
> >
> > Collected some performance numbers using (on a 65k MTU localhost in a VM):
> > $ perf record -g -- ./tcp_mmap -s -z
> > $ ./tcp_mmap -H ::1 -z
> > $ ...
> > $ perf report --symbol-filter=__cgroup_bpf_run_filter_getsockopt
> >
> > Without this patch:
> >      4.81%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_>
> >             |
> >              --4.74%--__cgroup_bpf_run_filter_getsockopt
> >                        |
> >                        |--1.06%--__kmalloc
> >                        |
> >                        |--0.71%--lock_sock_nested
> >                        |
> >                        |--0.62%--__might_fault
> >                        |
> >                         --0.52%--release_sock
> >
> > With the patch applied:
> >      3.29%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
> >             |
> >              --3.22%--__cgroup_bpf_run_filter_getsockopt
> >                        |
> >                        |--0.66%--lock_sock_nested
> >                        |
> >                        |--0.57%--__might_fault
> >                        |
> >                         --0.56%--release_sock
> >
> > So it saves about 1% of the system call. Unfortunately, we still get
> > 2-3% of overhead due to another socket lock/unlock :-(
> That could be a future exercise to optimize the fast path sockopts. ;)
Yeah, I couldn't think about anything simple so far. The only idea I have
is to allow custom implementation for tcp/udp (where we do lock_sock)
and then have existing BPF_CGROUP_RUN_PROG_{S,G}ETSOCKOPT
in net/socket.c as a fallback. Need to experiment more with it.

> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -16,6 +16,7 @@
> >  #include <linux/bpf-cgroup.h>
> >  #include <net/sock.h>
> >  #include <net/bpf_sk_storage.h>
> > +#include <net/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
> To be more specific, it should be <uapi/linux/tcp.h>.
Sure, let's do that. I went with net/tcp.h because
most of the code under net/* doesn't include uapi directly.

> >
> >  #include "../cgroup/cgroup-internal.h"
> >
> > @@ -1298,6 +1299,7 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
> >       return empty;
> >  }
> >
> > +
> Extra newline.
Oops, thanks, will fix.

> >  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >  {
> >       if (unlikely(max_optlen < 0))
> > @@ -1310,6 +1312,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >               max_optlen = PAGE_SIZE;
> >       }
> >
> > +     if (max_optlen <= sizeof(ctx->buf)) {
> > +             /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
> > +              * bytes avoid the cost of kzalloc.
> > +              */
> If it needs to respin, it will be good to have a few words here on why
> it only BUILD_BUG checks for "struct tcp_zerocopy_receive".
Sounds good, will add. I'll wait a day to let others comment and will respin.

> > +             BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
> > +                          BPF_SOCKOPT_KERN_BUF_SIZE);
> > +
> > +             ctx->optval = ctx->buf;
> > +             ctx->optval_end = ctx->optval + max_optlen;
> > +             return max_optlen;
> > +     }
> > +
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 29c27656165b..54a4225f36d8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1281,6 +1281,8 @@  struct bpf_sysctl_kern {
 	u64 tmp_reg;
 };
 
+#define BPF_SOCKOPT_KERN_BUF_SIZE	64
+
 struct bpf_sockopt_kern {
 	struct sock	*sk;
 	u8		*optval;
@@ -1289,6 +1291,7 @@  struct bpf_sockopt_kern {
 	s32		optname;
 	s32		optlen;
 	s32		retval;
+	u8		buf[BPF_SOCKOPT_KERN_BUF_SIZE];
 };
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6ec088a96302..e6a5c7aec1ec 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -16,6 +16,7 @@ 
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
 #include <net/bpf_sk_storage.h>
+#include <net/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
 
 #include "../cgroup/cgroup-internal.h"
 
@@ -1298,6 +1299,7 @@  static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 	return empty;
 }
 
+
 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 {
 	if (unlikely(max_optlen < 0))
@@ -1310,6 +1312,18 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 		max_optlen = PAGE_SIZE;
 	}
 
+	if (max_optlen <= sizeof(ctx->buf)) {
+		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+		 * bytes avoid the cost of kzalloc.
+		 */
+		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
+			     BPF_SOCKOPT_KERN_BUF_SIZE);
+
+		ctx->optval = ctx->buf;
+		ctx->optval_end = ctx->optval + max_optlen;
+		return max_optlen;
+	}
+
 	ctx->optval = kzalloc(max_optlen, GFP_USER);
 	if (!ctx->optval)
 		return -ENOMEM;
@@ -1321,9 +1335,16 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 
 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
 {
+	if (ctx->optval == ctx->buf)
+		return;
 	kfree(ctx->optval);
 }
 
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
+{
+	return ctx->optval != ctx->buf;
+}
+
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 				       int *optname, char __user *optval,
 				       int *optlen, char **kernel_optval)
@@ -1390,7 +1411,24 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 		 */
 		if (ctx.optlen != 0) {
 			*optlen = ctx.optlen;
-			*kernel_optval = ctx.optval;
+			/* We've used bpf_sockopt_kern->buf as an intermediary
+			 * storage, but the BPF program indicates that we need
+			 * to pass this data to the kernel setsockopt handler.
+			 * No way to export on-stack buf, have to allocate a
+			 * new buffer.
+			 */
+			if (!sockopt_buf_allocated(&ctx)) {
+				void *p = kzalloc(ctx.optlen, GFP_USER);
+
+				if (!p) {
+					ret = -ENOMEM;
+					goto out;
+				}
+				memcpy(p, ctx.optval, ctx.optlen);
+				*kernel_optval = p;
+			} else {
+				*kernel_optval = ctx.optval;
+			}
 		}
 	}