diff mbox series

[bpf-next,4/4] bpf: selftests: Add dctcp fallback test

Message ID 20210805050144.1352078-1-kafai@fb.com (mailing list archive)
State Awaiting Upstream
Delegated to: BPF
Headers show
Series bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 8 maintainers not CCed: john.fastabend@gmail.com linux-kselftest@vger.kernel.org toke@redhat.com shuah@kernel.org songliubraving@fb.com sdf@google.com yhs@fb.com kpsingh@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: Use of volatile is usually wrong: see Documentation/process/volatile-considered-harmful.rst WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: const array should probably be static const
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Martin KaFai Lau Aug. 5, 2021, 5:01 a.m. UTC
This patch makes the bpf_dctcp test to fallback to cubic by
using setsockopt(TCP_CONGESTION) when the tcp flow is not
ecn ready.

It also checks setsockopt() is not available to release().

The settimeo() from the network_helpers.h is used, so the local
one is removed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 tools/testing/selftests/bpf/bpf_tcp_helpers.h |   4 +
 .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 101 ++++++++++++++----
 tools/testing/selftests/bpf/progs/bpf_dctcp.c |  20 ++++
 .../selftests/bpf/progs/bpf_dctcp_release.c   |  26 +++++
 4 files changed, 128 insertions(+), 23 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp_release.c

Comments

Daniel Borkmann Aug. 6, 2021, 4:07 p.m. UTC | #1
On 8/5/21 7:01 AM, Martin KaFai Lau wrote:
> This patch makes the bpf_dctcp test to fallback to cubic by
> using setsockopt(TCP_CONGESTION) when the tcp flow is not
> ecn ready.
> 
> It also checks setsockopt() is not available to release().
> 
> The settimeo() from the network_helpers.h is used, so the local
> one is removed.
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
[...]
> diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
> index fd42247da8b4..48df7ffbefdb 100644
> --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
> +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
> @@ -17,6 +17,9 @@
>   
>   char _license[] SEC("license") = "GPL";
>   
> +volatile const char fallback[TCP_CA_NAME_MAX];
> +const char bpf_dctcp[] = "bpf_dctcp";
> +char cc_res[TCP_CA_NAME_MAX];
>   int stg_result = 0;
>   
>   struct {
> @@ -57,6 +60,23 @@ void BPF_PROG(dctcp_init, struct sock *sk)
>   	struct dctcp *ca = inet_csk_ca(sk);
>   	int *stg;
>   
> +	if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
> +		/* Switch to fallback */
> +		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
> +			       (void *)fallback, sizeof(fallback));
> +		/* Switch back to myself which the bpf trampoline
> +		 * stopped calling dctcp_init recursively.
> +		 */
> +		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
> +			       (void *)bpf_dctcp, sizeof(bpf_dctcp));
> +		/* Switch back to fallback */
> +		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
> +			       (void *)fallback, sizeof(fallback));
> +		bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
> +			       (void *)cc_res, sizeof(cc_res));
> +		return;

Is there a possibility where we later on instead of return refetch ca ptr via
ca = inet_csk_ca(sk) and mangle its struct dctcp fields whereas we're actually
messing with the new ca's internal fields (potentially crashing the kernel e.g.
if there was a pointer in the private struct of the new ca that we'd be corrupting)?

> +	}
> +
>   	ca->prior_rcv_nxt = tp->rcv_nxt;
>   	ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
>   	ca->loss_cwnd = 0;
> diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
> new file mode 100644
> index 000000000000..d836f7c372f0
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2021 Facebook */
> +
> +#include <stddef.h>
> +#include <linux/bpf.h>
> +#include <linux/types.h>
> +#include <linux/stddef.h>
> +#include <linux/tcp.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include "bpf_tcp_helpers.h"
> +
> +char _license[] SEC("license") = "GPL";
> +const char cubic[] = "cubic";
> +
> +void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk)
> +{
> +	bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
> +		       (void *)cubic, sizeof(cubic));
> +}
> +
> +SEC(".struct_ops")
> +struct tcp_congestion_ops dctcp_rel = {
> +	.release	= (void *)dctcp_nouse_release,
> +	.name		= "bpf_dctcp_rel",
> +};
>
Martin KaFai Lau Aug. 6, 2021, 5:42 p.m. UTC | #2
On Fri, Aug 06, 2021 at 06:07:01PM +0200, Daniel Borkmann wrote:
> On 8/5/21 7:01 AM, Martin KaFai Lau wrote:
> > This patch makes the bpf_dctcp test to fallback to cubic by
> > using setsockopt(TCP_CONGESTION) when the tcp flow is not
> > ecn ready.
> > 
> > It also checks setsockopt() is not available to release().
> > 
> > The settimeo() from the network_helpers.h is used, so the local
> > one is removed.
> > 
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> [...]
> > diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > index fd42247da8b4..48df7ffbefdb 100644
> > --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > @@ -17,6 +17,9 @@
> >   char _license[] SEC("license") = "GPL";
> > +volatile const char fallback[TCP_CA_NAME_MAX];
> > +const char bpf_dctcp[] = "bpf_dctcp";
> > +char cc_res[TCP_CA_NAME_MAX];
> >   int stg_result = 0;
> >   struct {
> > @@ -57,6 +60,23 @@ void BPF_PROG(dctcp_init, struct sock *sk)
> >   	struct dctcp *ca = inet_csk_ca(sk);
> >   	int *stg;
> > +	if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
> > +		/* Switch to fallback */
> > +		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
> > +			       (void *)fallback, sizeof(fallback));
> > +		/* Switch back to myself which the bpf trampoline
> > +		 * stopped calling dctcp_init recursively.
> > +		 */
> > +		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
> > +			       (void *)bpf_dctcp, sizeof(bpf_dctcp));
> > +		/* Switch back to fallback */
> > +		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
> > +			       (void *)fallback, sizeof(fallback));
> > +		bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
> > +			       (void *)cc_res, sizeof(cc_res));
> > +		return;
> 
> Is there a possibility where we later on instead of return refetch ca ptr via
> ca = inet_csk_ca(sk) and mangle its struct dctcp fields whereas we're actually
> messing with the new ca's internal fields (potentially crashing the kernel e.g.
> if there was a pointer in the private struct of the new ca that we'd be corrupting)?
Without switching to another tcp-cc,
if the bpf-tcp-cc was buggy (e.g. setting incorrect cwnd), it could also
slow down (or stall) the flow a lot by putting wrong values in its own
icsk_ca_priv.

About the potential pointer value in icsk_ca_priv,
the bpf-tcp-cc can only use the icsk_ca_priv as SCALAR, so switching
to another bpf-tcp-cc should be fine.

If a bpf-tcp-cc is switching to a kernel-tcp-cc, that kernel-tcp-cc
could potentially store a pointer in icsk_ca_priv.  The only case I
know is the tcp_cdg.c when icsk_ca_priv is not large enough and it
has to resort to kcalloc and store this pointer in icsk_ca_priv.
Other kernel-tcp-cc stores its data inline in icsk_ca_priv.
The ICSK_CA_PRIV_SIZE has been increased a few times to
store new data inline instead of doing another kmalloc, so
this should be the common case. [cc: Eric]

It could disallow switching to kernel-tcp-cc but I think
it will just end up too limiting and forcing people
to create a bpf-tcp-cc shell to mimic the kernel-tcp-cc
during fallback.  Considering only very limited kernel-tcp-cc
stores pointer in icsk_ca_priv, how about imposing a white/black
list for bpf_setsockopt(TCP_CONGESTION), e.g. disallow switching
to tcp_cdg?  In the near future,  the tagging feature that
Yonghong is working can be used to tag some specific kernel-tcp-cc's
struct that is switchable from bpf side (which most of them should
be switchable). [cc: Yonghong]

WDYT?

Thanks for the review!
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
index e49b7c450b42..5a024646918b 100644
--- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
@@ -12,6 +12,10 @@ 
 SEC("struct_ops/"#name) \
 BPF_PROG(name, args)
 
+#ifndef SOL_TCP
+#define SOL_TCP 6
+#endif
+
 #define tcp_jiffies32 ((__u32)bpf_jiffies64())
 
 struct sock_common {
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index efe1e979affb..b0ba8fa9d0ec 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -4,37 +4,18 @@ 
 #include <linux/err.h>
 #include <netinet/tcp.h>
 #include <test_progs.h>
+#include "network_helpers.h"
 #include "bpf_dctcp.skel.h"
 #include "bpf_cubic.skel.h"
 #include "bpf_tcp_nogpl.skel.h"
+#include "bpf_dctcp_release.skel.h"
 
 #define min(a, b) ((a) < (b) ? (a) : (b))
 
 static const unsigned int total_bytes = 10 * 1024 * 1024;
-static const struct timeval timeo_sec = { .tv_sec = 10 };
-static const size_t timeo_optlen = sizeof(timeo_sec);
 static int expected_stg = 0xeB9F;
 static int stop, duration;
 
-static int settimeo(int fd)
-{
-	int err;
-
-	err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
-			 timeo_optlen);
-	if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n",
-		  errno))
-		return -1;
-
-	err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
-			 timeo_optlen);
-	if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n",
-		  errno))
-		return -1;
-
-	return 0;
-}
-
 static int settcpca(int fd, const char *tcp_ca)
 {
 	int err;
@@ -61,7 +42,7 @@  static void *server(void *arg)
 		goto done;
 	}
 
-	if (settimeo(fd)) {
+	if (settimeo(fd, 0)) {
 		err = -errno;
 		goto done;
 	}
@@ -114,7 +95,7 @@  static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
 	}
 
 	if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) ||
-	    settimeo(lfd) || settimeo(fd))
+	    settimeo(lfd, 0) || settimeo(fd, 0))
 		goto done;
 
 	/* bind, listen and start server thread to accept */
@@ -267,6 +248,76 @@  static void test_invalid_license(void)
 	libbpf_set_print(old_print_fn);
 }
 
+static void test_dctcp_fallback(void)
+{
+	int err, lfd = -1, cli_fd = -1, srv_fd = -1;
+	struct network_helper_opts opts = {
+		.cc = "cubic",
+	};
+	struct bpf_dctcp *dctcp_skel;
+	struct bpf_link *link = NULL;
+	char srv_cc[16];
+	socklen_t cc_len = sizeof(srv_cc);
+
+	dctcp_skel = bpf_dctcp__open();
+	if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel"))
+		return;
+	strcpy(dctcp_skel->rodata->fallback, "cubic");
+	if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load"))
+		goto done;
+
+	link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
+	if (!ASSERT_OK_PTR(link, "dctcp link"))
+		goto done;
+
+	lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+	if (!ASSERT_GE(lfd, 0, "lfd") ||
+	    !ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp"))
+		goto done;
+
+	cli_fd = connect_to_fd_opts(lfd, &opts);
+	if (!ASSERT_GE(cli_fd, 0, "cli_fd"))
+		goto done;
+
+	srv_fd = accept(lfd, NULL, 0);
+	if (!ASSERT_GE(srv_fd, 0, "srv_fd"))
+		goto done;
+	ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res");
+
+	err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len);
+	if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)"))
+		goto done;
+	ASSERT_STREQ(srv_cc, "cubic", "srv_fd cc");
+
+done:
+	bpf_link__destroy(link);
+	bpf_dctcp__destroy(dctcp_skel);
+	if (lfd != -1)
+		close(lfd);
+	if (srv_fd != -1)
+		close(srv_fd);
+	if (cli_fd != -1)
+		close(cli_fd);
+}
+
+static void test_rel_setsockopt(void)
+{
+	struct bpf_dctcp_release *rel_skel;
+	libbpf_print_fn_t old_print_fn;
+
+	err_str = "unknown func bpf_setsockopt";
+	found = false;
+
+	old_print_fn = libbpf_set_print(libbpf_debug_print);
+	rel_skel = bpf_dctcp_release__open_and_load();
+	libbpf_set_print(old_print_fn);
+
+	ASSERT_ERR_PTR(rel_skel, "rel_skel");
+	ASSERT_TRUE(found, "expected_err_msg");
+
+	bpf_dctcp_release__destroy(rel_skel);
+}
+
 void test_bpf_tcp_ca(void)
 {
 	if (test__start_subtest("dctcp"))
@@ -275,4 +326,8 @@  void test_bpf_tcp_ca(void)
 		test_cubic();
 	if (test__start_subtest("invalid_license"))
 		test_invalid_license();
+	if (test__start_subtest("dctcp_fallback"))
+		test_dctcp_fallback();
+	if (test__start_subtest("rel_setsockopt"))
+		test_rel_setsockopt();
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
index fd42247da8b4..48df7ffbefdb 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -17,6 +17,9 @@ 
 
 char _license[] SEC("license") = "GPL";
 
+volatile const char fallback[TCP_CA_NAME_MAX];
+const char bpf_dctcp[] = "bpf_dctcp";
+char cc_res[TCP_CA_NAME_MAX];
 int stg_result = 0;
 
 struct {
@@ -57,6 +60,23 @@  void BPF_PROG(dctcp_init, struct sock *sk)
 	struct dctcp *ca = inet_csk_ca(sk);
 	int *stg;
 
+	if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
+		/* Switch to fallback */
+		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+			       (void *)fallback, sizeof(fallback));
+		/* Switch back to myself which the bpf trampoline
+		 * stopped calling dctcp_init recursively.
+		 */
+		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+			       (void *)bpf_dctcp, sizeof(bpf_dctcp));
+		/* Switch back to fallback */
+		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+			       (void *)fallback, sizeof(fallback));
+		bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
+			       (void *)cc_res, sizeof(cc_res));
+		return;
+	}
+
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
 	ca->loss_cwnd = 0;
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
new file mode 100644
index 000000000000..d836f7c372f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
@@ -0,0 +1,26 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/tcp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+const char cubic[] = "cubic";
+
+void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk)
+{
+	bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+		       (void *)cubic, sizeof(cubic));
+}
+
+SEC(".struct_ops")
+struct tcp_congestion_ops dctcp_rel = {
+	.release	= (void *)dctcp_nouse_release,
+	.name		= "bpf_dctcp_rel",
+};