Message ID | 20210805050144.1352078-1-kafai@fb.com (mailing list archive) |
---|---|
State | Awaiting Upstream |
Delegated to: | BPF |
Headers | show |
Series | bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt | expand |
On 8/5/21 7:01 AM, Martin KaFai Lau wrote: > This patch makes the bpf_dctcp test to fallback to cubic by > using setsockopt(TCP_CONGESTION) when the tcp flow is not > ecn ready. > > It also checks setsockopt() is not available to release(). > > The settimeo() from the network_helpers.h is used, so the local > one is removed. > > Signed-off-by: Martin KaFai Lau <kafai@fb.com> [...] > diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c > index fd42247da8b4..48df7ffbefdb 100644 > --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c > +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c > @@ -17,6 +17,9 @@ > > char _license[] SEC("license") = "GPL"; > > +volatile const char fallback[TCP_CA_NAME_MAX]; > +const char bpf_dctcp[] = "bpf_dctcp"; > +char cc_res[TCP_CA_NAME_MAX]; > int stg_result = 0; > > struct { > @@ -57,6 +60,23 @@ void BPF_PROG(dctcp_init, struct sock *sk) > struct dctcp *ca = inet_csk_ca(sk); > int *stg; > > + if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { > + /* Switch to fallback */ > + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, > + (void *)fallback, sizeof(fallback)); > + /* Switch back to myself which the bpf trampoline > + * stopped calling dctcp_init recursively. > + */ > + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, > + (void *)bpf_dctcp, sizeof(bpf_dctcp)); > + /* Switch back to fallback */ > + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, > + (void *)fallback, sizeof(fallback)); > + bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION, > + (void *)cc_res, sizeof(cc_res)); > + return; Is there a possibility where we later on instead of return refetch ca ptr via ca = inet_csk_ca(sk) and mangle its struct dctcp fields whereas we're actually messing with the new ca's internal fields (potentially crashing the kernel e.g. if there was a pointer in the private struct of the new ca that we'd be corrupting)? > + } > + > ca->prior_rcv_nxt = tp->rcv_nxt; > ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); > ca->loss_cwnd = 0; > diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c > new file mode 100644 > index 000000000000..d836f7c372f0 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c > @@ -0,0 +1,26 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright (c) 2021 Facebook */ > + > +#include <stddef.h> > +#include <linux/bpf.h> > +#include <linux/types.h> > +#include <linux/stddef.h> > +#include <linux/tcp.h> > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_tracing.h> > +#include "bpf_tcp_helpers.h" > + > +char _license[] SEC("license") = "GPL"; > +const char cubic[] = "cubic"; > + > +void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) > +{ > + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, > + (void *)cubic, sizeof(cubic)); > +} > + > +SEC(".struct_ops") > +struct tcp_congestion_ops dctcp_rel = { > + .release = (void *)dctcp_nouse_release, > + .name = "bpf_dctcp_rel", > +}; >
On Fri, Aug 06, 2021 at 06:07:01PM +0200, Daniel Borkmann wrote: > On 8/5/21 7:01 AM, Martin KaFai Lau wrote: > > This patch makes the bpf_dctcp test to fallback to cubic by > > using setsockopt(TCP_CONGESTION) when the tcp flow is not > > ecn ready. > > > > It also checks setsockopt() is not available to release(). > > > > The settimeo() from the network_helpers.h is used, so the local > > one is removed. > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com> > [...] > > diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c > > index fd42247da8b4..48df7ffbefdb 100644 > > --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c > > +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c > > @@ -17,6 +17,9 @@ > > char _license[] SEC("license") = "GPL"; > > +volatile const char fallback[TCP_CA_NAME_MAX]; > > +const char bpf_dctcp[] = "bpf_dctcp"; > > +char cc_res[TCP_CA_NAME_MAX]; > > int stg_result = 0; > > struct { > > @@ -57,6 +60,23 @@ void BPF_PROG(dctcp_init, struct sock *sk) > > struct dctcp *ca = inet_csk_ca(sk); > > int *stg; > > + if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { > > + /* Switch to fallback */ > > + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, > > + (void *)fallback, sizeof(fallback)); > > + /* Switch back to myself which the bpf trampoline > > + * stopped calling dctcp_init recursively. > > + */ > > + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, > > + (void *)bpf_dctcp, sizeof(bpf_dctcp)); > > + /* Switch back to fallback */ > > + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, > > + (void *)fallback, sizeof(fallback)); > > + bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION, > > + (void *)cc_res, sizeof(cc_res)); > > + return; > > Is there a possibility where we later on instead of return refetch ca ptr via > ca = inet_csk_ca(sk) and mangle its struct dctcp fields whereas we're actually > messing with the new ca's internal fields (potentially crashing the kernel e.g. > if there was a pointer in the private struct of the new ca that we'd be corrupting)? Without switching to another tcp-cc, if the bpf-tcp-cc was buggy (e.g. setting incorrect cwnd), it could also slow down (or stall) the flow a lot by putting wrong values in its own icsk_ca_priv. About the potential pointer value in icsk_ca_priv, the bpf-tcp-cc can only use the icsk_ca_priv as SCALAR, so switching to another bpf-tcp-cc should be fine. If a bpf-tcp-cc is switching to a kernel-tcp-cc, that kernel-tcp-cc could potentially store a pointer in icsk_ca_priv. The only case I know is the tcp_cdg.c when icsk_ca_priv is not large enough and it has to resort to kcalloc and store this pointer in icsk_ca_priv. Other kernel-tcp-cc stores its data inline in icsk_ca_priv. The ICSK_CA_PRIV_SIZE has been increased a few times to store new data inline instead of doing another kmalloc, so this should be the common case. [cc: Eric] It could disallow switching to kernel-tcp-cc but I think it will just end up too limiting and forcing people to create a bpf-tcp-cc shell to mimic the kernel-tcp-cc during fallback. Considering only very limited kernel-tcp-cc stores pointer in icsk_ca_priv, how about imposing a white/black list for bpf_setsockopt(TCP_CONGESTION), e.g. disallow switching to tcp_cdg? In the near future, the tagging feature that Yonghong is working can be used to tag some specific kernel-tcp-cc's struct that is switchable from bpf side (which most of them should be switchable). [cc: Yonghong] WDYT? Thanks for the review!
diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index e49b7c450b42..5a024646918b 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -12,6 +12,10 @@ SEC("struct_ops/"#name) \ BPF_PROG(name, args) +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + #define tcp_jiffies32 ((__u32)bpf_jiffies64()) struct sock_common { diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index efe1e979affb..b0ba8fa9d0ec 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -4,37 +4,18 @@ #include <linux/err.h> #include <netinet/tcp.h> #include <test_progs.h> +#include "network_helpers.h" #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" #include "bpf_tcp_nogpl.skel.h" +#include "bpf_dctcp_release.skel.h" #define min(a, b) ((a) < (b) ? (a) : (b)) static const unsigned int total_bytes = 10 * 1024 * 1024; -static const struct timeval timeo_sec = { .tv_sec = 10 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int expected_stg = 0xeB9F; static int stop, duration; -static int settimeo(int fd) -{ - int err; - - err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen); - if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n", - errno)) - return -1; - - err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen); - if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n", - errno)) - return -1; - - return 0; -} - static int settcpca(int fd, const char *tcp_ca) { int err; @@ -61,7 +42,7 @@ static void *server(void *arg) goto done; } - if (settimeo(fd)) { + if (settimeo(fd, 0)) { err = -errno; goto done; } @@ -114,7 +95,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd) || settimeo(fd)) + settimeo(lfd, 0) || settimeo(fd, 0)) goto done; /* bind, listen and start server thread to accept */ @@ -267,6 +248,76 @@ static void test_invalid_license(void) libbpf_set_print(old_print_fn); } +static void test_dctcp_fallback(void) +{ + int err, lfd = -1, cli_fd = -1, srv_fd = -1; + struct network_helper_opts opts = { + .cc = "cubic", + }; + struct bpf_dctcp *dctcp_skel; + struct bpf_link *link = NULL; + char srv_cc[16]; + socklen_t cc_len = sizeof(srv_cc); + + dctcp_skel = bpf_dctcp__open(); + if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) + return; + strcpy(dctcp_skel->rodata->fallback, "cubic"); + if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load")) + goto done; + + link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp); + if (!ASSERT_OK_PTR(link, "dctcp link")) + goto done; + + lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(lfd, 0, "lfd") || + !ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp")) + goto done; + + cli_fd = connect_to_fd_opts(lfd, &opts); + if (!ASSERT_GE(cli_fd, 0, "cli_fd")) + goto done; + + srv_fd = accept(lfd, NULL, 0); + if (!ASSERT_GE(srv_fd, 0, "srv_fd")) + goto done; + ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res"); + + err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len); + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) + goto done; + ASSERT_STREQ(srv_cc, "cubic", "srv_fd cc"); + +done: + bpf_link__destroy(link); + bpf_dctcp__destroy(dctcp_skel); + if (lfd != -1) + close(lfd); + if (srv_fd != -1) + close(srv_fd); + if (cli_fd != -1) + close(cli_fd); +} + +static void test_rel_setsockopt(void) +{ + struct bpf_dctcp_release *rel_skel; + libbpf_print_fn_t old_print_fn; + + err_str = "unknown func bpf_setsockopt"; + found = false; + + old_print_fn = libbpf_set_print(libbpf_debug_print); + rel_skel = bpf_dctcp_release__open_and_load(); + libbpf_set_print(old_print_fn); + + ASSERT_ERR_PTR(rel_skel, "rel_skel"); + ASSERT_TRUE(found, "expected_err_msg"); + + bpf_dctcp_release__destroy(rel_skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -275,4 +326,8 @@ void test_bpf_tcp_ca(void) test_cubic(); if (test__start_subtest("invalid_license")) test_invalid_license(); + if (test__start_subtest("dctcp_fallback")) + test_dctcp_fallback(); + if (test__start_subtest("rel_setsockopt")) + test_rel_setsockopt(); } diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index fd42247da8b4..48df7ffbefdb 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -17,6 +17,9 @@ char _license[] SEC("license") = "GPL"; +volatile const char fallback[TCP_CA_NAME_MAX]; +const char bpf_dctcp[] = "bpf_dctcp"; +char cc_res[TCP_CA_NAME_MAX]; int stg_result = 0; struct { @@ -57,6 +60,23 @@ void BPF_PROG(dctcp_init, struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); int *stg; + if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { + /* Switch to fallback */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)); + /* Switch back to myself which the bpf trampoline + * stopped calling dctcp_init recursively. + */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)bpf_dctcp, sizeof(bpf_dctcp)); + /* Switch back to fallback */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)); + bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)cc_res, sizeof(cc_res)); + return; + } + ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c new file mode 100644 index 000000000000..d836f7c372f0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <stddef.h> +#include <linux/bpf.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/tcp.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +const char cubic[] = "cubic"; + +void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) +{ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)cubic, sizeof(cubic)); +} + +SEC(".struct_ops") +struct tcp_congestion_ops dctcp_rel = { + .release = (void *)dctcp_nouse_release, + .name = "bpf_dctcp_rel", +};
This patch makes the bpf_dctcp test to fallback to cubic by using setsockopt(TCP_CONGESTION) when the tcp flow is not ecn ready. It also checks setsockopt() is not available to release(). The settimeo() from the network_helpers.h is used, so the local one is removed. Signed-off-by: Martin KaFai Lau <kafai@fb.com> --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 4 + .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 101 ++++++++++++++---- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 20 ++++ .../selftests/bpf/progs/bpf_dctcp_release.c | 26 +++++ 4 files changed, 128 insertions(+), 23 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp_release.c