Message ID | 20220503171437.666326-5-maximmi@nvidia.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | New BPF helpers to accelerate synproxy | expand |
On Tue, May 3, 2022 at 10:15 AM Maxim Mikityanskiy <maximmi@nvidia.com> wrote: > > This commit adds selftests for the new BPF helpers: > bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}. > > xdp_synproxy_kern.c is a BPF program that generates SYN cookies on > allowed TCP ports and sends SYNACKs to clients, accelerating synproxy > iptables module. > > xdp_synproxy.c is a userspace control application that allows to > configure the following options in runtime: list of allowed ports, MSS, > window scale, TTL. > > A selftest is added to prog_tests that leverages the above programs to > test the functionality of the new helpers. > > Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com> > Reviewed-by: Tariq Toukan <tariqt@nvidia.com> > --- selftests should use "selftests/bpf: " subject prefix, not "bpf: ", please update so it's more obvious that this patch touches selftests and not kernel-side BPF functionality. > tools/testing/selftests/bpf/.gitignore | 1 + > tools/testing/selftests/bpf/Makefile | 5 +- > .../selftests/bpf/prog_tests/xdp_synproxy.c | 109 +++ > .../selftests/bpf/progs/xdp_synproxy_kern.c | 750 ++++++++++++++++++ > tools/testing/selftests/bpf/xdp_synproxy.c | 418 ++++++++++ > 5 files changed, 1281 insertions(+), 2 deletions(-) > create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c > create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c > create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c > > diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore > index 595565eb68c0..ca2f47f45670 100644 > --- a/tools/testing/selftests/bpf/.gitignore > +++ b/tools/testing/selftests/bpf/.gitignore > @@ -43,3 +43,4 @@ test_cpp > *.tmp > xdpxceiver > xdp_redirect_multi > +xdp_synproxy > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > index bafdc5373a13..8ae602843b16 100644 > --- a/tools/testing/selftests/bpf/Makefile > +++ b/tools/testing/selftests/bpf/Makefile > @@ -82,9 +82,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ > - xdpxceiver xdp_redirect_multi > + xdpxceiver xdp_redirect_multi xdp_synproxy > > -TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read > +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/xdp_synproxy > > # Emit succinct information message describing current building step > # $1 - generic step name (e.g., CC, LINK, etc); > @@ -500,6 +500,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ > cap_helpers.c > TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ > $(OUTPUT)/liburandom_read.so \ > + $(OUTPUT)/xdp_synproxy \ this is the right way to make external binary available to test_progs flavors, but is there anything inherently requiring external binary instead of having a helper function doing the same? urandom_read has to be a separate binary. > ima_setup.sh \ > $(wildcard progs/btf_dump_test_case_*.c) > TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE > diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c > new file mode 100644 > index 000000000000..e08b28e25047 > --- /dev/null > +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c > @@ -0,0 +1,109 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#include <test_progs.h> > +#include <network_helpers.h> > + > +#define SYS(cmd) ({ \ > + if (!ASSERT_OK(system(cmd), (cmd))) \ > + goto out; \ > +}) > + > +#define SYS_OUT(cmd) ({ \ > + FILE *f = popen((cmd), "r"); \ > + if (!ASSERT_OK_PTR(f, (cmd))) \ > + goto out; \ > + f; \ > +}) > + > +static bool expect_str(char *buf, size_t size, const char *str) > +{ > + if (size != strlen(str)) > + return false; > + return !memcmp(buf, str, size); > +} > + > +void test_xdp_synproxy(void) > +{ > + int server_fd = -1, client_fd = -1, accept_fd = -1; > + struct nstoken *ns = NULL; > + FILE *ctrl_file = NULL; > + char buf[1024]; > + size_t size; > + > + SYS("ip netns add synproxy"); > + > + SYS("ip link add tmp0 type veth peer name tmp1"); > + SYS("ip link set tmp1 netns synproxy"); > + SYS("ip link set tmp0 up"); > + SYS("ip addr replace 198.18.0.1/24 dev tmp0"); > + > + // When checksum offload is enabled, the XDP program sees wrong > + // checksums and drops packets. > + SYS("ethtool -K tmp0 tx off"); > + // Workaround required for veth. don't use C++ comments, please stick to /* */ > + SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); > + > + ns = open_netns("synproxy"); > + if (!ASSERT_OK_PTR(ns, "setns")) > + goto out; > + > + SYS("ip link set lo up"); > + SYS("ip link set tmp1 up"); > + SYS("ip addr replace 198.18.0.2/24 dev tmp1"); > + SYS("sysctl -w net.ipv4.tcp_syncookies=2"); > + SYS("sysctl -w net.ipv4.tcp_timestamps=1"); > + SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0"); > + SYS("iptables -t raw -I PREROUTING \ > + -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack"); > + SYS("iptables -t filter -A INPUT \ > + -i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \ > + -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460"); > + SYS("iptables -t filter -A INPUT \ > + -i tmp1 -m state --state INVALID -j DROP"); > + > + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \ > + --mss4 1460 --mss6 1440 --wscale 7 --ttl 64"); > + size = fread(buf, 1, sizeof(buf), ctrl_file); buf is uninitialized so if fread fail strlen() can cause SIGSEGV or some other failure mode > + pclose(ctrl_file); > + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 0\n"), > + "initial SYNACKs")) > + goto out; > + > + server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0); > + if (!ASSERT_GE(server_fd, 0, "start_server")) > + goto out; > + > + close_netns(ns); > + ns = NULL; > + > + client_fd = connect_to_fd(server_fd, 10000); > + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) > + goto out; > + > + accept_fd = accept(server_fd, NULL, NULL); > + if (!ASSERT_GE(accept_fd, 0, "accept")) > + goto out; > + > + ns = open_netns("synproxy"); > + if (!ASSERT_OK_PTR(ns, "setns")) > + goto out; > + > + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); > + size = fread(buf, 1, sizeof(buf), ctrl_file); > + pclose(ctrl_file); > + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 1\n"), > + "SYNACKs after connection")) please use ASSERT_STREQ instead, same above > + goto out; > + > +out: > + if (accept_fd >= 0) > + close(accept_fd); > + if (client_fd >= 0) > + close(client_fd); > + if (server_fd >= 0) > + close(server_fd); > + if (ns) > + close_netns(ns); > + > + system("ip link del tmp0"); > + system("ip netns del synproxy"); > +} > diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c > new file mode 100644 > index 000000000000..9ae85b189072 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c > @@ -0,0 +1,750 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB Can you please elaborate on what Linux-OpenIB license is and why GPL-2.0 isn't enough? We usually have GPL-2.0 or LGPL-2.1 OR BSD-2-Clause > +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ > + > +#include "vmlinux.h" > + > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_endian.h> > +#include <asm/errno.h> > + [...] > + > +static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, > + __u32 len, __u8 proto, > + __u32 csum) > +{ > + __u64 s = csum; > + > + s += (__u32)saddr; > + s += (__u32)daddr; > +#if defined(__BIG_ENDIAN__) > + s += proto + len; > +#elif defined(__LITTLE_ENDIAN__) I've got few nudges in libbpf code base previously to use #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ instead (I don't remember the exact reason now, but there was a reason). Let's do the same here for consistency? > + s += (proto + len) << 8; > +#else > +#error Unknown endian > +#endif > + s = (s & 0xffffffff) + (s >> 32); > + s = (s & 0xffffffff) + (s >> 32); > + > + return csum_fold((__u32)s); > +} > + > +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, > + const struct in6_addr *daddr, > + __u32 len, __u8 proto, __u32 csum) > +{ > + __u64 sum = csum; > + int i; > + > +#pragma unroll > + for (i = 0; i < 4; i++) > + sum += (__u32)saddr->in6_u.u6_addr32[i]; > + > +#pragma unroll why unroll? BPF verifier handles such loops just fine, even if compiler decides to not unroll them > + for (i = 0; i < 4; i++) > + sum += (__u32)daddr->in6_u.u6_addr32[i]; > + > + // Don't combine additions to avoid 32-bit overflow. > + sum += bpf_htonl(len); > + sum += bpf_htonl(proto); > + > + sum = (sum & 0xffffffff) + (sum >> 32); > + sum = (sum & 0xffffffff) + (sum >> 32); > + > + return csum_fold((__u32)sum); > +} > + > +static __always_inline __u64 tcp_clock_ns(void) __always_inline isn't mandatory, you can just have static __u64 tcp_clock_ns() here and let compiler decide on inlining? same for below > +{ > + return bpf_ktime_get_ns(); > +} > + > +static __always_inline __u32 tcp_ns_to_ts(__u64 ns) > +{ > + return ns / (NSEC_PER_SEC / TCP_TS_HZ); > +} > + > +static __always_inline __u32 tcp_time_stamp_raw(void) > +{ > + return tcp_ns_to_ts(tcp_clock_ns()); > +} > + [...] > +static __always_inline void values_inc_synacks(void) > +{ > + __u32 key = 1; > + __u32 *value; > + > + value = bpf_map_lookup_elem(&values, &key); > + if (value) > + __sync_fetch_and_add(value, 1); > +} > + > +static __always_inline bool check_port_allowed(__u16 port) > +{ > + __u32 i; > + > + for (i = 0; i < MAX_ALLOWED_PORTS; i++) { > + __u32 key = i; > + __u16 *value; > + > + value = bpf_map_lookup_elem(&allowed_ports, &key); > + > + if (!value) > + break; > + // 0 is a terminator value. Check it first to avoid matching on > + // a forbidden port == 0 and returning true. please no C++ comments (everywhere) > + if (*value == 0) > + break; > + > + if (*value == port) > + return true; > + } > + > + return false; > +} > + [...]
On 2022-05-07 00:34, Andrii Nakryiko wrote: > On Tue, May 3, 2022 at 10:15 AM Maxim Mikityanskiy <maximmi@nvidia.com> wrote: >> >> This commit adds selftests for the new BPF helpers: >> bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}. >> >> xdp_synproxy_kern.c is a BPF program that generates SYN cookies on >> allowed TCP ports and sends SYNACKs to clients, accelerating synproxy >> iptables module. >> >> xdp_synproxy.c is a userspace control application that allows to >> configure the following options in runtime: list of allowed ports, MSS, >> window scale, TTL. >> >> A selftest is added to prog_tests that leverages the above programs to >> test the functionality of the new helpers. >> >> Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com> >> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> >> --- > > selftests should use "selftests/bpf: " subject prefix, not "bpf: ", > please update so it's more obvious that this patch touches selftests > and not kernel-side BPF functionality. > >> tools/testing/selftests/bpf/.gitignore | 1 + >> tools/testing/selftests/bpf/Makefile | 5 +- >> .../selftests/bpf/prog_tests/xdp_synproxy.c | 109 +++ >> .../selftests/bpf/progs/xdp_synproxy_kern.c | 750 ++++++++++++++++++ >> tools/testing/selftests/bpf/xdp_synproxy.c | 418 ++++++++++ >> 5 files changed, 1281 insertions(+), 2 deletions(-) >> create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c >> create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c >> create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c >> >> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore >> index 595565eb68c0..ca2f47f45670 100644 >> --- a/tools/testing/selftests/bpf/.gitignore >> +++ b/tools/testing/selftests/bpf/.gitignore >> @@ -43,3 +43,4 @@ test_cpp >> *.tmp >> xdpxceiver >> xdp_redirect_multi >> +xdp_synproxy >> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile >> index bafdc5373a13..8ae602843b16 100644 >> --- a/tools/testing/selftests/bpf/Makefile >> +++ b/tools/testing/selftests/bpf/Makefile >> @@ -82,9 +82,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \ >> TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ >> flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ >> test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ >> - xdpxceiver xdp_redirect_multi >> + xdpxceiver xdp_redirect_multi xdp_synproxy >> >> -TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read >> +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/xdp_synproxy >> >> # Emit succinct information message describing current building step >> # $1 - generic step name (e.g., CC, LINK, etc); >> @@ -500,6 +500,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ >> cap_helpers.c >> TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ >> $(OUTPUT)/liburandom_read.so \ >> + $(OUTPUT)/xdp_synproxy \ > > this is the right way to make external binary available to test_progs > flavors, but is there anything inherently requiring external binary > instead of having a helper function doing the same? urandom_read has > to be a separate binary. If you remember v1, it used to be a sample, but I was asked to convert it to a selftest, because samples are deprecated. The intention of having this separate binary is to have a sample reference implementation that can be used in real-world scenarios with minor or no changes. >> ima_setup.sh \ >> $(wildcard progs/btf_dump_test_case_*.c) >> TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE >> diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c >> new file mode 100644 >> index 000000000000..e08b28e25047 >> --- /dev/null >> +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c >> @@ -0,0 +1,109 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +#include <test_progs.h> >> +#include <network_helpers.h> >> + >> +#define SYS(cmd) ({ \ >> + if (!ASSERT_OK(system(cmd), (cmd))) \ >> + goto out; \ >> +}) >> + >> +#define SYS_OUT(cmd) ({ \ >> + FILE *f = popen((cmd), "r"); \ >> + if (!ASSERT_OK_PTR(f, (cmd))) \ >> + goto out; \ >> + f; \ >> +}) >> + >> +static bool expect_str(char *buf, size_t size, const char *str) >> +{ >> + if (size != strlen(str)) >> + return false; >> + return !memcmp(buf, str, size); >> +} >> + >> +void test_xdp_synproxy(void) >> +{ >> + int server_fd = -1, client_fd = -1, accept_fd = -1; >> + struct nstoken *ns = NULL; >> + FILE *ctrl_file = NULL; >> + char buf[1024]; >> + size_t size; >> + >> + SYS("ip netns add synproxy"); >> + >> + SYS("ip link add tmp0 type veth peer name tmp1"); >> + SYS("ip link set tmp1 netns synproxy"); >> + SYS("ip link set tmp0 up"); >> + SYS("ip addr replace 198.18.0.1/24 dev tmp0"); > >> + >> + // When checksum offload is enabled, the XDP program sees wrong >> + // checksums and drops packets. >> + SYS("ethtool -K tmp0 tx off"); >> + // Workaround required for veth. > > don't use C++ comments, please stick to /* */ > >> + SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); >> + >> + ns = open_netns("synproxy"); >> + if (!ASSERT_OK_PTR(ns, "setns")) >> + goto out; >> + >> + SYS("ip link set lo up"); >> + SYS("ip link set tmp1 up"); >> + SYS("ip addr replace 198.18.0.2/24 dev tmp1"); >> + SYS("sysctl -w net.ipv4.tcp_syncookies=2"); >> + SYS("sysctl -w net.ipv4.tcp_timestamps=1"); >> + SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0"); >> + SYS("iptables -t raw -I PREROUTING \ >> + -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack"); >> + SYS("iptables -t filter -A INPUT \ >> + -i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \ >> + -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460"); >> + SYS("iptables -t filter -A INPUT \ >> + -i tmp1 -m state --state INVALID -j DROP"); >> + >> + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \ >> + --mss4 1460 --mss6 1440 --wscale 7 --ttl 64"); >> + size = fread(buf, 1, sizeof(buf), ctrl_file); > > buf is uninitialized so if fread fail strlen() can cause SIGSEGV or > some other failure mode No, it will exit on the assert below (size won't be equal to strlen(str)). > >> + pclose(ctrl_file); >> + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 0\n"), >> + "initial SYNACKs")) >> + goto out; >> + >> + server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0); >> + if (!ASSERT_GE(server_fd, 0, "start_server")) >> + goto out; >> + >> + close_netns(ns); >> + ns = NULL; >> + >> + client_fd = connect_to_fd(server_fd, 10000); >> + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) >> + goto out; >> + >> + accept_fd = accept(server_fd, NULL, NULL); >> + if (!ASSERT_GE(accept_fd, 0, "accept")) >> + goto out; >> + >> + ns = open_netns("synproxy"); >> + if (!ASSERT_OK_PTR(ns, "setns")) >> + goto out; >> + >> + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); >> + size = fread(buf, 1, sizeof(buf), ctrl_file); >> + pclose(ctrl_file); >> + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 1\n"), >> + "SYNACKs after connection")) > > please use ASSERT_STREQ instead, same above It doesn't fit here for two reasons: * It doesn't consider size (and ignoring size will cause a UB on errors because of the uninitialized buf). * buf is not '\0'-terminated, and ASSERT_STREQ uses strcmp. > >> + goto out; >> + >> +out: >> + if (accept_fd >= 0) >> + close(accept_fd); >> + if (client_fd >= 0) >> + close(client_fd); >> + if (server_fd >= 0) >> + close(server_fd); >> + if (ns) >> + close_netns(ns); >> + >> + system("ip link del tmp0"); >> + system("ip netns del synproxy"); >> +} >> diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c >> new file mode 100644 >> index 000000000000..9ae85b189072 >> --- /dev/null >> +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c >> @@ -0,0 +1,750 @@ >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > > Can you please elaborate on what Linux-OpenIB license is and why > GPL-2.0 isn't enough? We usually have GPL-2.0 or LGPL-2.1 OR > BSD-2-Clause That's the license boilerplate we use in the mlx5e driver. I'll check with the relevant people whether we can submit it as GPL-2.0 solely. >> +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ >> + >> +#include "vmlinux.h" >> + >> +#include <bpf/bpf_helpers.h> >> +#include <bpf/bpf_endian.h> >> +#include <asm/errno.h> >> + > > [...] > >> + >> +static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, >> + __u32 len, __u8 proto, >> + __u32 csum) >> +{ >> + __u64 s = csum; >> + >> + s += (__u32)saddr; >> + s += (__u32)daddr; >> +#if defined(__BIG_ENDIAN__) >> + s += proto + len; >> +#elif defined(__LITTLE_ENDIAN__) > > I've got few nudges in libbpf code base previously to use > > #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ > > instead (I don't remember the exact reason now, but there was a > reason). Let's do the same here for consistency? OK. samples/bpf/xdpsock_user.c also still uses __BIG_ENDIAN__. >> + s += (proto + len) << 8; >> +#else >> +#error Unknown endian >> +#endif >> + s = (s & 0xffffffff) + (s >> 32); >> + s = (s & 0xffffffff) + (s >> 32); >> + >> + return csum_fold((__u32)s); >> +} >> + >> +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, >> + const struct in6_addr *daddr, >> + __u32 len, __u8 proto, __u32 csum) >> +{ >> + __u64 sum = csum; >> + int i; >> + >> +#pragma unroll >> + for (i = 0; i < 4; i++) >> + sum += (__u32)saddr->in6_u.u6_addr32[i]; >> + >> +#pragma unroll > > why unroll? BPF verifier handles such loops just fine, even if > compiler decides to not unroll them Optimization, see csum_ipv6_magic in net/ipv6/ip6_checksum.c that has this loop unrolled manually. >> + for (i = 0; i < 4; i++) >> + sum += (__u32)daddr->in6_u.u6_addr32[i]; >> + >> + // Don't combine additions to avoid 32-bit overflow. >> + sum += bpf_htonl(len); >> + sum += bpf_htonl(proto); >> + >> + sum = (sum & 0xffffffff) + (sum >> 32); >> + sum = (sum & 0xffffffff) + (sum >> 32); >> + >> + return csum_fold((__u32)sum); >> +} >> + >> +static __always_inline __u64 tcp_clock_ns(void) > > __always_inline isn't mandatory, you can just have static __u64 > tcp_clock_ns() here and let compiler decide on inlining? same for > below Do you mean just these three functions, or all functions below, or actually all functions in this file? It's not mandatory, but these are simple one-liners, it would be unpleasant to waste an extra call in performance-critical code if the compiler decides not to inline them. >> +{ >> + return bpf_ktime_get_ns(); >> +} >> + >> +static __always_inline __u32 tcp_ns_to_ts(__u64 ns) >> +{ >> + return ns / (NSEC_PER_SEC / TCP_TS_HZ); >> +} >> + >> +static __always_inline __u32 tcp_time_stamp_raw(void) >> +{ >> + return tcp_ns_to_ts(tcp_clock_ns()); >> +} >> + > > [...] > >> +static __always_inline void values_inc_synacks(void) >> +{ >> + __u32 key = 1; >> + __u32 *value; >> + >> + value = bpf_map_lookup_elem(&values, &key); >> + if (value) >> + __sync_fetch_and_add(value, 1); >> +} >> + >> +static __always_inline bool check_port_allowed(__u16 port) >> +{ >> + __u32 i; >> + >> + for (i = 0; i < MAX_ALLOWED_PORTS; i++) { >> + __u32 key = i; >> + __u16 *value; >> + >> + value = bpf_map_lookup_elem(&allowed_ports, &key); >> + >> + if (!value) >> + break; >> + // 0 is a terminator value. Check it first to avoid matching on >> + // a forbidden port == 0 and returning true. > > please no C++ comments (everywhere) > >> + if (*value == 0) >> + break; >> + >> + if (*value == port) >> + return true; >> + } >> + >> + return false; >> +} >> + > > [...]
On Tue, May 10, 2022 at 12:21 PM Maxim Mikityanskiy <maximmi@nvidia.com> wrote: > > On 2022-05-07 00:34, Andrii Nakryiko wrote: > > On Tue, May 3, 2022 at 10:15 AM Maxim Mikityanskiy <maximmi@nvidia.com> wrote: > >> > >> This commit adds selftests for the new BPF helpers: > >> bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}. > >> > >> xdp_synproxy_kern.c is a BPF program that generates SYN cookies on > >> allowed TCP ports and sends SYNACKs to clients, accelerating synproxy > >> iptables module. > >> > >> xdp_synproxy.c is a userspace control application that allows to > >> configure the following options in runtime: list of allowed ports, MSS, > >> window scale, TTL. > >> > >> A selftest is added to prog_tests that leverages the above programs to > >> test the functionality of the new helpers. > >> > >> Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com> > >> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> > >> --- > > > > selftests should use "selftests/bpf: " subject prefix, not "bpf: ", > > please update so it's more obvious that this patch touches selftests > > and not kernel-side BPF functionality. > > > >> tools/testing/selftests/bpf/.gitignore | 1 + > >> tools/testing/selftests/bpf/Makefile | 5 +- > >> .../selftests/bpf/prog_tests/xdp_synproxy.c | 109 +++ > >> .../selftests/bpf/progs/xdp_synproxy_kern.c | 750 ++++++++++++++++++ > >> tools/testing/selftests/bpf/xdp_synproxy.c | 418 ++++++++++ > >> 5 files changed, 1281 insertions(+), 2 deletions(-) > >> create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c > >> create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c > >> create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c > >> > >> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore > >> index 595565eb68c0..ca2f47f45670 100644 > >> --- a/tools/testing/selftests/bpf/.gitignore > >> +++ b/tools/testing/selftests/bpf/.gitignore > >> @@ -43,3 +43,4 @@ test_cpp > >> *.tmp > >> xdpxceiver > >> xdp_redirect_multi > >> +xdp_synproxy > >> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > >> index bafdc5373a13..8ae602843b16 100644 > >> --- a/tools/testing/selftests/bpf/Makefile > >> +++ b/tools/testing/selftests/bpf/Makefile > >> @@ -82,9 +82,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > >> TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > >> flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > >> test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ > >> - xdpxceiver xdp_redirect_multi > >> + xdpxceiver xdp_redirect_multi xdp_synproxy > >> > >> -TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read > >> +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/xdp_synproxy > >> > >> # Emit succinct information message describing current building step > >> # $1 - generic step name (e.g., CC, LINK, etc); > >> @@ -500,6 +500,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ > >> cap_helpers.c > >> TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ > >> $(OUTPUT)/liburandom_read.so \ > >> + $(OUTPUT)/xdp_synproxy \ > > > > this is the right way to make external binary available to test_progs > > flavors, but is there anything inherently requiring external binary > > instead of having a helper function doing the same? urandom_read has > > to be a separate binary. > > If you remember v1, it used to be a sample, but I was asked to convert > it to a selftest, because samples are deprecated. The intention of > having this separate binary is to have a sample reference implementation > that can be used in real-world scenarios with minor or no changes. > Ok, I'll let others chime in if they care enough about this. Selftests are first and foremost a test and not an almost production-ready collection of tools, but fine by me. > >> ima_setup.sh \ > >> $(wildcard progs/btf_dump_test_case_*.c) > >> TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE > >> diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c > >> new file mode 100644 > >> index 000000000000..e08b28e25047 > >> --- /dev/null > >> +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c > >> @@ -0,0 +1,109 @@ > >> +// SPDX-License-Identifier: GPL-2.0 > >> +#include <test_progs.h> > >> +#include <network_helpers.h> > >> + > >> +#define SYS(cmd) ({ \ > >> + if (!ASSERT_OK(system(cmd), (cmd))) \ > >> + goto out; \ > >> +}) > >> + > >> +#define SYS_OUT(cmd) ({ \ > >> + FILE *f = popen((cmd), "r"); \ > >> + if (!ASSERT_OK_PTR(f, (cmd))) \ > >> + goto out; \ > >> + f; \ > >> +}) > >> + > >> +static bool expect_str(char *buf, size_t size, const char *str) > >> +{ > >> + if (size != strlen(str)) > >> + return false; > >> + return !memcmp(buf, str, size); > >> +} > >> + > >> +void test_xdp_synproxy(void) > >> +{ > >> + int server_fd = -1, client_fd = -1, accept_fd = -1; > >> + struct nstoken *ns = NULL; > >> + FILE *ctrl_file = NULL; > >> + char buf[1024]; > >> + size_t size; > >> + > >> + SYS("ip netns add synproxy"); > >> + > >> + SYS("ip link add tmp0 type veth peer name tmp1"); > >> + SYS("ip link set tmp1 netns synproxy"); > >> + SYS("ip link set tmp0 up"); > >> + SYS("ip addr replace 198.18.0.1/24 dev tmp0"); > > > >> + > >> + // When checksum offload is enabled, the XDP program sees wrong > >> + // checksums and drops packets. > >> + SYS("ethtool -K tmp0 tx off"); > >> + // Workaround required for veth. > > > > don't use C++ comments, please stick to /* */ > > > >> + SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); > >> + > >> + ns = open_netns("synproxy"); > >> + if (!ASSERT_OK_PTR(ns, "setns")) > >> + goto out; > >> + > >> + SYS("ip link set lo up"); > >> + SYS("ip link set tmp1 up"); > >> + SYS("ip addr replace 198.18.0.2/24 dev tmp1"); > >> + SYS("sysctl -w net.ipv4.tcp_syncookies=2"); > >> + SYS("sysctl -w net.ipv4.tcp_timestamps=1"); > >> + SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0"); > >> + SYS("iptables -t raw -I PREROUTING \ > >> + -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack"); > >> + SYS("iptables -t filter -A INPUT \ > >> + -i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \ > >> + -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460"); > >> + SYS("iptables -t filter -A INPUT \ > >> + -i tmp1 -m state --state INVALID -j DROP"); > >> + > >> + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \ > >> + --mss4 1460 --mss6 1440 --wscale 7 --ttl 64"); > >> + size = fread(buf, 1, sizeof(buf), ctrl_file); > > > > buf is uninitialized so if fread fail strlen() can cause SIGSEGV or > > some other failure mode > > No, it will exit on the assert below (size won't be equal to strlen(str)). it's better to use ASSERT_STREQ() which will also emit expected and actual strings if they don't match. So maybe check size first, and then ASSERT_STREQ() instead of custom expect_str() "helper"? > > > > >> + pclose(ctrl_file); > >> + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 0\n"), > >> + "initial SYNACKs")) > >> + goto out; > >> + > >> + server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0); > >> + if (!ASSERT_GE(server_fd, 0, "start_server")) > >> + goto out; > >> + > >> + close_netns(ns); > >> + ns = NULL; > >> + > >> + client_fd = connect_to_fd(server_fd, 10000); > >> + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) > >> + goto out; > >> + > >> + accept_fd = accept(server_fd, NULL, NULL); > >> + if (!ASSERT_GE(accept_fd, 0, "accept")) > >> + goto out; > >> + > >> + ns = open_netns("synproxy"); > >> + if (!ASSERT_OK_PTR(ns, "setns")) > >> + goto out; > >> + > >> + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); > >> + size = fread(buf, 1, sizeof(buf), ctrl_file); > >> + pclose(ctrl_file); > >> + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 1\n"), > >> + "SYNACKs after connection")) > > > > please use ASSERT_STREQ instead, same above > > It doesn't fit here for two reasons: > > * It doesn't consider size (and ignoring size will cause a UB on errors > because of the uninitialized buf). > > * buf is not '\0'-terminated, and ASSERT_STREQ uses strcmp. can it be non-zero-terminated in normal case? see above about checking for errors separately > > > > >> + goto out; > >> + > >> +out: > >> + if (accept_fd >= 0) > >> + close(accept_fd); > >> + if (client_fd >= 0) > >> + close(client_fd); > >> + if (server_fd >= 0) > >> + close(server_fd); > >> + if (ns) > >> + close_netns(ns); > >> + > >> + system("ip link del tmp0"); > >> + system("ip netns del synproxy"); > >> +} > >> diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c > >> new file mode 100644 > >> index 000000000000..9ae85b189072 > >> --- /dev/null > >> +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c > >> @@ -0,0 +1,750 @@ > >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > > > > Can you please elaborate on what Linux-OpenIB license is and why > > GPL-2.0 isn't enough? We usually have GPL-2.0 or LGPL-2.1 OR > > BSD-2-Clause > > That's the license boilerplate we use in the mlx5e driver. I'll check > with the relevant people whether we can submit it as GPL-2.0 solely. > ok > >> +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ > >> + > >> +#include "vmlinux.h" > >> + > >> +#include <bpf/bpf_helpers.h> > >> +#include <bpf/bpf_endian.h> > >> +#include <asm/errno.h> > >> + > > > > [...] > > > >> + > >> +static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, > >> + __u32 len, __u8 proto, > >> + __u32 csum) > >> +{ > >> + __u64 s = csum; > >> + > >> + s += (__u32)saddr; > >> + s += (__u32)daddr; > >> +#if defined(__BIG_ENDIAN__) > >> + s += proto + len; > >> +#elif defined(__LITTLE_ENDIAN__) > > > > I've got few nudges in libbpf code base previously to use > > > > #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > > #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ > > > > instead (I don't remember the exact reason now, but there was a > > reason). Let's do the same here for consistency? > > OK. > > samples/bpf/xdpsock_user.c also still uses __BIG_ENDIAN__. > > >> + s += (proto + len) << 8; > >> +#else > >> +#error Unknown endian > >> +#endif > >> + s = (s & 0xffffffff) + (s >> 32); > >> + s = (s & 0xffffffff) + (s >> 32); > >> + > >> + return csum_fold((__u32)s); > >> +} > >> + > >> +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, > >> + const struct in6_addr *daddr, > >> + __u32 len, __u8 proto, __u32 csum) > >> +{ > >> + __u64 sum = csum; > >> + int i; > >> + > >> +#pragma unroll > >> + for (i = 0; i < 4; i++) > >> + sum += (__u32)saddr->in6_u.u6_addr32[i]; > >> + > >> +#pragma unroll > > > > why unroll? BPF verifier handles such loops just fine, even if > > compiler decides to not unroll them > > Optimization, see csum_ipv6_magic in net/ipv6/ip6_checksum.c that has > this loop unrolled manually. > > >> + for (i = 0; i < 4; i++) > >> + sum += (__u32)daddr->in6_u.u6_addr32[i]; > >> + > >> + // Don't combine additions to avoid 32-bit overflow. > >> + sum += bpf_htonl(len); > >> + sum += bpf_htonl(proto); > >> + > >> + sum = (sum & 0xffffffff) + (sum >> 32); > >> + sum = (sum & 0xffffffff) + (sum >> 32); > >> + > >> + return csum_fold((__u32)sum); > >> +} > >> + > >> +static __always_inline __u64 tcp_clock_ns(void) > > > > __always_inline isn't mandatory, you can just have static __u64 > > tcp_clock_ns() here and let compiler decide on inlining? same for > > below > > Do you mean just these three functions, or all functions below, or > actually all functions in this file? > > It's not mandatory, but these are simple one-liners, it would be > unpleasant to waste an extra call in performance-critical code if the > compiler decides not to inline them. > my point was that it's not mandatory anymore. Given this is a hybrid high-performance sample and selftest, I don't care. If it was just a test, there is no point in micro-optimizing this (similar for loop unrolling). > >> +{ > >> + return bpf_ktime_get_ns(); > >> +} > >> + > >> +static __always_inline __u32 tcp_ns_to_ts(__u64 ns) > >> +{ > >> + return ns / (NSEC_PER_SEC / TCP_TS_HZ); > >> +} > >> + > >> +static __always_inline __u32 tcp_time_stamp_raw(void) > >> +{ > >> + return tcp_ns_to_ts(tcp_clock_ns()); > >> +} > >> + > > > > [...] > > [...]
On 2022-05-11 03:10, Andrii Nakryiko wrote: > On Tue, May 10, 2022 at 12:21 PM Maxim Mikityanskiy <maximmi@nvidia.com> wrote: >> >> On 2022-05-07 00:34, Andrii Nakryiko wrote: >>> On Tue, May 3, 2022 at 10:15 AM Maxim Mikityanskiy <maximmi@nvidia.com> wrote: >>>> >>>> This commit adds selftests for the new BPF helpers: >>>> bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}. >>>> >>>> xdp_synproxy_kern.c is a BPF program that generates SYN cookies on >>>> allowed TCP ports and sends SYNACKs to clients, accelerating synproxy >>>> iptables module. >>>> >>>> xdp_synproxy.c is a userspace control application that allows to >>>> configure the following options in runtime: list of allowed ports, MSS, >>>> window scale, TTL. >>>> >>>> A selftest is added to prog_tests that leverages the above programs to >>>> test the functionality of the new helpers. >>>> >>>> Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com> >>>> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> >>>> --- >>> >>> selftests should use "selftests/bpf: " subject prefix, not "bpf: ", >>> please update so it's more obvious that this patch touches selftests >>> and not kernel-side BPF functionality. >>> >>>> tools/testing/selftests/bpf/.gitignore | 1 + >>>> tools/testing/selftests/bpf/Makefile | 5 +- >>>> .../selftests/bpf/prog_tests/xdp_synproxy.c | 109 +++ >>>> .../selftests/bpf/progs/xdp_synproxy_kern.c | 750 ++++++++++++++++++ >>>> tools/testing/selftests/bpf/xdp_synproxy.c | 418 ++++++++++ >>>> 5 files changed, 1281 insertions(+), 2 deletions(-) >>>> create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c >>>> create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c >>>> create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c >>>> >>>> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore >>>> index 595565eb68c0..ca2f47f45670 100644 >>>> --- a/tools/testing/selftests/bpf/.gitignore >>>> +++ b/tools/testing/selftests/bpf/.gitignore >>>> @@ -43,3 +43,4 @@ test_cpp >>>> *.tmp >>>> xdpxceiver >>>> xdp_redirect_multi >>>> +xdp_synproxy >>>> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile >>>> index bafdc5373a13..8ae602843b16 100644 >>>> --- a/tools/testing/selftests/bpf/Makefile >>>> +++ b/tools/testing/selftests/bpf/Makefile >>>> @@ -82,9 +82,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \ >>>> TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ >>>> flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ >>>> test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ >>>> - xdpxceiver xdp_redirect_multi >>>> + xdpxceiver xdp_redirect_multi xdp_synproxy >>>> >>>> -TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read >>>> +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/xdp_synproxy >>>> >>>> # Emit succinct information message describing current building step >>>> # $1 - generic step name (e.g., CC, LINK, etc); >>>> @@ -500,6 +500,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ >>>> cap_helpers.c >>>> TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ >>>> $(OUTPUT)/liburandom_read.so \ >>>> + $(OUTPUT)/xdp_synproxy \ >>> >>> this is the right way to make external binary available to test_progs >>> flavors, but is there anything inherently requiring external binary >>> instead of having a helper function doing the same? urandom_read has >>> to be a separate binary. >> >> If you remember v1, it used to be a sample, but I was asked to convert >> it to a selftest, because samples are deprecated. The intention of >> having this separate binary is to have a sample reference implementation >> that can be used in real-world scenarios with minor or no changes. >> > > Ok, I'll let others chime in if they care enough about this. Selftests > are first and foremost a test and not an almost production-ready > collection of tools, but fine by me. > >>>> ima_setup.sh \ >>>> $(wildcard progs/btf_dump_test_case_*.c) >>>> TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE >>>> diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c >>>> new file mode 100644 >>>> index 000000000000..e08b28e25047 >>>> --- /dev/null >>>> +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c >>>> @@ -0,0 +1,109 @@ >>>> +// SPDX-License-Identifier: GPL-2.0 >>>> +#include <test_progs.h> >>>> +#include <network_helpers.h> >>>> + >>>> +#define SYS(cmd) ({ \ >>>> + if (!ASSERT_OK(system(cmd), (cmd))) \ >>>> + goto out; \ >>>> +}) >>>> + >>>> +#define SYS_OUT(cmd) ({ \ >>>> + FILE *f = popen((cmd), "r"); \ >>>> + if (!ASSERT_OK_PTR(f, (cmd))) \ >>>> + goto out; \ >>>> + f; \ >>>> +}) >>>> + >>>> +static bool expect_str(char *buf, size_t size, const char *str) >>>> +{ >>>> + if (size != strlen(str)) >>>> + return false; >>>> + return !memcmp(buf, str, size); >>>> +} >>>> + >>>> +void test_xdp_synproxy(void) >>>> +{ >>>> + int server_fd = -1, client_fd = -1, accept_fd = -1; >>>> + struct nstoken *ns = NULL; >>>> + FILE *ctrl_file = NULL; >>>> + char buf[1024]; >>>> + size_t size; >>>> + >>>> + SYS("ip netns add synproxy"); >>>> + >>>> + SYS("ip link add tmp0 type veth peer name tmp1"); >>>> + SYS("ip link set tmp1 netns synproxy"); >>>> + SYS("ip link set tmp0 up"); >>>> + SYS("ip addr replace 198.18.0.1/24 dev tmp0"); >>> >>>> + >>>> + // When checksum offload is enabled, the XDP program sees wrong >>>> + // checksums and drops packets. >>>> + SYS("ethtool -K tmp0 tx off"); >>>> + // Workaround required for veth. >>> >>> don't use C++ comments, please stick to /* */ >>> >>>> + SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); >>>> + >>>> + ns = open_netns("synproxy"); >>>> + if (!ASSERT_OK_PTR(ns, "setns")) >>>> + goto out; >>>> + >>>> + SYS("ip link set lo up"); >>>> + SYS("ip link set tmp1 up"); >>>> + SYS("ip addr replace 198.18.0.2/24 dev tmp1"); >>>> + SYS("sysctl -w net.ipv4.tcp_syncookies=2"); >>>> + SYS("sysctl -w net.ipv4.tcp_timestamps=1"); >>>> + SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0"); >>>> + SYS("iptables -t raw -I PREROUTING \ >>>> + -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack"); >>>> + SYS("iptables -t filter -A INPUT \ >>>> + -i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \ >>>> + -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460"); >>>> + SYS("iptables -t filter -A INPUT \ >>>> + -i tmp1 -m state --state INVALID -j DROP"); >>>> + >>>> + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \ >>>> + --mss4 1460 --mss6 1440 --wscale 7 --ttl 64"); >>>> + size = fread(buf, 1, sizeof(buf), ctrl_file); >>> >>> buf is uninitialized so if fread fail strlen() can cause SIGSEGV or >>> some other failure mode >> >> No, it will exit on the assert below (size won't be equal to strlen(str)). > > it's better to use ASSERT_STREQ() which will also emit expected and > actual strings if they don't match. So maybe check size first, and > then ASSERT_STREQ() instead of custom expect_str() "helper"? See below, the command's output is not a string. If I extend my expect_str function to print the expected and actual outputs, would that work for you? >> >>> >>>> + pclose(ctrl_file); >>>> + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 0\n"), >>>> + "initial SYNACKs")) >>>> + goto out; >>>> + >>>> + server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0); >>>> + if (!ASSERT_GE(server_fd, 0, "start_server")) >>>> + goto out; >>>> + >>>> + close_netns(ns); >>>> + ns = NULL; >>>> + >>>> + client_fd = connect_to_fd(server_fd, 10000); >>>> + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) >>>> + goto out; >>>> + >>>> + accept_fd = accept(server_fd, NULL, NULL); >>>> + if (!ASSERT_GE(accept_fd, 0, "accept")) >>>> + goto out; >>>> + >>>> + ns = open_netns("synproxy"); >>>> + if (!ASSERT_OK_PTR(ns, "setns")) >>>> + goto out; >>>> + >>>> + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); >>>> + size = fread(buf, 1, sizeof(buf), ctrl_file); >>>> + pclose(ctrl_file); >>>> + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 1\n"), >>>> + "SYNACKs after connection")) >>> >>> please use ASSERT_STREQ instead, same above >> >> It doesn't fit here for two reasons: >> >> * It doesn't consider size (and ignoring size will cause a UB on errors >> because of the uninitialized buf). >> >> * buf is not '\0'-terminated, and ASSERT_STREQ uses strcmp. > > can it be non-zero-terminated in normal case? see above about checking > for errors separately fread(buf, x, y, file) just reads up to x*y bytes into buf. It doesn't treat it as a zero-terminated string, it can be any binary sequence of bytes. So, yes, in normal case it's going to be some printable characters WITHOUT any terminating '\0' (no one prints a '\0' to the terminal in normal cases). In bad cases it could be anything, including '\0' in the middle of the buffer, which would terminate strcmp early and can cause false positives. >> >>> >>>> + goto out; >>>> + >>>> +out: >>>> + if (accept_fd >= 0) >>>> + close(accept_fd); >>>> + if (client_fd >= 0) >>>> + close(client_fd); >>>> + if (server_fd >= 0) >>>> + close(server_fd); >>>> + if (ns) >>>> + close_netns(ns); >>>> + >>>> + system("ip link del tmp0"); >>>> + system("ip netns del synproxy"); >>>> +} >>>> diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c >>>> new file mode 100644 >>>> index 000000000000..9ae85b189072 >>>> --- /dev/null >>>> +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c >>>> @@ -0,0 +1,750 @@ >>>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB >>> >>> Can you please elaborate on what Linux-OpenIB license is and why >>> GPL-2.0 isn't enough? We usually have GPL-2.0 or LGPL-2.1 OR >>> BSD-2-Clause >> >> That's the license boilerplate we use in the mlx5e driver. I'll check >> with the relevant people whether we can submit it as GPL-2.0 solely. >> > > ok > >>>> +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ >>>> + >>>> +#include "vmlinux.h" >>>> + >>>> +#include <bpf/bpf_helpers.h> >>>> +#include <bpf/bpf_endian.h> >>>> +#include <asm/errno.h> >>>> + >>> >>> [...] >>> >>>> + >>>> +static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, >>>> + __u32 len, __u8 proto, >>>> + __u32 csum) >>>> +{ >>>> + __u64 s = csum; >>>> + >>>> + s += (__u32)saddr; >>>> + s += (__u32)daddr; >>>> +#if defined(__BIG_ENDIAN__) >>>> + s += proto + len; >>>> +#elif defined(__LITTLE_ENDIAN__) >>> >>> I've got few nudges in libbpf code base previously to use >>> >>> #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ >>> #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ >>> >>> instead (I don't remember the exact reason now, but there was a >>> reason). Let's do the same here for consistency? >> >> OK. >> >> samples/bpf/xdpsock_user.c also still uses __BIG_ENDIAN__. >> >>>> + s += (proto + len) << 8; >>>> +#else >>>> +#error Unknown endian >>>> +#endif >>>> + s = (s & 0xffffffff) + (s >> 32); >>>> + s = (s & 0xffffffff) + (s >> 32); >>>> + >>>> + return csum_fold((__u32)s); >>>> +} >>>> + >>>> +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, >>>> + const struct in6_addr *daddr, >>>> + __u32 len, __u8 proto, __u32 csum) >>>> +{ >>>> + __u64 sum = csum; >>>> + int i; >>>> + >>>> +#pragma unroll >>>> + for (i = 0; i < 4; i++) >>>> + sum += (__u32)saddr->in6_u.u6_addr32[i]; >>>> + >>>> +#pragma unroll >>> >>> why unroll? BPF verifier handles such loops just fine, even if >>> compiler decides to not unroll them >> >> Optimization, see csum_ipv6_magic in net/ipv6/ip6_checksum.c that has >> this loop unrolled manually. >> >>>> + for (i = 0; i < 4; i++) >>>> + sum += (__u32)daddr->in6_u.u6_addr32[i]; >>>> + >>>> + // Don't combine additions to avoid 32-bit overflow. >>>> + sum += bpf_htonl(len); >>>> + sum += bpf_htonl(proto); >>>> + >>>> + sum = (sum & 0xffffffff) + (sum >> 32); >>>> + sum = (sum & 0xffffffff) + (sum >> 32); >>>> + >>>> + return csum_fold((__u32)sum); >>>> +} >>>> + >>>> +static __always_inline __u64 tcp_clock_ns(void) >>> >>> __always_inline isn't mandatory, you can just have static __u64 >>> tcp_clock_ns() here and let compiler decide on inlining? same for >>> below >> >> Do you mean just these three functions, or all functions below, or >> actually all functions in this file? >> >> It's not mandatory, but these are simple one-liners, it would be >> unpleasant to waste an extra call in performance-critical code if the >> compiler decides not to inline them. >> > > my point was that it's not mandatory anymore. Given this is a hybrid > high-performance sample and selftest, I don't care. If it was just a > test, there is no point in micro-optimizing this (similar for loop > unrolling). > >>>> +{ >>>> + return bpf_ktime_get_ns(); >>>> +} >>>> + >>>> +static __always_inline __u32 tcp_ns_to_ts(__u64 ns) >>>> +{ >>>> + return ns / (NSEC_PER_SEC / TCP_TS_HZ); >>>> +} >>>> + >>>> +static __always_inline __u32 tcp_time_stamp_raw(void) >>>> +{ >>>> + return tcp_ns_to_ts(tcp_clock_ns()); >>>> +} >>>> + >>> >>> [...] >>> > > [...]
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 595565eb68c0..ca2f47f45670 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -43,3 +43,4 @@ test_cpp *.tmp xdpxceiver xdp_redirect_multi +xdp_synproxy diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index bafdc5373a13..8ae602843b16 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,9 +82,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ - xdpxceiver xdp_redirect_multi + xdpxceiver xdp_redirect_multi xdp_synproxy -TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/xdp_synproxy # Emit succinct information message describing current building step # $1 - generic step name (e.g., CC, LINK, etc); @@ -500,6 +500,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ cap_helpers.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \ + $(OUTPUT)/xdp_synproxy \ ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c new file mode 100644 index 000000000000..e08b28e25047 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> + +#define SYS(cmd) ({ \ + if (!ASSERT_OK(system(cmd), (cmd))) \ + goto out; \ +}) + +#define SYS_OUT(cmd) ({ \ + FILE *f = popen((cmd), "r"); \ + if (!ASSERT_OK_PTR(f, (cmd))) \ + goto out; \ + f; \ +}) + +static bool expect_str(char *buf, size_t size, const char *str) +{ + if (size != strlen(str)) + return false; + return !memcmp(buf, str, size); +} + +void test_xdp_synproxy(void) +{ + int server_fd = -1, client_fd = -1, accept_fd = -1; + struct nstoken *ns = NULL; + FILE *ctrl_file = NULL; + char buf[1024]; + size_t size; + + SYS("ip netns add synproxy"); + + SYS("ip link add tmp0 type veth peer name tmp1"); + SYS("ip link set tmp1 netns synproxy"); + SYS("ip link set tmp0 up"); + SYS("ip addr replace 198.18.0.1/24 dev tmp0"); + + // When checksum offload is enabled, the XDP program sees wrong + // checksums and drops packets. + SYS("ethtool -K tmp0 tx off"); + // Workaround required for veth. + SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); + + ns = open_netns("synproxy"); + if (!ASSERT_OK_PTR(ns, "setns")) + goto out; + + SYS("ip link set lo up"); + SYS("ip link set tmp1 up"); + SYS("ip addr replace 198.18.0.2/24 dev tmp1"); + SYS("sysctl -w net.ipv4.tcp_syncookies=2"); + SYS("sysctl -w net.ipv4.tcp_timestamps=1"); + SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0"); + SYS("iptables -t raw -I PREROUTING \ + -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack"); + SYS("iptables -t filter -A INPUT \ + -i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \ + -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460"); + SYS("iptables -t filter -A INPUT \ + -i tmp1 -m state --state INVALID -j DROP"); + + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \ + --mss4 1460 --mss6 1440 --wscale 7 --ttl 64"); + size = fread(buf, 1, sizeof(buf), ctrl_file); + pclose(ctrl_file); + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 0\n"), + "initial SYNACKs")) + goto out; + + server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto out; + + close_netns(ns); + ns = NULL; + + client_fd = connect_to_fd(server_fd, 10000); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto out; + + accept_fd = accept(server_fd, NULL, NULL); + if (!ASSERT_GE(accept_fd, 0, "accept")) + goto out; + + ns = open_netns("synproxy"); + if (!ASSERT_OK_PTR(ns, "setns")) + goto out; + + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); + size = fread(buf, 1, sizeof(buf), ctrl_file); + pclose(ctrl_file); + if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 1\n"), + "SYNACKs after connection")) + goto out; + +out: + if (accept_fd >= 0) + close(accept_fd); + if (client_fd >= 0) + close(client_fd); + if (server_fd >= 0) + close(server_fd); + if (ns) + close_netns(ns); + + system("ip link del tmp0"); + system("ip netns del synproxy"); +} diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c new file mode 100644 index 000000000000..9ae85b189072 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <asm/errno.h> + +#define NSEC_PER_SEC 1000000000L + +#define ETH_ALEN 6 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD + +#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) + +#define IP_DF 0x4000 +#define IP_MF 0x2000 +#define IP_OFFSET 0x1fff + +#define NEXTHDR_TCP 6 + +#define TCPOPT_NOP 1 +#define TCPOPT_EOL 0 +#define TCPOPT_MSS 2 +#define TCPOPT_WINDOW 3 +#define TCPOPT_SACK_PERM 4 +#define TCPOPT_TIMESTAMP 8 + +#define TCPOLEN_MSS 4 +#define TCPOLEN_WINDOW 3 +#define TCPOLEN_SACK_PERM 2 +#define TCPOLEN_TIMESTAMP 10 + +#define TCP_TS_HZ 1000 +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK (1 << 4) +#define TS_OPT_ECN (1 << 5) +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) +#define TCP_MAX_WSCALE 14U + +#define IPV4_MAXLEN 60 +#define TCP_MAXLEN 60 + +#define DEFAULT_MSS4 1460 +#define DEFAULT_MSS6 1440 +#define DEFAULT_WSCALE 7 +#define DEFAULT_TTL 64 +#define MAX_ALLOWED_PORTS 8 + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#define __get_unaligned_t(type, ptr) ({ \ + const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ + __pptr->x; \ +}) + +#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 2); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u16); + __uint(max_entries, MAX_ALLOWED_PORTS); +} allowed_ports SEC(".maps"); + +extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, + struct bpf_sock_tuple *bpf_tuple, + __u32 len_tuple, + struct bpf_ct_opts *opts, + __u32 len_opts) __ksym; + +extern void bpf_ct_release(struct nf_conn *ct) __ksym; + +static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) +{ + __u8 tmp[ETH_ALEN]; + + __builtin_memcpy(tmp, a, ETH_ALEN); + __builtin_memcpy(a, b, ETH_ALEN); + __builtin_memcpy(b, tmp, ETH_ALEN); +} + +static __always_inline __u16 csum_fold(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (__u16)~csum; +} + +static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, + __u32 csum) +{ + __u64 s = csum; + + s += (__u32)saddr; + s += (__u32)daddr; +#if defined(__BIG_ENDIAN__) + s += proto + len; +#elif defined(__LITTLE_ENDIAN__) + s += (proto + len) << 8; +#else +#error Unknown endian +#endif + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __u32 csum) +{ + __u64 sum = csum; + int i; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)saddr->in6_u.u6_addr32[i]; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)daddr->in6_u.u6_addr32[i]; + + // Don't combine additions to avoid 32-bit overflow. + sum += bpf_htonl(len); + sum += bpf_htonl(proto); + + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + + return csum_fold((__u32)sum); +} + +static __always_inline __u64 tcp_clock_ns(void) +{ + return bpf_ktime_get_ns(); +} + +static __always_inline __u32 tcp_ns_to_ts(__u64 ns) +{ + return ns / (NSEC_PER_SEC / TCP_TS_HZ); +} + +static __always_inline __u32 tcp_time_stamp_raw(void) +{ + return tcp_ns_to_ts(tcp_clock_ns()); +} + +struct tcpopt_context { + __u8 *ptr; + __u8 *end; + void *data_end; + __be32 *tsecr; + __u8 wscale; + bool option_timestamp; + bool option_sack; +}; + +static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +{ + __u8 opcode, opsize; + + if (ctx->ptr >= ctx->end) + return 1; + if (ctx->ptr >= ctx->data_end) + return 1; + + opcode = ctx->ptr[0]; + + if (opcode == TCPOPT_EOL) + return 1; + if (opcode == TCPOPT_NOP) { + ++ctx->ptr; + return 0; + } + + if (ctx->ptr + 1 >= ctx->end) + return 1; + if (ctx->ptr + 1 >= ctx->data_end) + return 1; + opsize = ctx->ptr[1]; + if (opsize < 2) + return 1; + + if (ctx->ptr + opsize > ctx->end) + return 1; + + switch (opcode) { + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) + ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { + ctx->option_timestamp = true; + /* Client's tsval becomes our tsecr. */ + *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) + ctx->option_sack = true; + break; + } + + ctx->ptr += opsize; + + return 0; +} + +static int tscookie_tcpopt_parse_batch(__u32 index, void *context) +{ + int i; + + for (i = 0; i < 7; i++) + if (tscookie_tcpopt_parse(context)) + return 1; + return 0; +} + +static __always_inline bool tscookie_init(struct tcphdr *tcp_header, + __u16 tcp_len, __be32 *tsval, + __be32 *tsecr, void *data_end) +{ + struct tcpopt_context loop_ctx = { + .ptr = (__u8 *)(tcp_header + 1), + .end = (__u8 *)tcp_header + tcp_len, + .data_end = data_end, + .tsecr = tsecr, + .wscale = TS_OPT_WSCALE_MASK, + .option_timestamp = false, + .option_sack = false, + }; + u32 cookie; + + bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); + + if (!loop_ctx.option_timestamp) + return false; + + cookie = tcp_time_stamp_raw() & ~TSMASK; + cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; + if (loop_ctx.option_sack) + cookie |= TS_OPT_SACK; + if (tcp_header->ece && tcp_header->cwr) + cookie |= TS_OPT_ECN; + *tsval = bpf_htonl(cookie); + + return true; +} + +static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, + __u8 *ttl, bool ipv6) +{ + __u32 key = 0; + __u64 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value && *value != 0) { + if (ipv6) + *mss = (*value >> 32) & 0xffff; + else + *mss = *value & 0xffff; + *wscale = (*value >> 16) & 0xf; + *ttl = (*value >> 24) & 0xff; + return; + } + + *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; + *wscale = DEFAULT_WSCALE; + *ttl = DEFAULT_TTL; +} + +static __always_inline void values_inc_synacks(void) +{ + __u32 key = 1; + __u32 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value) + __sync_fetch_and_add(value, 1); +} + +static __always_inline bool check_port_allowed(__u16 port) +{ + __u32 i; + + for (i = 0; i < MAX_ALLOWED_PORTS; i++) { + __u32 key = i; + __u16 *value; + + value = bpf_map_lookup_elem(&allowed_ports, &key); + + if (!value) + break; + // 0 is a terminator value. Check it first to avoid matching on + // a forbidden port == 0 and returning true. + if (*value == 0) + break; + + if (*value == port) + return true; + } + + return false; +} + +struct header_pointers { + struct ethhdr *eth; + struct iphdr *ipv4; + struct ipv6hdr *ipv6; + struct tcphdr *tcp; + __u16 tcp_len; +}; + +static __always_inline int tcp_dissect(void *data, void *data_end, + struct header_pointers *hdr) +{ + hdr->eth = data; + if (hdr->eth + 1 > data_end) + return XDP_DROP; + + switch (bpf_ntohs(hdr->eth->h_proto)) { + case ETH_P_IP: + hdr->ipv6 = NULL; + + hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv4 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) + return XDP_DROP; + if (hdr->ipv4->version != 4) + return XDP_DROP; + + if (hdr->ipv4->protocol != IPPROTO_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; + break; + case ETH_P_IPV6: + hdr->ipv4 = NULL; + + hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv6 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv6->version != 6) + return XDP_DROP; + + // XXX: Extension headers are not supported and could circumvent + // XDP SYN flood protection. + if (hdr->ipv6->nexthdr != NEXTHDR_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); + break; + default: + // XXX: VLANs will circumvent XDP SYN flood protection. + return XDP_PASS; + } + + if (hdr->tcp + 1 > data_end) + return XDP_DROP; + hdr->tcp_len = hdr->tcp->doff * 4; + if (hdr->tcp_len < sizeof(*hdr->tcp)) + return XDP_DROP; + + return XDP_TX; +} + +static __always_inline int tcp_lookup(struct xdp_md *ctx, struct header_pointers *hdr) +{ + struct bpf_ct_opts ct_lookup_opts = { + .netns_id = BPF_F_CURRENT_NETNS, + .l4proto = IPPROTO_TCP, + }; + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + __u32 tup_size; + + if (hdr->ipv4) { + // TCP doesn't normally use fragments, and XDP can't reassemble them. + if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) + return XDP_DROP; + + tup.ipv4.saddr = hdr->ipv4->saddr; + tup.ipv4.daddr = hdr->ipv4->daddr; + tup.ipv4.sport = hdr->tcp->source; + tup.ipv4.dport = hdr->tcp->dest; + tup_size = sizeof(tup.ipv4); + } else if (hdr->ipv6) { + __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); + __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); + tup.ipv6.sport = hdr->tcp->source; + tup.ipv6.dport = hdr->tcp->dest; + tup_size = sizeof(tup.ipv6); + } else { + // The verifier can't track that either ipv4 or ipv6 is not NULL. + return XDP_ABORTED; + } + ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); + if (ct) { + unsigned long status = ct->status; + + bpf_ct_release(ct); + if (status & IPS_CONFIRMED_BIT) + return XDP_PASS; + } else if (ct_lookup_opts.error != -ENOENT) { + return XDP_ABORTED; + } + + // error == -ENOENT || !(status & IPS_CONFIRMED_BIT) + return XDP_TX; +} + +static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, + __u8 wscale) +{ + __be32 *start = buf; + + *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + + if (!tsopt) + return buf - start; + + if (tsopt[0] & bpf_htonl(1 << 4)) + *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *buf++ = tsopt[0]; + *buf++ = tsopt[1]; + + if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + wscale); + + return buf - start; +} + +static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, + __u32 cookie, __be32 *tsopt, + __u16 mss, __u8 wscale) +{ + void *tcp_options; + + tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) + tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; + tcp_header->doff = 5; // doff is part of tcp_flag_word. + swap(tcp_header->source, tcp_header->dest); + tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); + tcp_header->seq = bpf_htonl(cookie); + tcp_header->window = 0; + tcp_header->urg_ptr = 0; + tcp_header->check = 0; // Rely on hardware checksum offload. + + tcp_options = (void *)(tcp_header + 1); + tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); +} + +static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, false); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv4->saddr, hdr->ipv4->daddr); + hdr->ipv4->check = 0; // Rely on hardware checksum offload. + hdr->ipv4->tos = 0; + hdr->ipv4->id = 0; + hdr->ipv4->ttl = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); +} + +static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, true); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv6->saddr, hdr->ipv6->daddr); + *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); + hdr->ipv6->hop_limit = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); +} + +static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, + struct xdp_md *ctx, + void *data, void *data_end) +{ + __u32 old_pkt_size, new_pkt_size; + // Unlike clang 10, clang 11 and 12 generate code that doesn't pass the + // BPF verifier if tsopt is not volatile. Volatile forces it to store + // the pointer value and use it directly, otherwise tcp_mkoptions is + // (mis)compiled like this: + // if (!tsopt) + // return buf - start; + // reg = stored_return_value_of_tscookie_init; + // if (reg) + // tsopt = tsopt_buf; + // else + // tsopt = NULL; + // ... + // *buf++ = tsopt[1]; + // It creates a dead branch where tsopt is assigned NULL, but the + // verifier can't prove it's dead and blocks the program. + __be32 * volatile tsopt = NULL; + __be32 tsopt_buf[2] = {}; + __u16 ip_len; + __u32 cookie; + __s64 value; + + // Checksum is not yet verified, but both checksum failure and TCP + // header checks return XDP_DROP, so the order doesn't matter. + if (hdr->tcp->fin || hdr->tcp->rst) + return XDP_DROP; + + // Issue SYN cookies on allowed ports, drop SYN packets on blocked + // ports. + if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) + return XDP_DROP; + + if (hdr->ipv4) { + // Check the IPv4 and TCP checksums before creating a SYNACK. + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_fold(value) != 0) + return XDP_DROP; // Bad IPv4 checksum. + + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; // Bad TCP checksum. + + ip_len = sizeof(*hdr->ipv4); + + value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, + hdr->tcp_len); + } else if (hdr->ipv6) { + // Check the TCP checksum before creating a SYNACK. + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; // Bad TCP checksum. + + ip_len = sizeof(*hdr->ipv6); + + value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, + hdr->tcp_len); + } else { + return XDP_ABORTED; + } + + if (value < 0) + return XDP_ABORTED; + cookie = (__u32)value; + + if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, + &tsopt_buf[0], &tsopt_buf[1], data_end)) + tsopt = tsopt_buf; + + // Check that there is enough space for a SYNACK. It also covers + // the check that the destination of the __builtin_memmove below + // doesn't overflow. + if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + if (hdr->ipv4) { + if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { + struct tcphdr *new_tcp_header; + + new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); + __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); + hdr->tcp = new_tcp_header; + + hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; + } + + tcpv4_gen_synack(hdr, cookie, tsopt); + } else if (hdr->ipv6) { + tcpv6_gen_synack(hdr, cookie, tsopt); + } else { + return XDP_ABORTED; + } + + // Recalculate checksums. + hdr->tcp->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (hdr->ipv4) { + hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, + hdr->ipv4->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + + hdr->ipv4->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); + if (value < 0) + return XDP_ABORTED; + hdr->ipv4->check = csum_fold(value); + } else if (hdr->ipv6) { + hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, + &hdr->ipv6->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + } else { + return XDP_ABORTED; + } + + // Set the new packet size. + old_pkt_size = data_end - data; + new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; + if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) + return XDP_ABORTED; + + values_inc_synacks(); + + return XDP_TX; +} + +static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) +{ + int err; + + if (hdr->tcp->rst) + return XDP_DROP; + + if (hdr->ipv4) + err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); + else if (hdr->ipv6) + err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); + else + return XDP_ABORTED; + if (err) + return XDP_DROP; + + return XDP_PASS; +} + +SEC("xdp") +int syncookie_xdp(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct header_pointers hdr; + __s64 value; + int ret; + + struct bpf_ct_opts ct_lookup_opts = { + .netns_id = BPF_F_CURRENT_NETNS, + .l4proto = IPPROTO_TCP, + }; + + ret = tcp_dissect(data, data_end, &hdr); + if (ret != XDP_TX) + return ret; + + ret = tcp_lookup(ctx, &hdr); + if (ret != XDP_TX) + return ret; + + // Packet is TCP and doesn't belong to an established connection. + + if ((hdr.tcp->syn ^ hdr.tcp->ack) != 1) + return XDP_DROP; + + // Grow the TCP header to TCP_MAXLEN to be able to pass any hdr.tcp_len + // to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. + if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr.tcp_len)) + return XDP_ABORTED; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + if (hdr.ipv4) { + hdr.eth = data; + hdr.ipv4 = (void *)hdr.eth + sizeof(*hdr.eth); + // IPV4_MAXLEN is needed when calculating checksum. + // At least sizeof(struct iphdr) is needed here to access ihl. + if ((void *)hdr.ipv4 + IPV4_MAXLEN > data_end) + return XDP_ABORTED; + hdr.tcp = (void *)hdr.ipv4 + hdr.ipv4->ihl * 4; + } else if (hdr.ipv6) { + hdr.eth = data; + hdr.ipv6 = (void *)hdr.eth + sizeof(*hdr.eth); + hdr.tcp = (void *)hdr.ipv6 + sizeof(*hdr.ipv6); + } else { + return XDP_ABORTED; + } + + if ((void *)hdr.tcp + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + // We run out of registers, tcp_len gets spilled to the stack, and the + // verifier forgets its min and max values checked above in tcp_dissect. + hdr.tcp_len = hdr.tcp->doff * 4; + if (hdr.tcp_len < sizeof(*hdr.tcp)) + return XDP_ABORTED; + + return hdr.tcp->syn ? syncookie_handle_syn(&hdr, ctx, data, data_end) : + syncookie_handle_ack(&hdr); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c new file mode 100644 index 000000000000..2b9585f2bc00 --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_synproxy.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include <stdnoreturn.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <getopt.h> +#include <signal.h> +#include <sys/types.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <net/if.h> +#include <linux/if_link.h> +#include <linux/limits.h> + +static unsigned int ifindex; +static __u32 attached_prog_id; + +static void noreturn cleanup(int sig) +{ + DECLARE_LIBBPF_OPTS(bpf_xdp_attach_opts, opts); + int prog_fd; + int err; + + if (attached_prog_id == 0) + exit(0); + + prog_fd = bpf_prog_get_fd_by_id(attached_prog_id); + if (prog_fd < 0) { + fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd)); + err = bpf_xdp_attach(ifindex, -1, 0, NULL); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err)); + fprintf(stderr, "Failed to detach XDP program\n"); + exit(1); + } + } else { + opts.old_prog_fd = prog_fd; + err = bpf_xdp_attach(ifindex, -1, XDP_FLAGS_REPLACE, &opts); + close(prog_fd); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd_opts: %s\n", strerror(-err)); + // Not an error if already replaced by someone else. + if (err != -EEXIST) { + fprintf(stderr, "Failed to detach XDP program\n"); + exit(1); + } + } + } + exit(0); +} + +static noreturn void usage(const char *progname) +{ + fprintf(stderr, "Usage: %s [--iface <iface>|--prog <prog_id>] [--mss4 <mss ipv4> --mss6 <mss ipv6> --wscale <wscale> --ttl <ttl>] [--ports <port1>,<port2>,...] [--single]\n", + progname); + exit(1); +} + +static unsigned long parse_arg_ul(const char *progname, const char *arg, unsigned long limit) +{ + unsigned long res; + char *endptr; + + errno = 0; + res = strtoul(arg, &endptr, 10); + if (errno != 0 || *endptr != '\0' || arg[0] == '\0' || res > limit) + usage(progname); + + return res; +} + +static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 *prog_id, + __u64 *tcpipopts, char **ports, bool *single) +{ + static struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "iface", required_argument, NULL, 'i' }, + { "prog", required_argument, NULL, 'x' }, + { "mss4", required_argument, NULL, 4 }, + { "mss6", required_argument, NULL, 6 }, + { "wscale", required_argument, NULL, 'w' }, + { "ttl", required_argument, NULL, 't' }, + { "ports", required_argument, NULL, 'p' }, + { "single", no_argument, NULL, 's' }, + { NULL, 0, NULL, 0 }, + }; + unsigned long mss4, mss6, wscale, ttl; + unsigned int tcpipopts_mask = 0; + + if (argc < 2) + usage(argv[0]); + + *ifindex = 0; + *prog_id = 0; + *tcpipopts = 0; + *ports = NULL; + *single = false; + + while (true) { + int opt; + + opt = getopt_long(argc, argv, "", long_options, NULL); + if (opt == -1) + break; + + switch (opt) { + case 'h': + usage(argv[0]); + break; + case 'i': + *ifindex = if_nametoindex(optarg); + if (*ifindex == 0) + usage(argv[0]); + break; + case 'x': + *prog_id = parse_arg_ul(argv[0], optarg, UINT32_MAX); + if (*prog_id == 0) + usage(argv[0]); + break; + case 4: + mss4 = parse_arg_ul(argv[0], optarg, UINT16_MAX); + tcpipopts_mask |= 1 << 0; + break; + case 6: + mss6 = parse_arg_ul(argv[0], optarg, UINT16_MAX); + tcpipopts_mask |= 1 << 1; + break; + case 'w': + wscale = parse_arg_ul(argv[0], optarg, 14); + tcpipopts_mask |= 1 << 2; + break; + case 't': + ttl = parse_arg_ul(argv[0], optarg, UINT8_MAX); + tcpipopts_mask |= 1 << 3; + break; + case 'p': + *ports = optarg; + break; + case 's': + *single = true; + break; + default: + usage(argv[0]); + } + } + if (optind < argc) + usage(argv[0]); + + if (tcpipopts_mask == 0xf) { + if (mss4 == 0 || mss6 == 0 || wscale == 0 || ttl == 0) + usage(argv[0]); + *tcpipopts = (mss6 << 32) | (ttl << 24) | (wscale << 16) | mss4; + } else if (tcpipopts_mask != 0) { + usage(argv[0]); + } + + if (*ifindex != 0 && *prog_id != 0) + usage(argv[0]); + if (*ifindex == 0 && *prog_id == 0) + usage(argv[0]); +} + +static int syncookie_attach(const char *argv0, unsigned int ifindex) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + char xdp_filename[PATH_MAX]; + struct bpf_program *prog; + struct bpf_object *obj; + int prog_fd; + int err; + + snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv0); + obj = bpf_object__open_file(xdp_filename, NULL); + err = libbpf_get_error(obj); + if (err < 0) { + fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err)); + return err; + } + + err = bpf_object__load(obj); + if (err < 0) { + fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err)); + return err; + } + + prog = bpf_object__find_program_by_name(obj, "syncookie_xdp"); + if (!prog) { + fprintf(stderr, "Error: bpf_object__find_program_by_name: program syncookie_xdp was not found\n"); + return -ENOENT; + } + + prog_fd = bpf_program__fd(prog); + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err < 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + goto out; + } + attached_prog_id = info.id; + signal(SIGINT, cleanup); + signal(SIGTERM, cleanup); + err = bpf_xdp_attach(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST, NULL); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err)); + signal(SIGINT, SIG_DFL); + signal(SIGTERM, SIG_DFL); + attached_prog_id = 0; + goto out; + } + err = 0; +out: + bpf_object__close(obj); + return err; +} + +static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports_map_fd) +{ + struct bpf_prog_info prog_info; + __u32 map_ids[8]; + __u32 info_len; + int prog_fd; + int err; + int i; + + *values_map_fd = -1; + *ports_map_fd = -1; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) { + fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd)); + return prog_fd; + } + + prog_info = (struct bpf_prog_info) { + .nr_map_ids = 8, + .map_ids = (__u64)map_ids, + }; + info_len = sizeof(prog_info); + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); + if (err != 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + goto out; + } + + if (prog_info.type != BPF_PROG_TYPE_XDP) { + fprintf(stderr, "Error: BPF prog type is not BPF_PROG_TYPE_XDP\n"); + err = -ENOENT; + goto out; + } + if (prog_info.nr_map_ids < 2) { + fprintf(stderr, "Error: Found %u BPF maps, expected at least 2\n", + prog_info.nr_map_ids); + err = -ENOENT; + goto out; + } + + for (i = 0; i < prog_info.nr_map_ids; i++) { + struct bpf_map_info map_info = {}; + int map_fd; + + err = bpf_map_get_fd_by_id(map_ids[i]); + if (err < 0) { + fprintf(stderr, "Error: bpf_map_get_fd_by_id: %s\n", strerror(-err)); + goto err_close_map_fds; + } + map_fd = err; + + info_len = sizeof(map_info); + err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); + if (err != 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + close(map_fd); + goto err_close_map_fds; + } + if (strcmp(map_info.name, "values") == 0) { + *values_map_fd = map_fd; + continue; + } + if (strcmp(map_info.name, "allowed_ports") == 0) { + *ports_map_fd = map_fd; + continue; + } + close(map_fd); + } + + if (*values_map_fd != -1 && *ports_map_fd != -1) { + err = 0; + goto out; + } + + err = -ENOENT; + +err_close_map_fds: + if (*values_map_fd != -1) + close(*values_map_fd); + if (*ports_map_fd != -1) + close(*ports_map_fd); + *values_map_fd = -1; + *ports_map_fd = -1; + +out: + close(prog_fd); + return err; +} + +int main(int argc, char *argv[]) +{ + int values_map_fd, ports_map_fd; + __u64 tcpipopts; + bool firstiter; + __u64 prevcnt; + __u32 prog_id; + char *ports; + bool single; + int err = 0; + + parse_options(argc, argv, &ifindex, &prog_id, &tcpipopts, &ports, &single); + + if (prog_id == 0) { + err = bpf_xdp_query_id(ifindex, 0, &prog_id); + if (err < 0) { + fprintf(stderr, "Error: bpf_get_link_xdp_id: %s\n", strerror(-err)); + goto out; + } + if (prog_id == 0) { + err = syncookie_attach(argv[0], ifindex); + if (err < 0) + goto out; + prog_id = attached_prog_id; + } + } + + err = syncookie_open_bpf_maps(prog_id, &values_map_fd, &ports_map_fd); + if (err < 0) + goto out; + + if (ports) { + __u16 port_last = 0; + __u32 port_idx = 0; + char *p = ports; + + fprintf(stderr, "Replacing allowed ports\n"); + + while (p && *p != '\0') { + char *token = strsep(&p, ","); + __u16 port; + + port = parse_arg_ul(argv[0], token, UINT16_MAX); + err = bpf_map_update_elem(ports_map_fd, &port_idx, &port, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + fprintf(stderr, "Failed to add port %u (index %u)\n", + port, port_idx); + goto out_close_maps; + } + fprintf(stderr, "Added port %u\n", port); + port_idx++; + } + err = bpf_map_update_elem(ports_map_fd, &port_idx, &port_last, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + fprintf(stderr, "Failed to add the terminator value 0 (index %u)\n", + port_idx); + goto out_close_maps; + } + } + + if (tcpipopts) { + __u32 key = 0; + + fprintf(stderr, "Replacing TCP/IP options\n"); + + err = bpf_map_update_elem(values_map_fd, &key, &tcpipopts, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + goto out_close_maps; + } + } + + if ((ports || tcpipopts) && attached_prog_id == 0 && !single) + goto out_close_maps; + + prevcnt = 0; + firstiter = true; + while (true) { + __u32 key = 1; + __u64 value; + + err = bpf_map_lookup_elem(values_map_fd, &key, &value); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_lookup_elem: %s\n", strerror(-err)); + goto out_close_maps; + } + if (firstiter) { + prevcnt = value; + firstiter = false; + } + if (single) { + printf("Total SYNACKs generated: %llu\n", value); + break; + } + printf("SYNACKs generated: %llu (total %llu)\n", value - prevcnt, value); + prevcnt = value; + sleep(1); + } + +out_close_maps: + close(values_map_fd); + close(ports_map_fd); +out: + return err == 0 ? 0 : 1; +}