diff mbox series

[v2,bpf-next,15/15] selftests/bpf: bpf_setsockopt tests

Message ID 20220803204736.3082620-1-kafai@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: net: Remove duplicated code from bpf_setsockopt() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 10 maintainers not CCed: john.fastabend@gmail.com song@kernel.org martin.lau@linux.dev linux-kselftest@vger.kernel.org kpsingh@kernel.org jolsa@kernel.org mykolal@fb.com shuah@kernel.org haoluo@google.com yhs@fb.com
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: Use of volatile is usually wrong: see Documentation/process/volatile-considered-harmful.rst WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: const array should probably be static const WARNING: externs should be avoided in .c files WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-16
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for Kernel LATEST on z15 with gcc

Commit Message

Martin KaFai Lau Aug. 3, 2022, 8:47 p.m. UTC
This patch adds tests to exercise optnames that are allowed
in bpf_setsockopt().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 .../selftests/bpf/prog_tests/setget_sockopt.c | 125 ++++
 .../selftests/bpf/progs/setget_sockopt.c      | 547 ++++++++++++++++++
 2 files changed, 672 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
 create mode 100644 tools/testing/selftests/bpf/progs/setget_sockopt.c

Comments

Stanislav Fomichev Aug. 3, 2022, 11:30 p.m. UTC | #1
On 08/03, Martin KaFai Lau wrote:
> This patch adds tests to exercise optnames that are allowed
> in bpf_setsockopt().

> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>   .../selftests/bpf/prog_tests/setget_sockopt.c | 125 ++++
>   .../selftests/bpf/progs/setget_sockopt.c      | 547 ++++++++++++++++++
>   2 files changed, 672 insertions(+)
>   create mode 100644  
> tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
>   create mode 100644 tools/testing/selftests/bpf/progs/setget_sockopt.c

> diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c  
> b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
> new file mode 100644
> index 000000000000..018611e6b248
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
> @@ -0,0 +1,125 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) Meta Platforms, Inc. and affiliates. */
> +
> +#define _GNU_SOURCE
> +#include <sched.h>
> +#include <linux/socket.h>
> +#include <net/if.h>
> +
> +#include "test_progs.h"
> +#include "cgroup_helpers.h"
> +#include "network_helpers.h"
> +
> +#include "setget_sockopt.skel.h"
> +
> +#define CG_NAME "/setget-sockopt-test"
> +
> +static const char addr4_str[] = "127.0.0.1";
> +static const char addr6_str[] = "::1";
> +static struct setget_sockopt *skel;
> +static int cg_fd;
> +
> +static int create_netns(void)
> +{
> +	if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
> +		return -1;
> +
> +	if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
> +		return -1;
> +
> +	if (!ASSERT_OK(system("ip link add dev binddevtest1 type veth peer name  
> binddevtest2"),
> +		       "add veth"))
> +		return -1;
> +
> +	if (!ASSERT_OK(system("ip link set dev binddevtest1 up"),
> +		       "bring veth up"))
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static void test_tcp(int family)
> +{
> +	struct setget_sockopt__bss *bss = skel->bss;
> +	int sfd, cfd;
> +
> +	memset(bss, 0, sizeof(*bss));
> +
> +	sfd = start_server(family, SOCK_STREAM,
> +			   family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
> +	if (!ASSERT_GE(sfd, 0, "start_server"))
> +		return;
> +
> +	cfd = connect_to_fd(sfd, 0);
> +	if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
> +		close(sfd);
> +		return;
> +	}
> +	close(sfd);
> +	close(cfd);
> +
> +	ASSERT_EQ(bss->nr_listen, 1, "nr_listen");
> +	ASSERT_EQ(bss->nr_connect, 1, "nr_connect");
> +	ASSERT_EQ(bss->nr_active, 1, "nr_active");
> +	ASSERT_EQ(bss->nr_passive, 1, "nr_passive");
> +	ASSERT_EQ(bss->nr_socket_post_create, 2, "nr_socket_post_create");
> +	ASSERT_EQ(bss->nr_binddev, 2, "nr_bind");
> +}
> +
> +static void test_udp(int family)
> +{
> +	struct setget_sockopt__bss *bss = skel->bss;
> +	int sfd;
> +
> +	memset(bss, 0, sizeof(*bss));
> +
> +	sfd = start_server(family, SOCK_DGRAM,
> +			   family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
> +	if (!ASSERT_GE(sfd, 0, "start_server"))
> +		return;
> +	close(sfd);
> +
> +	ASSERT_GE(bss->nr_socket_post_create, 1, "nr_socket_post_create");
> +	ASSERT_EQ(bss->nr_binddev, 1, "nr_bind");
> +}
> +
> +void test_setget_sockopt(void)
> +{
> +	cg_fd = test__join_cgroup(CG_NAME);
> +	if (cg_fd < 0)
> +		return;
> +
> +	if (create_netns())
> +		goto done;
> +
> +	skel = setget_sockopt__open();
> +	if (!ASSERT_OK_PTR(skel, "open skel"))
> +		goto done;
> +
> +	strcpy(skel->rodata->veth, "binddevtest1");
> +	skel->rodata->veth_ifindex = if_nametoindex("binddevtest1");
> +	if (!ASSERT_GT(skel->rodata->veth_ifindex, 0, "if_nametoindex"))
> +		goto done;
> +
> +	if (!ASSERT_OK(setget_sockopt__load(skel), "load skel"))
> +		goto done;
> +
> +	skel->links.skops_sockopt =
> +		bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
> +	if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
> +		goto done;
> +
> +	skel->links.socket_post_create =
> +		bpf_program__attach_cgroup(skel->progs.socket_post_create, cg_fd);
> +	if (!ASSERT_OK_PTR(skel->links.socket_post_create, "attach_cgroup"))
> +		goto done;
> +
> +	test_tcp(AF_INET6);
> +	test_tcp(AF_INET);
> +	test_udp(AF_INET6);
> +	test_udp(AF_INET);
> +
> +done:
> +	setget_sockopt__destroy(skel);
> +	close(cg_fd);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c  
> b/tools/testing/selftests/bpf/progs/setget_sockopt.c
> new file mode 100644
> index 000000000000..560cf4b92d65
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
> @@ -0,0 +1,547 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) Meta Platforms, Inc. and affiliates. */
> +
> +#include <stddef.h>
> +#include <stdbool.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <linux/in.h>
> +#include <linux/ipv6.h>
> +#include <linux/tcp.h>
> +#include <linux/socket.h>
> +#include <linux/bpf.h>
> +#include <linux/if.h>
> +#include <linux/types.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include <errno.h>
> +
> +#ifndef SO_TXREHASH
> +#define SO_TXREHASH 74
> +#endif
> +
> +#ifndef TCP_NAGLE_OFF
> +#define TCP_NAGLE_OFF 1
> +#endif
> +
> +#ifndef ARRAY_SIZE
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
> +#endif
> +
> +extern unsigned long CONFIG_HZ __kconfig;
> +
> +const volatile char veth[IFNAMSIZ];
> +const volatile int veth_ifindex;
> +
> +int nr_listen;
> +int nr_passive;
> +int nr_active;
> +int nr_connect;
> +int nr_binddev;
> +int nr_socket_post_create;
> +
> +struct sockopt_test {
> +	int opt;
> +	int new;
> +	int restore;
> +	int expected;
> +	int tcp_expected;
> +	int flip:1;
> +};
> +
> +static const char cubic_cc[] = "cubic";
> +static const char reno_cc[] = "reno";
> +
> +static const struct sockopt_test sol_socket_tests[] = {
> +	{ .opt = SO_REUSEADDR, .flip = 1, },
> +	{ .opt = SO_SNDBUF, .new = 8123, .expected = 8123 * 2, },
> +	{ .opt = SO_RCVBUF, .new = 8123, .expected = 8123 * 2, },
> +	{ .opt = SO_KEEPALIVE, .flip = 1, },
> +	{ .opt = SO_PRIORITY, .new = 0xeb9f, .expected = 0xeb9f, },
> +	{ .opt = SO_REUSEPORT, .flip = 1, },
> +	{ .opt = SO_RCVLOWAT, .new = 8123, .expected = 8123, },
> +	{ .opt = SO_MARK, .new = 0xeb9f, .expected = 0xeb9f, },
> +	{ .opt = SO_MAX_PACING_RATE, .new = 0xeb9f, .expected = 0xeb9f, },
> +	{ .opt = SO_TXREHASH, .flip = 1, },
> +	{ .opt = 0, },
> +};
> +
> +static const struct sockopt_test sol_tcp_tests[] = {
> +	{ .opt = TCP_NODELAY, .flip = 1, },
> +	{ .opt = TCP_MAXSEG, .new = 1314, .expected = 1314, },
> +	{ .opt = TCP_KEEPIDLE, .new = 123, .expected = 123, .restore = 321, },
> +	{ .opt = TCP_KEEPINTVL, .new = 123, .expected = 123, .restore = 321, },
> +	{ .opt = TCP_KEEPCNT, .new = 123, .expected = 123, .restore = 124, },
> +	{ .opt = TCP_SYNCNT, .new = 123, .expected = 123, .restore = 124, },
> +	{ .opt = TCP_WINDOW_CLAMP, .new = 8123, .expected = 8123, .restore =  
> 8124, },
> +	{ .opt = TCP_CONGESTION, },
> +	{ .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, },
> +	{ .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, },
> +	{ .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, },
> +	{ .opt = TCP_SAVE_SYN, .new = 1, .expected = 1, },
> +	{ .opt = 0, },
> +};
> +
> +static const struct sockopt_test sol_ip_tests[] = {
> +	{ .opt = IP_TOS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, },
> +	{ .opt = 0, },
> +};
> +
> +static const struct sockopt_test sol_ipv6_tests[] = {
> +	{ .opt = IPV6_TCLASS, .new = 0xe1, .expected = 0xe1, .tcp_expected =  
> 0xe0, },
> +	{ .opt = IPV6_AUTOFLOWLABEL, .flip = 1, },
> +	{ .opt = 0, },
> +};
> +
> +struct sock_common {
> +	unsigned short	skc_family;
> +	unsigned long	skc_flags;
> +	unsigned char	skc_reuse:4;
> +	unsigned char	skc_reuseport:1;
> +	unsigned char	skc_ipv6only:1;
> +	unsigned char	skc_net_refcnt:1;
> +} __attribute__((preserve_access_index));
> +
> +struct sock {
> +	struct sock_common	__sk_common;
> +	__u16			sk_type;
> +	__u16			sk_protocol;
> +	int			sk_rcvlowat;
> +	__u32			sk_mark;
> +	unsigned long		sk_max_pacing_rate;
> +	unsigned int		keepalive_time;
> +	unsigned int		keepalive_intvl;
> +} __attribute__((preserve_access_index));
> +
> +struct tcp_options_received {
> +	__u16 user_mss;
> +} __attribute__((preserve_access_index));

I'm assuming you're not using vmlinux here because it doesn't bring
it most of the defines? Should we add missing stuff to bpf_tracing_net.h
instead?

> +struct ipv6_pinfo {
> +	__u16			recverr:1,
> +				sndflow:1,
> +				repflow:1,
> +				pmtudisc:3,
> +				padding:1,
> +				srcprefs:3,
> +				dontfrag:1,
> +				autoflowlabel:1,
> +				autoflowlabel_set:1,
> +				mc_all:1,
> +				recverr_rfc4884:1,
> +				rtalert_isolate:1;
> +}  __attribute__((preserve_access_index));
> +
> +struct inet_sock {
> +	/* sk and pinet6 has to be the first two members of inet_sock */
> +	struct sock		sk;
> +	struct ipv6_pinfo	*pinet6;
> +} __attribute__((preserve_access_index));
> +
> +struct inet_connection_sock {
> +	__u32			  icsk_user_timeout;
> +	__u8			  icsk_syn_retries;
> +} __attribute__((preserve_access_index));
> +
> +struct tcp_sock {
> +	struct inet_connection_sock	inet_conn;
> +	struct tcp_options_received rx_opt;
> +	__u8	save_syn:2,
> +		syn_data:1,
> +		syn_fastopen:1,
> +		syn_fastopen_exp:1,
> +		syn_fastopen_ch:1,
> +		syn_data_acked:1,
> +		is_cwnd_limited:1;
> +	__u32	window_clamp;
> +	__u8	nonagle     : 4,
> +		thin_lto    : 1,
> +		recvmsg_inq : 1,
> +		repair      : 1,
> +		frto        : 1;
> +	__u32	notsent_lowat;
> +	__u8	keepalive_probes;
> +	unsigned int		keepalive_time;
> +	unsigned int		keepalive_intvl;
> +} __attribute__((preserve_access_index));
> +
> +struct socket {
> +	struct sock *sk;
> +} __attribute__((preserve_access_index));
> +
> +struct loop_ctx {
> +	void *ctx;
> +	struct sock *sk;
> +};
> +
> +static int __bpf_getsockopt(void *ctx, struct sock *sk,
> +			    int level, int opt, int *optval,
> +			    int optlen)
> +{
> +	if (level == SOL_SOCKET) {
> +		switch (opt) {
> +		case SO_REUSEADDR:
> +			*optval = !!(sk->__sk_common.skc_reuse);
> +			break;
> +		case SO_KEEPALIVE:
> +			*optval = !!(sk->__sk_common.skc_flags & (1UL << 3));
> +			break;
> +		case SO_RCVLOWAT:
> +			*optval = sk->sk_rcvlowat;
> +			break;

What's the idea with the options above? Why not allow them in
bpf_getsockopt instead?

> +		case SO_MARK:
> +			*optval = sk->sk_mark;
> +			break;

SO_MARK should be handled by bpf_getsockopt ?

> +		case SO_MAX_PACING_RATE:
> +			*optval = sk->sk_max_pacing_rate;
> +			break;
> +		default:
> +			return bpf_getsockopt(ctx, level, opt, optval, optlen);
> +		}
> +		return 0;
> +	}
> +
> +	if (level == IPPROTO_TCP) {
> +		struct tcp_sock *tp = bpf_skc_to_tcp_sock(sk);
> +
> +		if (!tp)
> +			return -1;
> +
> +		switch (opt) {
> +		case TCP_NODELAY:
> +			*optval = !!(tp->nonagle & TCP_NAGLE_OFF);
> +			break;
> +		case TCP_MAXSEG:
> +			*optval = tp->rx_opt.user_mss;
> +			break;
> +		case TCP_KEEPIDLE:
> +			*optval = tp->keepalive_time / CONFIG_HZ;
> +			break;
> +		case TCP_SYNCNT:
> +			*optval = tp->inet_conn.icsk_syn_retries;
> +			break;
> +		case TCP_KEEPINTVL:
> +			*optval = tp->keepalive_intvl / CONFIG_HZ;
> +			break;
> +		case TCP_KEEPCNT:
> +			*optval = tp->keepalive_probes;
> +			break;
> +		case TCP_WINDOW_CLAMP:
> +			*optval = tp->window_clamp;
> +			break;
> +		case TCP_THIN_LINEAR_TIMEOUTS:
> +			*optval = tp->thin_lto;
> +			break;
> +		case TCP_USER_TIMEOUT:
> +			*optval = tp->inet_conn.icsk_user_timeout;
> +			break;
> +		case TCP_NOTSENT_LOWAT:
> +			*optval = tp->notsent_lowat;
> +			break;
> +		case TCP_SAVE_SYN:
> +			*optval = tp->save_syn;
> +			break;
> +		default:
> +			return bpf_getsockopt(ctx, level, opt, optval, optlen);
> +		}
> +		return 0;
> +	}
> +
> +	if (level == IPPROTO_IPV6) {
> +		switch (opt) {
> +		case IPV6_AUTOFLOWLABEL: {
> +			__u16 proto = sk->sk_protocol;
> +			struct inet_sock *inet_sk;
> +
> +			if (proto == IPPROTO_TCP)
> +				inet_sk = (struct inet_sock *)bpf_skc_to_tcp_sock(sk);
> +			else
> +				inet_sk = (struct inet_sock *)bpf_skc_to_udp6_sock(sk);
> +
> +			if (!inet_sk)
> +				return -1;
> +
> +			*optval = !!inet_sk->pinet6->autoflowlabel;
> +			break;
> +		}
> +		default:
> +			return bpf_getsockopt(ctx, level, opt, optval, optlen);
> +		}
> +		return 0;
> +	}
> +
> +	return bpf_getsockopt(ctx, level, opt, optval, optlen);
> +}
> +
> +static int bpf_test_sockopt_flip(void *ctx, struct sock *sk,
> +				 const struct sockopt_test *t,
> +				 int level)
> +{
> +	int old, tmp, new, opt = t->opt;
> +
> +	opt = t->opt;
> +
> +	if (__bpf_getsockopt(ctx, sk, level, opt, &old, sizeof(old)))
> +		return 1;
> +	/* kernel initialized txrehash to 255 */
> +	if (level == SOL_SOCKET && opt == SO_TXREHASH && old != 0 && old != 1)
> +		old = 1;
> +
> +	new = !old;
> +	if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
> +		return 1;
> +	if (__bpf_getsockopt(ctx, sk, level, opt, &tmp, sizeof(tmp)) ||
> +	    tmp != new)
> +		return 1;
> +
> +	if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old)))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
> +				const struct sockopt_test *t,
> +				int level)
> +{
> +	int old, tmp, new, expected, opt;
> +
> +	opt = t->opt;
> +	new = t->new;
> +	if (sk->sk_type == SOCK_STREAM && t->tcp_expected)
> +		expected = t->tcp_expected;
> +	else
> +		expected = t->expected;
> +
> +	if (__bpf_getsockopt(ctx, sk, level, opt, &old, sizeof(old)) ||
> +	    old == new)
> +		return 1;
> +
> +	if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
> +		return 1;
> +	if (__bpf_getsockopt(ctx, sk, level, opt, &tmp, sizeof(tmp)) ||
> +	    tmp != expected)
> +		return 1;
> +
> +	if (t->restore)
> +		old = t->restore;
> +	if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old)))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
> +{
> +	const struct sockopt_test *t;
> +
> +	if (i >= ARRAY_SIZE(sol_socket_tests))
> +		return 1;
> +
> +	t = &sol_socket_tests[i];
> +	if (!t->opt)
> +		return 1;
> +
> +	if (t->flip)
> +		return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, SOL_SOCKET);
> +
> +	return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
> +}
> +
> +static int bpf_test_ip_sockopt(__u32 i, struct loop_ctx *lc)
> +{
> +	const struct sockopt_test *t;
> +
> +	if (i >= ARRAY_SIZE(sol_ip_tests))
> +		return 1;
> +
> +	t = &sol_ip_tests[i];
> +	if (!t->opt)
> +		return 1;
> +
> +	if (t->flip)
> +		return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IP);
> +
> +	return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IP);
> +}
> +
> +static int bpf_test_ipv6_sockopt(__u32 i, struct loop_ctx *lc)
> +{
> +	const struct sockopt_test *t;
> +
> +	if (i >= ARRAY_SIZE(sol_ipv6_tests))
> +		return 1;
> +
> +	t = &sol_ipv6_tests[i];
> +	if (!t->opt)
> +		return 1;
> +
> +	if (t->flip)
> +		return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IPV6);
> +
> +	return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IPV6);
> +}
> +
> +static int bpf_test_tcp_sockopt(__u32 i, struct loop_ctx *lc)
> +{
> +	const struct sockopt_test *t;
> +	struct sock *sk;
> +	void *ctx;
> +
> +	if (i >= ARRAY_SIZE(sol_tcp_tests))
> +		return 1;
> +
> +	t = &sol_tcp_tests[i];
> +	if (!t->opt)
> +		return 1;
> +
> +	ctx = lc->ctx;
> +	sk = lc->sk;
> +
> +	if (t->opt == TCP_CONGESTION) {
> +		char old_cc[16], tmp_cc[16];
> +		const char *new_cc;
> +
> +		if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc,  
> sizeof(old_cc)))
> +			return 1;
> +		if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc))
> +			new_cc = reno_cc;
> +		else
> +			new_cc = cubic_cc;
> +		if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, (void *)new_cc,
> +				   sizeof(new_cc)))
> +			return 1;
> +		if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, tmp_cc,  
> sizeof(tmp_cc)))
> +			return 1;
> +		if (bpf_strncmp(tmp_cc, sizeof(tmp_cc), new_cc))
> +			return 1;
> +		if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc,  
> sizeof(old_cc)))
> +			return 1;
> +		return 0;
> +	}
> +
> +	if (t->flip)
> +		return bpf_test_sockopt_flip(ctx, sk, t, IPPROTO_TCP);
> +
> +	return bpf_test_sockopt_int(ctx, sk, t, IPPROTO_TCP);
> +}
> +
> +static int bpf_test_sockopt(void *ctx, struct sock *sk)
> +{
> +	struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
> +	__u16 family, proto;
> +	int n;
> +
> +	family = sk->__sk_common.skc_family;
> +	proto = sk->sk_protocol;
> +
> +	n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt,  
> &lc, 0);
> +	if (n != ARRAY_SIZE(sol_socket_tests))
> +		return -1;
> +
> +	if (proto == IPPROTO_TCP) {
> +		n = bpf_loop(ARRAY_SIZE(sol_tcp_tests), bpf_test_tcp_sockopt, &lc, 0);
> +		if (n != ARRAY_SIZE(sol_tcp_tests))
> +			return -1;
> +	}
> +
> +	if (family == AF_INET) {
> +		n = bpf_loop(ARRAY_SIZE(sol_ip_tests), bpf_test_ip_sockopt, &lc, 0);
> +		if (n != ARRAY_SIZE(sol_ip_tests))
> +			return -1;
> +	} else {
> +		n = bpf_loop(ARRAY_SIZE(sol_ipv6_tests), bpf_test_ipv6_sockopt, &lc,  
> 0);
> +		if (n != ARRAY_SIZE(sol_ipv6_tests))
> +			return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int binddev_test(void *ctx)
> +{
> +	const char empty_ifname[] = "";
> +	int ifindex, zero = 0;
> +
> +	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
> +			   (void *)veth, sizeof(veth)))
> +		return -1;
> +	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
> +			   &ifindex, sizeof(int)) ||
> +	    ifindex != veth_ifindex)
> +		return -1;
> +
> +	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
> +			   (void *)empty_ifname, sizeof(empty_ifname)))
> +		return -1;
> +	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
> +			   &ifindex, sizeof(int)) ||
> +	    ifindex != 0)
> +		return -1;
> +
> +	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
> +			   (void *)&veth_ifindex, sizeof(int)))
> +		return -1;
> +	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
> +			   &ifindex, sizeof(int)) ||
> +	    ifindex != veth_ifindex)
> +		return -1;
> +
> +	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
> +			   &zero, sizeof(int)))
> +		return -1;
> +	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
> +			   &ifindex, sizeof(int)) ||
> +	    ifindex != 0)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +SEC("lsm_cgroup/socket_post_create")
> +int BPF_PROG(socket_post_create, struct socket *sock, int family,
> +	     int type, int protocol, int kern)
> +{
> +	struct sock *sk = sock->sk;
> +
> +	if (!sk)
> +		return 1;
> +
> +	nr_socket_post_create += !bpf_test_sockopt(sk, sk);
> +	nr_binddev += !binddev_test(sk);
> +
> +	return 1;
> +}
> +
> +SEC("sockops")
> +int skops_sockopt(struct bpf_sock_ops *skops)
> +{
> +	struct bpf_sock *bpf_sk = skops->sk;
> +	struct sock *sk;
> +
> +	if (!bpf_sk)
> +		return 1;
> +
> +	sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
> +	if (!sk)
> +		return 1;
> +
> +	switch (skops->op) {
> +	case BPF_SOCK_OPS_TCP_LISTEN_CB:
> +		nr_listen += !bpf_test_sockopt(skops, sk);
> +		break;
> +	case BPF_SOCK_OPS_TCP_CONNECT_CB:
> +		nr_connect += !bpf_test_sockopt(skops, sk);
> +		break;
> +	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
> +		nr_active += !bpf_test_sockopt(skops, sk);
> +		break;
> +	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
> +		nr_passive += !bpf_test_sockopt(skops, sk);
> +		break;
> +	}
> +
> +	return 1;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> --
> 2.30.2
Martin KaFai Lau Aug. 4, 2022, 12:04 a.m. UTC | #2
On Wed, Aug 03, 2022 at 04:30:54PM -0700, sdf@google.com wrote:
> > +struct sock_common {
> > +	unsigned short	skc_family;
> > +	unsigned long	skc_flags;
> > +	unsigned char	skc_reuse:4;
> > +	unsigned char	skc_reuseport:1;
> > +	unsigned char	skc_ipv6only:1;
> > +	unsigned char	skc_net_refcnt:1;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct sock {
> > +	struct sock_common	__sk_common;
> > +	__u16			sk_type;
> > +	__u16			sk_protocol;
> > +	int			sk_rcvlowat;
> > +	__u32			sk_mark;
> > +	unsigned long		sk_max_pacing_rate;
> > +	unsigned int		keepalive_time;
> > +	unsigned int		keepalive_intvl;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct tcp_options_received {
> > +	__u16 user_mss;
> > +} __attribute__((preserve_access_index));
> 
> I'm assuming you're not using vmlinux here because it doesn't bring
> it most of the defines? Should we add missing stuff to bpf_tracing_net.h
> instead?
Ah, actually my first attempt was to use vmlinux.h and had
all defines ready for addition to bpf_tracing_net.h. 

However, I hit an issue in reading bitfield.  It is why the
bitfield in the tcp_sock below is sandwiched between __u32.
I think it is likely LLVM and/or CO-RE related. Yonghong is
helping to investigate it.

In the mean time, I define those mini struct here.
Once the bitfield issue is resolved, we can go back to
use vmlinux.h.

> 
> > +struct ipv6_pinfo {
> > +	__u16			recverr:1,
> > +				sndflow:1,
> > +				repflow:1,
> > +				pmtudisc:3,
> > +				padding:1,
> > +				srcprefs:3,
> > +				dontfrag:1,
> > +				autoflowlabel:1,
> > +				autoflowlabel_set:1,
> > +				mc_all:1,
> > +				recverr_rfc4884:1,
> > +				rtalert_isolate:1;
> > +}  __attribute__((preserve_access_index));
> > +
> > +struct inet_sock {
> > +	/* sk and pinet6 has to be the first two members of inet_sock */
> > +	struct sock		sk;
> > +	struct ipv6_pinfo	*pinet6;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct inet_connection_sock {
> > +	__u32			  icsk_user_timeout;
> > +	__u8			  icsk_syn_retries;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct tcp_sock {
> > +	struct inet_connection_sock	inet_conn;
> > +	struct tcp_options_received rx_opt;
> > +	__u8	save_syn:2,
> > +		syn_data:1,
> > +		syn_fastopen:1,
> > +		syn_fastopen_exp:1,
> > +		syn_fastopen_ch:1,
> > +		syn_data_acked:1,
> > +		is_cwnd_limited:1;
> > +	__u32	window_clamp;
> > +	__u8	nonagle     : 4,
> > +		thin_lto    : 1,
> > +		recvmsg_inq : 1,
> > +		repair      : 1,
> > +		frto        : 1;
> > +	__u32	notsent_lowat;
> > +	__u8	keepalive_probes;
> > +	unsigned int		keepalive_time;
> > +	unsigned int		keepalive_intvl;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct socket {
> > +	struct sock *sk;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct loop_ctx {
> > +	void *ctx;
> > +	struct sock *sk;
> > +};
> > +
> > +static int __bpf_getsockopt(void *ctx, struct sock *sk,
> > +			    int level, int opt, int *optval,
> > +			    int optlen)
> > +{
> > +	if (level == SOL_SOCKET) {
> > +		switch (opt) {
> > +		case SO_REUSEADDR:
> > +			*optval = !!(sk->__sk_common.skc_reuse);
> > +			break;
> > +		case SO_KEEPALIVE:
> > +			*optval = !!(sk->__sk_common.skc_flags & (1UL << 3));
> > +			break;
> > +		case SO_RCVLOWAT:
> > +			*optval = sk->sk_rcvlowat;
> > +			break;
> 
> What's the idea with the options above? Why not allow them in
> bpf_getsockopt instead?
I am planning to refactor the bpf_getsockopt also,
so trying to avoid adding more dup code at this point
while they can directly be read from sk through PTR_TO_BTF_ID.

btw, since we are on bpf_getsockopt(), do you still see a usage on
bpf_getsockopt() for those 'integer-value' optnames that can be
easily read from the sk pointer ?

> 
> > +		case SO_MARK:
> > +			*optval = sk->sk_mark;
> > +			break;
> 
> SO_MARK should be handled by bpf_getsockopt ?
Good point, will remove SO_MARK case.

Thanks for the review!
Stanislav Fomichev Aug. 4, 2022, 5:03 p.m. UTC | #3
On Wed, Aug 3, 2022 at 5:04 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Wed, Aug 03, 2022 at 04:30:54PM -0700, sdf@google.com wrote:
> > > +struct sock_common {
> > > +   unsigned short  skc_family;
> > > +   unsigned long   skc_flags;
> > > +   unsigned char   skc_reuse:4;
> > > +   unsigned char   skc_reuseport:1;
> > > +   unsigned char   skc_ipv6only:1;
> > > +   unsigned char   skc_net_refcnt:1;
> > > +} __attribute__((preserve_access_index));
> > > +
> > > +struct sock {
> > > +   struct sock_common      __sk_common;
> > > +   __u16                   sk_type;
> > > +   __u16                   sk_protocol;
> > > +   int                     sk_rcvlowat;
> > > +   __u32                   sk_mark;
> > > +   unsigned long           sk_max_pacing_rate;
> > > +   unsigned int            keepalive_time;
> > > +   unsigned int            keepalive_intvl;
> > > +} __attribute__((preserve_access_index));
> > > +
> > > +struct tcp_options_received {
> > > +   __u16 user_mss;
> > > +} __attribute__((preserve_access_index));
> >
> > I'm assuming you're not using vmlinux here because it doesn't bring
> > it most of the defines? Should we add missing stuff to bpf_tracing_net.h
> > instead?
> Ah, actually my first attempt was to use vmlinux.h and had
> all defines ready for addition to bpf_tracing_net.h.
>
> However, I hit an issue in reading bitfield.  It is why the
> bitfield in the tcp_sock below is sandwiched between __u32.
> I think it is likely LLVM and/or CO-RE related. Yonghong is
> helping to investigate it.
>
> In the mean time, I define those mini struct here.
> Once the bitfield issue is resolved, we can go back to
> use vmlinux.h.

Oh, interesting :-)

> > > +struct ipv6_pinfo {
> > > +   __u16                   recverr:1,
> > > +                           sndflow:1,
> > > +                           repflow:1,
> > > +                           pmtudisc:3,
> > > +                           padding:1,
> > > +                           srcprefs:3,
> > > +                           dontfrag:1,
> > > +                           autoflowlabel:1,
> > > +                           autoflowlabel_set:1,
> > > +                           mc_all:1,
> > > +                           recverr_rfc4884:1,
> > > +                           rtalert_isolate:1;
> > > +}  __attribute__((preserve_access_index));
> > > +
> > > +struct inet_sock {
> > > +   /* sk and pinet6 has to be the first two members of inet_sock */
> > > +   struct sock             sk;
> > > +   struct ipv6_pinfo       *pinet6;
> > > +} __attribute__((preserve_access_index));
> > > +
> > > +struct inet_connection_sock {
> > > +   __u32                     icsk_user_timeout;
> > > +   __u8                      icsk_syn_retries;
> > > +} __attribute__((preserve_access_index));
> > > +
> > > +struct tcp_sock {
> > > +   struct inet_connection_sock     inet_conn;
> > > +   struct tcp_options_received rx_opt;
> > > +   __u8    save_syn:2,
> > > +           syn_data:1,
> > > +           syn_fastopen:1,
> > > +           syn_fastopen_exp:1,
> > > +           syn_fastopen_ch:1,
> > > +           syn_data_acked:1,
> > > +           is_cwnd_limited:1;
> > > +   __u32   window_clamp;
> > > +   __u8    nonagle     : 4,
> > > +           thin_lto    : 1,
> > > +           recvmsg_inq : 1,
> > > +           repair      : 1,
> > > +           frto        : 1;
> > > +   __u32   notsent_lowat;
> > > +   __u8    keepalive_probes;
> > > +   unsigned int            keepalive_time;
> > > +   unsigned int            keepalive_intvl;
> > > +} __attribute__((preserve_access_index));
> > > +
> > > +struct socket {
> > > +   struct sock *sk;
> > > +} __attribute__((preserve_access_index));
> > > +
> > > +struct loop_ctx {
> > > +   void *ctx;
> > > +   struct sock *sk;
> > > +};
> > > +
> > > +static int __bpf_getsockopt(void *ctx, struct sock *sk,
> > > +                       int level, int opt, int *optval,
> > > +                       int optlen)
> > > +{
> > > +   if (level == SOL_SOCKET) {
> > > +           switch (opt) {
> > > +           case SO_REUSEADDR:
> > > +                   *optval = !!(sk->__sk_common.skc_reuse);
> > > +                   break;
> > > +           case SO_KEEPALIVE:
> > > +                   *optval = !!(sk->__sk_common.skc_flags & (1UL << 3));
> > > +                   break;
> > > +           case SO_RCVLOWAT:
> > > +                   *optval = sk->sk_rcvlowat;
> > > +                   break;
> >
> > What's the idea with the options above? Why not allow them in
> > bpf_getsockopt instead?
> I am planning to refactor the bpf_getsockopt also,
> so trying to avoid adding more dup code at this point
> while they can directly be read from sk through PTR_TO_BTF_ID.
>
> btw, since we are on bpf_getsockopt(), do you still see a usage on
> bpf_getsockopt() for those 'integer-value' optnames that can be
> easily read from the sk pointer ?

Writing is still done via bpf_setsockopt, so having the same interface
to read the settings seems useful?




> > > +           case SO_MARK:
> > > +                   *optval = sk->sk_mark;
> > > +                   break;
> >
> > SO_MARK should be handled by bpf_getsockopt ?
> Good point, will remove SO_MARK case.
>
> Thanks for the review!
Martin KaFai Lau Aug. 4, 2022, 7:17 p.m. UTC | #4
On Thu, Aug 04, 2022 at 10:03:58AM -0700, Stanislav Fomichev wrote:
> > I am planning to refactor the bpf_getsockopt also,
> > so trying to avoid adding more dup code at this point
> > while they can directly be read from sk through PTR_TO_BTF_ID.
> >
> > btw, since we are on bpf_getsockopt(), do you still see a usage on
> > bpf_getsockopt() for those 'integer-value' optnames that can be
> > easily read from the sk pointer ?
> 
> Writing is still done via bpf_setsockopt, so having the same interface
> to read the settings seems useful?
Make sense.  It probably will have less surprise to have a
symmetrical optname expectation on set/getsockopt.  It will be
cheaper to add to bpf_getsockopt() anyway once it is cleaned up.
Asking because I just don't have new use case (adding optnames)
to bpf_getsockopt() after the bpf_skc_to_*() helpers were
introduced.

> > > > +           case SO_MARK:
> > > > +                   *optval = sk->sk_mark;
> > > > +                   break;
> > >
> > > SO_MARK should be handled by bpf_getsockopt ?
> > Good point, will remove SO_MARK case.
> >
> > Thanks for the review!
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
new file mode 100644
index 000000000000..018611e6b248
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
@@ -0,0 +1,125 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <linux/socket.h>
+#include <net/if.h>
+
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+
+#include "setget_sockopt.skel.h"
+
+#define CG_NAME "/setget-sockopt-test"
+
+static const char addr4_str[] = "127.0.0.1";
+static const char addr6_str[] = "::1";
+static struct setget_sockopt *skel;
+static int cg_fd;
+
+static int create_netns(void)
+{
+	if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
+		return -1;
+
+	if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
+		return -1;
+
+	if (!ASSERT_OK(system("ip link add dev binddevtest1 type veth peer name binddevtest2"),
+		       "add veth"))
+		return -1;
+
+	if (!ASSERT_OK(system("ip link set dev binddevtest1 up"),
+		       "bring veth up"))
+		return -1;
+
+	return 0;
+}
+
+static void test_tcp(int family)
+{
+	struct setget_sockopt__bss *bss = skel->bss;
+	int sfd, cfd;
+
+	memset(bss, 0, sizeof(*bss));
+
+	sfd = start_server(family, SOCK_STREAM,
+			   family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+	if (!ASSERT_GE(sfd, 0, "start_server"))
+		return;
+
+	cfd = connect_to_fd(sfd, 0);
+	if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {
+		close(sfd);
+		return;
+	}
+	close(sfd);
+	close(cfd);
+
+	ASSERT_EQ(bss->nr_listen, 1, "nr_listen");
+	ASSERT_EQ(bss->nr_connect, 1, "nr_connect");
+	ASSERT_EQ(bss->nr_active, 1, "nr_active");
+	ASSERT_EQ(bss->nr_passive, 1, "nr_passive");
+	ASSERT_EQ(bss->nr_socket_post_create, 2, "nr_socket_post_create");
+	ASSERT_EQ(bss->nr_binddev, 2, "nr_bind");
+}
+
+static void test_udp(int family)
+{
+	struct setget_sockopt__bss *bss = skel->bss;
+	int sfd;
+
+	memset(bss, 0, sizeof(*bss));
+
+	sfd = start_server(family, SOCK_DGRAM,
+			   family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+	if (!ASSERT_GE(sfd, 0, "start_server"))
+		return;
+	close(sfd);
+
+	ASSERT_GE(bss->nr_socket_post_create, 1, "nr_socket_post_create");
+	ASSERT_EQ(bss->nr_binddev, 1, "nr_bind");
+}
+
+void test_setget_sockopt(void)
+{
+	cg_fd = test__join_cgroup(CG_NAME);
+	if (cg_fd < 0)
+		return;
+
+	if (create_netns())
+		goto done;
+
+	skel = setget_sockopt__open();
+	if (!ASSERT_OK_PTR(skel, "open skel"))
+		goto done;
+
+	strcpy(skel->rodata->veth, "binddevtest1");
+	skel->rodata->veth_ifindex = if_nametoindex("binddevtest1");
+	if (!ASSERT_GT(skel->rodata->veth_ifindex, 0, "if_nametoindex"))
+		goto done;
+
+	if (!ASSERT_OK(setget_sockopt__load(skel), "load skel"))
+		goto done;
+
+	skel->links.skops_sockopt =
+		bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
+	if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
+		goto done;
+
+	skel->links.socket_post_create =
+		bpf_program__attach_cgroup(skel->progs.socket_post_create, cg_fd);
+	if (!ASSERT_OK_PTR(skel->links.socket_post_create, "attach_cgroup"))
+		goto done;
+
+	test_tcp(AF_INET6);
+	test_tcp(AF_INET);
+	test_udp(AF_INET6);
+	test_udp(AF_INET);
+
+done:
+	setget_sockopt__destroy(skel);
+	close(cg_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c
new file mode 100644
index 000000000000..560cf4b92d65
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
@@ -0,0 +1,547 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <linux/in.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/socket.h>
+#include <linux/bpf.h>
+#include <linux/if.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <errno.h>
+
+#ifndef SO_TXREHASH
+#define SO_TXREHASH 74
+#endif
+
+#ifndef TCP_NAGLE_OFF
+#define TCP_NAGLE_OFF 1
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+extern unsigned long CONFIG_HZ __kconfig;
+
+const volatile char veth[IFNAMSIZ];
+const volatile int veth_ifindex;
+
+int nr_listen;
+int nr_passive;
+int nr_active;
+int nr_connect;
+int nr_binddev;
+int nr_socket_post_create;
+
+struct sockopt_test {
+	int opt;
+	int new;
+	int restore;
+	int expected;
+	int tcp_expected;
+	int flip:1;
+};
+
+static const char cubic_cc[] = "cubic";
+static const char reno_cc[] = "reno";
+
+static const struct sockopt_test sol_socket_tests[] = {
+	{ .opt = SO_REUSEADDR, .flip = 1, },
+	{ .opt = SO_SNDBUF, .new = 8123, .expected = 8123 * 2, },
+	{ .opt = SO_RCVBUF, .new = 8123, .expected = 8123 * 2, },
+	{ .opt = SO_KEEPALIVE, .flip = 1, },
+	{ .opt = SO_PRIORITY, .new = 0xeb9f, .expected = 0xeb9f, },
+	{ .opt = SO_REUSEPORT, .flip = 1, },
+	{ .opt = SO_RCVLOWAT, .new = 8123, .expected = 8123, },
+	{ .opt = SO_MARK, .new = 0xeb9f, .expected = 0xeb9f, },
+	{ .opt = SO_MAX_PACING_RATE, .new = 0xeb9f, .expected = 0xeb9f, },
+	{ .opt = SO_TXREHASH, .flip = 1, },
+	{ .opt = 0, },
+};
+
+static const struct sockopt_test sol_tcp_tests[] = {
+	{ .opt = TCP_NODELAY, .flip = 1, },
+	{ .opt = TCP_MAXSEG, .new = 1314, .expected = 1314, },
+	{ .opt = TCP_KEEPIDLE, .new = 123, .expected = 123, .restore = 321, },
+	{ .opt = TCP_KEEPINTVL, .new = 123, .expected = 123, .restore = 321, },
+	{ .opt = TCP_KEEPCNT, .new = 123, .expected = 123, .restore = 124, },
+	{ .opt = TCP_SYNCNT, .new = 123, .expected = 123, .restore = 124, },
+	{ .opt = TCP_WINDOW_CLAMP, .new = 8123, .expected = 8123, .restore = 8124, },
+	{ .opt = TCP_CONGESTION, },
+	{ .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, },
+	{ .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, },
+	{ .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, },
+	{ .opt = TCP_SAVE_SYN, .new = 1, .expected = 1, },
+	{ .opt = 0, },
+};
+
+static const struct sockopt_test sol_ip_tests[] = {
+	{ .opt = IP_TOS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, },
+	{ .opt = 0, },
+};
+
+static const struct sockopt_test sol_ipv6_tests[] = {
+	{ .opt = IPV6_TCLASS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, },
+	{ .opt = IPV6_AUTOFLOWLABEL, .flip = 1, },
+	{ .opt = 0, },
+};
+
+struct sock_common {
+	unsigned short	skc_family;
+	unsigned long	skc_flags;
+	unsigned char	skc_reuse:4;
+	unsigned char	skc_reuseport:1;
+	unsigned char	skc_ipv6only:1;
+	unsigned char	skc_net_refcnt:1;
+} __attribute__((preserve_access_index));
+
+struct sock {
+	struct sock_common	__sk_common;
+	__u16			sk_type;
+	__u16			sk_protocol;
+	int			sk_rcvlowat;
+	__u32			sk_mark;
+	unsigned long		sk_max_pacing_rate;
+	unsigned int		keepalive_time;
+	unsigned int		keepalive_intvl;
+} __attribute__((preserve_access_index));
+
+struct tcp_options_received {
+	__u16 user_mss;
+} __attribute__((preserve_access_index));
+
+struct ipv6_pinfo {
+	__u16			recverr:1,
+				sndflow:1,
+				repflow:1,
+				pmtudisc:3,
+				padding:1,
+				srcprefs:3,
+				dontfrag:1,
+				autoflowlabel:1,
+				autoflowlabel_set:1,
+				mc_all:1,
+				recverr_rfc4884:1,
+				rtalert_isolate:1;
+}  __attribute__((preserve_access_index));
+
+struct inet_sock {
+	/* sk and pinet6 has to be the first two members of inet_sock */
+	struct sock		sk;
+	struct ipv6_pinfo	*pinet6;
+} __attribute__((preserve_access_index));
+
+struct inet_connection_sock {
+	__u32			  icsk_user_timeout;
+	__u8			  icsk_syn_retries;
+} __attribute__((preserve_access_index));
+
+struct tcp_sock {
+	struct inet_connection_sock	inet_conn;
+	struct tcp_options_received rx_opt;
+	__u8	save_syn:2,
+		syn_data:1,
+		syn_fastopen:1,
+		syn_fastopen_exp:1,
+		syn_fastopen_ch:1,
+		syn_data_acked:1,
+		is_cwnd_limited:1;
+	__u32	window_clamp;
+	__u8	nonagle     : 4,
+		thin_lto    : 1,
+		recvmsg_inq : 1,
+		repair      : 1,
+		frto        : 1;
+	__u32	notsent_lowat;
+	__u8	keepalive_probes;
+	unsigned int		keepalive_time;
+	unsigned int		keepalive_intvl;
+} __attribute__((preserve_access_index));
+
+struct socket {
+	struct sock *sk;
+} __attribute__((preserve_access_index));
+
+struct loop_ctx {
+	void *ctx;
+	struct sock *sk;
+};
+
+static int __bpf_getsockopt(void *ctx, struct sock *sk,
+			    int level, int opt, int *optval,
+			    int optlen)
+{
+	if (level == SOL_SOCKET) {
+		switch (opt) {
+		case SO_REUSEADDR:
+			*optval = !!(sk->__sk_common.skc_reuse);
+			break;
+		case SO_KEEPALIVE:
+			*optval = !!(sk->__sk_common.skc_flags & (1UL << 3));
+			break;
+		case SO_RCVLOWAT:
+			*optval = sk->sk_rcvlowat;
+			break;
+		case SO_MARK:
+			*optval = sk->sk_mark;
+			break;
+		case SO_MAX_PACING_RATE:
+			*optval = sk->sk_max_pacing_rate;
+			break;
+		default:
+			return bpf_getsockopt(ctx, level, opt, optval, optlen);
+		}
+		return 0;
+	}
+
+	if (level == IPPROTO_TCP) {
+		struct tcp_sock *tp = bpf_skc_to_tcp_sock(sk);
+
+		if (!tp)
+			return -1;
+
+		switch (opt) {
+		case TCP_NODELAY:
+			*optval = !!(tp->nonagle & TCP_NAGLE_OFF);
+			break;
+		case TCP_MAXSEG:
+			*optval = tp->rx_opt.user_mss;
+			break;
+		case TCP_KEEPIDLE:
+			*optval = tp->keepalive_time / CONFIG_HZ;
+			break;
+		case TCP_SYNCNT:
+			*optval = tp->inet_conn.icsk_syn_retries;
+			break;
+		case TCP_KEEPINTVL:
+			*optval = tp->keepalive_intvl / CONFIG_HZ;
+			break;
+		case TCP_KEEPCNT:
+			*optval = tp->keepalive_probes;
+			break;
+		case TCP_WINDOW_CLAMP:
+			*optval = tp->window_clamp;
+			break;
+		case TCP_THIN_LINEAR_TIMEOUTS:
+			*optval = tp->thin_lto;
+			break;
+		case TCP_USER_TIMEOUT:
+			*optval = tp->inet_conn.icsk_user_timeout;
+			break;
+		case TCP_NOTSENT_LOWAT:
+			*optval = tp->notsent_lowat;
+			break;
+		case TCP_SAVE_SYN:
+			*optval = tp->save_syn;
+			break;
+		default:
+			return bpf_getsockopt(ctx, level, opt, optval, optlen);
+		}
+		return 0;
+	}
+
+	if (level == IPPROTO_IPV6) {
+		switch (opt) {
+		case IPV6_AUTOFLOWLABEL: {
+			__u16 proto = sk->sk_protocol;
+			struct inet_sock *inet_sk;
+
+			if (proto == IPPROTO_TCP)
+				inet_sk = (struct inet_sock *)bpf_skc_to_tcp_sock(sk);
+			else
+				inet_sk = (struct inet_sock *)bpf_skc_to_udp6_sock(sk);
+
+			if (!inet_sk)
+				return -1;
+
+			*optval = !!inet_sk->pinet6->autoflowlabel;
+			break;
+		}
+		default:
+			return bpf_getsockopt(ctx, level, opt, optval, optlen);
+		}
+		return 0;
+	}
+
+	return bpf_getsockopt(ctx, level, opt, optval, optlen);
+}
+
+static int bpf_test_sockopt_flip(void *ctx, struct sock *sk,
+				 const struct sockopt_test *t,
+				 int level)
+{
+	int old, tmp, new, opt = t->opt;
+
+	opt = t->opt;
+
+	if (__bpf_getsockopt(ctx, sk, level, opt, &old, sizeof(old)))
+		return 1;
+	/* kernel initialized txrehash to 255 */
+	if (level == SOL_SOCKET && opt == SO_TXREHASH && old != 0 && old != 1)
+		old = 1;
+
+	new = !old;
+	if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
+		return 1;
+	if (__bpf_getsockopt(ctx, sk, level, opt, &tmp, sizeof(tmp)) ||
+	    tmp != new)
+		return 1;
+
+	if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old)))
+		return 1;
+
+	return 0;
+}
+
+static int bpf_test_sockopt_int(void *ctx, struct sock *sk,
+				const struct sockopt_test *t,
+				int level)
+{
+	int old, tmp, new, expected, opt;
+
+	opt = t->opt;
+	new = t->new;
+	if (sk->sk_type == SOCK_STREAM && t->tcp_expected)
+		expected = t->tcp_expected;
+	else
+		expected = t->expected;
+
+	if (__bpf_getsockopt(ctx, sk, level, opt, &old, sizeof(old)) ||
+	    old == new)
+		return 1;
+
+	if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
+		return 1;
+	if (__bpf_getsockopt(ctx, sk, level, opt, &tmp, sizeof(tmp)) ||
+	    tmp != expected)
+		return 1;
+
+	if (t->restore)
+		old = t->restore;
+	if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old)))
+		return 1;
+
+	return 0;
+}
+
+static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
+{
+	const struct sockopt_test *t;
+
+	if (i >= ARRAY_SIZE(sol_socket_tests))
+		return 1;
+
+	t = &sol_socket_tests[i];
+	if (!t->opt)
+		return 1;
+
+	if (t->flip)
+		return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, SOL_SOCKET);
+
+	return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
+}
+
+static int bpf_test_ip_sockopt(__u32 i, struct loop_ctx *lc)
+{
+	const struct sockopt_test *t;
+
+	if (i >= ARRAY_SIZE(sol_ip_tests))
+		return 1;
+
+	t = &sol_ip_tests[i];
+	if (!t->opt)
+		return 1;
+
+	if (t->flip)
+		return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IP);
+
+	return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IP);
+}
+
+static int bpf_test_ipv6_sockopt(__u32 i, struct loop_ctx *lc)
+{
+	const struct sockopt_test *t;
+
+	if (i >= ARRAY_SIZE(sol_ipv6_tests))
+		return 1;
+
+	t = &sol_ipv6_tests[i];
+	if (!t->opt)
+		return 1;
+
+	if (t->flip)
+		return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IPV6);
+
+	return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IPV6);
+}
+
+static int bpf_test_tcp_sockopt(__u32 i, struct loop_ctx *lc)
+{
+	const struct sockopt_test *t;
+	struct sock *sk;
+	void *ctx;
+
+	if (i >= ARRAY_SIZE(sol_tcp_tests))
+		return 1;
+
+	t = &sol_tcp_tests[i];
+	if (!t->opt)
+		return 1;
+
+	ctx = lc->ctx;
+	sk = lc->sk;
+
+	if (t->opt == TCP_CONGESTION) {
+		char old_cc[16], tmp_cc[16];
+		const char *new_cc;
+
+		if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc)))
+			return 1;
+		if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc))
+			new_cc = reno_cc;
+		else
+			new_cc = cubic_cc;
+		if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, (void *)new_cc,
+				   sizeof(new_cc)))
+			return 1;
+		if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, tmp_cc, sizeof(tmp_cc)))
+			return 1;
+		if (bpf_strncmp(tmp_cc, sizeof(tmp_cc), new_cc))
+			return 1;
+		if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc)))
+			return 1;
+		return 0;
+	}
+
+	if (t->flip)
+		return bpf_test_sockopt_flip(ctx, sk, t, IPPROTO_TCP);
+
+	return bpf_test_sockopt_int(ctx, sk, t, IPPROTO_TCP);
+}
+
+static int bpf_test_sockopt(void *ctx, struct sock *sk)
+{
+	struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
+	__u16 family, proto;
+	int n;
+
+	family = sk->__sk_common.skc_family;
+	proto = sk->sk_protocol;
+
+	n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
+	if (n != ARRAY_SIZE(sol_socket_tests))
+		return -1;
+
+	if (proto == IPPROTO_TCP) {
+		n = bpf_loop(ARRAY_SIZE(sol_tcp_tests), bpf_test_tcp_sockopt, &lc, 0);
+		if (n != ARRAY_SIZE(sol_tcp_tests))
+			return -1;
+	}
+
+	if (family == AF_INET) {
+		n = bpf_loop(ARRAY_SIZE(sol_ip_tests), bpf_test_ip_sockopt, &lc, 0);
+		if (n != ARRAY_SIZE(sol_ip_tests))
+			return -1;
+	} else {
+		n = bpf_loop(ARRAY_SIZE(sol_ipv6_tests), bpf_test_ipv6_sockopt, &lc, 0);
+		if (n != ARRAY_SIZE(sol_ipv6_tests))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int binddev_test(void *ctx)
+{
+	const char empty_ifname[] = "";
+	int ifindex, zero = 0;
+
+	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+			   (void *)veth, sizeof(veth)))
+		return -1;
+	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+			   &ifindex, sizeof(int)) ||
+	    ifindex != veth_ifindex)
+		return -1;
+
+	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+			   (void *)empty_ifname, sizeof(empty_ifname)))
+		return -1;
+	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+			   &ifindex, sizeof(int)) ||
+	    ifindex != 0)
+		return -1;
+
+	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+			   (void *)&veth_ifindex, sizeof(int)))
+		return -1;
+	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+			   &ifindex, sizeof(int)) ||
+	    ifindex != veth_ifindex)
+		return -1;
+
+	if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+			   &zero, sizeof(int)))
+		return -1;
+	if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX,
+			   &ifindex, sizeof(int)) ||
+	    ifindex != 0)
+		return -1;
+
+	return 0;
+}
+
+SEC("lsm_cgroup/socket_post_create")
+int BPF_PROG(socket_post_create, struct socket *sock, int family,
+	     int type, int protocol, int kern)
+{
+	struct sock *sk = sock->sk;
+
+	if (!sk)
+		return 1;
+
+	nr_socket_post_create += !bpf_test_sockopt(sk, sk);
+	nr_binddev += !binddev_test(sk);
+
+	return 1;
+}
+
+SEC("sockops")
+int skops_sockopt(struct bpf_sock_ops *skops)
+{
+	struct bpf_sock *bpf_sk = skops->sk;
+	struct sock *sk;
+
+	if (!bpf_sk)
+		return 1;
+
+	sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
+	if (!sk)
+		return 1;
+
+	switch (skops->op) {
+	case BPF_SOCK_OPS_TCP_LISTEN_CB:
+		nr_listen += !bpf_test_sockopt(skops, sk);
+		break;
+	case BPF_SOCK_OPS_TCP_CONNECT_CB:
+		nr_connect += !bpf_test_sockopt(skops, sk);
+		break;
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+		nr_active += !bpf_test_sockopt(skops, sk);
+		break;
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		nr_passive += !bpf_test_sockopt(skops, sk);
+		break;
+	}
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";