diff mbox series

[RFC,net-next,1/3] txhash: Make rethinking txhash behavior configurable via sysctl

Message ID 20210809185314.38187-2-tom@herbertland.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series txhash: Make hash rethink configurable and change the default | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 12 maintainers not CCed: christian.brauner@ubuntu.com daniel@iogearbox.net dsahern@kernel.org ptikhomirov@virtuozzo.com changbin.du@intel.com ap420073@gmail.com maheshb@google.com dvyukov@google.com edumazet@google.com kuba@kernel.org masahiroy@kernel.org yoshfuji@linux-ipv6.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff fail Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 7161 this patch: 727
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 141 lines checked
netdev/build_allmodconfig_warn fail Errors and warnings before: 12044 this patch: 1168
netdev/header_inline success Link

Commit Message

Tom Herbert Aug. 9, 2021, 6:53 p.m. UTC
Add a per ns sysctl that controls the txhash rethink behavior,
sk_rethink_txhash. This sysctl value is a mask rethink modes that are:
rethink at negative advice, rethink at SYN RTO, and rethink at non-SYN
RTO. A value of zero disables hash rethink. The default mask is set
to rethink with all three modes (retains current default behavior)
---
 include/net/netns/core.h    |  2 ++
 include/net/sock.h          | 26 +++++++++++++++++---------
 include/uapi/linux/socket.h | 13 +++++++++++++
 net/core/net_namespace.c    |  6 ++++++
 net/core/sysctl_net_core.c  |  7 +++++++
 net/ipv4/tcp_input.c        |  2 +-
 net/ipv4/tcp_timer.c        |  5 ++++-
 7 files changed, 50 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 36c2d998a43c..503f43bfc1d3 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,8 @@  struct netns_core {
 
 	int	sysctl_somaxconn;
 
+	unsigend int sysctl_txrehash_mode;
+
 #ifdef CONFIG_PROC_FS
 	int __percpu *sock_inuse;
 	struct prot_inuse __percpu *prot_inuse;
diff --git a/include/net/sock.h b/include/net/sock.h
index 6e761451c927..6ef5314e8eed 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -577,6 +577,12 @@  static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 			   __tmp | SK_USER_DATA_NOCOPY);		\
 })
 
+static inline
+struct net *sock_net(const struct sock *sk)
+{
+	return read_pnet(&sk->sk_net);
+}
+
 /*
  * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
  * or not whether his port will be reused by someone else. SK_FORCE_REUSE
@@ -1940,12 +1946,20 @@  static inline void sk_set_txhash(struct sock *sk)
 	WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
 }
 
-static inline bool sk_rethink_txhash(struct sock *sk)
+static inline bool sk_rethink_txhash(struct sock *sk, unsigned int level)
 {
-	if (sk->sk_txhash) {
+	unsigned int rehash_mode;
+
+	if (!sk->sk_txhash)
+		return false;
+
+	rehash_mode = READ_ONCE(sock_net(sk)->core.sysctl_txrehash_mode);
+
+	if (level & rehash_mode) {
 		sk_set_txhash(sk);
 		return true;
 	}
+
 	return false;
 }
 
@@ -1986,7 +2000,7 @@  static inline void __dst_negative_advice(struct sock *sk)
 
 static inline void dst_negative_advice(struct sock *sk)
 {
-	sk_rethink_txhash(sk);
+	sk_rethink_txhash(sk, SOCK_TXREHASH_MODE_NEG_ADVICE);
 	__dst_negative_advice(sk);
 }
 
@@ -2591,12 +2605,6 @@  static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
-static inline
-struct net *sock_net(const struct sock *sk)
-{
-	return read_pnet(&sk->sk_net);
-}
-
 static inline
 void sock_net_set(struct sock *sk, struct net *net)
 {
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index eb0a9a5b6e71..2c2cef795a9b 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -31,4 +31,17 @@  struct __kernel_sockaddr_storage {
 
 #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
 
+#define SOCK_TXREHASH_MODE_DISABLE	0
+
+/* Flag bits for individual rehash function modes */
+#define SOCK_TXREHASH_MODE_NEG_ADVICE	0x1
+#define SOCK_TXREHASH_MODE_SYN_RTO	0x2
+#define SOCK_TXREHASH_MODE_RTO		0x4
+
+#define SOCK_TXREHASH_MODE_DEFAULT	-1U
+
+#define SOCK_TXREHASH_MODE_MASK	(SOCK_TXREHASH_MODE_NEG_ADVICE |	\
+				 SOCK_TXREHASH_MODE_SYN_RTO |		\
+				 SOCK_TXREHASH_MODE_RTO)
+
 #endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 9b5a767eddd5..03d3767e6728 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -366,6 +366,12 @@  static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 static int __net_init net_defaults_init_net(struct net *net)
 {
 	net->core.sysctl_somaxconn = SOMAXCONN;
+
+	/* Default rethink mode is aggrssive (i.e. rethink on first RTO) */
+	net->core.sysctl_txrehash_mode = SOCK_TXREHASH_MODE_NEG_ADVICE |
+					 SOCK_TXREHASH_MODE_SYN_RTO |
+					 SOCK_TXREHASH_MODE_RTO;
+
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8496c1142c9..7e828a892bf5 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -592,6 +592,13 @@  static struct ctl_table netns_core_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.proc_handler	= proc_dointvec_minmax
 	},
+	{
+		.procname	= "txrehash_mode",
+		.data		= &init_net.core.sysctl_txrehash_mode,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f7bd7ae7d7a..08eeb2523393 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4442,7 +4442,7 @@  static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
 	 * DSACK state and change the txhash to re-route speculatively.
 	 */
 	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
-	    sk_rethink_txhash(sk))
+	    sk_rethink_txhash(sk, SOCK_TXREHASH_MODE_RTO))
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
 }
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 20cf4a98c69d..53ae43ab5ebe 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -234,6 +234,7 @@  static int tcp_write_timeout(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	bool expired = false, do_reset;
+	unsigned int rehash_mode;
 	int retry_until;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
@@ -241,6 +242,7 @@  static int tcp_write_timeout(struct sock *sk)
 			__dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		expired = icsk->icsk_retransmits >= retry_until;
+		rehash_mode = SOCK_TXREHASH_MODE_SYN_RTO;
 	} else {
 		if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
 			/* Black hole detection */
@@ -260,6 +262,7 @@  static int tcp_write_timeout(struct sock *sk)
 			if (tcp_out_of_resources(sk, do_reset))
 				return 1;
 		}
+		rehash_mode = SOCK_TXREHASH_MODE_RTO;
 	}
 	if (!expired)
 		expired = retransmits_timed_out(sk, retry_until,
@@ -277,7 +280,7 @@  static int tcp_write_timeout(struct sock *sk)
 		return 1;
 	}
 
-	if (sk_rethink_txhash(sk)) {
+	if (sk_rethink_txhash(sk, rehash_mode)) {
 		tp->timeout_rehash++;
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
 	}