diff mbox series

[v2,bpf-next,06/13] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

Message ID 20201207132456.65472-7-kuniyu@amazon.co.jp (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 1541 this patch: 1541
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: line length of 100 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 1520 this patch: 1520
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Iwashima, Kuniyuki Dec. 7, 2020, 1:24 p.m. UTC
This patch lets reuseport_detach_sock() return a pointer of struct sock,
which is used only by inet_unhash(). If it is not NULL,
inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
sockets from the closing listener to the selected one.

By default, the kernel selects a new listener randomly. In order to pick
out a different socket every time, we select the last element of socks[] as
the new listener. This behaviour is based on how the kernel moves sockets
in socks[]. (See also [1])

Basically, in order to redistribute sockets evenly, we have to use an eBPF
program called in the later commit, but as the side effect of such default
selection, the kernel can redistribute old requests evenly to new listeners
for a specific case where the application replaces listeners by
generations.

For example, we call listen() for four sockets (A, B, C, D), and close()
the first two by turns. The sockets move in socks[] like below.

  socks[0] : A <-.      socks[0] : D          socks[0] : D
  socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
  socks[2] : C   |      socks[2] : C --'
  socks[3] : D --'

Then, if C and D have newer settings than A and B, and each socket has a
request (a, b, c, d) in their accept queue, we can redistribute old
requests evenly to new listeners.

  socks[0] : A (a) <-.      socks[0] : D (a + d)      socks[0] : D (a + d)
  socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + c)
  socks[2] : C (c)   |      socks[2] : C (c) --'
  socks[3] : D (d) --'

Here, (A, D), or (B, C) can have different application settings, but they
MUST have the same settings at the socket API level; otherwise, unexpected
error may happen. For instance, if only the new listeners have
TCP_SAVE_SYN, old requests do not hold SYN data, so the application will
face inconsistency and cause an error.

Therefore, if there are different kinds of sockets, we must attach an eBPF
program described in later commits.

Link: https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dhdGQ@mail.gmail.com/
Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 include/net/sock_reuseport.h |  2 +-
 net/core/sock_reuseport.c    | 16 +++++++++++++---
 net/ipv4/inet_hashtables.c   |  9 +++++++--
 3 files changed, 21 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0e558ca7afbf..09a1b1539d4c 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -31,7 +31,7 @@  struct sock_reuseport {
 extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
 			      bool bind_inany);
-extern void reuseport_detach_sock(struct sock *sk);
+extern struct sock *reuseport_detach_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
 					  struct sk_buff *skb,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index c26f4256ff41..2de42f8103ea 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -184,9 +184,11 @@  int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 }
 EXPORT_SYMBOL(reuseport_add_sock);
 
-void reuseport_detach_sock(struct sock *sk)
+struct sock *reuseport_detach_sock(struct sock *sk)
 {
 	struct sock_reuseport *reuse;
+	struct bpf_prog *prog;
+	struct sock *nsk = NULL;
 	int i;
 
 	spin_lock_bh(&reuseport_lock);
@@ -215,17 +217,25 @@  void reuseport_detach_sock(struct sock *sk)
 
 		reuse->num_socks--;
 		reuse->socks[i] = reuse->socks[reuse->num_socks];
+		prog = rcu_dereference_protected(reuse->prog,
+						 lockdep_is_held(&reuseport_lock));
+
+		if (sk->sk_protocol == IPPROTO_TCP) {
+			if (reuse->num_socks && !prog)
+				nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i];
 
-		if (sk->sk_protocol == IPPROTO_TCP)
 			reuse->num_closed_socks++;
-		else
+		} else {
 			rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+		}
 	}
 
 	if (reuse->num_socks + reuse->num_closed_socks == 0)
 		call_rcu(&reuse->rcu, reuseport_free_rcu);
 
 	spin_unlock_bh(&reuseport_lock);
+
+	return nsk;
 }
 EXPORT_SYMBOL(reuseport_detach_sock);
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 45fb450b4522..545538a6bfac 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -681,6 +681,7 @@  void inet_unhash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct inet_listen_hashbucket *ilb = NULL;
+	struct sock *nsk;
 	spinlock_t *lock;
 
 	if (sk_unhashed(sk))
@@ -696,8 +697,12 @@  void inet_unhash(struct sock *sk)
 	if (sk_unhashed(sk))
 		goto unlock;
 
-	if (rcu_access_pointer(sk->sk_reuseport_cb))
-		reuseport_detach_sock(sk);
+	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
+		nsk = reuseport_detach_sock(sk);
+		if (nsk)
+			inet_csk_reqsk_queue_migrate(sk, nsk);
+	}
+
 	if (ilb) {
 		inet_unhash2(hashinfo, sk);
 		ilb->count--;