diff mbox series

[v2,bpf-next,04/13] tcp: Introduce inet_csk_reqsk_queue_migrate().

Message ID 20201207132456.65472-5-kuniyu@amazon.co.jp (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 3983 this patch: 3983
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 4094 this patch: 4094
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Iwashima, Kuniyuki Dec. 7, 2020, 1:24 p.m. UTC
This patch defines a new function to migrate ESTABLISHED/SYN_RECV sockets.

Listening sockets hold incoming connections as a linked list of struct
request_sock in the accept queue, and each request has reference to its
full socket and listener. In inet_csk_reqsk_queue_migrate(), we only unlink
the requests from the closing listener's queue and relink them to the head
of the new listener's queue. We do not process each request and its
reference to the listener, so the migration completes in O(1) time
complexity.

Moreover, if TFO requests caused RST before 3WHS has completed, they are
held in the listener's TFO queue to prevent DDoS attack. Thus, we also
migrate the requests in the TFO queue in the same way.

After 3WHS has completed, there are three access patterns to incoming
sockets:

  (1) access to the full socket instead of request_sock
  (2) access to request_sock from access queue
  (3) access to request_sock from TFO queue

In the first case, the full socket does not have a reference to its request
socket and listener, so we do not need the correct listener set in the
request socket. In the second case, we always have the correct listener and
currently do not use req->rsk_listener. However, in the third case of
TCP_SYN_RECV sockets, we take special care in the next commit.

Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 include/net/inet_connection_sock.h |  1 +
 net/ipv4/inet_connection_sock.c    | 68 ++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
diff mbox series

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 7338b3865a2a..2ea2d743f8fc 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -260,6 +260,7 @@  struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
 				      struct request_sock *req,
 				      struct sock *child);
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk);
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 				   unsigned long timeout);
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1451aa9712b0..5da38a756e4c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -992,6 +992,74 @@  struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
+{
+	struct request_sock_queue *old_accept_queue, *new_accept_queue;
+	struct fastopen_queue *old_fastopenq, *new_fastopenq;
+	spinlock_t *l1, *l2, *l3, *l4;
+
+	old_accept_queue = &inet_csk(sk)->icsk_accept_queue;
+	new_accept_queue = &inet_csk(nsk)->icsk_accept_queue;
+	old_fastopenq = &old_accept_queue->fastopenq;
+	new_fastopenq = &new_accept_queue->fastopenq;
+
+	l1 = &old_accept_queue->rskq_lock;
+	l2 = &new_accept_queue->rskq_lock;
+	l3 = &old_fastopenq->lock;
+	l4 = &new_fastopenq->lock;
+
+	/* sk is never selected as the new listener from reuse->socks[],
+	 * so inversion deadlock does not happen here,
+	 * but change the order to avoid the warning of lockdep.
+	 */
+	if (sk < nsk) {
+		swap(l1, l2);
+		swap(l3, l4);
+	}
+
+	spin_lock(l1);
+	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+
+	if (old_accept_queue->rskq_accept_head) {
+		if (new_accept_queue->rskq_accept_head)
+			old_accept_queue->rskq_accept_tail->dl_next =
+				new_accept_queue->rskq_accept_head;
+		else
+			new_accept_queue->rskq_accept_tail = old_accept_queue->rskq_accept_tail;
+
+		new_accept_queue->rskq_accept_head = old_accept_queue->rskq_accept_head;
+		old_accept_queue->rskq_accept_head = NULL;
+		old_accept_queue->rskq_accept_tail = NULL;
+
+		WRITE_ONCE(nsk->sk_ack_backlog, nsk->sk_ack_backlog + sk->sk_ack_backlog);
+		WRITE_ONCE(sk->sk_ack_backlog, 0);
+	}
+
+	spin_unlock(l2);
+	spin_unlock(l1);
+
+	spin_lock_bh(l3);
+	spin_lock_bh_nested(l4, SINGLE_DEPTH_NESTING);
+
+	new_fastopenq->qlen += old_fastopenq->qlen;
+	old_fastopenq->qlen = 0;
+
+	if (old_fastopenq->rskq_rst_head) {
+		if (new_fastopenq->rskq_rst_head)
+			old_fastopenq->rskq_rst_tail->dl_next = new_fastopenq->rskq_rst_head;
+		else
+			old_fastopenq->rskq_rst_tail = new_fastopenq->rskq_rst_tail;
+
+		new_fastopenq->rskq_rst_head = old_fastopenq->rskq_rst_head;
+		old_fastopenq->rskq_rst_head = NULL;
+		old_fastopenq->rskq_rst_tail = NULL;
+	}
+
+	spin_unlock_bh(l4);
+	spin_unlock_bh(l3);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate);
+
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 					 struct request_sock *req, bool own_req)
 {