diff mbox series

[v1,bpf-next,04/11] tcp: Migrate TFO requests causing RST during TCP_SYN_RECV.

Message ID 20201201144418.35045-5-kuniyu@amazon.co.jp (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 4 this patch: 4
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Iwashima, Kuniyuki Dec. 1, 2020, 2:44 p.m. UTC
A TFO request socket is only freed after BOTH 3WHS has completed (or
aborted) and the child socket has been accepted (or its listener has been
closed). Hence, depending on the order, there can be two kinds of request
sockets in the accept queue.

  3WHS -> accept : TCP_ESTABLISHED
  accept -> 3WHS : TCP_SYN_RECV

Unlike TCP_ESTABLISHED socket, accept() does not free the request socket
for TCP_SYN_RECV socket. It is freed later at reqsk_fastopen_remove().
Also, it accesses request_sock.rsk_listener. So, in order to complete TFO
socket migration, we have to set the current listener to it at accept()
before reqsk_fastopen_remove().

Moreover, if TFO request caused RST before 3WHS has completed, it is held
in the listener's TFO queue to prevent DDoS attack. Thus, we also have to
migrate the requests in TFO queue.

Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 net/ipv4/inet_connection_sock.c | 35 ++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

Comments

Eric Dumazet Dec. 1, 2020, 3:30 p.m. UTC | #1
On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> A TFO request socket is only freed after BOTH 3WHS has completed (or
> aborted) and the child socket has been accepted (or its listener has been
> closed). Hence, depending on the order, there can be two kinds of request
> sockets in the accept queue.
> 
>   3WHS -> accept : TCP_ESTABLISHED
>   accept -> 3WHS : TCP_SYN_RECV
> 
> Unlike TCP_ESTABLISHED socket, accept() does not free the request socket
> for TCP_SYN_RECV socket. It is freed later at reqsk_fastopen_remove().
> Also, it accesses request_sock.rsk_listener. So, in order to complete TFO
> socket migration, we have to set the current listener to it at accept()
> before reqsk_fastopen_remove().
> 
> Moreover, if TFO request caused RST before 3WHS has completed, it is held
> in the listener's TFO queue to prevent DDoS attack. Thus, we also have to
> migrate the requests in TFO queue.
> 
> Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
> ---
>  net/ipv4/inet_connection_sock.c | 35 ++++++++++++++++++++++++++++++++-
>  1 file changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index b27241ea96bd..361efe55b1ad 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -500,6 +500,16 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
>  	    tcp_rsk(req)->tfo_listener) {
>  		spin_lock_bh(&queue->fastopenq.lock);
>  		if (tcp_rsk(req)->tfo_listener) {
> +			if (req->rsk_listener != sk) {
> +				/* TFO request was migrated to another listener so
> +				 * the new listener must be used in reqsk_fastopen_remove()
> +				 * to hold requests which cause RST.
> +				 */
> +				sock_put(req->rsk_listener);
> +				sock_hold(sk);
> +				req->rsk_listener = sk;
> +			}
> +
>  			/* We are still waiting for the final ACK from 3WHS
>  			 * so can't free req now. Instead, we set req->sk to
>  			 * NULL to signify that the child socket is taken
> @@ -954,7 +964,6 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
>  
>  	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
>  		BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
> -		BUG_ON(sk != req->rsk_listener);

>  
>  		/* Paranoid, to prevent race condition if
>  		 * an inbound pkt destined for child is
> @@ -995,6 +1004,7 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
>  void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
>  {
>  	struct request_sock_queue *old_accept_queue, *new_accept_queue;
> +	struct fastopen_queue *old_fastopenq, *new_fastopenq;
>  
>  	old_accept_queue = &inet_csk(sk)->icsk_accept_queue;
>  	new_accept_queue = &inet_csk(nsk)->icsk_accept_queue;
> @@ -1019,6 +1029,29 @@ void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
>  
>  	spin_unlock(&new_accept_queue->rskq_lock);
>  	spin_unlock(&old_accept_queue->rskq_lock);
> +
> +	old_fastopenq = &old_accept_queue->fastopenq;
> +	new_fastopenq = &new_accept_queue->fastopenq;
> +
> +	spin_lock_bh(&old_fastopenq->lock);
> +	spin_lock_bh(&new_fastopenq->lock);


Same remark about lockdep being not happy with this (I guess)

> +
> +	new_fastopenq->qlen += old_fastopenq->qlen;
> +	old_fastopenq->qlen = 0;
> +
> +	if (old_fastopenq->rskq_rst_head) {
> +		if (new_fastopenq->rskq_rst_head)
> +			old_fastopenq->rskq_rst_tail->dl_next = new_fastopenq->rskq_rst_head;
> +		else
> +			old_fastopenq->rskq_rst_tail = new_fastopenq->rskq_rst_tail;
> +
> +		new_fastopenq->rskq_rst_head = old_fastopenq->rskq_rst_head;
> +		old_fastopenq->rskq_rst_head = NULL;
> +		old_fastopenq->rskq_rst_tail = NULL;
> +	}
> +
> +	spin_unlock_bh(&new_fastopenq->lock);
> +	spin_unlock_bh(&old_fastopenq->lock);
>  }
>  EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate);
>  
>
diff mbox series

Patch

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b27241ea96bd..361efe55b1ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -500,6 +500,16 @@  struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
 	    tcp_rsk(req)->tfo_listener) {
 		spin_lock_bh(&queue->fastopenq.lock);
 		if (tcp_rsk(req)->tfo_listener) {
+			if (req->rsk_listener != sk) {
+				/* TFO request was migrated to another listener so
+				 * the new listener must be used in reqsk_fastopen_remove()
+				 * to hold requests which cause RST.
+				 */
+				sock_put(req->rsk_listener);
+				sock_hold(sk);
+				req->rsk_listener = sk;
+			}
+
 			/* We are still waiting for the final ACK from 3WHS
 			 * so can't free req now. Instead, we set req->sk to
 			 * NULL to signify that the child socket is taken
@@ -954,7 +964,6 @@  static void inet_child_forget(struct sock *sk, struct request_sock *req,
 
 	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
 		BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
-		BUG_ON(sk != req->rsk_listener);
 
 		/* Paranoid, to prevent race condition if
 		 * an inbound pkt destined for child is
@@ -995,6 +1004,7 @@  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
 {
 	struct request_sock_queue *old_accept_queue, *new_accept_queue;
+	struct fastopen_queue *old_fastopenq, *new_fastopenq;
 
 	old_accept_queue = &inet_csk(sk)->icsk_accept_queue;
 	new_accept_queue = &inet_csk(nsk)->icsk_accept_queue;
@@ -1019,6 +1029,29 @@  void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
 
 	spin_unlock(&new_accept_queue->rskq_lock);
 	spin_unlock(&old_accept_queue->rskq_lock);
+
+	old_fastopenq = &old_accept_queue->fastopenq;
+	new_fastopenq = &new_accept_queue->fastopenq;
+
+	spin_lock_bh(&old_fastopenq->lock);
+	spin_lock_bh(&new_fastopenq->lock);
+
+	new_fastopenq->qlen += old_fastopenq->qlen;
+	old_fastopenq->qlen = 0;
+
+	if (old_fastopenq->rskq_rst_head) {
+		if (new_fastopenq->rskq_rst_head)
+			old_fastopenq->rskq_rst_tail->dl_next = new_fastopenq->rskq_rst_head;
+		else
+			old_fastopenq->rskq_rst_tail = new_fastopenq->rskq_rst_tail;
+
+		new_fastopenq->rskq_rst_head = old_fastopenq->rskq_rst_head;
+		old_fastopenq->rskq_rst_head = NULL;
+		old_fastopenq->rskq_rst_tail = NULL;
+	}
+
+	spin_unlock_bh(&new_fastopenq->lock);
+	spin_unlock_bh(&old_fastopenq->lock);
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate);