diff mbox series

[v5,bpf-next,05/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

Message ID 20210510034433.52818-6-kuniyu@amazon.co.jp (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 6 maintainers not CCed: dsahern@kernel.org yhs@fb.com kpsingh@kernel.org yoshfuji@linux-ipv6.org john.fastabend@gmail.com songliubraving@fb.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 3191 this patch: 3191
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 3427 this patch: 3427
netdev/header_inline success Link

Commit Message

Iwashima, Kuniyuki May 10, 2021, 3:44 a.m. UTC
When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
reqsk_clone(), we try to add it into the new listener's accept queue by
inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free() to
call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 include/net/request_sock.h      |  2 ++
 net/core/request_sock.c         | 39 +++++++++++++++++++++++++++++++++
 net/ipv4/inet_connection_sock.c | 31 +++++++++++++++++++++++++-
 3 files changed, 71 insertions(+), 1 deletion(-)

Comments

Martin KaFai Lau May 15, 2021, 1:06 a.m. UTC | #1
On Mon, May 10, 2021 at 12:44:27PM +0900, Kuniyuki Iwashima wrote:
> diff --git a/net/core/request_sock.c b/net/core/request_sock.c
> index f35c2e998406..7879a3660c52 100644
> --- a/net/core/request_sock.c
> +++ b/net/core/request_sock.c
> @@ -130,3 +130,42 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
>  out:
>  	spin_unlock_bh(&fastopenq->lock);
>  }
> +
> +struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk)
> +{
> +	struct sock *req_sk, *nreq_sk;
> +	struct request_sock *nreq;
> +
> +	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
> +	if (!nreq) {
> +		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
> +		sock_put(sk);
> +		return NULL;
> +	}
> +
> +	req_sk = req_to_sk(req);
> +	nreq_sk = req_to_sk(nreq);
> +
> +	memcpy(nreq_sk, req_sk,
> +	       offsetof(struct sock, sk_dontcopy_begin));
> +	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
> +	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
> +
> +	sk_node_init(&nreq_sk->sk_node);
> +	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
> +#ifdef CONFIG_XPS
> +	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
> +#endif
> +	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
> +	refcount_set(&nreq_sk->sk_refcnt, 0);
> +
> +	nreq->rsk_listener = sk;
> +
> +	/* We need not acquire fastopenq->lock
> +	 * because the child socket is locked in inet_csk_listen_stop().
> +	 */
> +	if (tcp_rsk(nreq)->tfo_listener)
Should IPPROTO_TCP be tested first like other similar situations
in inet_connection_sock.c?

Also, reqsk_clone() is only used in inet_connection_sock.c.
Can it be moved to inet_connection_sock.c instead and renamed to
inet_reqsk_clone()?
Iwashima, Kuniyuki May 15, 2021, 4:14 a.m. UTC | #2
From:   Martin KaFai Lau <kafai@fb.com>
Date:   Fri, 14 May 2021 18:06:16 -0700
> On Mon, May 10, 2021 at 12:44:27PM +0900, Kuniyuki Iwashima wrote:
> > diff --git a/net/core/request_sock.c b/net/core/request_sock.c
> > index f35c2e998406..7879a3660c52 100644
> > --- a/net/core/request_sock.c
> > +++ b/net/core/request_sock.c
> > @@ -130,3 +130,42 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
> >  out:
> >  	spin_unlock_bh(&fastopenq->lock);
> >  }
> > +
> > +struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk)
> > +{
> > +	struct sock *req_sk, *nreq_sk;
> > +	struct request_sock *nreq;
> > +
> > +	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
> > +	if (!nreq) {
> > +		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
> > +		sock_put(sk);
> > +		return NULL;
> > +	}
> > +
> > +	req_sk = req_to_sk(req);
> > +	nreq_sk = req_to_sk(nreq);
> > +
> > +	memcpy(nreq_sk, req_sk,
> > +	       offsetof(struct sock, sk_dontcopy_begin));
> > +	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
> > +	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
> > +
> > +	sk_node_init(&nreq_sk->sk_node);
> > +	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
> > +#ifdef CONFIG_XPS
> > +	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
> > +#endif
> > +	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
> > +	refcount_set(&nreq_sk->sk_refcnt, 0);
> > +
> > +	nreq->rsk_listener = sk;
> > +
> > +	/* We need not acquire fastopenq->lock
> > +	 * because the child socket is locked in inet_csk_listen_stop().
> > +	 */
> > +	if (tcp_rsk(nreq)->tfo_listener)
> Should IPPROTO_TCP be tested first like other similar situations
> in inet_connection_sock.c?

I've written this way because migration happens only in TCP for now, but I
agree that test of IPPROTO_TCP makes less error-prone in the future. So,
I'll test it first in the next spin.

Thank you!


> 
> Also, reqsk_clone() is only used in inet_connection_sock.c.
> Can it be moved to inet_connection_sock.c instead and renamed to
> inet_reqsk_clone()?

I'll do that.
diff mbox series

Patch

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 29e41ff3ec93..c6d6cfd3c93b 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -190,6 +190,8 @@  void reqsk_queue_alloc(struct request_sock_queue *queue);
 void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
 			   bool reset);
 
+struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk);
+
 static inline bool reqsk_queue_empty(const struct request_sock_queue *queue)
 {
 	return READ_ONCE(queue->rskq_accept_head) == NULL;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index f35c2e998406..7879a3660c52 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -130,3 +130,42 @@  void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
 out:
 	spin_unlock_bh(&fastopenq->lock);
 }
+
+struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk)
+{
+	struct sock *req_sk, *nreq_sk;
+	struct request_sock *nreq;
+
+	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+	if (!nreq) {
+		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+		sock_put(sk);
+		return NULL;
+	}
+
+	req_sk = req_to_sk(req);
+	nreq_sk = req_to_sk(nreq);
+
+	memcpy(nreq_sk, req_sk,
+	       offsetof(struct sock, sk_dontcopy_begin));
+	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+	sk_node_init(&nreq_sk->sk_node);
+	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+	refcount_set(&nreq_sk->sk_refcnt, 0);
+
+	nreq->rsk_listener = sk;
+
+	/* We need not acquire fastopenq->lock
+	 * because the child socket is locked in inet_csk_listen_stop().
+	 */
+	if (tcp_rsk(nreq)->tfo_listener)
+		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+	return nreq;
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fa806e9167ec..851992405826 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -695,6 +695,13 @@  int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
 /* return true if req was found in the ehash table */
 static bool reqsk_queue_unlink(struct request_sock *req)
 {
@@ -1036,14 +1043,36 @@  void inet_csk_listen_stop(struct sock *sk)
 	 * of the variants now.			--ANK
 	 */
 	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
-		struct sock *child = req->sk;
+		struct sock *child = req->sk, *nsk;
+		struct request_sock *nreq;
 
 		local_bh_disable();
 		bh_lock_sock(child);
 		WARN_ON(sock_owned_by_user(child));
 		sock_hold(child);
 
+		nsk = reuseport_migrate_sock(sk, child, NULL);
+		if (nsk) {
+			nreq = reqsk_clone(req, nsk);
+			if (nreq) {
+				refcount_set(&nreq->rsk_refcnt, 1);
+
+				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+					reqsk_migrate_reset(req);
+				} else {
+					reqsk_migrate_reset(nreq);
+					__reqsk_free(nreq);
+				}
+
+				/* inet_csk_reqsk_queue_add() has already
+				 * called inet_child_forget() on failure case.
+				 */
+				goto skip_child_forget;
+			}
+		}
+
 		inet_child_forget(sk, req, child);
+skip_child_forget:
 		reqsk_put(req);
 		bh_unlock_sock(child);
 		local_bh_enable();