diff mbox series

[net-next,11/13] mptcp: rework poll+nospace handling

Message ID 61e7462aa9ecfefd71888277134e9a5f9fd6946c.1605175834.git.pabeni@redhat.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series mptcp: improve multiple xmit streams support | expand

Commit Message

Paolo Abeni Nov. 12, 2020, 10:48 a.m. UTC
From: Florian Westphal <fw@strlen.de>

MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at
least one subflow and the mptcp socket itself are writeable.

mptcp_poll returns EPOLLOUT if the bit is set.

mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write
has used up all subflows or the mptcp socket wmem.

This reworks nospace handling as follows:

MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning.
This bit is set when the mptcp socket is not writeable.
The mptcp-level ack path schedule will then schedule the mptcp worker
to allow it to free already-acked data (and reduce wmem usage).

This will then wake userspace processes that wait for a POLLOUT event.

sendmsg will set MPTCP_NOSPACE only when it has to wait for more
wmem (blocking I/O case).

poll path will set MPTCP_NOSPACE in case the mptcp socket is
not writeable.

Normal tcp-level notification (SOCK_NOSPACE) is only enabled
in case the subflow socket has no available wmem.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 92 +++++++++++++++++++++++---------------------
 net/mptcp/protocol.h |  2 +-
 net/mptcp/subflow.c  | 11 +++---
 3 files changed, 54 insertions(+), 51 deletions(-)
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 86b4b6e2afbc..05b8112050b9 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -730,7 +730,7 @@  void mptcp_data_acked(struct sock *sk)
 {
 	mptcp_reset_timer(sk);
 
-	if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
+	if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
 	     mptcp_send_head(sk) ||
 	     (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
 		mptcp_schedule_work(sk);
@@ -830,20 +830,6 @@  static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
 	put_page(dfrag->page);
 }
 
-static bool mptcp_is_writeable(struct mptcp_sock *msk)
-{
-	struct mptcp_subflow_context *subflow;
-
-	if (!sk_stream_is_writeable((struct sock *)msk))
-		return false;
-
-	mptcp_for_each_subflow(msk, subflow) {
-		if (sk_stream_is_writeable(subflow->tcp_sock))
-			return true;
-	}
-	return false;
-}
-
 static void mptcp_clean_una(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -896,13 +882,8 @@  static void mptcp_clean_una_wakeup(struct sock *sk)
 	mptcp_clean_una(sk);
 
 	/* Only wake up writers if a subflow is ready */
-	if (mptcp_is_writeable(msk)) {
-		set_bit(MPTCP_SEND_SPACE, &msk->flags);
-		smp_mb__after_atomic();
-
-		/* set SEND_SPACE before sk_stream_write_space clears
-		 * NOSPACE
-		 */
+	if (sk_stream_is_writeable(sk)) {
+		clear_bit(MPTCP_NOSPACE, &msk->flags);
 		sk_stream_write_space(sk);
 	}
 }
@@ -1036,17 +1017,25 @@  static void mptcp_nospace(struct mptcp_sock *msk)
 {
 	struct mptcp_subflow_context *subflow;
 
-	clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+	set_bit(MPTCP_NOSPACE, &msk->flags);
 	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
 
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+		bool ssk_writeable = sk_stream_is_writeable(ssk);
 		struct socket *sock = READ_ONCE(ssk->sk_socket);
 
+		if (ssk_writeable || !sock)
+			continue;
+
 		/* enables ssk->write_space() callbacks */
-		if (sock)
-			set_bit(SOCK_NOSPACE, &sock->flags);
+		set_bit(SOCK_NOSPACE, &sock->flags);
 	}
+
+	/* mptcp_data_acked() could run just before we set the NOSPACE bit,
+	 * so explicitly check for snd_una value
+	 */
+	mptcp_clean_una((struct sock *)msk);
 }
 
 static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
@@ -1150,12 +1139,6 @@  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 	return NULL;
 }
 
-static void ssk_check_wmem(struct mptcp_sock *msk)
-{
-	if (unlikely(!mptcp_is_writeable(msk)))
-		mptcp_nospace(msk);
-}
-
 static void mptcp_push_release(struct sock *sk, struct sock *ssk,
 			       struct mptcp_sendmsg_info *info)
 {
@@ -1327,7 +1310,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 wait_for_memory:
 		mptcp_nospace(msk);
-		mptcp_clean_una(sk);
 		if (mptcp_timer_pending(sk))
 			mptcp_reset_timer(sk);
 		ret = sk_stream_wait_memory(sk, &timeo);
@@ -1339,7 +1321,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		mptcp_push_pending(sk, msg->msg_flags);
 
 out:
-	ssk_check_wmem(msk);
 	release_sock(sk);
 	return copied ? : ret;
 }
@@ -1916,7 +1897,6 @@  static int __mptcp_init_sock(struct sock *sk)
 	INIT_LIST_HEAD(&msk->conn_list);
 	INIT_LIST_HEAD(&msk->join_list);
 	INIT_LIST_HEAD(&msk->rtx_queue);
-	__set_bit(MPTCP_SEND_SPACE, &msk->flags);
 	INIT_WORK(&msk->work, mptcp_worker);
 	msk->out_of_order_queue = RB_ROOT;
 	msk->first_pending = NULL;
@@ -2612,13 +2592,6 @@  bool mptcp_finish_join(struct sock *ssk)
 	return true;
 }
 
-static bool mptcp_memory_free(const struct sock *sk, int wake)
-{
-	struct mptcp_sock *msk = mptcp_sk(sk);
-
-	return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
-}
-
 static struct proto mptcp_prot = {
 	.name		= "MPTCP",
 	.owner		= THIS_MODULE,
@@ -2639,7 +2612,6 @@  static struct proto mptcp_prot = {
 	.sockets_allocated	= &mptcp_sockets_allocated,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
-	.stream_memory_free	= mptcp_memory_free,
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
 	.sysctl_mem	= sysctl_tcp_mem,
 	.obj_size	= sizeof(struct mptcp_sock),
@@ -2812,6 +2784,39 @@  static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
 	       0;
 }
 
+static bool __mptcp_check_writeable(struct mptcp_sock *msk)
+{
+	struct sock *sk = (struct sock *)msk;
+	bool mptcp_writable;
+
+	mptcp_clean_una(sk);
+	mptcp_writable = sk_stream_is_writeable(sk);
+	if (!mptcp_writable)
+		mptcp_nospace(msk);
+
+	return mptcp_writable;
+}
+
+static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
+{
+	struct sock *sk = (struct sock *)msk;
+	__poll_t ret = 0;
+	bool slow;
+
+	if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
+		return 0;
+
+	if (sk_stream_is_writeable(sk))
+		return EPOLLOUT | EPOLLWRNORM;
+
+	slow = lock_sock_fast(sk);
+	if (__mptcp_check_writeable(msk))
+		ret = EPOLLOUT | EPOLLWRNORM;
+
+	unlock_sock_fast(sk, slow);
+	return ret;
+}
+
 static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait)
 {
@@ -2830,8 +2835,7 @@  static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 
 	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
 		mask |= mptcp_check_readable(msk);
-		if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
-			mask |= EPOLLOUT | EPOLLWRNORM;
+		mask |= mptcp_check_writeable(msk);
 	}
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fd9c666aed7f..8345011fc0ba 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -86,7 +86,7 @@ 
 
 /* MPTCP socket flags */
 #define MPTCP_DATA_READY	0
-#define MPTCP_SEND_SPACE	1
+#define MPTCP_NOSPACE		1
 #define MPTCP_WORK_RTX		2
 #define MPTCP_WORK_EOF		3
 #define MPTCP_FALLBACK_DONE	4
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 42581ffb0c7e..794259789194 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -997,17 +997,16 @@  static void subflow_data_ready(struct sock *sk)
 static void subflow_write_space(struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct socket *sock = READ_ONCE(sk->sk_socket);
 	struct sock *parent = subflow->conn;
 
 	if (!sk_stream_is_writeable(sk))
 		return;
 
-	if (sk_stream_is_writeable(parent)) {
-		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
-		smp_mb__after_atomic();
-		/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
-		sk_stream_write_space(parent);
-	}
+	if (sock && sk_stream_is_writeable(parent))
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+
+	sk_stream_write_space(parent);
 }
 
 static struct inet_connection_sock_af_ops *