diff mbox series

[mptcp-next,v2,6/7] mptcp: implement TCP_NOTSENT_LOWAT support.

Message ID aafd3d3f4e4f827ebff1235697afcc08182e964f.1707739536.git.pabeni@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Mat Martineau
Headers show
Series mptcp: implement TCP_NOTSENT_LOWAT support | expand

Checks

Context Check Description
matttbe/checkpatch success total: 0 errors, 0 warnings, 0 checks, 156 lines checked
matttbe/build success Build and static analysis OK
matttbe/KVM_Validation__normal success Success! ✅
matttbe/KVM_Validation__debug__except_selftest_mptcp_join_ success Success! ✅
matttbe/KVM_Validation__debug__only_selftest_mptcp_join_ success Success! ✅

Commit Message

Paolo Abeni Feb. 12, 2024, 3:19 p.m. UTC
Add support for such socket option storing the user-space provided
value in a new msk field, and using such data to implement the
_mptcp_stream_memory_free() helper, similar to the TCP one.

To avoid adding more indirect calls in the fast path, open-code
a variant of sk_stream_memory_free() in mptcp_sendmsg() and add
direct calls to the mptcp stream memory free helper where possible.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/464
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 48 +++++++++++++++++++++++++++++++++++++++-----
 net/mptcp/protocol.h | 28 +++++++++++++++++++++++++-
 net/mptcp/sockopt.c  | 12 +++++++++++
 3 files changed, 82 insertions(+), 6 deletions(-)

Comments

Mat Martineau Feb. 15, 2024, 10:31 p.m. UTC | #1
On Mon, 12 Feb 2024, Paolo Abeni wrote:

> Add support for such socket option storing the user-space provided
> value in a new msk field, and using such data to implement the
> _mptcp_stream_memory_free() helper, similar to the TCP one.
>
> To avoid adding more indirect calls in the fast path, open-code
> a variant of sk_stream_memory_free() in mptcp_sendmsg() and add
> direct calls to the mptcp stream memory free helper where possible.
>
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/464
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> net/mptcp/protocol.c | 48 +++++++++++++++++++++++++++++++++++++++-----
> net/mptcp/protocol.h | 28 +++++++++++++++++++++++++-
> net/mptcp/sockopt.c  | 12 +++++++++++
> 3 files changed, 82 insertions(+), 6 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index def01e030121..cc5680c4c5b3 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1762,6 +1762,39 @@ static int do_copy_data_nocache(struct sock *sk, int copy,
> 	return 0;
> }
>
> +static bool mptcp_stream_memory_free(const struct sock *sk, int wake)
> +{
> +	const struct mptcp_sock *msk = mptcp_sk(sk);
> +	u32 notsent_bytes = READ_ONCE(msk->write_seq) -
> +			    READ_ONCE(msk->snd_nxt);
> +
> +	return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);

Hi Paolo -

This is identical to __mptcp_stream_memory_free() (other than 'inline' and 
combining notsent_bytes assignment w/ declaration). Prefer to avoid the 
code duplication!


- Mat


> +}
> +
> +/* open-code sk_stream_memory_free() plus sent limit computation to
> + * avoid indirect calls in fast-path.
> + * Called under the msk socket lock, so we can avoid a bunch of ONCE
> + * annotations.
> + */
> +static u32 mptcp_send_limit(const struct sock *sk)
> +{
> +	const struct mptcp_sock *msk = mptcp_sk(sk);
> +	u32 limit, not_sent;
> +
> +	if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
> +		return 0;
> +
> +	limit = mptcp_notsent_lowat(sk);
> +	if (limit == UINT_MAX)
> +		return UINT_MAX;
> +
> +	not_sent = msk->write_seq - msk->snd_nxt;
> +	if (not_sent >= limit)
> +		return 0;
> +
> +	return limit - not_sent;
> +}
> +
> static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> {
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> @@ -1806,6 +1839,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 		struct mptcp_data_frag *dfrag;
> 		bool dfrag_collapsed;
> 		size_t psize, offset;
> +		u32 copy_limit;
> +
> +		/* ensure fitting the notsent_lowat() constraint */
> +		copy_limit = mptcp_send_limit(sk);
> +		if (!copy_limit)
> +			goto wait_for_memory;
>
> 		/* reuse tail pfrag, if possible, or carve a new one from the
> 		 * page allocator
> @@ -1813,9 +1852,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 		dfrag = mptcp_pending_tail(sk);
> 		dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
> 		if (!dfrag_collapsed) {
> -			if (!sk_stream_memory_free(sk))
> -				goto wait_for_memory;
> -
> 			if (!mptcp_page_frag_refill(sk, pfrag))
> 				goto wait_for_memory;
>
> @@ -1830,6 +1866,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 		offset = dfrag->offset + dfrag->data_len;
> 		psize = pfrag->size - offset;
> 		psize = min_t(size_t, psize, msg_data_left(msg));
> +		psize = min_t(size_t, psize, copy_limit);
> 		total_ts = psize + frag_truesize;
>
> 		if (!sk_wmem_schedule(sk, total_ts))
> @@ -3711,6 +3748,7 @@ static struct proto mptcp_prot = {
> 	.unhash		= mptcp_unhash,
> 	.get_port	= mptcp_get_port,
> 	.forward_alloc_get	= mptcp_forward_alloc_get,
> +	.stream_memory_free	= mptcp_stream_memory_free,
> 	.sockets_allocated	= &mptcp_sockets_allocated,
>
> 	.memory_allocated	= &tcp_memory_allocated,
> @@ -3884,12 +3922,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
> {
> 	struct sock *sk = (struct sock *)msk;
>
> -	if (sk_stream_is_writeable(sk))
> +	if (__mptcp_stream_is_writeable(sk, 1))
> 		return EPOLLOUT | EPOLLWRNORM;
>
> 	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
> 	smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
> -	if (sk_stream_is_writeable(sk))
> +	if (__mptcp_stream_is_writeable(sk, 1))
> 		return EPOLLOUT | EPOLLWRNORM;
>
> 	return 0;
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 0aa77c03643d..62b84cc6f35e 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -307,6 +307,7 @@ struct mptcp_sock {
> 			in_accept_queue:1,
> 			free_first:1,
> 			rcvspace_init:1;
> +	u32		notsent_lowat;
> 	struct work_struct work;
> 	struct sk_buff  *ooo_last_skb;
> 	struct rb_root  out_of_order_queue;
> @@ -796,11 +797,36 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
> 	       READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
> }
>
> +static inline u32 mptcp_notsent_lowat(const struct sock *sk)
> +{
> +	struct net *net = sock_net(sk);
> +	u32 val;
> +
> +	val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
> +	return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
> +}
> +
> +static inline bool __mptcp_stream_memory_free(const struct sock *sk, int wake)
> +{
> +	const struct mptcp_sock *msk = mptcp_sk(sk);
> +	u32 notsent_bytes;
> +
> +	notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
> +	return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
> +}
> +
> +static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
> +{
> +	return __mptcp_stream_memory_free(sk, wake) &&
> +	       __sk_stream_is_writeable(sk, wake);
> +}
> +
> static inline void mptcp_write_space(struct sock *sk)
> {
> 	/* pairs with memory barrier in mptcp_poll */
> 	smp_mb();
> -	sk_stream_write_space(sk);
> +	if (__mptcp_stream_memory_free(sk, 1))
> +		sk_stream_write_space(sk);
> }
>
> static inline void __mptcp_sync_sndbuf(struct sock *sk)
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index ac37f6c5e2ed..1b38dac70719 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -812,6 +812,16 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
> 		return 0;
> 	case TCP_ULP:
> 		return -EOPNOTSUPP;
> +	case TCP_NOTSENT_LOWAT:
> +		ret = mptcp_get_int_option(msk, optval, optlen, &val);
> +		if (ret)
> +			return ret;
> +
> +		lock_sock(sk);
> +		WRITE_ONCE(msk->notsent_lowat, val);
> +		mptcp_write_space(sk);
> +		release_sock(sk);
> +		return 0;
> 	case TCP_CONGESTION:
> 		return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
> 	case TCP_CORK:
> @@ -1345,6 +1355,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
> 		return mptcp_put_int_option(msk, optval, optlen, msk->cork);
> 	case TCP_NODELAY:
> 		return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
> +	case TCP_NOTSENT_LOWAT:
> +		return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
> 	}
> 	return -EOPNOTSUPP;
> }
> -- 
> 2.43.0
>
>
>
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index def01e030121..cc5680c4c5b3 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1762,6 +1762,39 @@  static int do_copy_data_nocache(struct sock *sk, int copy,
 	return 0;
 }
 
+static bool mptcp_stream_memory_free(const struct sock *sk, int wake)
+{
+	const struct mptcp_sock *msk = mptcp_sk(sk);
+	u32 notsent_bytes = READ_ONCE(msk->write_seq) -
+			    READ_ONCE(msk->snd_nxt);
+
+	return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
+}
+
+/* open-code sk_stream_memory_free() plus sent limit computation to
+ * avoid indirect calls in fast-path.
+ * Called under the msk socket lock, so we can avoid a bunch of ONCE
+ * annotations.
+ */
+static u32 mptcp_send_limit(const struct sock *sk)
+{
+	const struct mptcp_sock *msk = mptcp_sk(sk);
+	u32 limit, not_sent;
+
+	if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
+		return 0;
+
+	limit = mptcp_notsent_lowat(sk);
+	if (limit == UINT_MAX)
+		return UINT_MAX;
+
+	not_sent = msk->write_seq - msk->snd_nxt;
+	if (not_sent >= limit)
+		return 0;
+
+	return limit - not_sent;
+}
+
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1806,6 +1839,12 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		struct mptcp_data_frag *dfrag;
 		bool dfrag_collapsed;
 		size_t psize, offset;
+		u32 copy_limit;
+
+		/* ensure fitting the notsent_lowat() constraint */
+		copy_limit = mptcp_send_limit(sk);
+		if (!copy_limit)
+			goto wait_for_memory;
 
 		/* reuse tail pfrag, if possible, or carve a new one from the
 		 * page allocator
@@ -1813,9 +1852,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		dfrag = mptcp_pending_tail(sk);
 		dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
 		if (!dfrag_collapsed) {
-			if (!sk_stream_memory_free(sk))
-				goto wait_for_memory;
-
 			if (!mptcp_page_frag_refill(sk, pfrag))
 				goto wait_for_memory;
 
@@ -1830,6 +1866,7 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		offset = dfrag->offset + dfrag->data_len;
 		psize = pfrag->size - offset;
 		psize = min_t(size_t, psize, msg_data_left(msg));
+		psize = min_t(size_t, psize, copy_limit);
 		total_ts = psize + frag_truesize;
 
 		if (!sk_wmem_schedule(sk, total_ts))
@@ -3711,6 +3748,7 @@  static struct proto mptcp_prot = {
 	.unhash		= mptcp_unhash,
 	.get_port	= mptcp_get_port,
 	.forward_alloc_get	= mptcp_forward_alloc_get,
+	.stream_memory_free	= mptcp_stream_memory_free,
 	.sockets_allocated	= &mptcp_sockets_allocated,
 
 	.memory_allocated	= &tcp_memory_allocated,
@@ -3884,12 +3922,12 @@  static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
 {
 	struct sock *sk = (struct sock *)msk;
 
-	if (sk_stream_is_writeable(sk))
+	if (__mptcp_stream_is_writeable(sk, 1))
 		return EPOLLOUT | EPOLLWRNORM;
 
 	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 	smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
-	if (sk_stream_is_writeable(sk))
+	if (__mptcp_stream_is_writeable(sk, 1))
 		return EPOLLOUT | EPOLLWRNORM;
 
 	return 0;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 0aa77c03643d..62b84cc6f35e 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -307,6 +307,7 @@  struct mptcp_sock {
 			in_accept_queue:1,
 			free_first:1,
 			rcvspace_init:1;
+	u32		notsent_lowat;
 	struct work_struct work;
 	struct sk_buff  *ooo_last_skb;
 	struct rb_root  out_of_order_queue;
@@ -796,11 +797,36 @@  static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
 	       READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
 }
 
+static inline u32 mptcp_notsent_lowat(const struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	u32 val;
+
+	val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
+	return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
+}
+
+static inline bool __mptcp_stream_memory_free(const struct sock *sk, int wake)
+{
+	const struct mptcp_sock *msk = mptcp_sk(sk);
+	u32 notsent_bytes;
+
+	notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
+	return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
+}
+
+static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
+{
+	return __mptcp_stream_memory_free(sk, wake) &&
+	       __sk_stream_is_writeable(sk, wake);
+}
+
 static inline void mptcp_write_space(struct sock *sk)
 {
 	/* pairs with memory barrier in mptcp_poll */
 	smp_mb();
-	sk_stream_write_space(sk);
+	if (__mptcp_stream_memory_free(sk, 1))
+		sk_stream_write_space(sk);
 }
 
 static inline void __mptcp_sync_sndbuf(struct sock *sk)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index ac37f6c5e2ed..1b38dac70719 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -812,6 +812,16 @@  static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 		return 0;
 	case TCP_ULP:
 		return -EOPNOTSUPP;
+	case TCP_NOTSENT_LOWAT:
+		ret = mptcp_get_int_option(msk, optval, optlen, &val);
+		if (ret)
+			return ret;
+
+		lock_sock(sk);
+		WRITE_ONCE(msk->notsent_lowat, val);
+		mptcp_write_space(sk);
+		release_sock(sk);
+		return 0;
 	case TCP_CONGESTION:
 		return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
 	case TCP_CORK:
@@ -1345,6 +1355,8 @@  static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 		return mptcp_put_int_option(msk, optval, optlen, msk->cork);
 	case TCP_NODELAY:
 		return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
+	case TCP_NOTSENT_LOWAT:
+		return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
 	}
 	return -EOPNOTSUPP;
 }