diff mbox series

[mptcp-net] mptcp: wake-up readers only for in sequence data.

Message ID 5009954210af20796d1cf88ca630d19ec12e2132.1622132690.git.pabeni@redhat.com (mailing list archive)
State Accepted, archived
Commit 508f97cb7d94537ee43c3137607e089688c4772b
Delegated to: Matthieu Baerts
Headers show
Series [mptcp-net] mptcp: wake-up readers only for in sequence data. | expand

Commit Message

Paolo Abeni May 27, 2021, 4:27 p.m. UTC
Currently we relay on the subflow->data_avail field,
which is subject to races:

	ssk1
		skb len = 500 DSS(seq=1, len=1000, off=0)
		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL

	ssk2
		skb len = 500 DSS(seq = 501, len=1000)
		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL

	ssk1
		skb len = 500 DSS(seq = 1, len=1000, off =500)
		# still data_avail == MPTCP_SUBFLOW_DATA_AVAIL,
		# as the skb is covered by a pre-existing map,
		# which was in-sequence at reception time.

Instead we can explicitly check if some has been received in-sequence,
propagating the info from __mptcp_move_skbs_from_subflow().

Additionally and the 'ONCE' annotation to the 'data_avail' memory
access, as msk will read it outside the subflow socket lock.

Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 29 +++++++++++------------------
 net/mptcp/protocol.h |  1 -
 net/mptcp/subflow.c  | 23 +++++++++--------------
 3 files changed, 20 insertions(+), 33 deletions(-)

Comments

Mat Martineau May 28, 2021, 12:56 a.m. UTC | #1
On Thu, 27 May 2021, Paolo Abeni wrote:

> Currently we relay on the subflow->data_avail field,
                ^^^^^ rely

> which is subject to races:
>
> 	ssk1
> 		skb len = 500 DSS(seq=1, len=1000, off=0)
> 		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL
>
> 	ssk2
> 		skb len = 500 DSS(seq = 501, len=1000)
> 		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL
>
> 	ssk1
> 		skb len = 500 DSS(seq = 1, len=1000, off =500)
> 		# still data_avail == MPTCP_SUBFLOW_DATA_AVAIL,
> 		# as the skb is covered by a pre-existing map,
> 		# which was in-sequence at reception time.
>
> Instead we can explicitly check if some has been received in-sequence,
> propagating the info from __mptcp_move_skbs_from_subflow().
>
> Additionally and the 'ONCE' annotation to the 'data_avail' memory
                ^^^ add (right?)

Fixing this one since it's a little confusing to parse, so figured I'd 
edit above too.

> access, as msk will read it outside the subflow socket lock.
>
> Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> net/mptcp/protocol.c | 29 +++++++++++------------------
> net/mptcp/protocol.h |  1 -
> net/mptcp/subflow.c  | 23 +++++++++--------------
> 3 files changed, 20 insertions(+), 33 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 786f09d83d35..1e77d2defd28 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -681,13 +681,13 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
> /* In most cases we will be able to lock the mptcp socket.  If its already
>  * owned, we need to defer to the work queue to avoid ABBA deadlock.
>  */
> -static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
> +static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
> {
> 	struct sock *sk = (struct sock *)msk;
> 	unsigned int moved = 0;
>
> 	if (inet_sk_state_load(sk) == TCP_CLOSE)
> -		return;
> +		return false;
>
> 	mptcp_data_lock(sk);
>
> @@ -702,6 +702,8 @@ static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
> 	if (mptcp_pending_data_fin(sk, NULL))
> 		mptcp_schedule_work(sk);
> 	mptcp_data_unlock(sk);
> +
> +	return moved > 0;
> }
>
> void mptcp_data_ready(struct sock *sk, struct sock *ssk)
> @@ -709,7 +711,6 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
> 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	int sk_rbuf, ssk_rbuf;
> -	bool wake;
>
> 	/* The peer can send data while we are shutting down this
> 	 * subflow at msk destruction time, but we must avoid enqueuing
> @@ -718,28 +719,20 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
> 	if (unlikely(subflow->disposable))
> 		return;
>
> -	/* move_skbs_to_msk below can legitly clear the data_avail flag,
> -	 * but we will need later to properly woke the reader, cache its
> -	 * value
> -	 */
> -	wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
> -	if (wake)
> -		set_bit(MPTCP_DATA_READY, &msk->flags);
> -
> 	ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
> 	sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
> 	if (unlikely(ssk_rbuf > sk_rbuf))
> 		sk_rbuf = ssk_rbuf;
>
> -	/* over limit? can't append more skbs to msk */
> +	/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
> 	if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
> -		goto wake;
> -
> -	move_skbs_to_msk(msk, ssk);
> +		return;
>
> -wake:
> -	if (wake)
> +	/* Wake-up the reader only for in-sequence data */
> +	if (move_skbs_to_msk(msk, ssk)) {
> +		set_bit(MPTCP_DATA_READY, &msk->flags);
> 		sk->sk_data_ready(sk);
> +	}
> }
>
> static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
> @@ -871,7 +864,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
> 	sock_owned_by_me(sk);
>
> 	mptcp_for_each_subflow(msk, subflow) {
> -		if (subflow->data_avail)
> +		if (READ_ONCE(subflow->data_avail))
> 			return mptcp_subflow_tcp_sock(subflow);
> 	}
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 74184296b6af..160c2ab09f19 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -373,7 +373,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
> enum mptcp_data_avail {
> 	MPTCP_SUBFLOW_NODATA,
> 	MPTCP_SUBFLOW_DATA_AVAIL,
> -	MPTCP_SUBFLOW_OOO_DATA
> };

Now subflow->data_avail is effectively a bool.

Is it appropriate in -net to change it to a bool (in this patch or make it 
a series)? Or better to save that enum->bool conversion for net-next?

If we keep the enum, it makes sense to explicitly write or compare to 
MPTCP_SUBFLOW_NODATA rather than use 0 or have checks like "if 
(READ_ONCE(subflow->data_avail))", because all of those code lines are 
getting touched anyway.

>
> struct mptcp_delegated_action {
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 726bc3d083fa..783a542e5bb7 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -1098,7 +1098,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
> 	struct sk_buff *skb;
>
> 	if (!skb_peek(&ssk->sk_receive_queue))
> -		subflow->data_avail = 0;
> +		WRITE_ONCE(subflow->data_avail, 0);
> 	if (subflow->data_avail)

Ok, the one line of context immediately above isn't currently changed but 
my enum->bool comment applies to that line too.


These review comments are only about code/commit formatting and wouldn't 
affect the compiled code, so I do want to point out that the functionality 
of the patch looks good to me. Could apply to export and squash some 
updates to get more testing exposure now. Let Matthieu know what you 
prefer.

Mat


> 		return true;
>
> @@ -1137,18 +1137,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
> 		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
> 		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
> 			 ack_seq);
> -		if (ack_seq == old_ack) {
> -			subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
> -			break;
> -		} else if (after64(ack_seq, old_ack)) {
> -			subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
> -			break;
> +		if (unlikely(before64(ack_seq, old_ack))) {
> +			mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
> +			continue;
> 		}
>
> -		/* only accept in-sequence mapping. Old values are spurious
> -		 * retransmission
> -		 */
> -		mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
> +		WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
> +		break;
> 	}
> 	return true;
>
> @@ -1168,7 +1163,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
> 		subflow->reset_transient = 0;
> 		subflow->reset_reason = MPTCP_RST_EMPTCP;
> 		tcp_send_active_reset(ssk, GFP_ATOMIC);
> -		subflow->data_avail = 0;
> +		WRITE_ONCE(subflow->data_avail, 0);
> 		return false;
> 	}
>
> @@ -1178,7 +1173,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
> 	subflow->map_seq = READ_ONCE(msk->ack_seq);
> 	subflow->map_data_len = skb->len;
> 	subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
> -	subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
> +	WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
> 	return true;
> }
>
> @@ -1190,7 +1185,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
> 	if (subflow->map_valid &&
> 	    mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
> 		subflow->map_valid = 0;
> -		subflow->data_avail = 0;
> +		WRITE_ONCE(subflow->data_avail, 0);
>
> 		pr_debug("Done with mapping: seq=%u data_len=%u",
> 			 subflow->map_subflow_seq,
> -- 
> 2.26.3

--
Mat Martineau
Intel
Matthieu Baerts June 4, 2021, 7:58 p.m. UTC | #2
Hi Paolo, Mat,

On 27/05/2021 18:27, Paolo Abeni wrote:
> Currently we relay on the subflow->data_avail field,
> which is subject to races:
> 
> 	ssk1
> 		skb len = 500 DSS(seq=1, len=1000, off=0)
> 		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL
> 
> 	ssk2
> 		skb len = 500 DSS(seq = 501, len=1000)
> 		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL
> 
> 	ssk1
> 		skb len = 500 DSS(seq = 1, len=1000, off =500)
> 		# still data_avail == MPTCP_SUBFLOW_DATA_AVAIL,
> 		# as the skb is covered by a pre-existing map,
> 		# which was in-sequence at reception time.
> 
> Instead we can explicitly check if some has been received in-sequence,
> propagating the info from __mptcp_move_skbs_from_subflow().
> 
> Additionally and the 'ONCE' annotation to the 'data_avail' memory
> access, as msk will read it outside the subflow socket lock.
> 
> Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

Thank you for the patch and the review!

Just applied in our tree:

- 508f97cb7d94: mptcp: wake-up readers only for in sequence data
- Results: 119541ecd3c9..02c88cfc4215

Builds and tests are now in progress:

https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export/20210604T195755
https://github.com/multipath-tcp/mptcp_net-next/actions/workflows/build-validation.yml?query=branch:export/20210604T195755

Cheers,
Matt
Matthieu Baerts June 5, 2021, 9:18 a.m. UTC | #3
Hi Paolo, Mat,

On 04/06/2021 21:58, Matthieu Baerts wrote:
> Hi Paolo, Mat,
> 
> On 27/05/2021 18:27, Paolo Abeni wrote:
>> Currently we relay on the subflow->data_avail field,
>> which is subject to races:
>>
>> 	ssk1
>> 		skb len = 500 DSS(seq=1, len=1000, off=0)
>> 		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL
>>
>> 	ssk2
>> 		skb len = 500 DSS(seq = 501, len=1000)
>> 		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL
>>
>> 	ssk1
>> 		skb len = 500 DSS(seq = 1, len=1000, off =500)
>> 		# still data_avail == MPTCP_SUBFLOW_DATA_AVAIL,
>> 		# as the skb is covered by a pre-existing map,
>> 		# which was in-sequence at reception time.
>>
>> Instead we can explicitly check if some has been received in-sequence,
>> propagating the info from __mptcp_move_skbs_from_subflow().
>>
>> Additionally and the 'ONCE' annotation to the 'data_avail' memory
>> access, as msk will read it outside the subflow socket lock.
>>
>> Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path")
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> 
> Thank you for the patch and the review!
> 
> Just applied in our tree:
> 
> - 508f97cb7d94: mptcp: wake-up readers only for in sequence data
> - Results: 119541ecd3c9..02c88cfc4215
> 
> Builds and tests are now in progress:
> 
> https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export/20210604T195755
> https://github.com/multipath-tcp/mptcp_net-next/actions/workflows/build-validation.yml?query=branch:export/20210604T195755

I have some issues with this patch. Please see:

  https://github.com/multipath-tcp/mptcp_net-next/issues/201

Cheers,
Matt
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 786f09d83d35..1e77d2defd28 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -681,13 +681,13 @@  static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
 /* In most cases we will be able to lock the mptcp socket.  If its already
  * owned, we need to defer to the work queue to avoid ABBA deadlock.
  */
-static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
+static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
 {
 	struct sock *sk = (struct sock *)msk;
 	unsigned int moved = 0;
 
 	if (inet_sk_state_load(sk) == TCP_CLOSE)
-		return;
+		return false;
 
 	mptcp_data_lock(sk);
 
@@ -702,6 +702,8 @@  static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
 	if (mptcp_pending_data_fin(sk, NULL))
 		mptcp_schedule_work(sk);
 	mptcp_data_unlock(sk);
+
+	return moved > 0;
 }
 
 void mptcp_data_ready(struct sock *sk, struct sock *ssk)
@@ -709,7 +711,6 @@  void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	int sk_rbuf, ssk_rbuf;
-	bool wake;
 
 	/* The peer can send data while we are shutting down this
 	 * subflow at msk destruction time, but we must avoid enqueuing
@@ -718,28 +719,20 @@  void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 	if (unlikely(subflow->disposable))
 		return;
 
-	/* move_skbs_to_msk below can legitly clear the data_avail flag,
-	 * but we will need later to properly woke the reader, cache its
-	 * value
-	 */
-	wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
-	if (wake)
-		set_bit(MPTCP_DATA_READY, &msk->flags);
-
 	ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
 	sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
 	if (unlikely(ssk_rbuf > sk_rbuf))
 		sk_rbuf = ssk_rbuf;
 
-	/* over limit? can't append more skbs to msk */
+	/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
 	if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
-		goto wake;
-
-	move_skbs_to_msk(msk, ssk);
+		return;
 
-wake:
-	if (wake)
+	/* Wake-up the reader only for in-sequence data */
+	if (move_skbs_to_msk(msk, ssk)) {
+		set_bit(MPTCP_DATA_READY, &msk->flags);
 		sk->sk_data_ready(sk);
+	}
 }
 
 static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
@@ -871,7 +864,7 @@  static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
 	sock_owned_by_me(sk);
 
 	mptcp_for_each_subflow(msk, subflow) {
-		if (subflow->data_avail)
+		if (READ_ONCE(subflow->data_avail))
 			return mptcp_subflow_tcp_sock(subflow);
 	}
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 74184296b6af..160c2ab09f19 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -373,7 +373,6 @@  mptcp_subflow_rsk(const struct request_sock *rsk)
 enum mptcp_data_avail {
 	MPTCP_SUBFLOW_NODATA,
 	MPTCP_SUBFLOW_DATA_AVAIL,
-	MPTCP_SUBFLOW_OOO_DATA
 };
 
 struct mptcp_delegated_action {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 726bc3d083fa..783a542e5bb7 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1098,7 +1098,7 @@  static bool subflow_check_data_avail(struct sock *ssk)
 	struct sk_buff *skb;
 
 	if (!skb_peek(&ssk->sk_receive_queue))
-		subflow->data_avail = 0;
+		WRITE_ONCE(subflow->data_avail, 0);
 	if (subflow->data_avail)
 		return true;
 
@@ -1137,18 +1137,13 @@  static bool subflow_check_data_avail(struct sock *ssk)
 		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
 		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
 			 ack_seq);
-		if (ack_seq == old_ack) {
-			subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
-			break;
-		} else if (after64(ack_seq, old_ack)) {
-			subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
-			break;
+		if (unlikely(before64(ack_seq, old_ack))) {
+			mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
+			continue;
 		}
 
-		/* only accept in-sequence mapping. Old values are spurious
-		 * retransmission
-		 */
-		mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
+		WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+		break;
 	}
 	return true;
 
@@ -1168,7 +1163,7 @@  static bool subflow_check_data_avail(struct sock *ssk)
 		subflow->reset_transient = 0;
 		subflow->reset_reason = MPTCP_RST_EMPTCP;
 		tcp_send_active_reset(ssk, GFP_ATOMIC);
-		subflow->data_avail = 0;
+		WRITE_ONCE(subflow->data_avail, 0);
 		return false;
 	}
 
@@ -1178,7 +1173,7 @@  static bool subflow_check_data_avail(struct sock *ssk)
 	subflow->map_seq = READ_ONCE(msk->ack_seq);
 	subflow->map_data_len = skb->len;
 	subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
-	subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
+	WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
 	return true;
 }
 
@@ -1190,7 +1185,7 @@  bool mptcp_subflow_data_available(struct sock *sk)
 	if (subflow->map_valid &&
 	    mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
 		subflow->map_valid = 0;
-		subflow->data_avail = 0;
+		WRITE_ONCE(subflow->data_avail, 0);
 
 		pr_debug("Done with mapping: seq=%u data_len=%u",
 			 subflow->map_subflow_seq,