diff mbox series

[mptcp-next,v4,2/2] mptcp: retrans for redundant sends

Message ID aa989fcaf2c6471ce82d59fffd0d09c78191ba2e.1670724098.git.geliang.tang@suse.com (mailing list archive)
State Changes Requested
Delegated to: Mat Martineau
Headers show
Series BPF redundant scheduler, part 3 | expand

Commit Message

Geliang Tang Dec. 11, 2022, 2:03 a.m. UTC
Redundant sends need to work more like the MPTCP retransmit code path.
When the scheduler selects multiple subflows, the first subflow to send
is a "normal" transmit, and any other subflows would act like a retransmit
when accessing the dfrags.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 45 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

Comments

Mat Martineau Dec. 15, 2022, 1:47 a.m. UTC | #1
On Sun, 11 Dec 2022, Geliang Tang wrote:

> Redundant sends need to work more like the MPTCP retransmit code path.
> When the scheduler selects multiple subflows, the first subflow to send
> is a "normal" transmit, and any other subflows would act like a retransmit
> when accessing the dfrags.
>
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> ---
> net/mptcp/protocol.c | 45 ++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 41 insertions(+), 4 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 2342b9469181..4c07add44b02 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -45,6 +45,7 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm
>
> static void __mptcp_destroy_sock(struct sock *sk);
> static void __mptcp_check_send_data_fin(struct sock *sk);
> +static void __mptcp_retrans(struct sock *sk);
>
> DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
> static struct net_device mptcp_napi_dev;
> @@ -997,7 +998,7 @@ static void __mptcp_clean_una(struct sock *sk)
>
> 		if (unlikely(dfrag == msk->first_pending)) {
> 			/* in recovery mode can see ack after the current snd head */
> -			if (WARN_ON_ONCE(!msk->recovery))
> +			if (!msk->recovery)
> 				break;
>
> 			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> @@ -1012,7 +1013,7 @@ static void __mptcp_clean_una(struct sock *sk)
>
> 		/* prevent wrap around in recovery mode */
> 		if (unlikely(delta > dfrag->already_sent)) {
> -			if (WARN_ON_ONCE(!msk->recovery))
> +			if (!msk->recovery)
> 				goto out;
> 			if (WARN_ON_ONCE(delta > dfrag->data_len))
> 				goto out;
> @@ -1111,6 +1112,7 @@ struct mptcp_sendmsg_info {
> 	u16 sent;
> 	unsigned int flags;
> 	bool data_lock_held;
> +	struct mptcp_data_frag *last;
> };
>
> static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
> @@ -1526,6 +1528,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
> 		info->sent = dfrag->already_sent;
> 		info->limit = dfrag->data_len;
> 		len = dfrag->data_len - dfrag->already_sent;
> +		info->last = dfrag;
> 		while (len > 0) {
> 			int ret = 0;
>
> @@ -1562,14 +1565,19 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> 	struct sock *prev_ssk = NULL, *ssk = NULL;
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	struct mptcp_subflow_context *subflow;
> +	struct mptcp_data_frag *head, *dfrag;
> 	struct mptcp_sendmsg_info info = {
> 				.flags = flags,
> 	};
> 	bool do_check_data_fin = false;
> 	int push_count = 1;
>
> +	head = mptcp_send_head(sk);
> +	if (!head)
> +		goto out;
> +
> 	while (mptcp_send_head(sk) && (push_count > 0)) {
> -		int ret = 0;
> +		int ret = 0, i = 0;
>
> 		if (mptcp_sched_get_send(msk))
> 			break;
> @@ -1578,6 +1586,19 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
>
> 		mptcp_for_each_subflow(msk, subflow) {
> 			if (READ_ONCE(subflow->scheduled)) {
> +				if (i > 0) {
> +					WRITE_ONCE(msk->first_pending, head);
> +					mptcp_push_release(ssk, &info, do_check_data_fin);
> +
> +					while ((dfrag = mptcp_send_head(sk))) {
> +						__mptcp_retrans(sk);
> +						if (dfrag == info.last)
> +							break;
> +						WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> +					}
> +					goto out;
> +				}
> +
> 				mptcp_subflow_set_scheduled(subflow, false);
>
> 				prev_ssk = ssk;
> @@ -1605,6 +1626,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> 						push_count--;
> 					continue;
> 				}
> +				i++;
> 				do_check_data_fin = true;
> 				msk->last_snd = ssk;
> 			}
> @@ -1614,6 +1636,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> 	/* at this point we held the socket lock for the last subflow we used */
> 	mptcp_push_release(ssk, &info, do_check_data_fin);
>
> +out:
> 	/* ensure the rtx timer is running */
> 	if (!mptcp_timer_pending(sk))
> 		mptcp_reset_timer(sk);
> @@ -1628,13 +1651,18 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
> 	struct mptcp_sendmsg_info info = {
> 		.data_lock_held = true,
> 	};
> +	struct mptcp_data_frag *head;
> 	bool keep_pushing = true;
> 	struct sock *xmit_ssk;
> 	int copied = 0;
>
> +	head = mptcp_send_head(sk);
> +	if (!head)
> +		goto out;
> +
> 	info.flags = 0;
> 	while (mptcp_send_head(sk) && keep_pushing) {
> -		int ret = 0;
> +		int ret = 0, i = 0;
>
> 		/* check for a different subflow usage only after
> 		 * spooling the first chunk of data
> @@ -1659,18 +1687,27 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
> 			ret = __subflow_push_pending(sk, ssk, &info);
> 			if (ret <= 0)
> 				keep_pushing = false;
> +			i++;
> 			copied += ret;
> 			msk->last_snd = ssk;
> 		}
>
> 		mptcp_for_each_subflow(msk, subflow) {
> 			if (READ_ONCE(subflow->scheduled)) {
> +				if (i > 0) {
> +					WRITE_ONCE(msk->first_pending, head);
> +					if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
> +						mptcp_schedule_work(sk);

Hi Geliang -

It's not going to perform well enough to use the work queue for redundant 
sends.

MPTCP_WORK_RTX works ok for retransmissions because that is expected to be 
infrequent. A redundant scheduler would be sending everything on multiple 
subflows.

The retransmission code has some ideas for making redundant sends work, 
but redundant sending is different:

1. I think the scheduler will need to store some sequence numbers at the 
msk level so the same data is sent on different subflows, even on the 
__mptcp_subflow_push_pending() code path. This is a change for regular 
(not redundant) schedulers too, and lets the schedulers select which 
subflows to send on AND what data to send.

2. The redundant send code should not behave exactly like retransmit. For 
example, don't call mptcp_sched_get_retrans() or use the retransmit MIB 
counters. Maybe it's simpler to have a separate function rather than add a 
bunch of conditionals to __mptcp_retrans()?

3. When using the __mptcp_subflow_push_pending() code path, the 
MPTCP_DELEGATE_SEND technique should be used repeatedly until all 
redundantly scheduled subflows have sent their data.


I'll check with other community members at the meeting tomorrow to see 
if some other ideas come up.


> +					goto out;
> +				}
> +
> 				mptcp_subflow_set_scheduled(subflow, false);
>
> 				xmit_ssk = mptcp_subflow_tcp_sock(subflow);
> 				if (xmit_ssk != ssk) {
> 					mptcp_subflow_delegate(subflow,
> 							       MPTCP_DELEGATE_SEND);
> +					i++;
> 					msk->last_snd = xmit_ssk;
> 					keep_pushing = false;
> 				}
> -- 
> 2.35.3
>
>
>

Thanks,

--
Mat Martineau
Intel
Geliang Tang Dec. 19, 2022, 1:17 p.m. UTC | #2
On Wed, Dec 14, 2022 at 05:47:12PM -0800, Mat Martineau wrote:
> On Sun, 11 Dec 2022, Geliang Tang wrote:
> 
> > Redundant sends need to work more like the MPTCP retransmit code path.
> > When the scheduler selects multiple subflows, the first subflow to send
> > is a "normal" transmit, and any other subflows would act like a retransmit
> > when accessing the dfrags.
> > 
> > Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> > ---
> > net/mptcp/protocol.c | 45 ++++++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 41 insertions(+), 4 deletions(-)
> > 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 2342b9469181..4c07add44b02 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -45,6 +45,7 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm
> > 
> > static void __mptcp_destroy_sock(struct sock *sk);
> > static void __mptcp_check_send_data_fin(struct sock *sk);
> > +static void __mptcp_retrans(struct sock *sk);
> > 
> > DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
> > static struct net_device mptcp_napi_dev;
> > @@ -997,7 +998,7 @@ static void __mptcp_clean_una(struct sock *sk)
> > 
> > 		if (unlikely(dfrag == msk->first_pending)) {
> > 			/* in recovery mode can see ack after the current snd head */
> > -			if (WARN_ON_ONCE(!msk->recovery))
> > +			if (!msk->recovery)
> > 				break;
> > 
> > 			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> > @@ -1012,7 +1013,7 @@ static void __mptcp_clean_una(struct sock *sk)
> > 
> > 		/* prevent wrap around in recovery mode */
> > 		if (unlikely(delta > dfrag->already_sent)) {
> > -			if (WARN_ON_ONCE(!msk->recovery))
> > +			if (!msk->recovery)
> > 				goto out;
> > 			if (WARN_ON_ONCE(delta > dfrag->data_len))
> > 				goto out;
> > @@ -1111,6 +1112,7 @@ struct mptcp_sendmsg_info {
> > 	u16 sent;
> > 	unsigned int flags;
> > 	bool data_lock_held;
> > +	struct mptcp_data_frag *last;
> > };
> > 
> > static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
> > @@ -1526,6 +1528,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
> > 		info->sent = dfrag->already_sent;
> > 		info->limit = dfrag->data_len;
> > 		len = dfrag->data_len - dfrag->already_sent;
> > +		info->last = dfrag;
> > 		while (len > 0) {
> > 			int ret = 0;
> > 
> > @@ -1562,14 +1565,19 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> > 	struct sock *prev_ssk = NULL, *ssk = NULL;
> > 	struct mptcp_sock *msk = mptcp_sk(sk);
> > 	struct mptcp_subflow_context *subflow;
> > +	struct mptcp_data_frag *head, *dfrag;
> > 	struct mptcp_sendmsg_info info = {
> > 				.flags = flags,
> > 	};
> > 	bool do_check_data_fin = false;
> > 	int push_count = 1;
> > 
> > +	head = mptcp_send_head(sk);
> > +	if (!head)
> > +		goto out;
> > +
> > 	while (mptcp_send_head(sk) && (push_count > 0)) {
> > -		int ret = 0;
> > +		int ret = 0, i = 0;
> > 
> > 		if (mptcp_sched_get_send(msk))
> > 			break;
> > @@ -1578,6 +1586,19 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> > 
> > 		mptcp_for_each_subflow(msk, subflow) {
> > 			if (READ_ONCE(subflow->scheduled)) {
> > +				if (i > 0) {
> > +					WRITE_ONCE(msk->first_pending, head);
> > +					mptcp_push_release(ssk, &info, do_check_data_fin);
> > +
> > +					while ((dfrag = mptcp_send_head(sk))) {
> > +						__mptcp_retrans(sk);
> > +						if (dfrag == info.last)
> > +							break;
> > +						WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> > +					}
> > +					goto out;
> > +				}
> > +
> > 				mptcp_subflow_set_scheduled(subflow, false);
> > 
> > 				prev_ssk = ssk;
> > @@ -1605,6 +1626,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> > 						push_count--;
> > 					continue;
> > 				}
> > +				i++;
> > 				do_check_data_fin = true;
> > 				msk->last_snd = ssk;
> > 			}
> > @@ -1614,6 +1636,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> > 	/* at this point we held the socket lock for the last subflow we used */
> > 	mptcp_push_release(ssk, &info, do_check_data_fin);
> > 
> > +out:
> > 	/* ensure the rtx timer is running */
> > 	if (!mptcp_timer_pending(sk))
> > 		mptcp_reset_timer(sk);
> > @@ -1628,13 +1651,18 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
> > 	struct mptcp_sendmsg_info info = {
> > 		.data_lock_held = true,
> > 	};
> > +	struct mptcp_data_frag *head;
> > 	bool keep_pushing = true;
> > 	struct sock *xmit_ssk;
> > 	int copied = 0;
> > 
> > +	head = mptcp_send_head(sk);
> > +	if (!head)
> > +		goto out;
> > +
> > 	info.flags = 0;
> > 	while (mptcp_send_head(sk) && keep_pushing) {
> > -		int ret = 0;
> > +		int ret = 0, i = 0;
> > 
> > 		/* check for a different subflow usage only after
> > 		 * spooling the first chunk of data
> > @@ -1659,18 +1687,27 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
> > 			ret = __subflow_push_pending(sk, ssk, &info);
> > 			if (ret <= 0)
> > 				keep_pushing = false;
> > +			i++;
> > 			copied += ret;
> > 			msk->last_snd = ssk;
> > 		}
> > 
> > 		mptcp_for_each_subflow(msk, subflow) {
> > 			if (READ_ONCE(subflow->scheduled)) {
> > +				if (i > 0) {
> > +					WRITE_ONCE(msk->first_pending, head);
> > +					if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
> > +						mptcp_schedule_work(sk);
> 
> Hi Geliang -
> 
> It's not going to perform well enough to use the work queue for redundant
> sends.
> 
> MPTCP_WORK_RTX works ok for retransmissions because that is expected to be
> infrequent. A redundant scheduler would be sending everything on multiple
> subflows.
> 
> The retransmission code has some ideas for making redundant sends work, but
> redundant sending is different:
> 
> 1. I think the scheduler will need to store some sequence numbers at the msk
> level so the same data is sent on different subflows, even on the
> __mptcp_subflow_push_pending() code path. This is a change for regular (not
> redundant) schedulers too, and lets the schedulers select which subflows to
> send on AND what data to send.

Sorry, Mat, I didn't get this idea yet. Please give me more details
about it. We should add sched_seq_start and sched_seq_end in struct
mptcp_sock, right? These sequence numbers should be set in the BPF
context by the users, so we need to add sched_seq_start and
sched_seq_end in struct mptcp_sched_data too. Something likes:

        for (int i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
                if (!data->contexts[i])
                        break;

                mptcp_subflow_set_scheduled(data->contexts[i], true);
		data->sched_seq_start = SN;
		data->sched_seq_end = SN + LEN;
        }

How can the users know what sequence number (SN) to write from? The
sequence number is generated in the kernel?

We can use (msk->sched_seq_end - msk->sched_seq_start) to replace
"msk->snd_burst", but I still don't know how to check these sequence
numbers in __mptcp_subflow_push_pending().

Thanks,
-Geliang

> 
> 2. The redundant send code should not behave exactly like retransmit. For
> example, don't call mptcp_sched_get_retrans() or use the retransmit MIB
> counters. Maybe it's simpler to have a separate function rather than add a
> bunch of conditionals to __mptcp_retrans()?
> 
> 3. When using the __mptcp_subflow_push_pending() code path, the
> MPTCP_DELEGATE_SEND technique should be used repeatedly until all
> redundantly scheduled subflows have sent their data.
> 
> 
> I'll check with other community members at the meeting tomorrow to see if
> some other ideas come up.
> 
> 
> > +					goto out;
> > +				}
> > +
> > 				mptcp_subflow_set_scheduled(subflow, false);
> > 
> > 				xmit_ssk = mptcp_subflow_tcp_sock(subflow);
> > 				if (xmit_ssk != ssk) {
> > 					mptcp_subflow_delegate(subflow,
> > 							       MPTCP_DELEGATE_SEND);
> > +					i++;
> > 					msk->last_snd = xmit_ssk;
> > 					keep_pushing = false;
> > 				}
> > -- 
> > 2.35.3
> > 
> > 
> > 
> 
> Thanks,
> 
> --
> Mat Martineau
> Intel
Mat Martineau Dec. 20, 2022, 2:31 a.m. UTC | #3
On Mon, 19 Dec 2022, Geliang Tang wrote:

> On Wed, Dec 14, 2022 at 05:47:12PM -0800, Mat Martineau wrote:
>> On Sun, 11 Dec 2022, Geliang Tang wrote:
>>
>>> Redundant sends need to work more like the MPTCP retransmit code path.
>>> When the scheduler selects multiple subflows, the first subflow to send
>>> is a "normal" transmit, and any other subflows would act like a retransmit
>>> when accessing the dfrags.
>>>
>>> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
>>> ---
>>> net/mptcp/protocol.c | 45 ++++++++++++++++++++++++++++++++++++++++----
>>> 1 file changed, 41 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index 2342b9469181..4c07add44b02 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -45,6 +45,7 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm
>>>
>>> static void __mptcp_destroy_sock(struct sock *sk);
>>> static void __mptcp_check_send_data_fin(struct sock *sk);
>>> +static void __mptcp_retrans(struct sock *sk);
>>>
>>> DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
>>> static struct net_device mptcp_napi_dev;
>>> @@ -997,7 +998,7 @@ static void __mptcp_clean_una(struct sock *sk)
>>>
>>> 		if (unlikely(dfrag == msk->first_pending)) {
>>> 			/* in recovery mode can see ack after the current snd head */
>>> -			if (WARN_ON_ONCE(!msk->recovery))
>>> +			if (!msk->recovery)
>>> 				break;
>>>
>>> 			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
>>> @@ -1012,7 +1013,7 @@ static void __mptcp_clean_una(struct sock *sk)
>>>
>>> 		/* prevent wrap around in recovery mode */
>>> 		if (unlikely(delta > dfrag->already_sent)) {
>>> -			if (WARN_ON_ONCE(!msk->recovery))
>>> +			if (!msk->recovery)
>>> 				goto out;
>>> 			if (WARN_ON_ONCE(delta > dfrag->data_len))
>>> 				goto out;
>>> @@ -1111,6 +1112,7 @@ struct mptcp_sendmsg_info {
>>> 	u16 sent;
>>> 	unsigned int flags;
>>> 	bool data_lock_held;
>>> +	struct mptcp_data_frag *last;
>>> };
>>>
>>> static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
>>> @@ -1526,6 +1528,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
>>> 		info->sent = dfrag->already_sent;
>>> 		info->limit = dfrag->data_len;
>>> 		len = dfrag->data_len - dfrag->already_sent;
>>> +		info->last = dfrag;
>>> 		while (len > 0) {
>>> 			int ret = 0;
>>>
>>> @@ -1562,14 +1565,19 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
>>> 	struct sock *prev_ssk = NULL, *ssk = NULL;
>>> 	struct mptcp_sock *msk = mptcp_sk(sk);
>>> 	struct mptcp_subflow_context *subflow;
>>> +	struct mptcp_data_frag *head, *dfrag;
>>> 	struct mptcp_sendmsg_info info = {
>>> 				.flags = flags,
>>> 	};
>>> 	bool do_check_data_fin = false;
>>> 	int push_count = 1;
>>>
>>> +	head = mptcp_send_head(sk);
>>> +	if (!head)
>>> +		goto out;
>>> +
>>> 	while (mptcp_send_head(sk) && (push_count > 0)) {
>>> -		int ret = 0;
>>> +		int ret = 0, i = 0;
>>>
>>> 		if (mptcp_sched_get_send(msk))
>>> 			break;
>>> @@ -1578,6 +1586,19 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
>>>
>>> 		mptcp_for_each_subflow(msk, subflow) {
>>> 			if (READ_ONCE(subflow->scheduled)) {
>>> +				if (i > 0) {
>>> +					WRITE_ONCE(msk->first_pending, head);
>>> +					mptcp_push_release(ssk, &info, do_check_data_fin);
>>> +
>>> +					while ((dfrag = mptcp_send_head(sk))) {
>>> +						__mptcp_retrans(sk);
>>> +						if (dfrag == info.last)
>>> +							break;
>>> +						WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
>>> +					}
>>> +					goto out;
>>> +				}
>>> +
>>> 				mptcp_subflow_set_scheduled(subflow, false);
>>>
>>> 				prev_ssk = ssk;
>>> @@ -1605,6 +1626,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
>>> 						push_count--;
>>> 					continue;
>>> 				}
>>> +				i++;
>>> 				do_check_data_fin = true;
>>> 				msk->last_snd = ssk;
>>> 			}
>>> @@ -1614,6 +1636,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
>>> 	/* at this point we held the socket lock for the last subflow we used */
>>> 	mptcp_push_release(ssk, &info, do_check_data_fin);
>>>
>>> +out:
>>> 	/* ensure the rtx timer is running */
>>> 	if (!mptcp_timer_pending(sk))
>>> 		mptcp_reset_timer(sk);
>>> @@ -1628,13 +1651,18 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
>>> 	struct mptcp_sendmsg_info info = {
>>> 		.data_lock_held = true,
>>> 	};
>>> +	struct mptcp_data_frag *head;
>>> 	bool keep_pushing = true;
>>> 	struct sock *xmit_ssk;
>>> 	int copied = 0;
>>>
>>> +	head = mptcp_send_head(sk);
>>> +	if (!head)
>>> +		goto out;
>>> +
>>> 	info.flags = 0;
>>> 	while (mptcp_send_head(sk) && keep_pushing) {
>>> -		int ret = 0;
>>> +		int ret = 0, i = 0;
>>>
>>> 		/* check for a different subflow usage only after
>>> 		 * spooling the first chunk of data
>>> @@ -1659,18 +1687,27 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
>>> 			ret = __subflow_push_pending(sk, ssk, &info);
>>> 			if (ret <= 0)
>>> 				keep_pushing = false;
>>> +			i++;
>>> 			copied += ret;
>>> 			msk->last_snd = ssk;
>>> 		}
>>>
>>> 		mptcp_for_each_subflow(msk, subflow) {
>>> 			if (READ_ONCE(subflow->scheduled)) {
>>> +				if (i > 0) {
>>> +					WRITE_ONCE(msk->first_pending, head);
>>> +					if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
>>> +						mptcp_schedule_work(sk);
>>
>> Hi Geliang -
>>
>> It's not going to perform well enough to use the work queue for redundant
>> sends.
>>
>> MPTCP_WORK_RTX works ok for retransmissions because that is expected to be
>> infrequent. A redundant scheduler would be sending everything on multiple
>> subflows.
>>
>> The retransmission code has some ideas for making redundant sends work, but
>> redundant sending is different:
>>
>> 1. I think the scheduler will need to store some sequence numbers at the msk
>> level so the same data is sent on different subflows, even on the
>> __mptcp_subflow_push_pending() code path. This is a change for regular (not
>> redundant) schedulers too, and lets the schedulers select which subflows to
>> send on AND what data to send.
>
> Sorry, Mat, I didn't get this idea yet. Please give me more details
> about it. We should add sched_seq_start and sched_seq_end in struct
> mptcp_sock, right?

Right.

> These sequence numbers should be set in the BPF
> context by the users, so we need to add sched_seq_start and
> sched_seq_end in struct mptcp_sched_data too. Something likes:
>
>        for (int i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
>                if (!data->contexts[i])
>                        break;
>
>                mptcp_subflow_set_scheduled(data->contexts[i], true);
> 		data->sched_seq_start = SN;
> 		data->sched_seq_end = SN + LEN;
>        }
>
> How can the users know what sequence number (SN) to write from? The
> sequence number is generated in the kernel?
>

You're right, the users don't know that information - the wrappers (like 
mptcp_sched_get_send()) would have to get sched_seq_start from the 
msk->first_pending dfrag (dfrag->data_seq + dfrag->already_sent).

It would also be useful to have the wrapper set sched_seq_end using 
dfrag->data_len before calling the scheduler function (BPF or in-kernel), 
so the scheduler knows the maximum length for the segment. sched_seq_end 
would need to be validated to be between sched_seq_start and the end of 
the dfrag.

> We can use (msk->sched_seq_end - msk->sched_seq_start) to replace
> "msk->snd_burst", but I still don't know how to check these sequence
> numbers in __mptcp_subflow_push_pending().

It looks like the dfrags have the info needed? In addition, sched_seq_end 
can be used when setting info->limit in __subflow_push_pending() to limit 
how much data mptcp_sendmsg_frag() will attempt to send.


I hope that fills in the details you need to move ahead - if you have more 
questions I'm happy to answer or continue revising the design.


- Mat

>
>>
>> 2. The redundant send code should not behave exactly like retransmit. For
>> example, don't call mptcp_sched_get_retrans() or use the retransmit MIB
>> counters. Maybe it's simpler to have a separate function rather than add a
>> bunch of conditionals to __mptcp_retrans()?
>>
>> 3. When using the __mptcp_subflow_push_pending() code path, the
>> MPTCP_DELEGATE_SEND technique should be used repeatedly until all
>> redundantly scheduled subflows have sent their data.
>>
>>
>> I'll check with other community members at the meeting tomorrow to see if
>> some other ideas come up.
>>
>>
>>> +					goto out;
>>> +				}
>>> +
>>> 				mptcp_subflow_set_scheduled(subflow, false);
>>>
>>> 				xmit_ssk = mptcp_subflow_tcp_sock(subflow);
>>> 				if (xmit_ssk != ssk) {
>>> 					mptcp_subflow_delegate(subflow,
>>> 							       MPTCP_DELEGATE_SEND);
>>> +					i++;
>>> 					msk->last_snd = xmit_ssk;
>>> 					keep_pushing = false;
>>> 				}
>>> --
>>> 2.35.3
>>>
>>>
>>>
>>
>> Thanks,
>>
>> --
>> Mat Martineau
>> Intel
>

--
Mat Martineau
Intel
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 2342b9469181..4c07add44b02 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -45,6 +45,7 @@  static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm
 
 static void __mptcp_destroy_sock(struct sock *sk);
 static void __mptcp_check_send_data_fin(struct sock *sk);
+static void __mptcp_retrans(struct sock *sk);
 
 DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
 static struct net_device mptcp_napi_dev;
@@ -997,7 +998,7 @@  static void __mptcp_clean_una(struct sock *sk)
 
 		if (unlikely(dfrag == msk->first_pending)) {
 			/* in recovery mode can see ack after the current snd head */
-			if (WARN_ON_ONCE(!msk->recovery))
+			if (!msk->recovery)
 				break;
 
 			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
@@ -1012,7 +1013,7 @@  static void __mptcp_clean_una(struct sock *sk)
 
 		/* prevent wrap around in recovery mode */
 		if (unlikely(delta > dfrag->already_sent)) {
-			if (WARN_ON_ONCE(!msk->recovery))
+			if (!msk->recovery)
 				goto out;
 			if (WARN_ON_ONCE(delta > dfrag->data_len))
 				goto out;
@@ -1111,6 +1112,7 @@  struct mptcp_sendmsg_info {
 	u16 sent;
 	unsigned int flags;
 	bool data_lock_held;
+	struct mptcp_data_frag *last;
 };
 
 static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
@@ -1526,6 +1528,7 @@  static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
 		info->sent = dfrag->already_sent;
 		info->limit = dfrag->data_len;
 		len = dfrag->data_len - dfrag->already_sent;
+		info->last = dfrag;
 		while (len > 0) {
 			int ret = 0;
 
@@ -1562,14 +1565,19 @@  void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 	struct sock *prev_ssk = NULL, *ssk = NULL;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_subflow_context *subflow;
+	struct mptcp_data_frag *head, *dfrag;
 	struct mptcp_sendmsg_info info = {
 				.flags = flags,
 	};
 	bool do_check_data_fin = false;
 	int push_count = 1;
 
+	head = mptcp_send_head(sk);
+	if (!head)
+		goto out;
+
 	while (mptcp_send_head(sk) && (push_count > 0)) {
-		int ret = 0;
+		int ret = 0, i = 0;
 
 		if (mptcp_sched_get_send(msk))
 			break;
@@ -1578,6 +1586,19 @@  void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 
 		mptcp_for_each_subflow(msk, subflow) {
 			if (READ_ONCE(subflow->scheduled)) {
+				if (i > 0) {
+					WRITE_ONCE(msk->first_pending, head);
+					mptcp_push_release(ssk, &info, do_check_data_fin);
+
+					while ((dfrag = mptcp_send_head(sk))) {
+						__mptcp_retrans(sk);
+						if (dfrag == info.last)
+							break;
+						WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+					}
+					goto out;
+				}
+
 				mptcp_subflow_set_scheduled(subflow, false);
 
 				prev_ssk = ssk;
@@ -1605,6 +1626,7 @@  void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 						push_count--;
 					continue;
 				}
+				i++;
 				do_check_data_fin = true;
 				msk->last_snd = ssk;
 			}
@@ -1614,6 +1636,7 @@  void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 	/* at this point we held the socket lock for the last subflow we used */
 	mptcp_push_release(ssk, &info, do_check_data_fin);
 
+out:
 	/* ensure the rtx timer is running */
 	if (!mptcp_timer_pending(sk))
 		mptcp_reset_timer(sk);
@@ -1628,13 +1651,18 @@  static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
 	struct mptcp_sendmsg_info info = {
 		.data_lock_held = true,
 	};
+	struct mptcp_data_frag *head;
 	bool keep_pushing = true;
 	struct sock *xmit_ssk;
 	int copied = 0;
 
+	head = mptcp_send_head(sk);
+	if (!head)
+		goto out;
+
 	info.flags = 0;
 	while (mptcp_send_head(sk) && keep_pushing) {
-		int ret = 0;
+		int ret = 0, i = 0;
 
 		/* check for a different subflow usage only after
 		 * spooling the first chunk of data
@@ -1659,18 +1687,27 @@  static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
 			ret = __subflow_push_pending(sk, ssk, &info);
 			if (ret <= 0)
 				keep_pushing = false;
+			i++;
 			copied += ret;
 			msk->last_snd = ssk;
 		}
 
 		mptcp_for_each_subflow(msk, subflow) {
 			if (READ_ONCE(subflow->scheduled)) {
+				if (i > 0) {
+					WRITE_ONCE(msk->first_pending, head);
+					if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
+						mptcp_schedule_work(sk);
+					goto out;
+				}
+
 				mptcp_subflow_set_scheduled(subflow, false);
 
 				xmit_ssk = mptcp_subflow_tcp_sock(subflow);
 				if (xmit_ssk != ssk) {
 					mptcp_subflow_delegate(subflow,
 							       MPTCP_DELEGATE_SEND);
+					i++;
 					msk->last_snd = xmit_ssk;
 					keep_pushing = false;
 				}