Message ID | 93432e2a93c4ae6124fb607346ecfac45d40b2b3.1623835767.git.pabeni@redhat.com (mailing list archive) |
---|---|
State | Accepted, archived |
Commit | 7c5442f3e2bfaab26077f51199716a63fd74e9f4 |
Delegated to: | Matthieu Baerts |
Headers | show |
Series | [v3,mptcp-next] mptcp: refine mptcp_cleanup_rbuf | expand |
On Wed, 16 Jun 2021, Paolo Abeni wrote: > The current cleanup rbuf tries a bit too hard to avoid acquiring > the subflow socket lock. We may end-up delaying the needed ack, > or skip acking a blocked subflow. > > Address the above extending the conditions used to trigger the cleanup > to reflect more closely what TCP does and invoking tcp_cleanup_rbuf() > on all the active subflows. > > Note that we can't replicate the exact tests implemented in > tcp_cleanup_rbuf(), as MPTCP lacks some of the required info - e.g. > ping-pong mode. > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > --- > v2 -> v3: > - mptcp_subflow_cleanup_rbuf() returns void Thanks for the v3, this looks ready to apply. Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> > v1 -> v2: > - access ssk icsk/tp state instead of msk (Mat) > --- > net/mptcp/protocol.c | 56 ++++++++++++++++++-------------------------- > net/mptcp/protocol.h | 1 - > 2 files changed, 23 insertions(+), 34 deletions(-) > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index 0a220862f62d..05c8382aafef 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -442,49 +442,46 @@ static void mptcp_send_ack(struct mptcp_sock *msk) > } > } > > -static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) > +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) > { > bool slow; > - int ret; > > slow = lock_sock_fast(ssk); > - ret = tcp_can_send_ack(ssk); > - if (ret) > + if (tcp_can_send_ack(ssk)) > tcp_cleanup_rbuf(ssk, 1); > unlock_sock_fast(ssk, slow); > - return ret; > +} > + > +static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) > +{ > + const struct inet_connection_sock *icsk = inet_csk(ssk); > + bool ack_pending = READ_ONCE(icsk->icsk_ack.pending); > + const struct tcp_sock *tp = tcp_sk(ssk); > + > + return (ack_pending & ICSK_ACK_SCHED) && > + ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > > + READ_ONCE(icsk->icsk_ack.rcv_mss)) || > + (rx_empty && ack_pending & > + (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); > } > > static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) > { > - struct sock *ack_hint = READ_ONCE(msk->ack_hint); > int old_space = READ_ONCE(msk->old_wspace); > struct mptcp_subflow_context *subflow; > struct sock *sk = (struct sock *)msk; > - bool cleanup; > + int space = __mptcp_space(sk); > + bool cleanup, rx_empty; > > - /* this is a simple superset of what tcp_cleanup_rbuf() implements > - * so that we don't have to acquire the ssk socket lock most of the time > - * to do actually nothing > - */ > - cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); > - if (!cleanup) > - return; > + cleanup = (space > 0) && (space >= (old_space << 1)); > + rx_empty = !atomic_read(&sk->sk_rmem_alloc); > > - /* if the hinted ssk is still active, try to use it */ > - if (likely(ack_hint)) { > - mptcp_for_each_subflow(msk, subflow) { > - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); > + mptcp_for_each_subflow(msk, subflow) { > + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); > > - if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) > - return; > - } > + if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) > + mptcp_subflow_cleanup_rbuf(ssk); > } > - > - /* otherwise pick the first active subflow */ > - mptcp_for_each_subflow(msk, subflow) > - if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) > - return; > } > > static bool mptcp_check_data_fin(struct sock *sk) > @@ -629,7 +626,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, > break; > } > } while (more_data_avail); > - WRITE_ONCE(msk->ack_hint, ssk); > > *bytes += moved; > return done; > @@ -1910,7 +1906,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > __mptcp_update_rmem(sk); > done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); > mptcp_data_unlock(sk); > - tcp_cleanup_rbuf(ssk, moved); > > if (unlikely(ssk->sk_err)) > __mptcp_error_report(sk); > @@ -1926,7 +1921,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > ret |= __mptcp_ofo_queue(msk); > __mptcp_splice_receive_queue(sk); > mptcp_data_unlock(sk); > - mptcp_cleanup_rbuf(msk); > } > if (ret) > mptcp_check_data_fin((struct sock *)msk); > @@ -2175,9 +2169,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, > if (ssk == msk->last_snd) > msk->last_snd = NULL; > > - if (ssk == msk->ack_hint) > - msk->ack_hint = NULL; > - > if (ssk == msk->first) > msk->first = NULL; > > @@ -2392,7 +2383,6 @@ static int __mptcp_init_sock(struct sock *sk) > msk->rmem_released = 0; > msk->tx_pending_data = 0; > > - msk->ack_hint = NULL; > msk->first = NULL; > inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; > WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > index 41f2957bf50c..2480db50cbd2 100644 > --- a/net/mptcp/protocol.h > +++ b/net/mptcp/protocol.h > @@ -243,7 +243,6 @@ struct mptcp_sock { > bool use_64bit_ack; /* Set when we received a 64-bit DSN */ > bool csum_enabled; > spinlock_t join_list_lock; > - struct sock *ack_hint; > struct work_struct work; > struct sk_buff *ooo_last_skb; > struct rb_root out_of_order_queue; > -- > 2.26.3 > > > -- Mat Martineau Intel
Hi Paolo, Mat, On 16/06/2021 11:29, Paolo Abeni wrote: > The current cleanup rbuf tries a bit too hard to avoid acquiring > the subflow socket lock. We may end-up delaying the needed ack, > or skip acking a blocked subflow. > > Address the above extending the conditions used to trigger the cleanup > to reflect more closely what TCP does and invoking tcp_cleanup_rbuf() > on all the active subflows. > > Note that we can't replicate the exact tests implemented in > tcp_cleanup_rbuf(), as MPTCP lacks some of the required info - e.g. > ping-pong mode. > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> Thank you for the patch and the review! Now in our tree (feature for net-next) with Mat's RvB tag: - 7c5442f3e2bf: mptcp: refine mptcp_cleanup_rbuf - Results: 8db480094258..f40f6ff5a355 Builds and tests are now in progress: https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export/20210617T132720 https://github.com/multipath-tcp/mptcp_net-next/actions/workflows/build-validation.yml?query=branch:export/20210617T132720 Cheers, Matt
On Wed, 2021-06-16 at 11:29 +0200, Paolo Abeni wrote: > The current cleanup rbuf tries a bit too hard to avoid acquiring > the subflow socket lock. We may end-up delaying the needed ack, > or skip acking a blocked subflow. > > Address the above extending the conditions used to trigger the cleanup > to reflect more closely what TCP does and invoking tcp_cleanup_rbuf() > on all the active subflows. > > Note that we can't replicate the exact tests implemented in > tcp_cleanup_rbuf(), as MPTCP lacks some of the required info - e.g. > ping-pong mode. > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > --- > v2 -> v3: > - mptcp_subflow_cleanup_rbuf() returns void > v1 -> v2: > - access ssk icsk/tp state instead of msk (Mat) Darn me! it looks like this patch has some issues, that are almost hidden by mptcp-level retrans. Investigating. I'll try to provide a squash-to patch. Meanwhile we should not send this one to netdev. Thanks! Paolo
On Fri, 2021-06-18 at 19:35 +0200, Paolo Abeni wrote: > On Wed, 2021-06-16 at 11:29 +0200, Paolo Abeni wrote: > > The current cleanup rbuf tries a bit too hard to avoid acquiring > > the subflow socket lock. We may end-up delaying the needed ack, > > or skip acking a blocked subflow. > > > > Address the above extending the conditions used to trigger the cleanup > > to reflect more closely what TCP does and invoking tcp_cleanup_rbuf() > > on all the active subflows. > > > > Note that we can't replicate the exact tests implemented in > > tcp_cleanup_rbuf(), as MPTCP lacks some of the required info - e.g. > > ping-pong mode. > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > --- > > v2 -> v3: > > - mptcp_subflow_cleanup_rbuf() returns void > > v1 -> v2: > > - access ssk icsk/tp state instead of msk (Mat) > > Darn me! it looks like this patch has some issues, that are almost > hidden by mptcp-level retrans. Investigating. I'll try to provide a > squash-to patch. > > Meanwhile we should not send this one to netdev. Well, it looks like the issue I observer predates this change. The point is: if we disable mptcp-level retrans, mptcp_connect.sh self-test in a loop hang on some lossy test-case (which is unexpected). I *think* the solution is still in a better mptcp_cleanup_rbuf() - which is eluding me since several iteration :( /P
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0a220862f62d..05c8382aafef 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -442,49 +442,46 @@ static void mptcp_send_ack(struct mptcp_sock *msk) } } -static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) { bool slow; - int ret; slow = lock_sock_fast(ssk); - ret = tcp_can_send_ack(ssk); - if (ret) + if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); - return ret; +} + +static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) +{ + const struct inet_connection_sock *icsk = inet_csk(ssk); + bool ack_pending = READ_ONCE(icsk->icsk_ack.pending); + const struct tcp_sock *tp = tcp_sk(ssk); + + return (ack_pending & ICSK_ACK_SCHED) && + ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > + READ_ONCE(icsk->icsk_ack.rcv_mss)) || + (rx_empty && ack_pending & + (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { - struct sock *ack_hint = READ_ONCE(msk->ack_hint); int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; - bool cleanup; + int space = __mptcp_space(sk); + bool cleanup, rx_empty; - /* this is a simple superset of what tcp_cleanup_rbuf() implements - * so that we don't have to acquire the ssk socket lock most of the time - * to do actually nothing - */ - cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); - if (!cleanup) - return; + cleanup = (space > 0) && (space >= (old_space << 1)); + rx_empty = !atomic_read(&sk->sk_rmem_alloc); - /* if the hinted ssk is still active, try to use it */ - if (likely(ack_hint)) { - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) - return; - } + if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) + mptcp_subflow_cleanup_rbuf(ssk); } - - /* otherwise pick the first active subflow */ - mptcp_for_each_subflow(msk, subflow) - if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) - return; } static bool mptcp_check_data_fin(struct sock *sk) @@ -629,7 +626,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, break; } } while (more_data_avail); - WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; return done; @@ -1910,7 +1906,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); - tcp_cleanup_rbuf(ssk, moved); if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); @@ -1926,7 +1921,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); - mptcp_cleanup_rbuf(msk); } if (ret) mptcp_check_data_fin((struct sock *)msk); @@ -2175,9 +2169,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk == msk->last_snd) msk->last_snd = NULL; - if (ssk == msk->ack_hint) - msk->ack_hint = NULL; - if (ssk == msk->first) msk->first = NULL; @@ -2392,7 +2383,6 @@ static int __mptcp_init_sock(struct sock *sk) msk->rmem_released = 0; msk->tx_pending_data = 0; - msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 41f2957bf50c..2480db50cbd2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -243,7 +243,6 @@ struct mptcp_sock { bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; spinlock_t join_list_lock; - struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue;
The current cleanup rbuf tries a bit too hard to avoid acquiring the subflow socket lock. We may end-up delaying the needed ack, or skip acking a blocked subflow. Address the above extending the conditions used to trigger the cleanup to reflect more closely what TCP does and invoking tcp_cleanup_rbuf() on all the active subflows. Note that we can't replicate the exact tests implemented in tcp_cleanup_rbuf(), as MPTCP lacks some of the required info - e.g. ping-pong mode. Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- v2 -> v3: - mptcp_subflow_cleanup_rbuf() returns void v1 -> v2: - access ssk icsk/tp state instead of msk (Mat) --- net/mptcp/protocol.c | 56 ++++++++++++++++++-------------------------- net/mptcp/protocol.h | 1 - 2 files changed, 23 insertions(+), 34 deletions(-)