diff mbox series

[mptcp-next,2/4] mptcp: more accurate receive buffer updates

Message ID 63def511912bf79d6af065dccb155e9304932640.1659107989.git.pabeni@redhat.com (mailing list archive)
State Superseded, archived
Commit 02da150692878eef7f8dd8740a198e9dfc102d11
Delegated to: Matthieu Baerts
Headers show
Series mptcp: just another receive path refactor | expand

Checks

Context Check Description
matttbe/checkpatch success total: 0 errors, 0 warnings, 0 checks, 78 lines checked
matttbe/build fail Build error with: -Werror

Commit Message

Paolo Abeni July 29, 2022, 3:33 p.m. UTC
Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
acquiring the subflow socket lock, estimating if the tcp level
cleanup could actually send an ack.

Such estimate is a bit rough when accounting for receive window
change, as it consider the msk available buffer space instead
of the announced mptcp-level window.

Let's consider the announced window instead, mirroring closely
the plain TCP implementation. We need to lockless access a bunch
of additional, tcp fields.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mptcp.h  |  2 ++
 net/ipv4/tcp.c       |  3 +++
 net/mptcp/options.c  |  1 -
 net/mptcp/protocol.c | 14 +++++++++++---
 net/mptcp/protocol.h |  6 +++++-
 5 files changed, 21 insertions(+), 5 deletions(-)

Comments

kernel test robot July 29, 2022, 6:24 p.m. UTC | #1
Hi Paolo,

I love your patch! Yet something to improve:

[auto build test ERROR on mptcp/export]
[also build test ERROR on linus/master v5.19-rc8 next-20220728]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
base:   https://github.com/multipath-tcp/mptcp_net-next.git export
config: um-i386_defconfig (https://download.01.org/0day-ci/archive/20220730/202207300227.kYan1H1K-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-3) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/c330a583c6d306ab637d187f0f981bfe4408caba
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
        git checkout c330a583c6d306ab637d187f0f981bfe4408caba
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   net/ipv4/tcp.c: In function 'tcp_cleanup_rbuf':
>> net/ipv4/tcp.c:1609:21: error: implicit declaration of function 'is_tcpsk' [-Werror=implicit-function-declaration]
    1609 |                 if (is_tcpsk(sk))
         |                     ^~~~~~~~
   cc1: some warnings being treated as errors


vim +/is_tcpsk +1609 net/ipv4/tcp.c

  1563	
  1564	/* Clean up the receive buffer for full frames taken by the user,
  1565	 * then send an ACK if necessary.  COPIED is the number of bytes
  1566	 * tcp_recvmsg has given to the user so far, it speeds up the
  1567	 * calculation of whether or not we must ACK for the sake of
  1568	 * a window update.
  1569	 */
  1570	void tcp_cleanup_rbuf(struct sock *sk, int copied)
  1571	{
  1572		struct tcp_sock *tp = tcp_sk(sk);
  1573		bool time_to_ack = false;
  1574	
  1575		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
  1576	
  1577		WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
  1578		     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
  1579		     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
  1580	
  1581		if (inet_csk_ack_scheduled(sk)) {
  1582			const struct inet_connection_sock *icsk = inet_csk(sk);
  1583	
  1584			if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
  1585			    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
  1586			    /*
  1587			     * If this read emptied read buffer, we send ACK, if
  1588			     * connection is not bidirectional, user drained
  1589			     * receive buffer and there was a small segment
  1590			     * in queue.
  1591			     */
  1592			    (copied > 0 &&
  1593			     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
  1594			      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
  1595			       !inet_csk_in_pingpong_mode(sk))) &&
  1596			      !atomic_read(&sk->sk_rmem_alloc)))
  1597				time_to_ack = true;
  1598		}
  1599	
  1600		/* We send an ACK if we can now advertise a non-zero window
  1601		 * which has been raised "significantly".
  1602		 *
  1603		 * Even if window raised up to infinity, do not send window open ACK
  1604		 * in states, where we will not receive more. It is useless.
  1605		 */
  1606		if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
  1607			__u32 rcv_window_now = tcp_receive_window(tp);
  1608	
> 1609			if (is_tcpsk(sk))
  1610				mptcp_receive_window(sk, &rcv_window_now);
  1611	
  1612			/* Optimize, __tcp_select_window() is not cheap. */
  1613			if (2*rcv_window_now <= tp->window_clamp) {
  1614				__u32 new_window = __tcp_select_window(sk);
  1615	
  1616				/* Send ACK now, if this read freed lots of space
  1617				 * in our buffer. Certainly, new_window is new window.
  1618				 * We can advertise it now, if it is not less than current one.
  1619				 * "Lots" means "at least twice" here.
  1620				 */
  1621				if (new_window && new_window >= 2 * rcv_window_now)
  1622					time_to_ack = true;
  1623			}
  1624		}
  1625		if (time_to_ack)
  1626			tcp_send_ack(sk);
  1627	}
  1628
kernel test robot July 29, 2022, 7:05 p.m. UTC | #2
Hi Paolo,

I love your patch! Yet something to improve:

[auto build test ERROR on mptcp/export]
[also build test ERROR on linus/master v5.19-rc8 next-20220728]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
base:   https://github.com/multipath-tcp/mptcp_net-next.git export
config: hexagon-randconfig-r045-20220729 (https://download.01.org/0day-ci/archive/20220730/202207300238.2ZcAhQUi-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 8dfaecc4c24494337933aff9d9166486ca0949f1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/c330a583c6d306ab637d187f0f981bfe4408caba
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
        git checkout c330a583c6d306ab637d187f0f981bfe4408caba
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash net/ipv4/ net/mptcp/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> net/ipv4/tcp.c:1609:7: error: call to undeclared function 'is_tcpsk'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   if (is_tcpsk(sk))
                       ^
   1 error generated.


vim +/is_tcpsk +1609 net/ipv4/tcp.c

  1563	
  1564	/* Clean up the receive buffer for full frames taken by the user,
  1565	 * then send an ACK if necessary.  COPIED is the number of bytes
  1566	 * tcp_recvmsg has given to the user so far, it speeds up the
  1567	 * calculation of whether or not we must ACK for the sake of
  1568	 * a window update.
  1569	 */
  1570	void tcp_cleanup_rbuf(struct sock *sk, int copied)
  1571	{
  1572		struct tcp_sock *tp = tcp_sk(sk);
  1573		bool time_to_ack = false;
  1574	
  1575		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
  1576	
  1577		WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
  1578		     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
  1579		     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
  1580	
  1581		if (inet_csk_ack_scheduled(sk)) {
  1582			const struct inet_connection_sock *icsk = inet_csk(sk);
  1583	
  1584			if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
  1585			    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
  1586			    /*
  1587			     * If this read emptied read buffer, we send ACK, if
  1588			     * connection is not bidirectional, user drained
  1589			     * receive buffer and there was a small segment
  1590			     * in queue.
  1591			     */
  1592			    (copied > 0 &&
  1593			     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
  1594			      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
  1595			       !inet_csk_in_pingpong_mode(sk))) &&
  1596			      !atomic_read(&sk->sk_rmem_alloc)))
  1597				time_to_ack = true;
  1598		}
  1599	
  1600		/* We send an ACK if we can now advertise a non-zero window
  1601		 * which has been raised "significantly".
  1602		 *
  1603		 * Even if window raised up to infinity, do not send window open ACK
  1604		 * in states, where we will not receive more. It is useless.
  1605		 */
  1606		if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
  1607			__u32 rcv_window_now = tcp_receive_window(tp);
  1608	
> 1609			if (is_tcpsk(sk))
  1610				mptcp_receive_window(sk, &rcv_window_now);
  1611	
  1612			/* Optimize, __tcp_select_window() is not cheap. */
  1613			if (2*rcv_window_now <= tp->window_clamp) {
  1614				__u32 new_window = __tcp_select_window(sk);
  1615	
  1616				/* Send ACK now, if this read freed lots of space
  1617				 * in our buffer. Certainly, new_window is new window.
  1618				 * We can advertise it now, if it is not less than current one.
  1619				 * "Lots" means "at least twice" here.
  1620				 */
  1621				if (new_window && new_window >= 2 * rcv_window_now)
  1622					time_to_ack = true;
  1623			}
  1624		}
  1625		if (time_to_ack)
  1626			tcp_send_ack(sk);
  1627	}
  1628
diff mbox series

Patch

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 7af7fd48acc7..4b6c66b73bf4 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -136,6 +136,7 @@  static inline bool rsk_drop_req(const struct request_sock *req)
 	return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
 }
 
+void mptcp_receive_window(const struct sock *ssk, int *rwnd);
 void mptcp_space(const struct sock *ssk, int *space, int *full_space);
 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
 		       unsigned int *size, struct mptcp_out_options *opts);
@@ -284,6 +285,7 @@  static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 	return true;
 }
 
+static inline void mptcp_receive_window(const struct sock *ssk, int *rwnd) { }
 static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
 static inline void mptcp_seq_show(struct seq_file *seq) { }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 970e9a2cca4a..9e2df51c0558 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1606,6 +1606,9 @@  void tcp_cleanup_rbuf(struct sock *sk, int copied)
 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 		__u32 rcv_window_now = tcp_receive_window(tp);
 
+		if (is_tcpsk(sk))
+			mptcp_receive_window(sk, &rcv_window_now);
+
 		/* Optimize, __tcp_select_window() is not cheap. */
 		if (2*rcv_window_now <= tp->window_clamp) {
 			__u32 new_window = __tcp_select_window(sk);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 30d289044e71..563ef8fe5a85 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -604,7 +604,6 @@  static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 	}
 	opts->ext_copy.use_ack = 1;
 	opts->suboptions = OPTION_MPTCP_DSS;
-	WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
 
 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
 	if (dss_size == 0)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5af3d591a20b..17e2dbe43639 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -535,6 +535,14 @@  static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
 	unlock_sock_fast(ssk, slow);
 }
 
+void mptcp_receive_window(const struct sock *ssk, int *rwnd)
+{
+	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+	const struct sock *sk = subflow->conn;
+
+	*rwnd = __mptcp_receive_window(mptcp_sk(sk));
+}
+
 static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
 {
 	const struct inet_connection_sock *icsk = inet_csk(ssk);
@@ -550,13 +558,13 @@  static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
 
 static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
 {
-	int old_space = READ_ONCE(msk->old_wspace);
+	int cur_window = __mptcp_receive_window(msk);
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
-	int space =  __mptcp_space(sk);
+	int new_window =  __mptcp_space(sk);
 	bool cleanup, rx_empty;
 
-	cleanup = (space > 0) && (space >= (old_space << 1));
+	cleanup = new_window > 0 && new_window >= (cur_window << 1);
 	rx_empty = !__mptcp_rmem(sk);
 
 	mptcp_for_each_subflow(msk, subflow) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 1bc9f6e77ddd..a54f42462a71 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -260,7 +260,6 @@  struct mptcp_sock {
 	int		rmem_fwd_alloc;
 	struct sock	*last_snd;
 	int		snd_burst;
-	int		old_wspace;
 	u64		recovery_snd_nxt;	/* in recovery mode accept up to this seq;
 						 * recovery related fields are under data_lock
 						 * protection
@@ -336,6 +335,11 @@  static inline int __mptcp_rmem(const struct sock *sk)
 	return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
 }
 
+static inline int __mptcp_receive_window(const struct mptcp_sock *msk)
+{
+	return atomic64_read(&msk->rcv_wnd_sent) - msk->ack_seq;
+}
+
 static inline int __mptcp_space(const struct sock *sk)
 {
 	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));