diff mbox series

[mptcp-next,1/3] mptcp: add TCP_INQ cmsg support

Message ID 20211108105711.16200-2-fw@strlen.de (mailing list archive)
State Superseded, archived
Delegated to: Mat Martineau
Headers show
Series support for setsockopt(TCP_INQ) | expand

Checks

Context Check Description
matttbe/build success Build and static analysis OK
matttbe/checkpatch success total: 0 errors, 0 warnings, 0 checks, 126 lines checked
matttbe/KVM_Validation__normal warning Unstable: 2 failed test(s): packetdrill_sockopts selftest_mptcp_join
matttbe/KVM_Validation__debug warning Unstable: 1 failed test(s): packetdrill_sockopts

Commit Message

Florian Westphal Nov. 8, 2021, 10:57 a.m. UTC
Support the TCP_INQ setsockopt.
This is a boolean that tells recvmsg path to include the remaining
in-sequence bytes into a cmsg data.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++
 net/mptcp/protocol.h |  1 +
 net/mptcp/sockopt.c  | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)

Comments

Mat Martineau Nov. 10, 2021, 1:11 a.m. UTC | #1
On Mon, 8 Nov 2021, Florian Westphal wrote:

> Support the TCP_INQ setsockopt.
> This is a boolean that tells recvmsg path to include the remaining
> in-sequence bytes into a cmsg data.
>
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++
> net/mptcp/protocol.h |  1 +
> net/mptcp/sockopt.c  | 37 +++++++++++++++++++++++++++++++++++++
> 3 files changed, 71 insertions(+)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index b0bfe20d6bb0..b4263bf821ac 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -46,6 +46,7 @@ struct mptcp_skb_cb {
>
> enum {
> 	MPTCP_CMSG_TS = BIT(0),
> +	MPTCP_CMSG_INQ = BIT(1),
> };
>
> static struct percpu_counter mptcp_sockets_allocated;
> @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> 	return !skb_queue_empty(&msk->receive_queue);
> }
>
> +static unsigned int mptcp_inq_hint(const struct sock *sk)
> +{
> +	const struct mptcp_sock *msk = mptcp_sk(sk);
> +	const struct sk_buff *skb;
> +	u64 acked = msk->ack_seq;
> +	u64 hint_val = 0;
> +
> +	skb = skb_peek(&msk->receive_queue);
> +	if (skb) {
> +		u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset;

MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related 
to the map_seq.

I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for 
hint_val?

I did run in to a selftest failure, but could not reproduce:

$ sudo ./mptcp_sockopt.sh
Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client
Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server
tcp_inq is larger than one mbyte
copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4)
  client exit code 1, server 2

netns ns1-0-IRJkco socket stat for 12001:
Netid  State     Recv-Q  Send-Q     Local Address:Port      Peer Address:Port   Process 
tcp    ESTAB     0       0               10.0.2.1:12001         10.0.2.2:52141   ino:0 sk:1 fwmark:0x1 <->
 	 ts sack cubic wscale:7,7 rto:201 rtt:0.427/0.261 ato:40 mss:1448 pmtu:1500 rcvmss:1420 advmss:1448 cwnd:10 ssthresh:56 bytes_sent:163620 bytes_acked:163620 bytes_received:65332 segs_out:131 segs_in:68 data_segs_out:117 data_segs_in:50 send 271288056bps lastsnd:30052 lastrcv:30057 lastack:30051 pacing_rate 1821988992bps delivery_rate 1488222216bps delivered:118 busy:3ms rcv_rtt:1 rcv_space:14600 rcv_ssthresh:148470 minrtt:0.122 tcp-ulp-mptcp flags:Jec token:0000(id:0)/5a7e8206(id:3) seq:a590dda83727410f sfseq:e0cd ssnoff:db8b95cc maplen:1e68
tcp    ESTAB     0       0               10.0.1.1:12001         10.0.1.2:48520   ino:0 sk:2 fwmark:0x1 <->
 	 ts sack cubic wscale:7,7 rto:201 rtt:0.279/0.037 ato:40 mss:1448 pmtu:1500 rcvmss:1420 advmss:1448 cwnd:10 ssthresh:20 bytes_sent:426076 bytes_acked:426076 bytes_received:114892 segs_out:368 segs_in:278 data_segs_out:336 data_segs_in:86 send 415197133bps lastsnd:105 lastrcv:30055 lastack:105 pacing_rate 496678872bps delivery_rate 41078008bps delivered:337 busy:9ms rwnd_limited:1ms(11.1%) sndbuf_limited:1ms(11.1%) rcv_rtt:1 rcv_space:14600 rcv_ssthresh:189558 minrtt:0.082 tcp-ulp-mptcp flags:Mec token:0000(id:0)/5a7e8206(id:0) seq:a590dda83728010f sfseq:1a0cd ssnoff:511e5d9a maplen:2000
tcp    ESTAB     0       0               10.0.3.1:12001         10.0.3.2:49725   ino:0 sk:3 fwmark:0x1 <->
 	 ts sack cubic wscale:7,7 rto:201 rtt:1/0.5 mss:1448 pmtu:1500 rcvmss:536 advmss:1448 cwnd:10 segs_out:2 segs_in:3 send 115840000bps lastsnd:30055 lastrcv:30055 lastack:30053 pacing_rate 231680000bps delivered:1 rcv_space:14600 rcv_ssthresh:64076 minrtt:1 tcp-ulp-mptcp flags:Jec token:0000(id:0)/5a7e8206(id:5) seq:a590dda83728210f sfseq:0 ssnoff:1bb4388 maplen:0
mptcp  LAST-ACK  0       0               10.0.1.1:12001         10.0.1.2:48520   timer:(keepalive,59sec,0) ino:0 sk:4 fwmark:0x1 ---
 	 subflows:7 add_addr_signal:8 add_addr_accepted:5 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:5a7e8206 write_seq:cf6523e6d02cad72 snd_una:cf6523e6d027a6cd rcv_nxt:a590dda837282110

netns ns2-0-IRJkco socket stat for 12001:
Netid  State       Recv-Q  Send-Q   Local Address:Port      Peer Address:Port   Process 
tcp    ESTAB       230756  0             10.0.1.2:48520         10.0.1.1:12001   ino:0 sk:5 fwmark:0x2 <->
 	 ts sack cubic wscale:7,7 rto:201 rtt:0.414/0.368 ato:40 mss:1448 pmtu:1500 rcvmss:1168 advmss:1448 cwnd:27 ssthresh:27 bytes_sent:114892 bytes_acked:114893 bytes_received:426076 segs_out:278 segs_in:369 data_segs_out:86 data_segs_in:336 send 755478261bps lastsnd:30074 lastrcv:123 lastack:123 pacing_rate 906573912bps delivery_rate 8849496bps delivered:87 busy:5ms rcv_rtt:0.888 rcv_space:14480 rcv_ssthresh:163912 minrtt:0.082 tcp-ulp-mptcp flags:Mmec token:0000(id:0)/af0717b9(id:0) seq:cf6523e6d028ec4d sfseq:2e479 ssnoff:3f01671c maplen:1680
tcp    ESTAB       0       0             10.0.3.2:49725         10.0.3.1:12001   ino:0 sk:6 fwmark:0x2 <->
 	 ts sack cubic wscale:7,7 rto:201 rtt:0.48/0.24 mss:1448 pmtu:1500 rcvmss:536 advmss:1448 cwnd:10 bytes_acked:1 segs_out:3 segs_in:3 send 241333333bps lastsnd:30073 lastrcv:30073 lastack:30071 pacing_rate 482666664bps delivered:1 rcv_space:14480 rcv_ssthresh:64088 minrtt:0.48 tcp-ulp-mptcp flags:Jjec token:5a7e8206(id:5)/af0717b9(id:0) seq:0 sfseq:0 ssnoff:a5831809 maplen:0
tcp    ESTAB       52444   0             10.0.2.2:52141         10.0.2.1:12001   ino:0 sk:7 fwmark:0x2 <->
 	 ts sack cubic wscale:7,7 rto:201 rtt:0.167/0.105 ato:40 mss:1448 pmtu:1500 rcvmss:1420 advmss:1448 cwnd:15 bytes_sent:65332 bytes_acked:65333 bytes_received:163620 segs_out:68 segs_in:132 data_segs_out:50 data_segs_in:117 send 1040479042bps lastsnd:30075 lastrcv:30070 lastack:30070 pacing_rate 2079401640bps delivery_rate 868800000bps delivered:51 busy:3ms rcv_rtt:0.184 rcv_space:14480 rcv_ssthresh:152552 minrtt:0.077 tcp-ulp-mptcp flags:Jjec token:5a7e8206(id:3)/af0717b9(id:0) seq:cf6523e6d027e4ed sfseq:13841 ssnoff:21898d30 maplen:7a08
mptcp  FIN-WAIT-2  108184  0             10.0.1.2:48520         10.0.1.1:12001   timer:(keepalive,29sec,0) ino:0 sk:8 fwmark:0x2 ---
 	 subflows:7 add_addr_signal:8 add_addr_accepted:7 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:af0717b9 write_seq:a590dda837282110 snd_una:a590dda837282110 rcv_nxt:cf6523e6d027a6cd

> +
> +		hint_val = acked - map_seq;
> +
> +		if (hint_val >= INT_MAX)
> +			hint_val = INT_MAX - 1;
> +	}
> +
> +	if (hint_val == 0 && sock_flag(sk, SOCK_DONE))
> +		hint_val = 1;
> +
> +	return (unsigned int)hint_val;
> +}
> +
> static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> 			 int nonblock, int flags, int *addr_len)
> {
> @@ -2030,6 +2054,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> 	len = min_t(size_t, len, INT_MAX);
> 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
>
> +	if (unlikely(msk->recvmsg_inq))
> +		cmsg_flags = MPTCP_CMSG_INQ;
> +
> 	while (copied < len) {
> 		int bytes_read;
>
> @@ -2103,6 +2130,12 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> 	if (cmsg_flags && copied >= 0) {
> 		if (cmsg_flags & MPTCP_CMSG_TS)
> 			tcp_recv_timestamp(msg, sk, &tss);
> +
> +		if (cmsg_flags & MPTCP_CMSG_INQ) {
> +			unsigned int inq = mptcp_inq_hint(sk);
> +
> +			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
> +		}
> 	}
>
> 	pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 906509c6cde5..e77de7662df0 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -250,6 +250,7 @@ struct mptcp_sock {
> 	bool		use_64bit_ack; /* Set when we received a 64-bit DSN */
> 	bool		csum_enabled;
> 	bool		allow_infinite_fallback;
> +	u8		recvmsg_inq:1;
> 	spinlock_t	join_list_lock;
> 	struct work_struct work;
> 	struct sk_buff  *ooo_last_skb;
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index b818e91f2e09..7405152691e0 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -557,6 +557,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
> 		case TCP_TIMESTAMP:
> 		case TCP_NOTSENT_LOWAT:
> 		case TCP_TX_DELAY:
> +		case TCP_INQ:
> 			return true;
> 		}
>
> @@ -698,7 +699,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
> static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
> 				    sockptr_t optval, unsigned int optlen)
> {
> +	struct sock *sk = (void *)msk;
> +	int ret, val;
> +
> 	switch (optname) {
> +	case TCP_INQ:
> +		ret = mptcp_get_int_option(msk, optval, optlen, &val);
> +		if (ret)
> +			return ret;
> +		if (val < 0 || val > 1)
> +			return -EINVAL;
> +
> +		lock_sock(sk);
> +		msk->recvmsg_inq = !!val;
> +		release_sock(sk);
> +		return 0;
> 	case TCP_ULP:
> 		return -EOPNOTSUPP;
> 	case TCP_CONGESTION:
> @@ -1032,6 +1047,26 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o
> 	return 0;
> }
>
> +static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
> +				int __user *optlen, int val)
> +{
> +	int len;
> +
> +	if (get_user(len, optlen))
> +		return -EFAULT;
> +
> +	len = min_t(unsigned int, len, sizeof(int));
> +	if (len < 0)
> +		return -EINVAL;
> +
> +	if (put_user(len, optlen))
> +		return -EFAULT;
> +	if (copy_to_user(optval, &val, len))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
> 				    char __user *optval, int __user *optlen)
> {
> @@ -1042,6 +1077,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
> 	case TCP_CC_INFO:
> 		return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
> 						      optval, optlen);
> +	case TCP_INQ:
> +		return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq);
> 	}
> 	return -EOPNOTSUPP;
> }
> -- 
> 2.32.0
>
>
>

--
Mat Martineau
Intel
Florian Westphal Nov. 10, 2021, 10:17 a.m. UTC | #2
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
> On Mon, 8 Nov 2021, Florian Westphal wrote:
> 
> > Support the TCP_INQ setsockopt.
> > This is a boolean that tells recvmsg path to include the remaining
> > in-sequence bytes into a cmsg data.
> > 
> > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224
> > Signed-off-by: Florian Westphal <fw@strlen.de>
> > ---
> > net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++
> > net/mptcp/protocol.h |  1 +
> > net/mptcp/sockopt.c  | 37 +++++++++++++++++++++++++++++++++++++
> > 3 files changed, 71 insertions(+)
> > 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index b0bfe20d6bb0..b4263bf821ac 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -46,6 +46,7 @@ struct mptcp_skb_cb {
> > 
> > enum {
> > 	MPTCP_CMSG_TS = BIT(0),
> > +	MPTCP_CMSG_INQ = BIT(1),
> > };
> > 
> > static struct percpu_counter mptcp_sockets_allocated;
> > @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> > 	return !skb_queue_empty(&msk->receive_queue);
> > }
> > 
> > +static unsigned int mptcp_inq_hint(const struct sock *sk)
> > +{
> > +	const struct mptcp_sock *msk = mptcp_sk(sk);
> > +	const struct sk_buff *skb;
> > +	u64 acked = msk->ack_seq;
> > +	u64 hint_val = 0;
> > +
> > +	skb = skb_peek(&msk->receive_queue);
> > +	if (skb) {
> > +		u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset;
> 
> MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related
> to the map_seq.
> 
> I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for
> hint_val?

I don't think so.

INQ should return all bytes that are still pending for read().

ack_seq is the largest in-sequence number we've seen, so ack_seq - map_seq yields
the in-sequence byte count of the skbs that have been queued for reading.

I could probably use skb_peek_tail() instead of skb_peek() and then grab
->end_seq instead of using msk->ack_seq, but I fail to see the advantage.

Because userspace might have partially read some data, we need
to add the offset that userspace might have consumed already.

e.g., one skb queued, with 10 bytes of data, userspace calls read(, ..., 1);
so the inq hint should return 9, not 10.

(read() increments MPTCP_SKB_CB(skb)->offset if skb wasn't consumed).

> I did run in to a selftest failure, but could not reproduce:
> 
> $ sudo ./mptcp_sockopt.sh
> Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client
> Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server
> tcp_inq is larger than one mbyte
> copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4)
>  client exit code 1, server 2

Hmm, could not repro either so far.
Mat Martineau Nov. 10, 2021, 7:13 p.m. UTC | #3
On Wed, 10 Nov 2021, Florian Westphal wrote:

> Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
>> On Mon, 8 Nov 2021, Florian Westphal wrote:
>>
>>> Support the TCP_INQ setsockopt.
>>> This is a boolean that tells recvmsg path to include the remaining
>>> in-sequence bytes into a cmsg data.
>>>
>>> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224
>>> Signed-off-by: Florian Westphal <fw@strlen.de>
>>> ---
>>> net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++
>>> net/mptcp/protocol.h |  1 +
>>> net/mptcp/sockopt.c  | 37 +++++++++++++++++++++++++++++++++++++
>>> 3 files changed, 71 insertions(+)
>>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index b0bfe20d6bb0..b4263bf821ac 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -46,6 +46,7 @@ struct mptcp_skb_cb {
>>>
>>> enum {
>>> 	MPTCP_CMSG_TS = BIT(0),
>>> +	MPTCP_CMSG_INQ = BIT(1),
>>> };
>>>
>>> static struct percpu_counter mptcp_sockets_allocated;
>>> @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
>>> 	return !skb_queue_empty(&msk->receive_queue);
>>> }
>>>
>>> +static unsigned int mptcp_inq_hint(const struct sock *sk)
>>> +{
>>> +	const struct mptcp_sock *msk = mptcp_sk(sk);
>>> +	const struct sk_buff *skb;
>>> +	u64 acked = msk->ack_seq;
>>> +	u64 hint_val = 0;
>>> +
>>> +	skb = skb_peek(&msk->receive_queue);
>>> +	if (skb) {
>>> +		u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset;
>>
>> MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related
>> to the map_seq.
>>
>> I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for
>> hint_val?
>
> I don't think so.
>
> INQ should return all bytes that are still pending for read().
>
> ack_seq is the largest in-sequence number we've seen, so ack_seq - map_seq yields
> the in-sequence byte count of the skbs that have been queued for reading.
>
> I could probably use skb_peek_tail() instead of skb_peek() and then grab
> ->end_seq instead of using msk->ack_seq, but I fail to see the advantage.

Yeah, you're right. I was thinking of end_seq for the tail skb, but that's 
not the skb this code is looking at. Also, my mistake in thinking the 
mptcp_skb_cb map_seq was a copy of the "map_seq" from the headers, but 
it's really a mapped DSN. So it does seem like the hint_val calculation is 
ok.

>
> Because userspace might have partially read some data, we need
> to add the offset that userspace might have consumed already.
>
> e.g., one skb queued, with 10 bytes of data, userspace calls read(, ..., 1);
> so the inq hint should return 9, not 10.
>
> (read() increments MPTCP_SKB_CB(skb)->offset if skb wasn't consumed).
>
>> I did run in to a selftest failure, but could not reproduce:
>>
>> $ sudo ./mptcp_sockopt.sh
>> Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client
>> Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server
>> tcp_inq is larger than one mbyte
>> copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4)
>>  client exit code 1, server 2
>
> Hmm, could not repro either so far.
>

Still trying here, modified the test to show the inq value in case that 
helps debug.

--
Mat Martineau
Intel
Mat Martineau Nov. 10, 2021, 7:35 p.m. UTC | #4
On Wed, 10 Nov 2021, Mat Martineau wrote:

> On Wed, 10 Nov 2021, Florian Westphal wrote:
>
>> Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
>>> On Mon, 8 Nov 2021, Florian Westphal wrote:
>>> 
>>> I did run in to a selftest failure, but could not reproduce:
>>> 
>>> $ sudo ./mptcp_sockopt.sh
>>> Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client
>>> Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server
>>> tcp_inq is larger than one mbyte
>>> copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4)
>>>  client exit code 1, server 2
>> 
>> Hmm, could not repro either so far.
>> 
>
> Still trying here, modified the test to show the inq value in case that helps 
> debug.

Well, that was quick:

"tcp_inq=7ffffffe is larger than one mbyte"

At least it's the expected INT_MAX - 1 value... I'll instrument the kernel 
some more and try again.



The rest of the failure dump:

copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4)
  client exit code 1, server 2

netns ns1-0-ExcWL9 socket stat for 12001:
Netid                        State                            Recv-Q                        Send-Q                                                  Local Address:Port                                                    Peer Address:Port                         Process 
tcp                          TIME-WAIT                        0                             0                                                            10.0.3.1:12001                                                       10.0.3.2:45689                         timer:(timewait,28sec,0) ino:0 sk:1 fwmark:0x1

tcp                          TIME-WAIT                        0                             0                                                            10.0.1.1:12001                                                       10.0.1.2:55212                         timer:(timewait,28sec,0) ino:0 sk:2 fwmark:0x1

tcp                          TIME-WAIT                        0                             0                                                            10.0.2.1:12001                                                       10.0.2.2:41595                         timer:(timewait,28sec,0) ino:0 sk:3 fwmark:0x1

tcp                          TIME-WAIT                        0                             0                                                            10.0.4.1:12001                                                       10.0.4.2:54849                         timer:(timewait,28sec,0) ino:0 sk:4 fwmark:0x1

tcp                          ESTAB                            0                             0                                                    [dead:beef:1::1]:12001                                               [dead:beef:1::2]:52336                         ino:0 sk:5 fwmark:0x1 <->
 	 ts sack cubic wscale:7,7 rto:203 rtt:2.136/2.521 ato:40 mss:1428 pmtu:1500 rcvmss:1428 advmss:1428 cwnd:27 ssthresh:27 bytes_sent:5020656 bytes_acked:5020656 bytes_received:81527 segs_out:3748 segs_in:746 data_segs_out:3718 data_segs_in:62 send 144404494bps lastsnd:137 lastrcv:30088 lastack:137 pacing_rate 173285392bps delivery_rate 215547168bps delivered:3719 busy:1372ms rwnd_limited:1ms(0.1%) sndbuf_limited:49ms(3.6%) rcv_rtt:1.095 rcv_space:14400 rcv_ssthresh:64096 minrtt:0.081 tcp-ulp-mptcp flags:Mec token:0000(id:0)/587fabd7(id:0) seq:1c219faa3cd01b83 sfseq:e001 ssnoff:f8380bd4 maplen:5e77
mptcp                        LAST-ACK                         0                             0                                                    [dead:beef:1::1]:12001                                               [dead:beef:1::2]:52336                         timer:(keepalive,59sec,0) ino:0 sk:6 fwmark:0x1 ---
 	 subflows:7 add_addr_signal:8 add_addr_accepted:7 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:587fabd7 write_seq:674f10efa988edc3 snd_una:674f10efa985917a rcv_nxt:1c219faa3cd079fb

netns ns2-0-ExcWL9 socket stat for 12001:
Netid                       State                             Recv-Q                         Send-Q                                                  Local Address:Port                                                    Peer Address:Port                        Process 
tcp                         ESTAB                             4843536                        0                                                    [dead:beef:1::2]:52336                                               [dead:beef:1::1]:12001                        ino:0 sk:1001 fwmark:0x2 <->
 	 ts sack cubic wscale:7,7 rto:201 rtt:0.895/0.65 ato:40 mss:1428 pmtu:1500 rcvmss:1400 advmss:1428 cwnd:22 ssthresh:20 bytes_sent:81527 bytes_acked:81528 bytes_received:5020656 segs_out:746 segs_in:3749 data_segs_out:62 data_segs_in:3718 send 280813408bps lastsnd:30109 lastrcv:158 lastack:66 pacing_rate 336834952bps delivery_rate 232090288bps delivered:63 busy:5ms rcv_rtt:103.546 rcv_space:14280 rcv_ssthresh:351074 minrtt:0.176 tcp-ulp-mptcp flags:Mmec token:0000(id:0)/ec432c53(id:0) seq:674f10efa985727a sfseq:294e1 ssnoff:cc6bcb15 maplen:1f00
mptcp                       FIN-WAIT-2                        62616                          0                                                    [dead:beef:1::2]:52336                                               [dead:beef:1::1]:12001                        timer:(keepalive,29sec,0) ino:0 sk:1002 fwmark:0x2 ---
 	 subflows:7 add_addr_signal:8 add_addr_accepted:7 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:ec432c53 write_seq:1c219faa3cd079fb snd_una:1c219faa3cd079fb rcv_nxt:674f10efa985917a



--
Mat Martineau
Intel
Florian Westphal Nov. 10, 2021, 11:09 p.m. UTC | #5
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
> > Still trying here, modified the test to show the inq value in case that
> > helps debug.
> 
> Well, that was quick:
> 
> "tcp_inq=7ffffffe is larger than one mbyte"
> 
> At least it's the expected INT_MAX - 1 value... I'll instrument the kernel
> some more and try again.

Great, thanks.  I'll look at this tomorrow as well.

I will keep this test case as-is, but will add an extra test with
out-of-band channel that will inform receiver about exact number of sent
bytes so we can check reported inq hint vs. the (known) unread data value.
Mat Martineau Nov. 11, 2021, 12:05 a.m. UTC | #6
On Thu, 11 Nov 2021, Florian Westphal wrote:

> Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
>>> Still trying here, modified the test to show the inq value in case that
>>> helps debug.
>>
>> Well, that was quick:
>>
>> "tcp_inq=7ffffffe is larger than one mbyte"
>>
>> At least it's the expected INT_MAX - 1 value... I'll instrument the kernel
>> some more and try again.
>
> Great, thanks.  I'll look at this tomorrow as well.
>
> I will keep this test case as-is, but will add an extra test with
> out-of-band channel that will inform receiver about exact number of sent
> bytes so we can check reported inq hint vs. the (known) unread data value.
>

More data:

    mptcp_connect-3837    [003] .....   229.043932: mptcp_inq_hint: msk->ack_seq=14236111453981801913 cb->map_seq=14236111453981665217 cb->offset=59862, msk->sk_receive_queue_tail=0000000000000000 hint_val_raw=76834
    mptcp_connect-3837    [003] .....   229.044556: mptcp_inq_hint: msk->ack_seq=14236111453981801913 cb->map_seq=14236111453981665217 cb->offset=68054, msk->sk_receive_queue_tail=0000000000000000 hint_val_raw=68642
    mptcp_connect-3837    [003] .....   229.045875: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=17814, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=64930
    mptcp_connect-3837    [003] .....   229.046829: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=26006, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=56738
    mptcp_connect-3837    [003] .....   229.047619: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=34198, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=48546
    mptcp_connect-3837    [003] .....   229.048245: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=42390, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=40354
    mptcp_connect-3837    [003] .....   229.048761: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=16368, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=20504
    mptcp_connect-3837    [003] .....   229.049461: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=24560, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=12312
    mptcp_connect-3837    [003] .....   229.050651: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=32752, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=4120
    mptcp_connect-3837    [003] .....   229.051074: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=40944, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=18446744073709547544

So, it does seem to be related to acked data in msk->sk_receive_queue


Mat
Paolo Abeni Nov. 11, 2021, 10:48 a.m. UTC | #7
On Wed, 2021-11-10 at 11:17 +0100, Florian Westphal wrote:
> Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
> > On Mon, 8 Nov 2021, Florian Westphal wrote:
> > 
> > > Support the TCP_INQ setsockopt.
> > > This is a boolean that tells recvmsg path to include the remaining
> > > in-sequence bytes into a cmsg data.
> > > 
> > > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224
> > > Signed-off-by: Florian Westphal <fw@strlen.de>
> > > ---
> > > net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++
> > > net/mptcp/protocol.h |  1 +
> > > net/mptcp/sockopt.c  | 37 +++++++++++++++++++++++++++++++++++++
> > > 3 files changed, 71 insertions(+)
> > > 
> > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > > index b0bfe20d6bb0..b4263bf821ac 100644
> > > --- a/net/mptcp/protocol.c
> > > +++ b/net/mptcp/protocol.c
> > > @@ -46,6 +46,7 @@ struct mptcp_skb_cb {
> > > 
> > > enum {
> > > 	MPTCP_CMSG_TS = BIT(0),
> > > +	MPTCP_CMSG_INQ = BIT(1),
> > > };
> > > 
> > > static struct percpu_counter mptcp_sockets_allocated;
> > > @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> > > 	return !skb_queue_empty(&msk->receive_queue);
> > > }
> > > 
> > > +static unsigned int mptcp_inq_hint(const struct sock *sk)
> > > +{
> > > +	const struct mptcp_sock *msk = mptcp_sk(sk);
> > > +	const struct sk_buff *skb;
> > > +	u64 acked = msk->ack_seq;
> > > +	u64 hint_val = 0;
> > > +
> > > +	skb = skb_peek(&msk->receive_queue);
> > > +	if (skb) {
> > > +		u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset;
> > 
> > MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related
> > to the map_seq.
> > 
> > I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for
> > hint_val?
> 
> I don't think so.
> 
> INQ should return all bytes that are still pending for read().
> 
> ack_seq is the largest in-sequence number we've seen, so ack_seq - map_seq yields
> the in-sequence byte count of the skbs that have been queued for reading.
> 
> I could probably use skb_peek_tail() instead of skb_peek() and then grab
> ->end_seq instead of using msk->ack_seq, but I fail to see the advantage.
> 
> Because userspace might have partially read some data, we need
> to add the offset that userspace might have consumed already.
> 
> e.g., one skb queued, with 10 bytes of data, userspace calls read(, ..., 1);
> so the inq hint should return 9, not 10.
> 
> (read() increments MPTCP_SKB_CB(skb)->offset if skb wasn't consumed).

I think the problem is that at CB->offset initialization time we have:

	MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
        MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
        MPTCP_SKB_CB(skb)->offset = offset

	// ...
	WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);

'offest' can be > 0, and 'map_seq' is the sequence mapping for the
first byte carried this skb. MPTCP_SKB_CB(skb)->map_seq + offset is in
general different from ack_seq (is greater then...)

A possible solution would be updating 'MPTCP_SKB_CB(skb)->map_seq'
every time 'MPTCP_SKB_CB(skb)->offset' is updated, and then using
directly 'MPTCP_SKB_CB(skb)->map_seq' to compute 'hint_val'

WDYT?

Cheers,

Paolo
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b0bfe20d6bb0..b4263bf821ac 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -46,6 +46,7 @@  struct mptcp_skb_cb {
 
 enum {
 	MPTCP_CMSG_TS = BIT(0),
+	MPTCP_CMSG_INQ = BIT(1),
 };
 
 static struct percpu_counter mptcp_sockets_allocated;
@@ -2006,6 +2007,29 @@  static bool __mptcp_move_skbs(struct mptcp_sock *msk)
 	return !skb_queue_empty(&msk->receive_queue);
 }
 
+static unsigned int mptcp_inq_hint(const struct sock *sk)
+{
+	const struct mptcp_sock *msk = mptcp_sk(sk);
+	const struct sk_buff *skb;
+	u64 acked = msk->ack_seq;
+	u64 hint_val = 0;
+
+	skb = skb_peek(&msk->receive_queue);
+	if (skb) {
+		u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset;
+
+		hint_val = acked - map_seq;
+
+		if (hint_val >= INT_MAX)
+			hint_val = INT_MAX - 1;
+	}
+
+	if (hint_val == 0 && sock_flag(sk, SOCK_DONE))
+		hint_val = 1;
+
+	return (unsigned int)hint_val;
+}
+
 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			 int nonblock, int flags, int *addr_len)
 {
@@ -2030,6 +2054,9 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	len = min_t(size_t, len, INT_MAX);
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
+	if (unlikely(msk->recvmsg_inq))
+		cmsg_flags = MPTCP_CMSG_INQ;
+
 	while (copied < len) {
 		int bytes_read;
 
@@ -2103,6 +2130,12 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	if (cmsg_flags && copied >= 0) {
 		if (cmsg_flags & MPTCP_CMSG_TS)
 			tcp_recv_timestamp(msg, sk, &tss);
+
+		if (cmsg_flags & MPTCP_CMSG_INQ) {
+			unsigned int inq = mptcp_inq_hint(sk);
+
+			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+		}
 	}
 
 	pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 906509c6cde5..e77de7662df0 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -250,6 +250,7 @@  struct mptcp_sock {
 	bool		use_64bit_ack; /* Set when we received a 64-bit DSN */
 	bool		csum_enabled;
 	bool		allow_infinite_fallback;
+	u8		recvmsg_inq:1;
 	spinlock_t	join_list_lock;
 	struct work_struct work;
 	struct sk_buff  *ooo_last_skb;
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index b818e91f2e09..7405152691e0 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -557,6 +557,7 @@  static bool mptcp_supported_sockopt(int level, int optname)
 		case TCP_TIMESTAMP:
 		case TCP_NOTSENT_LOWAT:
 		case TCP_TX_DELAY:
+		case TCP_INQ:
 			return true;
 		}
 
@@ -698,7 +699,21 @@  static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
 static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 				    sockptr_t optval, unsigned int optlen)
 {
+	struct sock *sk = (void *)msk;
+	int ret, val;
+
 	switch (optname) {
+	case TCP_INQ:
+		ret = mptcp_get_int_option(msk, optval, optlen, &val);
+		if (ret)
+			return ret;
+		if (val < 0 || val > 1)
+			return -EINVAL;
+
+		lock_sock(sk);
+		msk->recvmsg_inq = !!val;
+		release_sock(sk);
+		return 0;
 	case TCP_ULP:
 		return -EOPNOTSUPP;
 	case TCP_CONGESTION:
@@ -1032,6 +1047,26 @@  static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o
 	return 0;
 }
 
+static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
+				int __user *optlen, int val)
+{
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	if (len < 0)
+		return -EINVAL;
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 				    char __user *optval, int __user *optlen)
 {
@@ -1042,6 +1077,8 @@  static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 	case TCP_CC_INFO:
 		return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
 						      optval, optlen);
+	case TCP_INQ:
+		return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq);
 	}
 	return -EOPNOTSUPP;
 }