Message ID | 20211108105711.16200-2-fw@strlen.de (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Mat Martineau |
Headers | show |
Series | support for setsockopt(TCP_INQ) | expand |
Context | Check | Description |
---|---|---|
matttbe/build | success | Build and static analysis OK |
matttbe/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 126 lines checked |
matttbe/KVM_Validation__normal | warning | Unstable: 2 failed test(s): packetdrill_sockopts selftest_mptcp_join |
matttbe/KVM_Validation__debug | warning | Unstable: 1 failed test(s): packetdrill_sockopts |
On Mon, 8 Nov 2021, Florian Westphal wrote: > Support the TCP_INQ setsockopt. > This is a boolean that tells recvmsg path to include the remaining > in-sequence bytes into a cmsg data. > > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224 > Signed-off-by: Florian Westphal <fw@strlen.de> > --- > net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ > net/mptcp/protocol.h | 1 + > net/mptcp/sockopt.c | 37 +++++++++++++++++++++++++++++++++++++ > 3 files changed, 71 insertions(+) > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index b0bfe20d6bb0..b4263bf821ac 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -46,6 +46,7 @@ struct mptcp_skb_cb { > > enum { > MPTCP_CMSG_TS = BIT(0), > + MPTCP_CMSG_INQ = BIT(1), > }; > > static struct percpu_counter mptcp_sockets_allocated; > @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > return !skb_queue_empty(&msk->receive_queue); > } > > +static unsigned int mptcp_inq_hint(const struct sock *sk) > +{ > + const struct mptcp_sock *msk = mptcp_sk(sk); > + const struct sk_buff *skb; > + u64 acked = msk->ack_seq; > + u64 hint_val = 0; > + > + skb = skb_peek(&msk->receive_queue); > + if (skb) { > + u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset; MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related to the map_seq. I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for hint_val? I did run in to a selftest failure, but could not reproduce: $ sudo ./mptcp_sockopt.sh Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server tcp_inq is larger than one mbyte copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4) client exit code 1, server 2 netns ns1-0-IRJkco socket stat for 12001: Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port Process tcp ESTAB 0 0 10.0.2.1:12001 10.0.2.2:52141 ino:0 sk:1 fwmark:0x1 <-> ts sack cubic wscale:7,7 rto:201 rtt:0.427/0.261 ato:40 mss:1448 pmtu:1500 rcvmss:1420 advmss:1448 cwnd:10 ssthresh:56 bytes_sent:163620 bytes_acked:163620 bytes_received:65332 segs_out:131 segs_in:68 data_segs_out:117 data_segs_in:50 send 271288056bps lastsnd:30052 lastrcv:30057 lastack:30051 pacing_rate 1821988992bps delivery_rate 1488222216bps delivered:118 busy:3ms rcv_rtt:1 rcv_space:14600 rcv_ssthresh:148470 minrtt:0.122 tcp-ulp-mptcp flags:Jec token:0000(id:0)/5a7e8206(id:3) seq:a590dda83727410f sfseq:e0cd ssnoff:db8b95cc maplen:1e68 tcp ESTAB 0 0 10.0.1.1:12001 10.0.1.2:48520 ino:0 sk:2 fwmark:0x1 <-> ts sack cubic wscale:7,7 rto:201 rtt:0.279/0.037 ato:40 mss:1448 pmtu:1500 rcvmss:1420 advmss:1448 cwnd:10 ssthresh:20 bytes_sent:426076 bytes_acked:426076 bytes_received:114892 segs_out:368 segs_in:278 data_segs_out:336 data_segs_in:86 send 415197133bps lastsnd:105 lastrcv:30055 lastack:105 pacing_rate 496678872bps delivery_rate 41078008bps delivered:337 busy:9ms rwnd_limited:1ms(11.1%) sndbuf_limited:1ms(11.1%) rcv_rtt:1 rcv_space:14600 rcv_ssthresh:189558 minrtt:0.082 tcp-ulp-mptcp flags:Mec token:0000(id:0)/5a7e8206(id:0) seq:a590dda83728010f sfseq:1a0cd ssnoff:511e5d9a maplen:2000 tcp ESTAB 0 0 10.0.3.1:12001 10.0.3.2:49725 ino:0 sk:3 fwmark:0x1 <-> ts sack cubic wscale:7,7 rto:201 rtt:1/0.5 mss:1448 pmtu:1500 rcvmss:536 advmss:1448 cwnd:10 segs_out:2 segs_in:3 send 115840000bps lastsnd:30055 lastrcv:30055 lastack:30053 pacing_rate 231680000bps delivered:1 rcv_space:14600 rcv_ssthresh:64076 minrtt:1 tcp-ulp-mptcp flags:Jec token:0000(id:0)/5a7e8206(id:5) seq:a590dda83728210f sfseq:0 ssnoff:1bb4388 maplen:0 mptcp LAST-ACK 0 0 10.0.1.1:12001 10.0.1.2:48520 timer:(keepalive,59sec,0) ino:0 sk:4 fwmark:0x1 --- subflows:7 add_addr_signal:8 add_addr_accepted:5 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:5a7e8206 write_seq:cf6523e6d02cad72 snd_una:cf6523e6d027a6cd rcv_nxt:a590dda837282110 netns ns2-0-IRJkco socket stat for 12001: Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port Process tcp ESTAB 230756 0 10.0.1.2:48520 10.0.1.1:12001 ino:0 sk:5 fwmark:0x2 <-> ts sack cubic wscale:7,7 rto:201 rtt:0.414/0.368 ato:40 mss:1448 pmtu:1500 rcvmss:1168 advmss:1448 cwnd:27 ssthresh:27 bytes_sent:114892 bytes_acked:114893 bytes_received:426076 segs_out:278 segs_in:369 data_segs_out:86 data_segs_in:336 send 755478261bps lastsnd:30074 lastrcv:123 lastack:123 pacing_rate 906573912bps delivery_rate 8849496bps delivered:87 busy:5ms rcv_rtt:0.888 rcv_space:14480 rcv_ssthresh:163912 minrtt:0.082 tcp-ulp-mptcp flags:Mmec token:0000(id:0)/af0717b9(id:0) seq:cf6523e6d028ec4d sfseq:2e479 ssnoff:3f01671c maplen:1680 tcp ESTAB 0 0 10.0.3.2:49725 10.0.3.1:12001 ino:0 sk:6 fwmark:0x2 <-> ts sack cubic wscale:7,7 rto:201 rtt:0.48/0.24 mss:1448 pmtu:1500 rcvmss:536 advmss:1448 cwnd:10 bytes_acked:1 segs_out:3 segs_in:3 send 241333333bps lastsnd:30073 lastrcv:30073 lastack:30071 pacing_rate 482666664bps delivered:1 rcv_space:14480 rcv_ssthresh:64088 minrtt:0.48 tcp-ulp-mptcp flags:Jjec token:5a7e8206(id:5)/af0717b9(id:0) seq:0 sfseq:0 ssnoff:a5831809 maplen:0 tcp ESTAB 52444 0 10.0.2.2:52141 10.0.2.1:12001 ino:0 sk:7 fwmark:0x2 <-> ts sack cubic wscale:7,7 rto:201 rtt:0.167/0.105 ato:40 mss:1448 pmtu:1500 rcvmss:1420 advmss:1448 cwnd:15 bytes_sent:65332 bytes_acked:65333 bytes_received:163620 segs_out:68 segs_in:132 data_segs_out:50 data_segs_in:117 send 1040479042bps lastsnd:30075 lastrcv:30070 lastack:30070 pacing_rate 2079401640bps delivery_rate 868800000bps delivered:51 busy:3ms rcv_rtt:0.184 rcv_space:14480 rcv_ssthresh:152552 minrtt:0.077 tcp-ulp-mptcp flags:Jjec token:5a7e8206(id:3)/af0717b9(id:0) seq:cf6523e6d027e4ed sfseq:13841 ssnoff:21898d30 maplen:7a08 mptcp FIN-WAIT-2 108184 0 10.0.1.2:48520 10.0.1.1:12001 timer:(keepalive,29sec,0) ino:0 sk:8 fwmark:0x2 --- subflows:7 add_addr_signal:8 add_addr_accepted:7 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:af0717b9 write_seq:a590dda837282110 snd_una:a590dda837282110 rcv_nxt:cf6523e6d027a6cd > + > + hint_val = acked - map_seq; > + > + if (hint_val >= INT_MAX) > + hint_val = INT_MAX - 1; > + } > + > + if (hint_val == 0 && sock_flag(sk, SOCK_DONE)) > + hint_val = 1; > + > + return (unsigned int)hint_val; > +} > + > static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, > int nonblock, int flags, int *addr_len) > { > @@ -2030,6 +2054,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, > len = min_t(size_t, len, INT_MAX); > target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); > > + if (unlikely(msk->recvmsg_inq)) > + cmsg_flags = MPTCP_CMSG_INQ; > + > while (copied < len) { > int bytes_read; > > @@ -2103,6 +2130,12 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, > if (cmsg_flags && copied >= 0) { > if (cmsg_flags & MPTCP_CMSG_TS) > tcp_recv_timestamp(msg, sk, &tss); > + > + if (cmsg_flags & MPTCP_CMSG_INQ) { > + unsigned int inq = mptcp_inq_hint(sk); > + > + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); > + } > } > > pr_debug("msk=%p rx queue empty=%d:%d copied=%d", > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > index 906509c6cde5..e77de7662df0 100644 > --- a/net/mptcp/protocol.h > +++ b/net/mptcp/protocol.h > @@ -250,6 +250,7 @@ struct mptcp_sock { > bool use_64bit_ack; /* Set when we received a 64-bit DSN */ > bool csum_enabled; > bool allow_infinite_fallback; > + u8 recvmsg_inq:1; > spinlock_t join_list_lock; > struct work_struct work; > struct sk_buff *ooo_last_skb; > diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c > index b818e91f2e09..7405152691e0 100644 > --- a/net/mptcp/sockopt.c > +++ b/net/mptcp/sockopt.c > @@ -557,6 +557,7 @@ static bool mptcp_supported_sockopt(int level, int optname) > case TCP_TIMESTAMP: > case TCP_NOTSENT_LOWAT: > case TCP_TX_DELAY: > + case TCP_INQ: > return true; > } > > @@ -698,7 +699,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, > static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, > sockptr_t optval, unsigned int optlen) > { > + struct sock *sk = (void *)msk; > + int ret, val; > + > switch (optname) { > + case TCP_INQ: > + ret = mptcp_get_int_option(msk, optval, optlen, &val); > + if (ret) > + return ret; > + if (val < 0 || val > 1) > + return -EINVAL; > + > + lock_sock(sk); > + msk->recvmsg_inq = !!val; > + release_sock(sk); > + return 0; > case TCP_ULP: > return -EOPNOTSUPP; > case TCP_CONGESTION: > @@ -1032,6 +1047,26 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o > return 0; > } > > +static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, > + int __user *optlen, int val) > +{ > + int len; > + > + if (get_user(len, optlen)) > + return -EFAULT; > + > + len = min_t(unsigned int, len, sizeof(int)); > + if (len < 0) > + return -EINVAL; > + > + if (put_user(len, optlen)) > + return -EFAULT; > + if (copy_to_user(optval, &val, len)) > + return -EFAULT; > + > + return 0; > +} > + > static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, > char __user *optval, int __user *optlen) > { > @@ -1042,6 +1077,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, > case TCP_CC_INFO: > return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, > optval, optlen); > + case TCP_INQ: > + return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq); > } > return -EOPNOTSUPP; > } > -- > 2.32.0 > > > -- Mat Martineau Intel
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: > On Mon, 8 Nov 2021, Florian Westphal wrote: > > > Support the TCP_INQ setsockopt. > > This is a boolean that tells recvmsg path to include the remaining > > in-sequence bytes into a cmsg data. > > > > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224 > > Signed-off-by: Florian Westphal <fw@strlen.de> > > --- > > net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ > > net/mptcp/protocol.h | 1 + > > net/mptcp/sockopt.c | 37 +++++++++++++++++++++++++++++++++++++ > > 3 files changed, 71 insertions(+) > > > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > > index b0bfe20d6bb0..b4263bf821ac 100644 > > --- a/net/mptcp/protocol.c > > +++ b/net/mptcp/protocol.c > > @@ -46,6 +46,7 @@ struct mptcp_skb_cb { > > > > enum { > > MPTCP_CMSG_TS = BIT(0), > > + MPTCP_CMSG_INQ = BIT(1), > > }; > > > > static struct percpu_counter mptcp_sockets_allocated; > > @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > > return !skb_queue_empty(&msk->receive_queue); > > } > > > > +static unsigned int mptcp_inq_hint(const struct sock *sk) > > +{ > > + const struct mptcp_sock *msk = mptcp_sk(sk); > > + const struct sk_buff *skb; > > + u64 acked = msk->ack_seq; > > + u64 hint_val = 0; > > + > > + skb = skb_peek(&msk->receive_queue); > > + if (skb) { > > + u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset; > > MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related > to the map_seq. > > I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for > hint_val? I don't think so. INQ should return all bytes that are still pending for read(). ack_seq is the largest in-sequence number we've seen, so ack_seq - map_seq yields the in-sequence byte count of the skbs that have been queued for reading. I could probably use skb_peek_tail() instead of skb_peek() and then grab ->end_seq instead of using msk->ack_seq, but I fail to see the advantage. Because userspace might have partially read some data, we need to add the offset that userspace might have consumed already. e.g., one skb queued, with 10 bytes of data, userspace calls read(, ..., 1); so the inq hint should return 9, not 10. (read() increments MPTCP_SKB_CB(skb)->offset if skb wasn't consumed). > I did run in to a selftest failure, but could not reproduce: > > $ sudo ./mptcp_sockopt.sh > Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client > Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server > tcp_inq is larger than one mbyte > copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4) > client exit code 1, server 2 Hmm, could not repro either so far.
On Wed, 10 Nov 2021, Florian Westphal wrote: > Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: >> On Mon, 8 Nov 2021, Florian Westphal wrote: >> >>> Support the TCP_INQ setsockopt. >>> This is a boolean that tells recvmsg path to include the remaining >>> in-sequence bytes into a cmsg data. >>> >>> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224 >>> Signed-off-by: Florian Westphal <fw@strlen.de> >>> --- >>> net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ >>> net/mptcp/protocol.h | 1 + >>> net/mptcp/sockopt.c | 37 +++++++++++++++++++++++++++++++++++++ >>> 3 files changed, 71 insertions(+) >>> >>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c >>> index b0bfe20d6bb0..b4263bf821ac 100644 >>> --- a/net/mptcp/protocol.c >>> +++ b/net/mptcp/protocol.c >>> @@ -46,6 +46,7 @@ struct mptcp_skb_cb { >>> >>> enum { >>> MPTCP_CMSG_TS = BIT(0), >>> + MPTCP_CMSG_INQ = BIT(1), >>> }; >>> >>> static struct percpu_counter mptcp_sockets_allocated; >>> @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) >>> return !skb_queue_empty(&msk->receive_queue); >>> } >>> >>> +static unsigned int mptcp_inq_hint(const struct sock *sk) >>> +{ >>> + const struct mptcp_sock *msk = mptcp_sk(sk); >>> + const struct sk_buff *skb; >>> + u64 acked = msk->ack_seq; >>> + u64 hint_val = 0; >>> + >>> + skb = skb_peek(&msk->receive_queue); >>> + if (skb) { >>> + u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset; >> >> MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related >> to the map_seq. >> >> I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for >> hint_val? > > I don't think so. > > INQ should return all bytes that are still pending for read(). > > ack_seq is the largest in-sequence number we've seen, so ack_seq - map_seq yields > the in-sequence byte count of the skbs that have been queued for reading. > > I could probably use skb_peek_tail() instead of skb_peek() and then grab > ->end_seq instead of using msk->ack_seq, but I fail to see the advantage. Yeah, you're right. I was thinking of end_seq for the tail skb, but that's not the skb this code is looking at. Also, my mistake in thinking the mptcp_skb_cb map_seq was a copy of the "map_seq" from the headers, but it's really a mapped DSN. So it does seem like the hint_val calculation is ok. > > Because userspace might have partially read some data, we need > to add the offset that userspace might have consumed already. > > e.g., one skb queued, with 10 bytes of data, userspace calls read(, ..., 1); > so the inq hint should return 9, not 10. > > (read() increments MPTCP_SKB_CB(skb)->offset if skb wasn't consumed). > >> I did run in to a selftest failure, but could not reproduce: >> >> $ sudo ./mptcp_sockopt.sh >> Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client >> Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server >> tcp_inq is larger than one mbyte >> copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4) >> client exit code 1, server 2 > > Hmm, could not repro either so far. > Still trying here, modified the test to show the inq value in case that helps debug. -- Mat Martineau Intel
On Wed, 10 Nov 2021, Mat Martineau wrote: > On Wed, 10 Nov 2021, Florian Westphal wrote: > >> Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: >>> On Mon, 8 Nov 2021, Florian Westphal wrote: >>> >>> I did run in to a selftest failure, but could not reproduce: >>> >>> $ sudo ./mptcp_sockopt.sh >>> Created /tmp/tmp.Kj9eQSvhn3 (size 1024 KB) containing data sent by client >>> Created /tmp/tmp.qOpE4X43Jp (size 1024 KB) containing data sent by server >>> tcp_inq is larger than one mbyte >>> copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4) >>> client exit code 1, server 2 >> >> Hmm, could not repro either so far. >> > > Still trying here, modified the test to show the inq value in case that helps > debug. Well, that was quick: "tcp_inq=7ffffffe is larger than one mbyte" At least it's the expected INT_MAX - 1 value... I'll instrument the kernel some more and try again. The rest of the failure dump: copyfd_io_poll: poll timed out (events: POLLIN 0, POLLOUT 4) client exit code 1, server 2 netns ns1-0-ExcWL9 socket stat for 12001: Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port Process tcp TIME-WAIT 0 0 10.0.3.1:12001 10.0.3.2:45689 timer:(timewait,28sec,0) ino:0 sk:1 fwmark:0x1 tcp TIME-WAIT 0 0 10.0.1.1:12001 10.0.1.2:55212 timer:(timewait,28sec,0) ino:0 sk:2 fwmark:0x1 tcp TIME-WAIT 0 0 10.0.2.1:12001 10.0.2.2:41595 timer:(timewait,28sec,0) ino:0 sk:3 fwmark:0x1 tcp TIME-WAIT 0 0 10.0.4.1:12001 10.0.4.2:54849 timer:(timewait,28sec,0) ino:0 sk:4 fwmark:0x1 tcp ESTAB 0 0 [dead:beef:1::1]:12001 [dead:beef:1::2]:52336 ino:0 sk:5 fwmark:0x1 <-> ts sack cubic wscale:7,7 rto:203 rtt:2.136/2.521 ato:40 mss:1428 pmtu:1500 rcvmss:1428 advmss:1428 cwnd:27 ssthresh:27 bytes_sent:5020656 bytes_acked:5020656 bytes_received:81527 segs_out:3748 segs_in:746 data_segs_out:3718 data_segs_in:62 send 144404494bps lastsnd:137 lastrcv:30088 lastack:137 pacing_rate 173285392bps delivery_rate 215547168bps delivered:3719 busy:1372ms rwnd_limited:1ms(0.1%) sndbuf_limited:49ms(3.6%) rcv_rtt:1.095 rcv_space:14400 rcv_ssthresh:64096 minrtt:0.081 tcp-ulp-mptcp flags:Mec token:0000(id:0)/587fabd7(id:0) seq:1c219faa3cd01b83 sfseq:e001 ssnoff:f8380bd4 maplen:5e77 mptcp LAST-ACK 0 0 [dead:beef:1::1]:12001 [dead:beef:1::2]:52336 timer:(keepalive,59sec,0) ino:0 sk:6 fwmark:0x1 --- subflows:7 add_addr_signal:8 add_addr_accepted:7 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:587fabd7 write_seq:674f10efa988edc3 snd_una:674f10efa985917a rcv_nxt:1c219faa3cd079fb netns ns2-0-ExcWL9 socket stat for 12001: Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port Process tcp ESTAB 4843536 0 [dead:beef:1::2]:52336 [dead:beef:1::1]:12001 ino:0 sk:1001 fwmark:0x2 <-> ts sack cubic wscale:7,7 rto:201 rtt:0.895/0.65 ato:40 mss:1428 pmtu:1500 rcvmss:1400 advmss:1428 cwnd:22 ssthresh:20 bytes_sent:81527 bytes_acked:81528 bytes_received:5020656 segs_out:746 segs_in:3749 data_segs_out:62 data_segs_in:3718 send 280813408bps lastsnd:30109 lastrcv:158 lastack:66 pacing_rate 336834952bps delivery_rate 232090288bps delivered:63 busy:5ms rcv_rtt:103.546 rcv_space:14280 rcv_ssthresh:351074 minrtt:0.176 tcp-ulp-mptcp flags:Mmec token:0000(id:0)/ec432c53(id:0) seq:674f10efa985727a sfseq:294e1 ssnoff:cc6bcb15 maplen:1f00 mptcp FIN-WAIT-2 62616 0 [dead:beef:1::2]:52336 [dead:beef:1::1]:12001 timer:(keepalive,29sec,0) ino:0 sk:1002 fwmark:0x2 --- subflows:7 add_addr_signal:8 add_addr_accepted:7 subflows_max:8 add_addr_signal_max:8 add_addr_accepted_max:8 remote_key token:ec432c53 write_seq:1c219faa3cd079fb snd_una:1c219faa3cd079fb rcv_nxt:674f10efa985917a -- Mat Martineau Intel
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: > > Still trying here, modified the test to show the inq value in case that > > helps debug. > > Well, that was quick: > > "tcp_inq=7ffffffe is larger than one mbyte" > > At least it's the expected INT_MAX - 1 value... I'll instrument the kernel > some more and try again. Great, thanks. I'll look at this tomorrow as well. I will keep this test case as-is, but will add an extra test with out-of-band channel that will inform receiver about exact number of sent bytes so we can check reported inq hint vs. the (known) unread data value.
On Thu, 11 Nov 2021, Florian Westphal wrote: > Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: >>> Still trying here, modified the test to show the inq value in case that >>> helps debug. >> >> Well, that was quick: >> >> "tcp_inq=7ffffffe is larger than one mbyte" >> >> At least it's the expected INT_MAX - 1 value... I'll instrument the kernel >> some more and try again. > > Great, thanks. I'll look at this tomorrow as well. > > I will keep this test case as-is, but will add an extra test with > out-of-band channel that will inform receiver about exact number of sent > bytes so we can check reported inq hint vs. the (known) unread data value. > More data: mptcp_connect-3837 [003] ..... 229.043932: mptcp_inq_hint: msk->ack_seq=14236111453981801913 cb->map_seq=14236111453981665217 cb->offset=59862, msk->sk_receive_queue_tail=0000000000000000 hint_val_raw=76834 mptcp_connect-3837 [003] ..... 229.044556: mptcp_inq_hint: msk->ack_seq=14236111453981801913 cb->map_seq=14236111453981665217 cb->offset=68054, msk->sk_receive_queue_tail=0000000000000000 hint_val_raw=68642 mptcp_connect-3837 [003] ..... 229.045875: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=17814, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=64930 mptcp_connect-3837 [003] ..... 229.046829: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=26006, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=56738 mptcp_connect-3837 [003] ..... 229.047619: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=34198, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=48546 mptcp_connect-3837 [003] ..... 229.048245: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981723649 cb->offset=42390, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=40354 mptcp_connect-3837 [003] ..... 229.048761: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=16368, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=20504 mptcp_connect-3837 [003] ..... 229.049461: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=24560, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=12312 mptcp_connect-3837 [003] ..... 229.050651: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=32752, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=4120 mptcp_connect-3837 [003] ..... 229.051074: mptcp_inq_hint: msk->ack_seq=14236111453981806393 cb->map_seq=14236111453981769521 cb->offset=40944, msk->sk_receive_queue_tail=000000007850e9d3 hint_val_raw=18446744073709547544 So, it does seem to be related to acked data in msk->sk_receive_queue Mat
On Wed, 2021-11-10 at 11:17 +0100, Florian Westphal wrote: > Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: > > On Mon, 8 Nov 2021, Florian Westphal wrote: > > > > > Support the TCP_INQ setsockopt. > > > This is a boolean that tells recvmsg path to include the remaining > > > in-sequence bytes into a cmsg data. > > > > > > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224 > > > Signed-off-by: Florian Westphal <fw@strlen.de> > > > --- > > > net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ > > > net/mptcp/protocol.h | 1 + > > > net/mptcp/sockopt.c | 37 +++++++++++++++++++++++++++++++++++++ > > > 3 files changed, 71 insertions(+) > > > > > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > > > index b0bfe20d6bb0..b4263bf821ac 100644 > > > --- a/net/mptcp/protocol.c > > > +++ b/net/mptcp/protocol.c > > > @@ -46,6 +46,7 @@ struct mptcp_skb_cb { > > > > > > enum { > > > MPTCP_CMSG_TS = BIT(0), > > > + MPTCP_CMSG_INQ = BIT(1), > > > }; > > > > > > static struct percpu_counter mptcp_sockets_allocated; > > > @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) > > > return !skb_queue_empty(&msk->receive_queue); > > > } > > > > > > +static unsigned int mptcp_inq_hint(const struct sock *sk) > > > +{ > > > + const struct mptcp_sock *msk = mptcp_sk(sk); > > > + const struct sk_buff *skb; > > > + u64 acked = msk->ack_seq; > > > + u64 hint_val = 0; > > > + > > > + skb = skb_peek(&msk->receive_queue); > > > + if (skb) { > > > + u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset; > > > > MPTCP_SKB_CB(skb)->offset is an offset within the skb payload, not related > > to the map_seq. > > > > I think MPTCP_SKB_CB(skb)->end_seq has the sequence number you need for > > hint_val? > > I don't think so. > > INQ should return all bytes that are still pending for read(). > > ack_seq is the largest in-sequence number we've seen, so ack_seq - map_seq yields > the in-sequence byte count of the skbs that have been queued for reading. > > I could probably use skb_peek_tail() instead of skb_peek() and then grab > ->end_seq instead of using msk->ack_seq, but I fail to see the advantage. > > Because userspace might have partially read some data, we need > to add the offset that userspace might have consumed already. > > e.g., one skb queued, with 10 bytes of data, userspace calls read(, ..., 1); > so the inq hint should return 9, not 10. > > (read() increments MPTCP_SKB_CB(skb)->offset if skb wasn't consumed). I think the problem is that at CB->offset initialization time we have: MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset // ... WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); 'offest' can be > 0, and 'map_seq' is the sequence mapping for the first byte carried this skb. MPTCP_SKB_CB(skb)->map_seq + offset is in general different from ack_seq (is greater then...) A possible solution would be updating 'MPTCP_SKB_CB(skb)->map_seq' every time 'MPTCP_SKB_CB(skb)->offset' is updated, and then using directly 'MPTCP_SKB_CB(skb)->map_seq' to compute 'hint_val' WDYT? Cheers, Paolo
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b0bfe20d6bb0..b4263bf821ac 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -46,6 +46,7 @@ struct mptcp_skb_cb { enum { MPTCP_CMSG_TS = BIT(0), + MPTCP_CMSG_INQ = BIT(1), }; static struct percpu_counter mptcp_sockets_allocated; @@ -2006,6 +2007,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) return !skb_queue_empty(&msk->receive_queue); } +static unsigned int mptcp_inq_hint(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + const struct sk_buff *skb; + u64 acked = msk->ack_seq; + u64 hint_val = 0; + + skb = skb_peek(&msk->receive_queue); + if (skb) { + u64 map_seq = MPTCP_SKB_CB(skb)->map_seq + MPTCP_SKB_CB(skb)->offset; + + hint_val = acked - map_seq; + + if (hint_val >= INT_MAX) + hint_val = INT_MAX - 1; + } + + if (hint_val == 0 && sock_flag(sk, SOCK_DONE)) + hint_val = 1; + + return (unsigned int)hint_val; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -2030,6 +2054,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + if (unlikely(msk->recvmsg_inq)) + cmsg_flags = MPTCP_CMSG_INQ; + while (copied < len) { int bytes_read; @@ -2103,6 +2130,12 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); + + if (cmsg_flags & MPTCP_CMSG_INQ) { + unsigned int inq = mptcp_inq_hint(sk); + + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } } pr_debug("msk=%p rx queue empty=%d:%d copied=%d", diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 906509c6cde5..e77de7662df0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -250,6 +250,7 @@ struct mptcp_sock { bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; bool allow_infinite_fallback; + u8 recvmsg_inq:1; spinlock_t join_list_lock; struct work_struct work; struct sk_buff *ooo_last_skb; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index b818e91f2e09..7405152691e0 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -557,6 +557,7 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_TIMESTAMP: case TCP_NOTSENT_LOWAT: case TCP_TX_DELAY: + case TCP_INQ: return true; } @@ -698,7 +699,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { + struct sock *sk = (void *)msk; + int ret, val; + switch (optname) { + case TCP_INQ: + ret = mptcp_get_int_option(msk, optval, optlen, &val); + if (ret) + return ret; + if (val < 0 || val > 1) + return -EINVAL; + + lock_sock(sk); + msk->recvmsg_inq = !!val; + release_sock(sk); + return 0; case TCP_ULP: return -EOPNOTSUPP; case TCP_CONGESTION: @@ -1032,6 +1047,26 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o return 0; } +static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, + int __user *optlen, int val) +{ + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + if (len < 0) + return -EINVAL; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { @@ -1042,6 +1077,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_CC_INFO: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); + case TCP_INQ: + return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq); } return -EOPNOTSUPP; }
Support the TCP_INQ setsockopt. This is a boolean that tells recvmsg path to include the remaining in-sequence bytes into a cmsg data. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/224 Signed-off-by: Florian Westphal <fw@strlen.de> --- net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/sockopt.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+)