diff mbox series

SUNRPC: Use TCP_CORK to optimise send performance on the server

Message ID 20210213202532.23146-1-trondmy@kernel.org (mailing list archive)
State New
Headers show
Series SUNRPC: Use TCP_CORK to optimise send performance on the server | expand

Commit Message

trondmy@kernel.org Feb. 13, 2021, 8:25 p.m. UTC
From: Trond Myklebust <trond.myklebust@hammerspace.com>

Use a counter to keep track of how many requests are queued behind the
xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/svcsock.h | 2 ++
 net/sunrpc/svcsock.c           | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

Comments

Chuck Lever Feb. 13, 2021, 9:53 p.m. UTC | #1
Hi Trond-

> On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
> 
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> Use a counter to keep track of how many requests are queued behind the
> xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.

I'm intrigued, but IMO, the patch description needs to explain
why this change should be made. Why abandon Nagle?

If you expect a performance impact, the description should
provide metrics to show it.

(We should have Daire try this change with his multi-client
setup).


> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
> include/linux/sunrpc/svcsock.h | 2 ++
> net/sunrpc/svcsock.c           | 8 +++++++-
> 2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
> index b7ac7fe68306..bcc555c7ae9c 100644
> --- a/include/linux/sunrpc/svcsock.h
> +++ b/include/linux/sunrpc/svcsock.h
> @@ -35,6 +35,8 @@ struct svc_sock {
> 	/* Total length of the data (not including fragment headers)
> 	 * received so far in the fragments making up this rpc: */
> 	u32			sk_datalen;
> +	/* Number of queued send requests */
> +	atomic_t		sk_sendqlen;

Can you take advantage of xpt_mutex: update this field
only in the critical section, and make it a simple
integer type?


> 	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
> };
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 5a809c64dc7b..231f510a4830 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
> 
> 	svc_tcp_release_rqst(rqstp);
> 
> +	atomic_inc(&svsk->sk_sendqlen);
> 	mutex_lock(&xprt->xpt_mutex);
> 	if (svc_xprt_is_dead(xprt))
> 		goto out_notconn;
> +	tcp_sock_set_cork(svsk->sk_sk, true);
> 	err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
> 	xdr_free_bvec(xdr);
> 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
> 	if (err < 0 || sent != (xdr->len + sizeof(marker)))
> 		goto out_close;
> +	if (atomic_dec_and_test(&svsk->sk_sendqlen))
> +		tcp_sock_set_cork(svsk->sk_sk, false);
> 	mutex_unlock(&xprt->xpt_mutex);
> 	return sent;
> 
> out_notconn:
> +	atomic_dec(&svsk->sk_sendqlen);
> 	mutex_unlock(&xprt->xpt_mutex);
> 	return -ENOTCONN;
> out_close:
> @@ -1192,6 +1197,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
> 		  (err < 0) ? err : sent, xdr->len);
> 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
> 	svc_xprt_enqueue(xprt);
> +	atomic_dec(&svsk->sk_sendqlen);
> 	mutex_unlock(&xprt->xpt_mutex);
> 	return -EAGAIN;
> }
> @@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
> 		svsk->sk_datalen = 0;
> 		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
> 
> -		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
> +		tcp_sock_set_nodelay(sk);
> 
> 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> 		switch (sk->sk_state) {
> -- 
> 2.29.2
> 

--
Chuck Lever
Trond Myklebust Feb. 13, 2021, 10:10 p.m. UTC | #2
On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
> Hi Trond-
> 
> > On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
> > 
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > Use a counter to keep track of how many requests are queued behind
> > the
> > xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.
> 
> I'm intrigued, but IMO, the patch description needs to explain
> why this change should be made. Why abandon Nagle?
> 

This doesn't change the Nagle/TCP_NODELAY settings. It just switches to
using the new documented kernel interface.

The only change is to use TCP_CORK so that we don't send out partially
filled TCP frames, when we can see that there are other RPC replies
that are queued up for transmission.

Note the combination TCP_CORK+TCP_NODELAY is common, and the main
effect of the latter is that when we turn off the TCP_CORK, then there
is an immediate forced push of the TCP queue.

> If you expect a performance impact, the description should
> provide metrics to show it.

I don't have a performance system to measure the improvement
accurately. However I am seeing an notable improvement with the
equivalent client change. Specifically, xfstests generic/127 shows a
~20% improvement on my VM based test rig.

> (We should have Daire try this change with his multi-client
> setup).
> 
> 
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> > include/linux/sunrpc/svcsock.h | 2 ++
> > net/sunrpc/svcsock.c           | 8 +++++++-
> > 2 files changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/sunrpc/svcsock.h
> > b/include/linux/sunrpc/svcsock.h
> > index b7ac7fe68306..bcc555c7ae9c 100644
> > --- a/include/linux/sunrpc/svcsock.h
> > +++ b/include/linux/sunrpc/svcsock.h
> > @@ -35,6 +35,8 @@ struct svc_sock {
> >         /* Total length of the data (not including fragment
> > headers)
> >          * received so far in the fragments making up this rpc: */
> >         u32                     sk_datalen;
> > +       /* Number of queued send requests */
> > +       atomic_t                sk_sendqlen;
> 
> Can you take advantage of xpt_mutex: update this field
> only in the critical section, and make it a simple
> integer type?
> 
> 
> >         struct page *           sk_pages[RPCSVC_MAXPAGES];      /*
> > received data */
> > };
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index 5a809c64dc7b..231f510a4830 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct svc_rqst
> > *rqstp)
> > 
> >         svc_tcp_release_rqst(rqstp);
> > 
> > +       atomic_inc(&svsk->sk_sendqlen);
> >         mutex_lock(&xprt->xpt_mutex);
> >         if (svc_xprt_is_dead(xprt))
> >                 goto out_notconn;
> > +       tcp_sock_set_cork(svsk->sk_sk, true);
> >         err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker,
> > &sent);
> >         xdr_free_bvec(xdr);
> >         trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
> >         if (err < 0 || sent != (xdr->len + sizeof(marker)))
> >                 goto out_close;
> > +       if (atomic_dec_and_test(&svsk->sk_sendqlen))
> > +               tcp_sock_set_cork(svsk->sk_sk, false);
> >         mutex_unlock(&xprt->xpt_mutex);
> >         return sent;
> > 
> > out_notconn:
> > +       atomic_dec(&svsk->sk_sendqlen);
> >         mutex_unlock(&xprt->xpt_mutex);
> >         return -ENOTCONN;
> > out_close:
> > @@ -1192,6 +1197,7 @@ static int svc_tcp_sendto(struct svc_rqst
> > *rqstp)
> >                   (err < 0) ? err : sent, xdr->len);
> >         set_bit(XPT_CLOSE, &xprt->xpt_flags);
> >         svc_xprt_enqueue(xprt);
> > +       atomic_dec(&svsk->sk_sendqlen);
> >         mutex_unlock(&xprt->xpt_mutex);
> >         return -EAGAIN;
> > }
> > @@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock
> > *svsk, struct svc_serv *serv)
> >                 svsk->sk_datalen = 0;
> >                 memset(&svsk->sk_pages[0], 0, sizeof(svsk-
> > >sk_pages));
> > 
> > -               tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
> > +               tcp_sock_set_nodelay(sk);
> > 
> >                 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> >                 switch (sk->sk_state) {
> > -- 
> > 2.29.2
> > 
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever Feb. 13, 2021, 11:30 p.m. UTC | #3
> On Feb 13, 2021, at 5:10 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
>> Hi Trond-
>> 
>>> On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
>>> 
>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> 
>>> Use a counter to keep track of how many requests are queued behind
>>> the
>>> xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.
>> 
>> I'm intrigued, but IMO, the patch description needs to explain
>> why this change should be made. Why abandon Nagle?
>> 
> 
> This doesn't change the Nagle/TCP_NODELAY settings. It just switches to
> using the new documented kernel interface.
> 
> The only change is to use TCP_CORK so that we don't send out partially
> filled TCP frames, when we can see that there are other RPC replies
> that are queued up for transmission.
> 
> Note the combination TCP_CORK+TCP_NODELAY is common, and the main
> effect of the latter is that when we turn off the TCP_CORK, then there
> is an immediate forced push of the TCP queue.

The description above suggests the patch is just a
clean-up, but a forced push has potential to change
the server's behavior.

I'm trying to assess what kind of additional testing
would be valuable.


>> If you expect a performance impact, the description should
>> provide metrics to show it.
> 
> I don't have a performance system to measure the improvement
> accurately.

Then let's have Daire try it out, if possible.


> However I am seeing an notable improvement with the
> equivalent client change. Specifically, xfstests generic/127 shows a
> ~20% improvement on my VM based test rig.

>> (We should have Daire try this change with his multi-client
>> setup).
>> 
>> 
>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> ---
>>> include/linux/sunrpc/svcsock.h | 2 ++
>>> net/sunrpc/svcsock.c           | 8 +++++++-
>>> 2 files changed, 9 insertions(+), 1 deletion(-)
>>> 
>>> diff --git a/include/linux/sunrpc/svcsock.h
>>> b/include/linux/sunrpc/svcsock.h
>>> index b7ac7fe68306..bcc555c7ae9c 100644
>>> --- a/include/linux/sunrpc/svcsock.h
>>> +++ b/include/linux/sunrpc/svcsock.h
>>> @@ -35,6 +35,8 @@ struct svc_sock {
>>>         /* Total length of the data (not including fragment
>>> headers)
>>>          * received so far in the fragments making up this rpc: */
>>>         u32                     sk_datalen;
>>> +       /* Number of queued send requests */
>>> +       atomic_t                sk_sendqlen;
>> 
>> Can you take advantage of xpt_mutex: update this field
>> only in the critical section, and make it a simple
>> integer type?

I see why atomic_inc(&sk_sendqlen); has to happen outside
the mutex. However, it seems a little crazy/costly to have
both of these serialization primitives.

I haven't found a mutex_unlock() that will report that there
are no waiters to wake up, though.


>>>         struct page *           sk_pages[RPCSVC_MAXPAGES];      /*
>>> received data */
>>> };
>>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>>> index 5a809c64dc7b..231f510a4830 100644
>>> --- a/net/sunrpc/svcsock.c
>>> +++ b/net/sunrpc/svcsock.c
>>> @@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct svc_rqst
>>> *rqstp)
>>> 
>>>         svc_tcp_release_rqst(rqstp);
>>> 
>>> +       atomic_inc(&svsk->sk_sendqlen);
>>>         mutex_lock(&xprt->xpt_mutex);
>>>         if (svc_xprt_is_dead(xprt))
>>>                 goto out_notconn;
>>> +       tcp_sock_set_cork(svsk->sk_sk, true);
>>>         err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker,
>>> &sent);
>>>         xdr_free_bvec(xdr);
>>>         trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
>>>         if (err < 0 || sent != (xdr->len + sizeof(marker)))
>>>                 goto out_close;
>>> +       if (atomic_dec_and_test(&svsk->sk_sendqlen))
>>> +               tcp_sock_set_cork(svsk->sk_sk, false);
>>>         mutex_unlock(&xprt->xpt_mutex);
>>>         return sent;
>>> 
>>> out_notconn:
>>> +       atomic_dec(&svsk->sk_sendqlen);
>>>         mutex_unlock(&xprt->xpt_mutex);
>>>         return -ENOTCONN;
>>> out_close:
>>> @@ -1192,6 +1197,7 @@ static int svc_tcp_sendto(struct svc_rqst
>>> *rqstp)
>>>                   (err < 0) ? err : sent, xdr->len);
>>>         set_bit(XPT_CLOSE, &xprt->xpt_flags);
>>>         svc_xprt_enqueue(xprt);
>>> +       atomic_dec(&svsk->sk_sendqlen);
>>>         mutex_unlock(&xprt->xpt_mutex);
>>>         return -EAGAIN;
>>> }
>>> @@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock
>>> *svsk, struct svc_serv *serv)
>>>                 svsk->sk_datalen = 0;
>>>                 memset(&svsk->sk_pages[0], 0, sizeof(svsk-
>>>> sk_pages));
>>> 
>>> -               tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
>>> +               tcp_sock_set_nodelay(sk);
>>> 
>>>                 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
>>>                 switch (sk->sk_state) {
>>> -- 
>>> 2.29.2
>>> 
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com

--
Chuck Lever
Daire Byrne Feb. 14, 2021, 2:13 p.m. UTC | #4
----- On 13 Feb, 2021, at 23:30, Chuck Lever chuck.lever@oracle.com wrote:

>> I don't have a performance system to measure the improvement
>> accurately.
> 
> Then let's have Daire try it out, if possible.

I'm happy to test it out on one of our 2 x 40G NFS servers with 100 x 1G clients (but it's trickier to patch the clients too atm).

Just so I'm clear, this is in addition to Chuck's "Handle TCP socket sends with kernel_sendpage() again" patch from bz #209439 (which I think is now in 5.11 rc)? Or you want to see what this patch looks like on it's own without that (e.g. v5.10)?

Daire
Trond Myklebust Feb. 14, 2021, 4:21 p.m. UTC | #5
On Sat, 2021-02-13 at 23:30 +0000, Chuck Lever wrote:
> 
> 
> > On Feb 13, 2021, at 5:10 PM, Trond Myklebust <
> > trondmy@hammerspace.com> wrote:
> > 
> > On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
> > > Hi Trond-
> > > 
> > > > On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
> > > > 
> > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > 
> > > > Use a counter to keep track of how many requests are queued
> > > > behind
> > > > the
> > > > xprt->xpt_mutex, and keep TCP_CORK set until the queue is
> > > > empty.
> > > 
> > > I'm intrigued, but IMO, the patch description needs to explain
> > > why this change should be made. Why abandon Nagle?
> > > 
> > 
> > This doesn't change the Nagle/TCP_NODELAY settings. It just
> > switches to
> > using the new documented kernel interface.
> > 
> > The only change is to use TCP_CORK so that we don't send out
> > partially
> > filled TCP frames, when we can see that there are other RPC replies
> > that are queued up for transmission.
> > 
> > Note the combination TCP_CORK+TCP_NODELAY is common, and the main
> > effect of the latter is that when we turn off the TCP_CORK, then
> > there
> > is an immediate forced push of the TCP queue.
> 
> The description above suggests the patch is just a
> clean-up, but a forced push has potential to change
> the server's behavior.

Well, yes. That's very much the point.

Right now, the TCP_NODELAY/Nagle setting means that we're doing that
forced push at the end of _every_ RPC reply, whether or not there is
more stuff that can be queued up in the socket. The MSG_MORE is the
only thing that keeps us from doing the forced push on every sendpage()
call.
So the TCP_CORK is there to _further delay_ that forced push until we
think the queue is empty.

IOW: it attempts to optimise the scheduling of that push until we're
actually done pushing more stuff into the socket.

> I'm trying to assess what kind of additional testing
> would be valuable.
> 
> 
> > > If you expect a performance impact, the description should
> > > provide metrics to show it.
> > 
> > I don't have a performance system to measure the improvement
> > accurately.
> 
> Then let's have Daire try it out, if possible.
> 
> 
> > However I am seeing an notable improvement with the
> > equivalent client change. Specifically, xfstests generic/127 shows
> > a
> > ~20% improvement on my VM based test rig.
> 
> > > (We should have Daire try this change with his multi-client
> > > setup).
> > > 
> > > 
> > > > Signed-off-by: Trond Myklebust
> > > > <trond.myklebust@hammerspace.com>
> > > > ---
> > > > include/linux/sunrpc/svcsock.h | 2 ++
> > > > net/sunrpc/svcsock.c           | 8 +++++++-
> > > > 2 files changed, 9 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/include/linux/sunrpc/svcsock.h
> > > > b/include/linux/sunrpc/svcsock.h
> > > > index b7ac7fe68306..bcc555c7ae9c 100644
> > > > --- a/include/linux/sunrpc/svcsock.h
> > > > +++ b/include/linux/sunrpc/svcsock.h
> > > > @@ -35,6 +35,8 @@ struct svc_sock {
> > > >         /* Total length of the data (not including fragment
> > > > headers)
> > > >          * received so far in the fragments making up this rpc:
> > > > */
> > > >         u32                     sk_datalen;
> > > > +       /* Number of queued send requests */
> > > > +       atomic_t                sk_sendqlen;
> > > 
> > > Can you take advantage of xpt_mutex: update this field
> > > only in the critical section, and make it a simple
> > > integer type?
> 
> I see why atomic_inc(&sk_sendqlen); has to happen outside
> the mutex. However, it seems a little crazy/costly to have
> both of these serialization primitives.
> 
> I haven't found a mutex_unlock() that will report that there
> are no waiters to wake up, though.
> 
> 
> > > >         struct page *           sk_pages[RPCSVC_MAXPAGES];     
> > > > /*
> > > > received data */
> > > > };
> > > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > > > index 5a809c64dc7b..231f510a4830 100644
> > > > --- a/net/sunrpc/svcsock.c
> > > > +++ b/net/sunrpc/svcsock.c
> > > > @@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct
> > > > svc_rqst
> > > > *rqstp)
> > > > 
> > > >         svc_tcp_release_rqst(rqstp);
> > > > 
> > > > +       atomic_inc(&svsk->sk_sendqlen);
> > > >         mutex_lock(&xprt->xpt_mutex);
> > > >         if (svc_xprt_is_dead(xprt))
> > > >                 goto out_notconn;
> > > > +       tcp_sock_set_cork(svsk->sk_sk, true);
> > > >         err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker,
> > > > &sent);
> > > >         xdr_free_bvec(xdr);
> > > >         trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
> > > >         if (err < 0 || sent != (xdr->len + sizeof(marker)))
> > > >                 goto out_close;
> > > > +       if (atomic_dec_and_test(&svsk->sk_sendqlen))
> > > > +               tcp_sock_set_cork(svsk->sk_sk, false);
> > > >         mutex_unlock(&xprt->xpt_mutex);
> > > >         return sent;
> > > > 
> > > > out_notconn:
> > > > +       atomic_dec(&svsk->sk_sendqlen);
> > > >         mutex_unlock(&xprt->xpt_mutex);
> > > >         return -ENOTCONN;
> > > > out_close:
> > > > @@ -1192,6 +1197,7 @@ static int svc_tcp_sendto(struct svc_rqst
> > > > *rqstp)
> > > >                   (err < 0) ? err : sent, xdr->len);
> > > >         set_bit(XPT_CLOSE, &xprt->xpt_flags);
> > > >         svc_xprt_enqueue(xprt);
> > > > +       atomic_dec(&svsk->sk_sendqlen);
> > > >         mutex_unlock(&xprt->xpt_mutex);
> > > >         return -EAGAIN;
> > > > }
> > > > @@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock
> > > > *svsk, struct svc_serv *serv)
> > > >                 svsk->sk_datalen = 0;
> > > >                 memset(&svsk->sk_pages[0], 0, sizeof(svsk-
> > > > > sk_pages));
> > > > 
> > > > -               tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
> > > > +               tcp_sock_set_nodelay(sk);
> > > > 
> > > >                 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> > > >                 switch (sk->sk_state) {
> > > > -- 
> > > > 2.29.2
> > > > 
> > > 
> > > --
> > > Chuck Lever
> > > 
> > > 
> > > 
> > 
> > -- 
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> 
> --
> Chuck Lever
> 
> 
>
Trond Myklebust Feb. 14, 2021, 4:24 p.m. UTC | #6
On Sun, 2021-02-14 at 14:13 +0000, Daire Byrne wrote:
> 
> ----- On 13 Feb, 2021, at 23:30, Chuck Lever
> chuck.lever@oracle.com wrote:
> 
> > > I don't have a performance system to measure the improvement
> > > accurately.
> > 
> > Then let's have Daire try it out, if possible.
> 
> I'm happy to test it out on one of our 2 x 40G NFS servers with 100 x
> 1G clients (but it's trickier to patch the clients too atm).
> 
> Just so I'm clear, this is in addition to Chuck's "Handle TCP socket
> sends with kernel_sendpage() again" patch from bz #209439 (which I
> think is now in 5.11 rc)? Or you want to see what this patch looks
> like on it's own without that (e.g. v5.10)?
> 

This patch is independent of which socket mechanism we're using: it
should work with sendmsg(), sendpage(), and even writev(). So feel free
to test whichever combination makes most sense to you.
Chuck Lever Feb. 14, 2021, 4:59 p.m. UTC | #7
> On Feb 14, 2021, at 9:13 AM, Daire Byrne <daire@dneg.com> wrote:
> 
> 
> ----- On 13 Feb, 2021, at 23:30, Chuck Lever chuck.lever@oracle.com wrote:
> 
>>> I don't have a performance system to measure the improvement
>>> accurately.
>> 
>> Then let's have Daire try it out, if possible.
> 
> I'm happy to test it out on one of our 2 x 40G NFS servers with 100 x 1G clients (but it's trickier to patch the clients too atm).

Yes, that's exactly what we need. Thank you!


> Just so I'm clear, this is in addition to Chuck's "Handle TCP socket sends with kernel_sendpage() again" patch from bz #209439 (which I think is now in 5.11 rc)? Or you want to see what this patch looks like on it's own without that (e.g. v5.10)?

Please include the "Handle TCP socket sends with kernel_sendpage() again" fix.
Or, you can pull a recent stable kernel, I think that fix is already in there.

--
Chuck Lever
Chuck Lever Feb. 14, 2021, 5:27 p.m. UTC | #8
> On Feb 14, 2021, at 11:21 AM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Sat, 2021-02-13 at 23:30 +0000, Chuck Lever wrote:
>> 
>> 
>>> On Feb 13, 2021, at 5:10 PM, Trond Myklebust <
>>> trondmy@hammerspace.com> wrote:
>>> 
>>> On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
>>>> Hi Trond-
>>>> 
>>>>> On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
>>>>> 
>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>> 
>>>>> Use a counter to keep track of how many requests are queued
>>>>> behind
>>>>> the
>>>>> xprt->xpt_mutex, and keep TCP_CORK set until the queue is
>>>>> empty.
>>>> 
>>>> I'm intrigued, but IMO, the patch description needs to explain
>>>> why this change should be made. Why abandon Nagle?
>>>> 
>>> 
>>> This doesn't change the Nagle/TCP_NODELAY settings. It just
>>> switches to
>>> using the new documented kernel interface.
>>> 
>>> The only change is to use TCP_CORK so that we don't send out
>>> partially
>>> filled TCP frames, when we can see that there are other RPC replies
>>> that are queued up for transmission.
>>> 
>>> Note the combination TCP_CORK+TCP_NODELAY is common, and the main
>>> effect of the latter is that when we turn off the TCP_CORK, then
>>> there
>>> is an immediate forced push of the TCP queue.
>> 
>> The description above suggests the patch is just a
>> clean-up, but a forced push has potential to change
>> the server's behavior.
> 
> Well, yes. That's very much the point.
> 
> Right now, the TCP_NODELAY/Nagle setting means that we're doing that
> forced push at the end of _every_ RPC reply, whether or not there is
> more stuff that can be queued up in the socket. The MSG_MORE is the
> only thing that keeps us from doing the forced push on every sendpage()
> call.
> So the TCP_CORK is there to _further delay_ that forced push until we
> think the queue is empty.

My concern is that waiting for the queue to empty before pushing could
improve throughput at the cost of increased average round-trip latency.
That concern is based on experience I've had attempting to batch sends
in the RDMA transport.


> IOW: it attempts to optimise the scheduling of that push until we're
> actually done pushing more stuff into the socket.

Yep, clear, thanks. It would help a lot if the above were included in
the patch description.

And, I presume that the TCP layer will push anyway if it needs to
reclaim resources to handle more queued sends.

Let's also consider starvation; ie, that the server will continue
queuing replies such that it never uncorks. The logic in the patch
appears to depend on the client stopping at some point to wait for the
server to catch up. There probably should be a trap door that uncorks
after a few requests (say, 8) or certain number of bytes are pending
without a break.


--
Chuck Lever
Trond Myklebust Feb. 14, 2021, 5:41 p.m. UTC | #9
On Sun, 2021-02-14 at 17:27 +0000, Chuck Lever wrote:
> 
> 
> > On Feb 14, 2021, at 11:21 AM, Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > 
> > On Sat, 2021-02-13 at 23:30 +0000, Chuck Lever wrote:
> > > 
> > > 
> > > > On Feb 13, 2021, at 5:10 PM, Trond Myklebust <
> > > > trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
> > > > > Hi Trond-
> > > > > 
> > > > > > On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
> > > > > > 
> > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > 
> > > > > > Use a counter to keep track of how many requests are queued
> > > > > > behind
> > > > > > the
> > > > > > xprt->xpt_mutex, and keep TCP_CORK set until the queue is
> > > > > > empty.
> > > > > 
> > > > > I'm intrigued, but IMO, the patch description needs to
> > > > > explain
> > > > > why this change should be made. Why abandon Nagle?
> > > > > 
> > > > 
> > > > This doesn't change the Nagle/TCP_NODELAY settings. It just
> > > > switches to
> > > > using the new documented kernel interface.
> > > > 
> > > > The only change is to use TCP_CORK so that we don't send out
> > > > partially
> > > > filled TCP frames, when we can see that there are other RPC
> > > > replies
> > > > that are queued up for transmission.
> > > > 
> > > > Note the combination TCP_CORK+TCP_NODELAY is common, and the
> > > > main
> > > > effect of the latter is that when we turn off the TCP_CORK,
> > > > then
> > > > there
> > > > is an immediate forced push of the TCP queue.
> > > 
> > > The description above suggests the patch is just a
> > > clean-up, but a forced push has potential to change
> > > the server's behavior.
> > 
> > Well, yes. That's very much the point.
> > 
> > Right now, the TCP_NODELAY/Nagle setting means that we're doing
> > that
> > forced push at the end of _every_ RPC reply, whether or not there
> > is
> > more stuff that can be queued up in the socket. The MSG_MORE is the
> > only thing that keeps us from doing the forced push on every
> > sendpage()
> > call.
> > So the TCP_CORK is there to _further delay_ that forced push until
> > we
> > think the queue is empty.
> 
> My concern is that waiting for the queue to empty before pushing
> could
> improve throughput at the cost of increased average round-trip
> latency.
> That concern is based on experience I've had attempting to batch
> sends
> in the RDMA transport.
> 
> 
> > IOW: it attempts to optimise the scheduling of that push until
> > we're
> > actually done pushing more stuff into the socket.
> 
> Yep, clear, thanks. It would help a lot if the above were included in
> the patch description.
> 
> And, I presume that the TCP layer will push anyway if it needs to
> reclaim resources to handle more queued sends.
> 
> Let's also consider starvation; ie, that the server will continue
> queuing replies such that it never uncorks. The logic in the patch
> appears to depend on the client stopping at some point to wait for
> the
> server to catch up. There probably should be a trap door that uncorks
> after a few requests (say, 8) or certain number of bytes are pending
> without a break.

So, firstly, the TCP layer will still push every time a frame is full,
so traffic does not stop altogether while TCP_CORK is set. Secondly,
TCP will also always push on send errors (e.g. when out of free socket
buffer). Thirdly, it will always push after hitting the 200ms ceiling,
as described in the tcp(7) manpage.

IOW: The TCP_CORK feature is not designed to block the socket forever.
It is there to allow the application to hint to the TCP layer what it
needs to do in exactly the same way that MSG_MORE does.
Chuck Lever Feb. 14, 2021, 5:58 p.m. UTC | #10
> On Feb 14, 2021, at 12:41 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Sun, 2021-02-14 at 17:27 +0000, Chuck Lever wrote:
>> 
>> 
>>> On Feb 14, 2021, at 11:21 AM, Trond Myklebust
>>> <trondmy@hammerspace.com> wrote:
>>> 
>>> On Sat, 2021-02-13 at 23:30 +0000, Chuck Lever wrote:
>>>> 
>>>> 
>>>>> On Feb 13, 2021, at 5:10 PM, Trond Myklebust <
>>>>> trondmy@hammerspace.com> wrote:
>>>>> 
>>>>> On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
>>>>>> Hi Trond-
>>>>>> 
>>>>>>> On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
>>>>>>> 
>>>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>>>> 
>>>>>>> Use a counter to keep track of how many requests are queued
>>>>>>> behind
>>>>>>> the
>>>>>>> xprt->xpt_mutex, and keep TCP_CORK set until the queue is
>>>>>>> empty.
>>>>>> 
>>>>>> I'm intrigued, but IMO, the patch description needs to
>>>>>> explain
>>>>>> why this change should be made. Why abandon Nagle?
>>>>>> 
>>>>> 
>>>>> This doesn't change the Nagle/TCP_NODELAY settings. It just
>>>>> switches to
>>>>> using the new documented kernel interface.
>>>>> 
>>>>> The only change is to use TCP_CORK so that we don't send out
>>>>> partially
>>>>> filled TCP frames, when we can see that there are other RPC
>>>>> replies
>>>>> that are queued up for transmission.
>>>>> 
>>>>> Note the combination TCP_CORK+TCP_NODELAY is common, and the
>>>>> main
>>>>> effect of the latter is that when we turn off the TCP_CORK,
>>>>> then
>>>>> there
>>>>> is an immediate forced push of the TCP queue.
>>>> 
>>>> The description above suggests the patch is just a
>>>> clean-up, but a forced push has potential to change
>>>> the server's behavior.
>>> 
>>> Well, yes. That's very much the point.
>>> 
>>> Right now, the TCP_NODELAY/Nagle setting means that we're doing
>>> that
>>> forced push at the end of _every_ RPC reply, whether or not there
>>> is
>>> more stuff that can be queued up in the socket. The MSG_MORE is the
>>> only thing that keeps us from doing the forced push on every
>>> sendpage()
>>> call.
>>> So the TCP_CORK is there to _further delay_ that forced push until
>>> we
>>> think the queue is empty.
>> 
>> My concern is that waiting for the queue to empty before pushing
>> could
>> improve throughput at the cost of increased average round-trip
>> latency.
>> That concern is based on experience I've had attempting to batch
>> sends
>> in the RDMA transport.
>> 
>> 
>>> IOW: it attempts to optimise the scheduling of that push until
>>> we're
>>> actually done pushing more stuff into the socket.
>> 
>> Yep, clear, thanks. It would help a lot if the above were included in
>> the patch description.
>> 
>> And, I presume that the TCP layer will push anyway if it needs to
>> reclaim resources to handle more queued sends.
>> 
>> Let's also consider starvation; ie, that the server will continue
>> queuing replies such that it never uncorks. The logic in the patch
>> appears to depend on the client stopping at some point to wait for
>> the
>> server to catch up. There probably should be a trap door that uncorks
>> after a few requests (say, 8) or certain number of bytes are pending
>> without a break.
> 
> So, firstly, the TCP layer will still push every time a frame is full,
> so traffic does not stop altogether while TCP_CORK is set.

OK.


> Secondly, TCP will also always push on send errors (e.g. when out of free socket
> buffer).

As I presumed. OK.


> Thirdly, it will always push after hitting the 200ms ceiling,
> as described in the tcp(7) manpage.

That's the trap door we need to ensure no starvation or deadlock,
assuming there are no bugs in the TCP layer's logic.

200ms seems a long time to wait, though, when average round trip
latencies are usually under 1ms on typical Ethernet. It would be
good to know how often sending has to wait this long.


> IOW: The TCP_CORK feature is not designed to block the socket forever.
> It is there to allow the application to hint to the TCP layer what it
> needs to do in exactly the same way that MSG_MORE does.

As long as it is only a hint, then we're good.

Out of interest, why not use only MSG_MORE, or remove the use
of MSG_MORE in favor of only cork? If these are essentially the
same mechanism, seems like we need only one or the other.


--
Chuck Lever
Trond Myklebust Feb. 14, 2021, 6:18 p.m. UTC | #11
On Sun, 2021-02-14 at 17:58 +0000, Chuck Lever wrote:
> 
> 
> > On Feb 14, 2021, at 12:41 PM, Trond Myklebust < 
> > trondmy@hammerspace.com> wrote:
> > 
> > On Sun, 2021-02-14 at 17:27 +0000, Chuck Lever wrote:
> > > 
> > > 
> > > > On Feb 14, 2021, at 11:21 AM, Trond Myklebust
> > > > <trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Sat, 2021-02-13 at 23:30 +0000, Chuck Lever wrote:
> > > > > 
> > > > > 
> > > > > > On Feb 13, 2021, at 5:10 PM, Trond Myklebust <
> > > > > > trondmy@hammerspace.com> wrote:
> > > > > > 
> > > > > > On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
> > > > > > > Hi Trond-
> > > > > > > 
> > > > > > > > On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
> > > > > > > > 
> > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > > > 
> > > > > > > > Use a counter to keep track of how many requests are
> > > > > > > > queued
> > > > > > > > behind
> > > > > > > > the
> > > > > > > > xprt->xpt_mutex, and keep TCP_CORK set until the queue
> > > > > > > > is
> > > > > > > > empty.
> > > > > > > 
> > > > > > > I'm intrigued, but IMO, the patch description needs to
> > > > > > > explain
> > > > > > > why this change should be made. Why abandon Nagle?
> > > > > > > 
> > > > > > 
> > > > > > This doesn't change the Nagle/TCP_NODELAY settings. It just
> > > > > > switches to
> > > > > > using the new documented kernel interface.
> > > > > > 
> > > > > > The only change is to use TCP_CORK so that we don't send
> > > > > > out
> > > > > > partially
> > > > > > filled TCP frames, when we can see that there are other RPC
> > > > > > replies
> > > > > > that are queued up for transmission.
> > > > > > 
> > > > > > Note the combination TCP_CORK+TCP_NODELAY is common, and
> > > > > > the
> > > > > > main
> > > > > > effect of the latter is that when we turn off the TCP_CORK,
> > > > > > then
> > > > > > there
> > > > > > is an immediate forced push of the TCP queue.
> > > > > 
> > > > > The description above suggests the patch is just a
> > > > > clean-up, but a forced push has potential to change
> > > > > the server's behavior.
> > > > 
> > > > Well, yes. That's very much the point.
> > > > 
> > > > Right now, the TCP_NODELAY/Nagle setting means that we're doing
> > > > that
> > > > forced push at the end of _every_ RPC reply, whether or not
> > > > there
> > > > is
> > > > more stuff that can be queued up in the socket. The MSG_MORE is
> > > > the
> > > > only thing that keeps us from doing the forced push on every
> > > > sendpage()
> > > > call.
> > > > So the TCP_CORK is there to _further delay_ that forced push
> > > > until
> > > > we
> > > > think the queue is empty.
> > > 
> > > My concern is that waiting for the queue to empty before pushing
> > > could
> > > improve throughput at the cost of increased average round-trip
> > > latency.
> > > That concern is based on experience I've had attempting to batch
> > > sends
> > > in the RDMA transport.
> > > 
> > > 
> > > > IOW: it attempts to optimise the scheduling of that push until
> > > > we're
> > > > actually done pushing more stuff into the socket.
> > > 
> > > Yep, clear, thanks. It would help a lot if the above were
> > > included in
> > > the patch description.
> > > 
> > > And, I presume that the TCP layer will push anyway if it needs to
> > > reclaim resources to handle more queued sends.
> > > 
> > > Let's also consider starvation; ie, that the server will continue
> > > queuing replies such that it never uncorks. The logic in the
> > > patch
> > > appears to depend on the client stopping at some point to wait
> > > for
> > > the
> > > server to catch up. There probably should be a trap door that
> > > uncorks
> > > after a few requests (say, 8) or certain number of bytes are
> > > pending
> > > without a break.
> > 
> > So, firstly, the TCP layer will still push every time a frame is
> > full,
> > so traffic does not stop altogether while TCP_CORK is set.
> 
> OK.
> 
> 
> > Secondly, TCP will also always push on send errors (e.g. when out
> > of free socket
> > buffer).
> 
> As I presumed. OK.
> 
> 
> > Thirdly, it will always push after hitting the 200ms ceiling,
> > as described in the tcp(7) manpage.
> 
> That's the trap door we need to ensure no starvation or deadlock,
> assuming there are no bugs in the TCP layer's logic.
> 
> 200ms seems a long time to wait, though, when average round trip
> latencies are usually under 1ms on typical Ethernet. It would be
> good to know how often sending has to wait this long.

If it does wait that long, then it would be because the system is under
such load that scheduling the next task waiting for the xpt_mutex is
taking more than 200ms. I don't see how this would make things any
worse.

> > IOW: The TCP_CORK feature is not designed to block the socket
> > forever.
> > It is there to allow the application to hint to the TCP layer what
> > it
> > needs to do in exactly the same way that MSG_MORE does.
> 
> As long as it is only a hint, then we're good.
> 
> Out of interest, why not use only MSG_MORE, or remove the use
> of MSG_MORE in favor of only cork? If these are essentially the
> same mechanism, seems like we need only one or the other.
> 

The advantage of TCP_CORK is that you can perform the queue length
evaluation _after_ the sendmsg/sendpage/writev call. Since all of those
can block due to memory allocation, socket buffer shortage, etc, then
it is quite likely under many workloads that other RPC calls may have
time to finish processing and get queued up.

I agree that we probably could remove MSG_MORE for tcp once TCP_CORK is
implemented. However those flags continue to be useful for udp.
Chuck Lever Feb. 14, 2021, 7:43 p.m. UTC | #12
> On Feb 14, 2021, at 1:18 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Sun, 2021-02-14 at 17:58 +0000, Chuck Lever wrote:
>> 
>> 
>>> On Feb 14, 2021, at 12:41 PM, Trond Myklebust < 
>>> trondmy@hammerspace.com> wrote:
>>> 
>>> On Sun, 2021-02-14 at 17:27 +0000, Chuck Lever wrote:
>>>> 
>>>> 
>>>>> On Feb 14, 2021, at 11:21 AM, Trond Myklebust
>>>>> <trondmy@hammerspace.com> wrote:
>>>>> 
>>>>> On Sat, 2021-02-13 at 23:30 +0000, Chuck Lever wrote:
>>>>>> 
>>>>>> 
>>>>>>> On Feb 13, 2021, at 5:10 PM, Trond Myklebust <
>>>>>>> trondmy@hammerspace.com> wrote:
>>>>>>> 
>>>>>>> On Sat, 2021-02-13 at 21:53 +0000, Chuck Lever wrote:
>>>>>>>> Hi Trond-
>>>>>>>> 
>>>>>>>>> On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
>>>>>>>>> 
>>>>>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>>>>>> 
>>>>>>>>> Use a counter to keep track of how many requests are
>>>>>>>>> queued
>>>>>>>>> behind
>>>>>>>>> the
>>>>>>>>> xprt->xpt_mutex, and keep TCP_CORK set until the queue
>>>>>>>>> is
>>>>>>>>> empty.
>>>>>>>> 
>>>>>>>> I'm intrigued, but IMO, the patch description needs to
>>>>>>>> explain
>>>>>>>> why this change should be made. Why abandon Nagle?
>>>>>>>> 
>>>>>>> 
>>>>>>> This doesn't change the Nagle/TCP_NODELAY settings. It just
>>>>>>> switches to
>>>>>>> using the new documented kernel interface.
>>>>>>> 
>>>>>>> The only change is to use TCP_CORK so that we don't send
>>>>>>> out
>>>>>>> partially
>>>>>>> filled TCP frames, when we can see that there are other RPC
>>>>>>> replies
>>>>>>> that are queued up for transmission.
>>>>>>> 
>>>>>>> Note the combination TCP_CORK+TCP_NODELAY is common, and
>>>>>>> the
>>>>>>> main
>>>>>>> effect of the latter is that when we turn off the TCP_CORK,
>>>>>>> then
>>>>>>> there
>>>>>>> is an immediate forced push of the TCP queue.
>>>>>> 
>>>>>> The description above suggests the patch is just a
>>>>>> clean-up, but a forced push has potential to change
>>>>>> the server's behavior.
>>>>> 
>>>>> Well, yes. That's very much the point.
>>>>> 
>>>>> Right now, the TCP_NODELAY/Nagle setting means that we're doing
>>>>> that
>>>>> forced push at the end of _every_ RPC reply, whether or not
>>>>> there
>>>>> is
>>>>> more stuff that can be queued up in the socket. The MSG_MORE is
>>>>> the
>>>>> only thing that keeps us from doing the forced push on every
>>>>> sendpage()
>>>>> call.
>>>>> So the TCP_CORK is there to _further delay_ that forced push
>>>>> until
>>>>> we
>>>>> think the queue is empty.
>>>> 
>>>> My concern is that waiting for the queue to empty before pushing
>>>> could
>>>> improve throughput at the cost of increased average round-trip
>>>> latency.
>>>> That concern is based on experience I've had attempting to batch
>>>> sends
>>>> in the RDMA transport.
>>>> 
>>>> 
>>>>> IOW: it attempts to optimise the scheduling of that push until
>>>>> we're
>>>>> actually done pushing more stuff into the socket.
>>>> 
>>>> Yep, clear, thanks. It would help a lot if the above were
>>>> included in
>>>> the patch description.
>>>> 
>>>> And, I presume that the TCP layer will push anyway if it needs to
>>>> reclaim resources to handle more queued sends.
>>>> 
>>>> Let's also consider starvation; ie, that the server will continue
>>>> queuing replies such that it never uncorks. The logic in the
>>>> patch
>>>> appears to depend on the client stopping at some point to wait
>>>> for
>>>> the
>>>> server to catch up. There probably should be a trap door that
>>>> uncorks
>>>> after a few requests (say, 8) or certain number of bytes are
>>>> pending
>>>> without a break.
>>> 
>>> So, firstly, the TCP layer will still push every time a frame is
>>> full,
>>> so traffic does not stop altogether while TCP_CORK is set.
>> 
>> OK.
>> 
>> 
>>> Secondly, TCP will also always push on send errors (e.g. when out
>>> of free socket
>>> buffer).
>> 
>> As I presumed. OK.
>> 
>> 
>>> Thirdly, it will always push after hitting the 200ms ceiling,
>>> as described in the tcp(7) manpage.
>> 
>> That's the trap door we need to ensure no starvation or deadlock,
>> assuming there are no bugs in the TCP layer's logic.
>> 
>> 200ms seems a long time to wait, though, when average round trip
>> latencies are usually under 1ms on typical Ethernet. It would be
>> good to know how often sending has to wait this long.
> 
> If it does wait that long, then it would be because the system is under
> such load that scheduling the next task waiting for the xpt_mutex is
> taking more than 200ms. I don't see how this would make things any
> worse.

Sure, but I'm speculating that some kind of metric or tracepoint might
provide useful field diagnostic information. Not necessary for this
particular patch, but someday, perhaps.


>>> IOW: The TCP_CORK feature is not designed to block the socket
>>> forever.
>>> It is there to allow the application to hint to the TCP layer what
>>> it
>>> needs to do in exactly the same way that MSG_MORE does.
>> 
>> As long as it is only a hint, then we're good.
>> 
>> Out of interest, why not use only MSG_MORE, or remove the use
>> of MSG_MORE in favor of only cork? If these are essentially the
>> same mechanism, seems like we need only one or the other.
>> 
> 
> The advantage of TCP_CORK is that you can perform the queue length
> evaluation _after_ the sendmsg/sendpage/writev call. Since all of those
> can block due to memory allocation, socket buffer shortage, etc, then
> it is quite likely under many workloads that other RPC calls may have
> time to finish processing and get queued up.
> 
> I agree that we probably could remove MSG_MORE for tcp once TCP_CORK is
> implemented. However those flags continue to be useful for udp.

On the server, the UDP path uses xprt_sock_sendmsg(). The TCP path uses
svc_tcp_sendmsg(). Separate call paths. Removing MSG_MORE only for TCP
should be a straightforward simplification to svc_tcp_sendmsg().

--
Chuck Lever
Daire Byrne Feb. 15, 2021, 2:08 p.m. UTC | #13
----- On 14 Feb, 2021, at 16:59, Chuck Lever chuck.lever@oracle.com wrote:

>>>> I don't have a performance system to measure the improvement
>>>> accurately.
>>> 
>>> Then let's have Daire try it out, if possible.
>> 
>> I'm happy to test it out on one of our 2 x 40G NFS servers with 100 x 1G clients
>> (but it's trickier to patch the clients too atm).
> 
> Yes, that's exactly what we need. Thank you!
> 
>> Just so I'm clear, this is in addition to Chuck's "Handle TCP socket sends with
>> kernel_sendpage() again" patch from bz #209439 (which I think is now in 5.11
>> rc)? Or you want to see what this patch looks like on it's own without that
>> (e.g. v5.10)?
> 
> Please include the "Handle TCP socket sends with kernel_sendpage() again" fix.
> Or, you can pull a recent stable kernel, I think that fix is already in there.

I took v5.10.16 and used a ~100Gbit capable server with ~150 x 1 gbit clients all reading the same file from the server's pagecache as the test. Both with and without the patch, I consistently see around 90gbit worth of sends from the server for sustained periods. Any differences between them are well within margins of error for repeat runs of the benchmark.

The only noticeable difference is in the output of perf top where svc_xprt_do_enqueue goes from ~0.9% without the patch to ~3% with the patch. It now takes up second place (up from 17th place) behind native_queued_spin_lock_slowpath:

   3.57%  [kernel]                     [k] native_queued_spin_lock_slowpath
   3.07%  [kernel]                     [k] svc_xprt_do_enqueue

I also don't really see much difference in the softirq cpu usage.

So there doesn't seem to be any negative impacts of the patch but because I'm already pushing the server to it's network hardware limit (without the patch), it's also not clear if it is improving performance for this benchmark either.

I also tried with 50 clients and sustained the expected 50gbit sends from the server in both with and without the patch.

Daire
Chuck Lever Feb. 15, 2021, 6:22 p.m. UTC | #14
> On Feb 15, 2021, at 9:08 AM, Daire Byrne <daire@dneg.com> wrote:
> 
> 
> ----- On 14 Feb, 2021, at 16:59, Chuck Lever chuck.lever@oracle.com wrote:
> 
>>>>> I don't have a performance system to measure the improvement
>>>>> accurately.
>>>> 
>>>> Then let's have Daire try it out, if possible.
>>> 
>>> I'm happy to test it out on one of our 2 x 40G NFS servers with 100 x 1G clients
>>> (but it's trickier to patch the clients too atm).
>> 
>> Yes, that's exactly what we need. Thank you!
>> 
>>> Just so I'm clear, this is in addition to Chuck's "Handle TCP socket sends with
>>> kernel_sendpage() again" patch from bz #209439 (which I think is now in 5.11
>>> rc)? Or you want to see what this patch looks like on it's own without that
>>> (e.g. v5.10)?
>> 
>> Please include the "Handle TCP socket sends with kernel_sendpage() again" fix.
>> Or, you can pull a recent stable kernel, I think that fix is already in there.
> 
> I took v5.10.16 and used a ~100Gbit capable server with ~150 x 1 gbit clients all reading the same file from the server's pagecache as the test. Both with and without the patch, I consistently see around 90gbit worth of sends from the server for sustained periods. Any differences between them are well within margins of error for repeat runs of the benchmark.
> 
> The only noticeable difference is in the output of perf top where svc_xprt_do_enqueue goes from ~0.9% without the patch to ~3% with the patch. It now takes up second place (up from 17th place) behind native_queued_spin_lock_slowpath:
> 
>   3.57%  [kernel]                     [k] native_queued_spin_lock_slowpath
>   3.07%  [kernel]                     [k] svc_xprt_do_enqueue
> 
> I also don't really see much difference in the softirq cpu usage.
> 
> So there doesn't seem to be any negative impacts of the patch but because I'm already pushing the server to it's network hardware limit (without the patch), it's also not clear if it is improving performance for this benchmark either.
> 
> I also tried with 50 clients and sustained the expected 50gbit sends from the server in both with and without the patch.

Thank you Daire. That is a comforting result. The increase in do_enqueue
is curious, though.

--
Chuck Lever
Chuck Lever Feb. 16, 2021, 4:27 p.m. UTC | #15
Hey Trond-

> On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
> 
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> Use a counter to keep track of how many requests are queued behind the
> xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.
> 
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>

Let's move this forward a little bit.

I'd like to see the previously discussed MSG_MORE simplification
integrated into this patch.

In addition to Daire's testing, I've done some testing:
- No behavior regressions noted
- No changes in large I/O throughput
- Slightly shorter RTTs on software build

And the current version of the patch is now in the for-rc branch
of https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git/
to get broader testing exposure.

This work is a candidate for a second NFSD PR during the 5.12
merge window, along with the other patches currently in the
for-rc branch.


> ---
> include/linux/sunrpc/svcsock.h | 2 ++
> net/sunrpc/svcsock.c           | 8 +++++++-
> 2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
> index b7ac7fe68306..bcc555c7ae9c 100644
> --- a/include/linux/sunrpc/svcsock.h
> +++ b/include/linux/sunrpc/svcsock.h
> @@ -35,6 +35,8 @@ struct svc_sock {
> 	/* Total length of the data (not including fragment headers)
> 	 * received so far in the fragments making up this rpc: */
> 	u32			sk_datalen;
> +	/* Number of queued send requests */
> +	atomic_t		sk_sendqlen;
> 
> 	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
> };
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 5a809c64dc7b..231f510a4830 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
> 
> 	svc_tcp_release_rqst(rqstp);
> 
> +	atomic_inc(&svsk->sk_sendqlen);
> 	mutex_lock(&xprt->xpt_mutex);
> 	if (svc_xprt_is_dead(xprt))
> 		goto out_notconn;
> +	tcp_sock_set_cork(svsk->sk_sk, true);
> 	err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
> 	xdr_free_bvec(xdr);
> 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
> 	if (err < 0 || sent != (xdr->len + sizeof(marker)))
> 		goto out_close;
> +	if (atomic_dec_and_test(&svsk->sk_sendqlen))
> +		tcp_sock_set_cork(svsk->sk_sk, false);
> 	mutex_unlock(&xprt->xpt_mutex);
> 	return sent;
> 
> out_notconn:
> +	atomic_dec(&svsk->sk_sendqlen);
> 	mutex_unlock(&xprt->xpt_mutex);
> 	return -ENOTCONN;
> out_close:
> @@ -1192,6 +1197,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
> 		  (err < 0) ? err : sent, xdr->len);
> 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
> 	svc_xprt_enqueue(xprt);
> +	atomic_dec(&svsk->sk_sendqlen);
> 	mutex_unlock(&xprt->xpt_mutex);
> 	return -EAGAIN;
> }
> @@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
> 		svsk->sk_datalen = 0;
> 		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
> 
> -		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
> +		tcp_sock_set_nodelay(sk);
> 
> 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> 		switch (sk->sk_state) {
> -- 
> 2.29.2
> 

--
Chuck Lever
Trond Myklebust Feb. 16, 2021, 5:10 p.m. UTC | #16
On Tue, 2021-02-16 at 16:27 +0000, Chuck Lever wrote:
> Hey Trond-
> 
> > On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
> > 
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > Use a counter to keep track of how many requests are queued behind
> > the
> > xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.
> > 
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> Let's move this forward a little bit.
> 
> I'd like to see the previously discussed MSG_MORE simplification
> integrated into this patch.
> 

Hmm... I think I'd prefer to make it incremental to this one. I wasn't
too sure about whether or not removing MSG_SENDPAGE_NOTLAST makes a
difference.
AFAICS, the answer is no, but just in case, let's make it easy to back
this change out.

> In addition to Daire's testing, I've done some testing:
> - No behavior regressions noted
> - No changes in large I/O throughput
> - Slightly shorter RTTs on software build
> 
> And the current version of the patch is now in the for-rc branch
> of https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git/
> to get broader testing exposure.
> 
> This work is a candidate for a second NFSD PR during the 5.12
> merge window, along with the other patches currently in the
> for-rc branch.
> 
> 
> > ---
> > include/linux/sunrpc/svcsock.h | 2 ++
> > net/sunrpc/svcsock.c           | 8 +++++++-
> > 2 files changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/sunrpc/svcsock.h
> > b/include/linux/sunrpc/svcsock.h
> > index b7ac7fe68306..bcc555c7ae9c 100644
> > --- a/include/linux/sunrpc/svcsock.h
> > +++ b/include/linux/sunrpc/svcsock.h
> > @@ -35,6 +35,8 @@ struct svc_sock {
> >         /* Total length of the data (not including fragment
> > headers)
> >          * received so far in the fragments making up this rpc: */
> >         u32                     sk_datalen;
> > +       /* Number of queued send requests */
> > +       atomic_t                sk_sendqlen;
> > 
> >         struct page *           sk_pages[RPCSVC_MAXPAGES];      /*
> > received data */
> > };
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index 5a809c64dc7b..231f510a4830 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct svc_rqst
> > *rqstp)
> > 
> >         svc_tcp_release_rqst(rqstp);
> > 
> > +       atomic_inc(&svsk->sk_sendqlen);
> >         mutex_lock(&xprt->xpt_mutex);
> >         if (svc_xprt_is_dead(xprt))
> >                 goto out_notconn;
> > +       tcp_sock_set_cork(svsk->sk_sk, true);
> >         err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker,
> > &sent);
> >         xdr_free_bvec(xdr);
> >         trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
> >         if (err < 0 || sent != (xdr->len + sizeof(marker)))
> >                 goto out_close;
> > +       if (atomic_dec_and_test(&svsk->sk_sendqlen))
> > +               tcp_sock_set_cork(svsk->sk_sk, false);
> >         mutex_unlock(&xprt->xpt_mutex);
> >         return sent;
> > 
> > out_notconn:
> > +       atomic_dec(&svsk->sk_sendqlen);
> >         mutex_unlock(&xprt->xpt_mutex);
> >         return -ENOTCONN;
> > out_close:
> > @@ -1192,6 +1197,7 @@ static int svc_tcp_sendto(struct svc_rqst
> > *rqstp)
> >                   (err < 0) ? err : sent, xdr->len);
> >         set_bit(XPT_CLOSE, &xprt->xpt_flags);
> >         svc_xprt_enqueue(xprt);
> > +       atomic_dec(&svsk->sk_sendqlen);
> >         mutex_unlock(&xprt->xpt_mutex);
> >         return -EAGAIN;
> > }
> > @@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock
> > *svsk, struct svc_serv *serv)
> >                 svsk->sk_datalen = 0;
> >                 memset(&svsk->sk_pages[0], 0, sizeof(svsk-
> > >sk_pages));
> > 
> > -               tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
> > +               tcp_sock_set_nodelay(sk);
> > 
> >                 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> >                 switch (sk->sk_state) {
> > -- 
> > 2.29.2
> > 
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever Feb. 16, 2021, 5:12 p.m. UTC | #17
> On Feb 16, 2021, at 12:10 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Tue, 2021-02-16 at 16:27 +0000, Chuck Lever wrote:
>> Hey Trond-
>> 
>>> On Feb 13, 2021, at 3:25 PM, trondmy@kernel.org wrote:
>>> 
>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> 
>>> Use a counter to keep track of how many requests are queued behind
>>> the
>>> xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.
>>> 
>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>> 
>> Let's move this forward a little bit.
>> 
>> I'd like to see the previously discussed MSG_MORE simplification
>> integrated into this patch.
>> 
> 
> Hmm... I think I'd prefer to make it incremental to this one. I wasn't
> too sure about whether or not removing MSG_SENDPAGE_NOTLAST makes a
> difference.
> AFAICS, the answer is no, but just in case, let's make it easy to back
> this change out.

OK. Please send an incremental patch. Thanks!


>> In addition to Daire's testing, I've done some testing:
>> - No behavior regressions noted
>> - No changes in large I/O throughput
>> - Slightly shorter RTTs on software build
>> 
>> And the current version of the patch is now in the for-rc branch
>> of https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git/
>> to get broader testing exposure.
>> 
>> This work is a candidate for a second NFSD PR during the 5.12
>> merge window, along with the other patches currently in the
>> for-rc branch.
>> 
>> 
>>> ---
>>> include/linux/sunrpc/svcsock.h | 2 ++
>>> net/sunrpc/svcsock.c           | 8 +++++++-
>>> 2 files changed, 9 insertions(+), 1 deletion(-)
>>> 
>>> diff --git a/include/linux/sunrpc/svcsock.h
>>> b/include/linux/sunrpc/svcsock.h
>>> index b7ac7fe68306..bcc555c7ae9c 100644
>>> --- a/include/linux/sunrpc/svcsock.h
>>> +++ b/include/linux/sunrpc/svcsock.h
>>> @@ -35,6 +35,8 @@ struct svc_sock {
>>>         /* Total length of the data (not including fragment
>>> headers)
>>>          * received so far in the fragments making up this rpc: */
>>>         u32                     sk_datalen;
>>> +       /* Number of queued send requests */
>>> +       atomic_t                sk_sendqlen;
>>> 
>>>         struct page *           sk_pages[RPCSVC_MAXPAGES];      /*
>>> received data */
>>> };
>>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>>> index 5a809c64dc7b..231f510a4830 100644
>>> --- a/net/sunrpc/svcsock.c
>>> +++ b/net/sunrpc/svcsock.c
>>> @@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct svc_rqst
>>> *rqstp)
>>> 
>>>         svc_tcp_release_rqst(rqstp);
>>> 
>>> +       atomic_inc(&svsk->sk_sendqlen);
>>>         mutex_lock(&xprt->xpt_mutex);
>>>         if (svc_xprt_is_dead(xprt))
>>>                 goto out_notconn;
>>> +       tcp_sock_set_cork(svsk->sk_sk, true);
>>>         err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker,
>>> &sent);
>>>         xdr_free_bvec(xdr);
>>>         trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
>>>         if (err < 0 || sent != (xdr->len + sizeof(marker)))
>>>                 goto out_close;
>>> +       if (atomic_dec_and_test(&svsk->sk_sendqlen))
>>> +               tcp_sock_set_cork(svsk->sk_sk, false);
>>>         mutex_unlock(&xprt->xpt_mutex);
>>>         return sent;
>>> 
>>> out_notconn:
>>> +       atomic_dec(&svsk->sk_sendqlen);
>>>         mutex_unlock(&xprt->xpt_mutex);
>>>         return -ENOTCONN;
>>> out_close:
>>> @@ -1192,6 +1197,7 @@ static int svc_tcp_sendto(struct svc_rqst
>>> *rqstp)
>>>                   (err < 0) ? err : sent, xdr->len);
>>>         set_bit(XPT_CLOSE, &xprt->xpt_flags);
>>>         svc_xprt_enqueue(xprt);
>>> +       atomic_dec(&svsk->sk_sendqlen);
>>>         mutex_unlock(&xprt->xpt_mutex);
>>>         return -EAGAIN;
>>> }
>>> @@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock
>>> *svsk, struct svc_serv *serv)
>>>                 svsk->sk_datalen = 0;
>>>                 memset(&svsk->sk_pages[0], 0, sizeof(svsk-
>>>> sk_pages));
>>> 
>>> -               tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
>>> +               tcp_sock_set_nodelay(sk);
>>> 
>>>                 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
>>>                 switch (sk->sk_state) {
>>> -- 
>>> 2.29.2
>>> 
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Trond Myklebust
> CTO, Hammerspace Inc
> 4984 El Camino Real, Suite 208
> Los Altos, CA 94022
> ​
> www.hammer.space

--
Chuck Lever
diff mbox series

Patch

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index b7ac7fe68306..bcc555c7ae9c 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -35,6 +35,8 @@  struct svc_sock {
 	/* Total length of the data (not including fragment headers)
 	 * received so far in the fragments making up this rpc: */
 	u32			sk_datalen;
+	/* Number of queued send requests */
+	atomic_t		sk_sendqlen;
 
 	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
 };
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5a809c64dc7b..231f510a4830 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1171,18 +1171,23 @@  static int svc_tcp_sendto(struct svc_rqst *rqstp)
 
 	svc_tcp_release_rqst(rqstp);
 
+	atomic_inc(&svsk->sk_sendqlen);
 	mutex_lock(&xprt->xpt_mutex);
 	if (svc_xprt_is_dead(xprt))
 		goto out_notconn;
+	tcp_sock_set_cork(svsk->sk_sk, true);
 	err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
 	xdr_free_bvec(xdr);
 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
 	if (err < 0 || sent != (xdr->len + sizeof(marker)))
 		goto out_close;
+	if (atomic_dec_and_test(&svsk->sk_sendqlen))
+		tcp_sock_set_cork(svsk->sk_sk, false);
 	mutex_unlock(&xprt->xpt_mutex);
 	return sent;
 
 out_notconn:
+	atomic_dec(&svsk->sk_sendqlen);
 	mutex_unlock(&xprt->xpt_mutex);
 	return -ENOTCONN;
 out_close:
@@ -1192,6 +1197,7 @@  static int svc_tcp_sendto(struct svc_rqst *rqstp)
 		  (err < 0) ? err : sent, xdr->len);
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 	svc_xprt_enqueue(xprt);
+	atomic_dec(&svsk->sk_sendqlen);
 	mutex_unlock(&xprt->xpt_mutex);
 	return -EAGAIN;
 }
@@ -1261,7 +1267,7 @@  static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 		svsk->sk_datalen = 0;
 		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
 
-		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+		tcp_sock_set_nodelay(sk);
 
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		switch (sk->sk_state) {