diff mbox

[1/4] sunrpc: flag transports as using IETF approved congestion control protocols

Message ID 20170223170337.10686-2-jlayton@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton Feb. 23, 2017, 5:03 p.m. UTC
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          | 1 +
 net/sunrpc/svcsock.c                     | 1 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
 3 files changed, 4 insertions(+)

Comments

Tom Talpey Feb. 23, 2017, 7:42 p.m. UTC | #1
On 2/23/2017 12:03 PM, Jeff Layton wrote:
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  include/linux/sunrpc/svc_xprt.h          | 1 +
>  net/sunrpc/svcsock.c                     | 1 +
>  net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++

There's a possibly-important detail here. Not all RDMA transports have
"IETF-approved congestion control", for example, RoCEv1. However, iWARP
and (arguably) RoCEv2 do. On the other hand, as a nonroutable protocol,
RoCEv1 may not fall under this restriction.

Net-net, inspecting only the RDMA attribute of the transport may be
insufficient here.

It could be argued however that the xprtrdma layer, with its rpcrdma
crediting, provides such congestion. But that needs to be made
explicit, and perhaps, discussed in IETF. Initially, I think it would
be important to flag this as a point for the future. For now, it may
be best to flag RoCEv1 as not supporting congestion.

Tom.

>  3 files changed, 4 insertions(+)
>
> diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
> index 7440290f64ac..f8aa9452b63c 100644
> --- a/include/linux/sunrpc/svc_xprt.h
> +++ b/include/linux/sunrpc/svc_xprt.h
> @@ -67,6 +67,7 @@ struct svc_xprt {
>  #define XPT_CACHE_AUTH	11		/* cache auth info */
>  #define XPT_LOCAL	12		/* connection from loopback interface */
>  #define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
> +#define XPT_CONG_CTRL	14		/* IETF approved congestion control protocol */
>
>  	struct svc_serv		*xpt_server;	/* service for transport */
>  	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index de066acdb34e..1956b8b96b2d 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
>  	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
>  		      &svsk->sk_xprt, serv);
>  	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
> +	set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
>  	if (sk->sk_state == TCP_LISTEN) {
>  		dprintk("setting up TCP socket for listening\n");
>  		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> index 39652d390a9c..96b4797c2c54 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> @@ -571,6 +571,8 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
>  	spin_lock_init(&cma_xprt->sc_ctxt_lock);
>  	spin_lock_init(&cma_xprt->sc_map_lock);
>
> +	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
> +
>  	if (listener)
>  		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Feb. 23, 2017, 8 p.m. UTC | #2
On Thu, 2017-02-23 at 14:42 -0500, Tom Talpey wrote:
> On 2/23/2017 12:03 PM, Jeff Layton wrote:
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> >  include/linux/sunrpc/svc_xprt.h          | 1 +
> >  net/sunrpc/svcsock.c                     | 1 +
> >  net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
> 
> There's a possibly-important detail here. Not all RDMA transports have
> "IETF-approved congestion control", for example, RoCEv1. However, iWARP
> and (arguably) RoCEv2 do. On the other hand, as a nonroutable protocol,
> RoCEv1 may not fall under this restriction.
> 
> Net-net, inspecting only the RDMA attribute of the transport may be
> insufficient here.
> 
> It could be argued however that the xprtrdma layer, with its rpcrdma
> crediting, provides such congestion. But that needs to be made
> explicit, and perhaps, discussed in IETF. Initially, I think it would
> be important to flag this as a point for the future. For now, it may
> be best to flag RoCEv1 as not supporting congestion.
> 
> Tom.
> 

(cc'ing Chuck and the linux-rdma list)

Thanks Tom, that's very interesting.

Not being well versed in the xprtrdma layer, what condition should we
use here to set the flag? git grep shows that the string "ROCEV1" only
shows up in the bxnt_en driver. Is there some way to determine this
generically for any given RDMA driver?


> >  3 files changed, 4 insertions(+)
> > 
> > diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
> > index 7440290f64ac..f8aa9452b63c 100644
> > --- a/include/linux/sunrpc/svc_xprt.h
> > +++ b/include/linux/sunrpc/svc_xprt.h
> > @@ -67,6 +67,7 @@ struct svc_xprt {
> >  #define XPT_CACHE_AUTH	11		/* cache auth info */
> >  #define XPT_LOCAL	12		/* connection from loopback interface */
> >  #define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
> > +#define XPT_CONG_CTRL	14		/* IETF approved congestion control protocol */
> > 
> >  	struct svc_serv		*xpt_server;	/* service for transport */
> >  	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index de066acdb34e..1956b8b96b2d 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
> >  	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
> >  		      &svsk->sk_xprt, serv);
> >  	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
> > +	set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
> >  	if (sk->sk_state == TCP_LISTEN) {
> >  		dprintk("setting up TCP socket for listening\n");
> >  		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
> > diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> > index 39652d390a9c..96b4797c2c54 100644
> > --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
> > +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> > @@ -571,6 +571,8 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
> >  	spin_lock_init(&cma_xprt->sc_ctxt_lock);
> >  	spin_lock_init(&cma_xprt->sc_map_lock);
> > 
> > +	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
> > +
> >  	if (listener)
> >  		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
> > 
> >
Tom Talpey Feb. 23, 2017, 8:06 p.m. UTC | #3
On 2/23/2017 3:00 PM, Jeff Layton wrote:
> On Thu, 2017-02-23 at 14:42 -0500, Tom Talpey wrote:
>> On 2/23/2017 12:03 PM, Jeff Layton wrote:
>>> Signed-off-by: Jeff Layton <jlayton@redhat.com>
>>> ---
>>>  include/linux/sunrpc/svc_xprt.h          | 1 +
>>>  net/sunrpc/svcsock.c                     | 1 +
>>>  net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
>>
>> There's a possibly-important detail here. Not all RDMA transports have
>> "IETF-approved congestion control", for example, RoCEv1. However, iWARP
>> and (arguably) RoCEv2 do. On the other hand, as a nonroutable protocol,
>> RoCEv1 may not fall under this restriction.
>>
>> Net-net, inspecting only the RDMA attribute of the transport may be
>> insufficient here.
>>
>> It could be argued however that the xprtrdma layer, with its rpcrdma
>> crediting, provides such congestion. But that needs to be made
>> explicit, and perhaps, discussed in IETF. Initially, I think it would
>> be important to flag this as a point for the future. For now, it may
>> be best to flag RoCEv1 as not supporting congestion.
>>
>> Tom.
>>
>
> (cc'ing Chuck and the linux-rdma list)
>
> Thanks Tom, that's very interesting.
>
> Not being well versed in the xprtrdma layer, what condition should we
> use here to set the flag? git grep shows that the string "ROCEV1" only
> shows up in the bxnt_en driver. Is there some way to determine this
> generically for any given RDMA driver?

I would not code RoCEv1 as an exception, I would code iWARP and RoCEv2
as the only eligible ones. There are any number of other possibilities,
none of which should be automatically flagged as congestion-controlled.

I'm also not sure I'm comfortable with hardcoding such a list into RPC.
But it may be the best you can do for now. Chuck, are you aware of a
verbs interface to obtain the RDMA transport type?

Tom.

>
>
>>>  3 files changed, 4 insertions(+)
>>>
>>> diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
>>> index 7440290f64ac..f8aa9452b63c 100644
>>> --- a/include/linux/sunrpc/svc_xprt.h
>>> +++ b/include/linux/sunrpc/svc_xprt.h
>>> @@ -67,6 +67,7 @@ struct svc_xprt {
>>>  #define XPT_CACHE_AUTH	11		/* cache auth info */
>>>  #define XPT_LOCAL	12		/* connection from loopback interface */
>>>  #define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
>>> +#define XPT_CONG_CTRL	14		/* IETF approved congestion control protocol */
>>>
>>>  	struct svc_serv		*xpt_server;	/* service for transport */
>>>  	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
>>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>>> index de066acdb34e..1956b8b96b2d 100644
>>> --- a/net/sunrpc/svcsock.c
>>> +++ b/net/sunrpc/svcsock.c
>>> @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
>>>  	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
>>>  		      &svsk->sk_xprt, serv);
>>>  	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
>>> +	set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
>>>  	if (sk->sk_state == TCP_LISTEN) {
>>>  		dprintk("setting up TCP socket for listening\n");
>>>  		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>>> index 39652d390a9c..96b4797c2c54 100644
>>> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>>> @@ -571,6 +571,8 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
>>>  	spin_lock_init(&cma_xprt->sc_ctxt_lock);
>>>  	spin_lock_init(&cma_xprt->sc_map_lock);
>>>
>>> +	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
>>> +
>>>  	if (listener)
>>>  		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
>>>
>>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Feb. 23, 2017, 8:11 p.m. UTC | #4
On Thu, Feb 23, 2017 at 03:06:25PM -0500, Tom Talpey wrote:
> On 2/23/2017 3:00 PM, Jeff Layton wrote:
> >On Thu, 2017-02-23 at 14:42 -0500, Tom Talpey wrote:
> >>On 2/23/2017 12:03 PM, Jeff Layton wrote:
> >>>Signed-off-by: Jeff Layton <jlayton@redhat.com>
> >>>---
> >>> include/linux/sunrpc/svc_xprt.h          | 1 +
> >>> net/sunrpc/svcsock.c                     | 1 +
> >>> net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
> >>
> >>There's a possibly-important detail here. Not all RDMA transports have
> >>"IETF-approved congestion control", for example, RoCEv1. However, iWARP
> >>and (arguably) RoCEv2 do. On the other hand, as a nonroutable protocol,
> >>RoCEv1 may not fall under this restriction.
> >>
> >>Net-net, inspecting only the RDMA attribute of the transport may be
> >>insufficient here.
> >>
> >>It could be argued however that the xprtrdma layer, with its rpcrdma
> >>crediting, provides such congestion. But that needs to be made
> >>explicit, and perhaps, discussed in IETF. Initially, I think it would
> >>be important to flag this as a point for the future. For now, it may
> >>be best to flag RoCEv1 as not supporting congestion.
> >>
> >>Tom.
> >>
> >
> >(cc'ing Chuck and the linux-rdma list)
> >
> >Thanks Tom, that's very interesting.
> >
> >Not being well versed in the xprtrdma layer, what condition should we
> >use here to set the flag? git grep shows that the string "ROCEV1" only
> >shows up in the bxnt_en driver. Is there some way to determine this
> >generically for any given RDMA driver?
> 
> I would not code RoCEv1 as an exception, I would code iWARP and RoCEv2
> as the only eligible ones. There are any number of other possibilities,
> none of which should be automatically flagged as congestion-controlled.
> 
> I'm also not sure I'm comfortable with hardcoding such a list into RPC.
> But it may be the best you can do for now. Chuck, are you aware of a
> verbs interface to obtain the RDMA transport type?

If this gets too complicated--we've been allowing NFSv4/UDP for years,
letting this one (arguable?) exception through in RDMA a little longer
won't kill us.

(And if we really shouldn't be doing NFSv4 over some RDMA transports--is
it worth supporting them at all, if the only support we can get is
NFSv3-only?)

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Feb. 23, 2017, 8:15 p.m. UTC | #5
--
Chuck Lever

> On Feb 23, 2017, at 2:42 PM, Tom Talpey <tom@talpey.com> wrote:
> 
>> On 2/23/2017 12:03 PM, Jeff Layton wrote:
>> Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> ---
>> include/linux/sunrpc/svc_xprt.h          | 1 +
>> net/sunrpc/svcsock.c                     | 1 +
>> net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
> 
> There's a possibly-important detail here. Not all RDMA transports have
> "IETF-approved congestion control", for example, RoCEv1. However, iWARP
> and (arguably) RoCEv2 do. On the other hand, as a nonroutable protocol,
> RoCEv1 may not fall under this restriction.

Probably a discussion that could be handled in rfc5667bis. But it's
not clear what IETF-approved congestion control is, precisely.

We'd have to say something about InfiniBand (a transport which
is not specified by the IETF) too.


> Net-net, inspecting only the RDMA attribute of the transport may be
> insufficient here.

The transport implementation would have to set it in the svc_xprt
rather than having it be a constant field in the xprt class.


> It could be argued however that the xprtrdma layer, with its rpcrdma
> crediting, provides such congestion. But that needs to be made
> explicit, and perhaps, discussed in IETF. Initially, I think it would
> be important to flag this as a point for the future. For now, it may
> be best to flag RoCEv1 as not supporting congestion.
> 
> Tom.
> 
>> 3 files changed, 4 insertions(+)
>> 
>> diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
>> index 7440290f64ac..f8aa9452b63c 100644
>> --- a/include/linux/sunrpc/svc_xprt.h
>> +++ b/include/linux/sunrpc/svc_xprt.h
>> @@ -67,6 +67,7 @@ struct svc_xprt {
>> #define XPT_CACHE_AUTH    11        /* cache auth info */
>> #define XPT_LOCAL    12        /* connection from loopback interface */
>> #define XPT_KILL_TEMP   13        /* call xpo_kill_temp_xprt before closing */
>> +#define XPT_CONG_CTRL    14        /* IETF approved congestion control protocol */
>> 
>>    struct svc_serv        *xpt_server;    /* service for transport */
>>    atomic_t                xpt_reserved;    /* space on outq that is rsvd */
>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>> index de066acdb34e..1956b8b96b2d 100644
>> --- a/net/sunrpc/svcsock.c
>> +++ b/net/sunrpc/svcsock.c
>> @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
>>    svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
>>              &svsk->sk_xprt, serv);
>>    set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
>> +    set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
>>    if (sk->sk_state == TCP_LISTEN) {
>>        dprintk("setting up TCP socket for listening\n");
>>        set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> index 39652d390a9c..96b4797c2c54 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> @@ -571,6 +571,8 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
>>    spin_lock_init(&cma_xprt->sc_ctxt_lock);
>>    spin_lock_init(&cma_xprt->sc_map_lock);
>> 
>> +    set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
>> +
>>    if (listener)
>>        set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
>> 
>> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Feb. 23, 2017, 8:17 p.m. UTC | #6
--
Chuck Lever

> On Feb 23, 2017, at 3:06 PM, Tom Talpey <tom@talpey.com> wrote:
> 
>> On 2/23/2017 3:00 PM, Jeff Layton wrote:
>>> On Thu, 2017-02-23 at 14:42 -0500, Tom Talpey wrote:
>>>> On 2/23/2017 12:03 PM, Jeff Layton wrote:
>>>> Signed-off-by: Jeff Layton <jlayton@redhat.com>
>>>> ---
>>>> include/linux/sunrpc/svc_xprt.h          | 1 +
>>>> net/sunrpc/svcsock.c                     | 1 +
>>>> net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
>>> 
>>> There's a possibly-important detail here. Not all RDMA transports have
>>> "IETF-approved congestion control", for example, RoCEv1. However, iWARP
>>> and (arguably) RoCEv2 do. On the other hand, as a nonroutable protocol,
>>> RoCEv1 may not fall under this restriction.
>>> 
>>> Net-net, inspecting only the RDMA attribute of the transport may be
>>> insufficient here.
>>> 
>>> It could be argued however that the xprtrdma layer, with its rpcrdma
>>> crediting, provides such congestion. But that needs to be made
>>> explicit, and perhaps, discussed in IETF. Initially, I think it would
>>> be important to flag this as a point for the future. For now, it may
>>> be best to flag RoCEv1 as not supporting congestion.
>>> 
>>> Tom.
>>> 
>> 
>> (cc'ing Chuck and the linux-rdma list)
>> 
>> Thanks Tom, that's very interesting.
>> 
>> Not being well versed in the xprtrdma layer, what condition should we
>> use here to set the flag? git grep shows that the string "ROCEV1" only
>> shows up in the bxnt_en driver. Is there some way to determine this
>> generically for any given RDMA driver?
> 
> I would not code RoCEv1 as an exception, I would code iWARP and RoCEv2
> as the only eligible ones. There are any number of other possibilities,
> none of which should be automatically flagged as congestion-controlled.
> 
> I'm also not sure I'm comfortable with hardcoding such a list into RPC.
> But it may be the best you can do for now. Chuck, are you aware of a
> verbs interface to obtain the RDMA transport type?

Yes, I can have a look when I get to Connectathon.



> 
> Tom.
> 
>> 
>> 
>>>> 3 files changed, 4 insertions(+)
>>>> 
>>>> diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
>>>> index 7440290f64ac..f8aa9452b63c 100644
>>>> --- a/include/linux/sunrpc/svc_xprt.h
>>>> +++ b/include/linux/sunrpc/svc_xprt.h
>>>> @@ -67,6 +67,7 @@ struct svc_xprt {
>>>> #define XPT_CACHE_AUTH    11        /* cache auth info */
>>>> #define XPT_LOCAL    12        /* connection from loopback interface */
>>>> #define XPT_KILL_TEMP   13        /* call xpo_kill_temp_xprt before closing */
>>>> +#define XPT_CONG_CTRL    14        /* IETF approved congestion control protocol */
>>>> 
>>>>    struct svc_serv        *xpt_server;    /* service for transport */
>>>>    atomic_t                xpt_reserved;    /* space on outq that is rsvd */
>>>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>>>> index de066acdb34e..1956b8b96b2d 100644
>>>> --- a/net/sunrpc/svcsock.c
>>>> +++ b/net/sunrpc/svcsock.c
>>>> @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
>>>>    svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
>>>>              &svsk->sk_xprt, serv);
>>>>    set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
>>>> +    set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
>>>>    if (sk->sk_state == TCP_LISTEN) {
>>>>        dprintk("setting up TCP socket for listening\n");
>>>>        set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
>>>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>>>> index 39652d390a9c..96b4797c2c54 100644
>>>> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
>>>> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>>>> @@ -571,6 +571,8 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
>>>>    spin_lock_init(&cma_xprt->sc_ctxt_lock);
>>>>    spin_lock_init(&cma_xprt->sc_map_lock);
>>>> 
>>>> +    set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
>>>> +
>>>>    if (listener)
>>>>        set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
>>>> 
>>>> 
>> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 23, 2017, 8:26 p.m. UTC | #7
On Thu, Feb 23, 2017 at 03:11:09PM -0500, J. Bruce Fields wrote:

> (And if we really shouldn't be doing NFSv4 over some RDMA transports--is
> it worth supporting them at all, if the only support we can get is
> NFSv3-only?)

This seems like a strange comment - NFSv4 should be supported on all
RDMA transports, surely?

Largely RDMA lives in its own congestion management world. If a site
is running RDMA they have done something to mitigate interactions with
TCP style congestion control on the same wire.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Feb. 23, 2017, 8:32 p.m. UTC | #8
On Thu, 2017-02-23 at 15:11 -0500, J. Bruce Fields wrote:
> On Thu, Feb 23, 2017 at 03:06:25PM -0500, Tom Talpey wrote:
> > On 2/23/2017 3:00 PM, Jeff Layton wrote:
> > > On Thu, 2017-02-23 at 14:42 -0500, Tom Talpey wrote:
> > > > On 2/23/2017 12:03 PM, Jeff Layton wrote:
> > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > ---
> > > > > include/linux/sunrpc/svc_xprt.h          | 1 +
> > > > > net/sunrpc/svcsock.c                     | 1 +
> > > > > net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++
> > > > 
> > > > There's a possibly-important detail here. Not all RDMA transports have
> > > > "IETF-approved congestion control", for example, RoCEv1. However, iWARP
> > > > and (arguably) RoCEv2 do. On the other hand, as a nonroutable protocol,
> > > > RoCEv1 may not fall under this restriction.
> > > > 
> > > > Net-net, inspecting only the RDMA attribute of the transport may be
> > > > insufficient here.
> > > > 
> > > > It could be argued however that the xprtrdma layer, with its rpcrdma
> > > > crediting, provides such congestion. But that needs to be made
> > > > explicit, and perhaps, discussed in IETF. Initially, I think it would
> > > > be important to flag this as a point for the future. For now, it may
> > > > be best to flag RoCEv1 as not supporting congestion.
> > > > 
> > > > Tom.
> > > > 
> > > 
> > > (cc'ing Chuck and the linux-rdma list)
> > > 
> > > Thanks Tom, that's very interesting.
> > > 
> > > Not being well versed in the xprtrdma layer, what condition should we
> > > use here to set the flag? git grep shows that the string "ROCEV1" only
> > > shows up in the bxnt_en driver. Is there some way to determine this
> > > generically for any given RDMA driver?
> > 
> > I would not code RoCEv1 as an exception, I would code iWARP and RoCEv2
> > as the only eligible ones. There are any number of other possibilities,
> > none of which should be automatically flagged as congestion-controlled.
> > 
> > I'm also not sure I'm comfortable with hardcoding such a list into RPC.
> > But it may be the best you can do for now. Chuck, are you aware of a
> > verbs interface to obtain the RDMA transport type?
> 
> If this gets too complicated--we've been allowing NFSv4/UDP for years,
> letting this one (arguable?) exception through in RDMA a little longer
> won't kill us.
> 

That's my feeling too. This is still an improvement over the status
quo, and hopefully anyone with RDMA hardware will have a bit more clue
as to whether it can properly support v4.

We can always further restrict when rdma_create_xprt sets the flag in a
later patch if we figure out some way to determine this generically. I
will plan to add a comment that we're setting this RDMA svc_xprt
universally even though it may not always be true.

> (And if we really shouldn't be doing NFSv4 over some RDMA transports--is
> it worth supporting them at all, if the only support we can get is
> NFSv3-only?)
> 

I'd be inclined to leave them working and just deny the use of v4 on
such transports.
Tom Talpey Feb. 23, 2017, 8:33 p.m. UTC | #9
On 2/23/2017 3:26 PM, Jason Gunthorpe wrote:
> On Thu, Feb 23, 2017 at 03:11:09PM -0500, J. Bruce Fields wrote:
>
>> (And if we really shouldn't be doing NFSv4 over some RDMA transports--is
>> it worth supporting them at all, if the only support we can get is
>> NFSv3-only?)
>
> This seems like a strange comment - NFSv4 should be supported on all
> RDMA transports, surely?
>
> Largely RDMA lives in its own congestion management world. If a site
> is running RDMA they have done something to mitigate interactions with
> TCP style congestion control on the same wire.

The key words are "IETF-approved". Mitigation and Interaction are
operational decisions, not protocol design.

We could argue that the requirement is bogus, or that all RDMA transports
comply, or that the RPCRDMA layer provides it, but none of these arguments
would grant IETF approval. That said, I think there's a lot of
room for interpretation here.

Tom.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 23, 2017, 8:55 p.m. UTC | #10
On Thu, Feb 23, 2017 at 03:33:53PM -0500, Tom Talpey wrote:

> The key words are "IETF-approved". Mitigation and Interaction are
> operational decisions, not protocol design.

We are talking about this bit from RFC 3530 ?

   Where an NFS version 4 implementation supports operation over the IP
   network protocol, the supported transports between NFS and IP MUST be
   among the IETF-approved congestion control transport protocols, which
   include TCP and SCTP.

This gives most of RDMA an out as it is not over the IP protocol. The
only obvious troubled one is RoCEv2..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey Feb. 24, 2017, 3:08 p.m. UTC | #11
On 2/23/2017 3:55 PM, Jason Gunthorpe wrote:
> On Thu, Feb 23, 2017 at 03:33:53PM -0500, Tom Talpey wrote:
>
>> The key words are "IETF-approved". Mitigation and Interaction are
>> operational decisions, not protocol design.
>
> We are talking about this bit from RFC 3530 ?
>
>    Where an NFS version 4 implementation supports operation over the IP
>    network protocol, the supported transports between NFS and IP MUST be
>    among the IETF-approved congestion control transport protocols, which
>    include TCP and SCTP.
>
> This gives most of RDMA an out as it is not over the IP protocol. The
> only obvious troubled one is RoCEv2..

RFC7530 has updated this text somewhat, but it's similar, yes.

https://tools.ietf.org/html/rfc7530#section-3.1

The specification language specifically calls out IP-based transports,
which is why I mentioned that RoCEv1, being non-IP-based and not even
truly routable, could obtain a bye. But the NFS layer IMO should really
not be digging down to this level. I think it would be much better if
each transport could expose a relevant attribute, which NFS could
optionally inspect.

As you mention, RoCEv2 is a bit of a pickle. It's UDP/IP-based, and it
does have end-to-end congestion control, but technically speaking it
is not "IETF approved". I'm not sure what call to make there.

Tom.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Feb. 24, 2017, 5:17 p.m. UTC | #12
On Fri, 2017-02-24 at 10:08 -0500, Tom Talpey wrote:
> On 2/23/2017 3:55 PM, Jason Gunthorpe wrote:
> > On Thu, Feb 23, 2017 at 03:33:53PM -0500, Tom Talpey wrote:
> > 
> > > The key words are "IETF-approved". Mitigation and Interaction are
> > > operational decisions, not protocol design.
> > 
> > We are talking about this bit from RFC 3530 ?
> > 
> >    Where an NFS version 4 implementation supports operation over the IP
> >    network protocol, the supported transports between NFS and IP MUST be
> >    among the IETF-approved congestion control transport protocols, which
> >    include TCP and SCTP.
> > 
> > This gives most of RDMA an out as it is not over the IP protocol. The
> > only obvious troubled one is RoCEv2..
> 
> RFC7530 has updated this text somewhat, but it's similar, yes.
> 
> https://tools.ietf.org/html/rfc7530#section-3.1
> 
> The specification language specifically calls out IP-based transports,
> which is why I mentioned that RoCEv1, being non-IP-based and not even
> truly routable, could obtain a bye. But the NFS layer IMO should really
> not be digging down to this level. I think it would be much better if
> each transport could expose a relevant attribute, which NFS could
> optionally inspect.
> 
> As you mention, RoCEv2 is a bit of a pickle. It's UDP/IP-based, and it
> does have end-to-end congestion control, but technically speaking it
> is not "IETF approved". I'm not sure what call to make there.
> 
> Tom.

I'm perfectly willing to forgo the "IETF approved" verbiage since it's
ambiguous anyway. We can certainly just use more weasel words in the
comments as well.

RFC5661 has a bit more to say on the matter:

   NFSv4.1 works over Remote Direct Memory Access (RDMA) and non-RDMA-
   based transports with the following attributes:
   o  The transport supports reliable delivery of data, which NFSv4.1
      requires but neither NFSv4.1 nor RPC has facilities for ensuring
      [34].

   o  The transport delivers data in the order it was sent.  Ordered
      delivery simplifies detection of transmit errors, and simplifies
      the sending of arbitrary sized requests and responses via the
      record marking protocol [3].

I'd rather rely on those attributes instead of any sort of IETF
approval anyway. Do all RDMA transports (RoCEv2, in particular) have
those characteristics?

If not, then perhaps there is some way to have different RDMA transport
drivers set flags in some structure as to whether they fulfill those
requirements, and then have the xprtrdma driver check for those flags.

In any case, for now I think we should just give all RDMA transports a
pass, and clean that up later. I'm mostly interested in excluding UDP
over IP for now -- being more strict with RDMA can come later.
Jason Gunthorpe Feb. 24, 2017, 6:03 p.m. UTC | #13
> I'd rather rely on those attributes instead of any sort of IETF
> approval anyway. Do all RDMA transports (RoCEv2, in particular) have
> those characteristics?

The NFS-RDMA driver only works with RDMA RC transports which are
defined to provide all those characteristics.

> In any case, for now I think we should just give all RDMA transports a
> pass, and clean that up later. I'm mostly interested in excluding UDP
> over IP for now -- being more strict with RDMA can come later.

Makes sense to me. At the end of the day I think everything supported
by NFS-RDMA should be permitted to use NFSv4, and we should ignore
sketchy spec language that suggests otherwise.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 7440290f64ac..f8aa9452b63c 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -67,6 +67,7 @@  struct svc_xprt {
 #define XPT_CACHE_AUTH	11		/* cache auth info */
 #define XPT_LOCAL	12		/* connection from loopback interface */
 #define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
+#define XPT_CONG_CTRL	14		/* IETF approved congestion control protocol */
 
 	struct svc_serv		*xpt_server;	/* service for transport */
 	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index de066acdb34e..1956b8b96b2d 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1306,6 +1306,7 @@  static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
 		      &svsk->sk_xprt, serv);
 	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+	set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
 	if (sk->sk_state == TCP_LISTEN) {
 		dprintk("setting up TCP socket for listening\n");
 		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 39652d390a9c..96b4797c2c54 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -571,6 +571,8 @@  static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
 	spin_lock_init(&cma_xprt->sc_map_lock);
 
+	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
 	if (listener)
 		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);