diff mbox

[V3,01/17] xprtrdma: mind the device's max fast register page list depth

Message ID 20140430192936.5663.66537.stgit@manet.1015granger.net (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Chuck Lever III April 30, 2014, 7:29 p.m. UTC
From: Steve Wise <swise@opengridcomputing.com>

Some rdma devices don't support a fast register page list depth of
at least RPCRDMA_MAX_DATA_SEGS.  So xprtrdma needs to chunk its fast
register regions according to the minimum of the device max supported
depth or RPCRDMA_MAX_DATA_SEGS.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
---

 net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ---
 net/sunrpc/xprtrdma/verbs.c     |   47 +++++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
 3 files changed, 36 insertions(+), 16 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Devesh Sharma May 16, 2014, 7:08 a.m. UTC | #1
Chuck

This patch is causing a CPU soft-lockup if underlying vendor reports devattr.max_fast_reg_pagr_list_len = 0 and ia->ri_memreg_strategy = FRMR (Default option).
I think there is need to refer to device capability flags. If strategy = FRMR is forced and devattr.max_fast_reg_pagr_list_len=0 then flash an error and fail RPC with -EIO.

See inline:

> -----Original Message-----

> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-

> owner@vger.kernel.org] On Behalf Of Chuck Lever

> Sent: Thursday, May 01, 2014 1:00 AM

> To: linux-nfs@vger.kernel.org; linux-rdma@vger.kernel.org

> Cc: Anna.Schumaker@netapp.com

> Subject: [PATCH V3 01/17] xprtrdma: mind the device's max fast register

> page list depth

> 

> From: Steve Wise <swise@opengridcomputing.com>

> 

> Some rdma devices don't support a fast register page list depth of at least

> RPCRDMA_MAX_DATA_SEGS.  So xprtrdma needs to chunk its fast register

> regions according to the minimum of the device max supported depth or

> RPCRDMA_MAX_DATA_SEGS.

> 

> Signed-off-by: Steve Wise <swise@opengridcomputing.com>

> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>

> ---

> 

>  net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ---

>  net/sunrpc/xprtrdma/verbs.c     |   47 +++++++++++++++++++++++++++++-

> ---------

>  net/sunrpc/xprtrdma/xprt_rdma.h |    1 +

>  3 files changed, 36 insertions(+), 16 deletions(-)

> 

> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c

> b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead52..400aa1b 100644

> --- a/net/sunrpc/xprtrdma/rpc_rdma.c

> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c

> @@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct

> xdr_buf *target,

>  	/* success. all failures return above */

>  	req->rl_nchunks = nchunks;

> 

> -	BUG_ON(nchunks == 0);

> -	BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)

> -	       && (nchunks > 3));

> -

>  	/*

>  	 * finish off header. If write, marshal discrim and nchunks.

>  	 */

> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c

> index 9372656..55fb09a 100644

> --- a/net/sunrpc/xprtrdma/verbs.c

> +++ b/net/sunrpc/xprtrdma/verbs.c

> @@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct

> sockaddr *addr, int memreg)

>  				__func__);

>  			memreg = RPCRDMA_REGISTER;

>  #endif

> +		} else {

> +			/* Mind the ia limit on FRMR page list depth */

> +			ia->ri_max_frmr_depth = min_t(unsigned int,

> +				RPCRDMA_MAX_DATA_SEGS,

> +				devattr.max_fast_reg_page_list_len);

>  		}

>  		break;

>  	}

> @@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct

> rpcrdma_ia *ia,

>  	ep->rep_attr.srq = NULL;

>  	ep->rep_attr.cap.max_send_wr = cdata->max_requests;

>  	switch (ia->ri_memreg_strategy) {

> -	case RPCRDMA_FRMR:

> +	case RPCRDMA_FRMR: {

> +		int depth = 7;

> +

>  		/* Add room for frmr register and invalidate WRs.

>  		 * 1. FRMR reg WR for head

>  		 * 2. FRMR invalidate WR for head

> -		 * 3. FRMR reg WR for pagelist

> -		 * 4. FRMR invalidate WR for pagelist

> +		 * 3. N FRMR reg WRs for pagelist

> +		 * 4. N FRMR invalidate WRs for pagelist

>  		 * 5. FRMR reg WR for tail

>  		 * 6. FRMR invalidate WR for tail

>  		 * 7. The RDMA_SEND WR

>  		 */

> -		ep->rep_attr.cap.max_send_wr *= 7;

> +

> +		/* Calculate N if the device max FRMR depth is smaller than

> +		 * RPCRDMA_MAX_DATA_SEGS.

> +		 */

> +		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {

> +			int delta = RPCRDMA_MAX_DATA_SEGS -

> +				    ia->ri_max_frmr_depth;

> +

> +			do {

> +				depth += 2; /* FRMR reg + invalidate */

> +				delta -= ia->ri_max_frmr_depth;


If ia->ri_max_frmr_depth is = 0. This loop becomes infinite loop.

> +			} while (delta > 0);

> +

> +		}

> +		ep->rep_attr.cap.max_send_wr *= depth;

>  		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {

> -			cdata->max_requests = devattr.max_qp_wr / 7;

> +			cdata->max_requests = devattr.max_qp_wr / depth;

>  			if (!cdata->max_requests)

>  				return -EINVAL;

> -			ep->rep_attr.cap.max_send_wr = cdata-

> >max_requests * 7;

> +			ep->rep_attr.cap.max_send_wr = cdata-

> >max_requests *

> +						       depth;

>  		}

>  		break;

> +	}

>  	case RPCRDMA_MEMWINDOWS_ASYNC:

>  	case RPCRDMA_MEMWINDOWS:

>  		/* Add room for mw_binds+unbinds - overkill! */ @@ -

> 1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,

> struct rpcrdma_ep *ep,

>  	case RPCRDMA_FRMR:

>  		for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--

> ) {

>  			r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,

> -

> RPCRDMA_MAX_SEGS);

> +						ia->ri_max_frmr_depth);

>  			if (IS_ERR(r->r.frmr.fr_mr)) {

>  				rc = PTR_ERR(r->r.frmr.fr_mr);

>  				dprintk("RPC:       %s: ib_alloc_fast_reg_mr"

>  					" failed %i\n", __func__, rc);

>  				goto out;

>  			}

> -			r->r.frmr.fr_pgl =

> -				ib_alloc_fast_reg_page_list(ia->ri_id-

> >device,

> -

> RPCRDMA_MAX_SEGS);

> +			r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(

> +						ia->ri_id->device,

> +						ia->ri_max_frmr_depth);

>  			if (IS_ERR(r->r.frmr.fr_pgl)) {

>  				rc = PTR_ERR(r->r.frmr.fr_pgl);

>  				dprintk("RPC:       %s: "

> @@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct

> rpcrdma_mr_seg *seg,

>  	seg1->mr_offset -= pageoff;	/* start of page */

>  	seg1->mr_len += pageoff;

>  	len = -pageoff;

> -	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)

> -		*nsegs = RPCRDMA_MAX_DATA_SEGS;

> +	if (*nsegs > ia->ri_max_frmr_depth)

> +		*nsegs = ia->ri_max_frmr_depth;

>  	for (page_no = i = 0; i < *nsegs;) {

>  		rpcrdma_map_one(ia, seg, writing);

>  		pa = seg->mr_dma;

> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h

> b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445d..98340a3 100644

> --- a/net/sunrpc/xprtrdma/xprt_rdma.h

> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h

> @@ -66,6 +66,7 @@ struct rpcrdma_ia {

>  	struct completion	ri_done;

>  	int			ri_async_rc;

>  	enum rpcrdma_memreg	ri_memreg_strategy;

> +	unsigned int		ri_max_frmr_depth;

>  };

> 

>  /*

> 

> --

> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the

> body of a message to majordomo@vger.kernel.org More majordomo info at

> http://vger.kernel.org/majordomo-info.html
Steve Wise May 16, 2014, 2:10 p.m. UTC | #2
I guess the client code doesn't verify that the device supports the 
chosen memreg mode.  That's not good.   Lemme fix this and respin this 
patch.


On 5/16/2014 2:08 AM, Devesh Sharma wrote:
> Chuck
>
> This patch is causing a CPU soft-lockup if underlying vendor reports devattr.max_fast_reg_pagr_list_len = 0 and ia->ri_memreg_strategy = FRMR (Default option).
> I think there is need to refer to device capability flags. If strategy = FRMR is forced and devattr.max_fast_reg_pagr_list_len=0 then flash an error and fail RPC with -EIO.
>
> See inline:
>
>> -----Original Message-----
>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>> owner@vger.kernel.org] On Behalf Of Chuck Lever
>> Sent: Thursday, May 01, 2014 1:00 AM
>> To: linux-nfs@vger.kernel.org; linux-rdma@vger.kernel.org
>> Cc: Anna.Schumaker@netapp.com
>> Subject: [PATCH V3 01/17] xprtrdma: mind the device's max fast register
>> page list depth
>>
>> From: Steve Wise <swise@opengridcomputing.com>
>>
>> Some rdma devices don't support a fast register page list depth of at least
>> RPCRDMA_MAX_DATA_SEGS.  So xprtrdma needs to chunk its fast register
>> regions according to the minimum of the device max supported depth or
>> RPCRDMA_MAX_DATA_SEGS.
>>
>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>
>>   net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ---
>>   net/sunrpc/xprtrdma/verbs.c     |   47 +++++++++++++++++++++++++++++-
>> ---------
>>   net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
>>   3 files changed, 36 insertions(+), 16 deletions(-)
>>
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c
>> b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead52..400aa1b 100644
>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>> @@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct
>> xdr_buf *target,
>>   	/* success. all failures return above */
>>   	req->rl_nchunks = nchunks;
>>
>> -	BUG_ON(nchunks == 0);
>> -	BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
>> -	       && (nchunks > 3));
>> -
>>   	/*
>>   	 * finish off header. If write, marshal discrim and nchunks.
>>   	 */
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index 9372656..55fb09a 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct
>> sockaddr *addr, int memreg)
>>   				__func__);
>>   			memreg = RPCRDMA_REGISTER;
>>   #endif
>> +		} else {
>> +			/* Mind the ia limit on FRMR page list depth */
>> +			ia->ri_max_frmr_depth = min_t(unsigned int,
>> +				RPCRDMA_MAX_DATA_SEGS,
>> +				devattr.max_fast_reg_page_list_len);
>>   		}
>>   		break;
>>   	}
>> @@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct
>> rpcrdma_ia *ia,
>>   	ep->rep_attr.srq = NULL;
>>   	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
>>   	switch (ia->ri_memreg_strategy) {
>> -	case RPCRDMA_FRMR:
>> +	case RPCRDMA_FRMR: {
>> +		int depth = 7;
>> +
>>   		/* Add room for frmr register and invalidate WRs.
>>   		 * 1. FRMR reg WR for head
>>   		 * 2. FRMR invalidate WR for head
>> -		 * 3. FRMR reg WR for pagelist
>> -		 * 4. FRMR invalidate WR for pagelist
>> +		 * 3. N FRMR reg WRs for pagelist
>> +		 * 4. N FRMR invalidate WRs for pagelist
>>   		 * 5. FRMR reg WR for tail
>>   		 * 6. FRMR invalidate WR for tail
>>   		 * 7. The RDMA_SEND WR
>>   		 */
>> -		ep->rep_attr.cap.max_send_wr *= 7;
>> +
>> +		/* Calculate N if the device max FRMR depth is smaller than
>> +		 * RPCRDMA_MAX_DATA_SEGS.
>> +		 */
>> +		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
>> +			int delta = RPCRDMA_MAX_DATA_SEGS -
>> +				    ia->ri_max_frmr_depth;
>> +
>> +			do {
>> +				depth += 2; /* FRMR reg + invalidate */
>> +				delta -= ia->ri_max_frmr_depth;
> If ia->ri_max_frmr_depth is = 0. This loop becomes infinite loop.
>
>> +			} while (delta > 0);
>> +
>> +		}
>> +		ep->rep_attr.cap.max_send_wr *= depth;
>>   		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
>> -			cdata->max_requests = devattr.max_qp_wr / 7;
>> +			cdata->max_requests = devattr.max_qp_wr / depth;
>>   			if (!cdata->max_requests)
>>   				return -EINVAL;
>> -			ep->rep_attr.cap.max_send_wr = cdata-
>>> max_requests * 7;
>> +			ep->rep_attr.cap.max_send_wr = cdata-
>>> max_requests *
>> +						       depth;
>>   		}
>>   		break;
>> +	}
>>   	case RPCRDMA_MEMWINDOWS_ASYNC:
>>   	case RPCRDMA_MEMWINDOWS:
>>   		/* Add room for mw_binds+unbinds - overkill! */ @@ -
>> 1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,
>> struct rpcrdma_ep *ep,
>>   	case RPCRDMA_FRMR:
>>   		for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--
>> ) {
>>   			r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
>> -
>> RPCRDMA_MAX_SEGS);
>> +						ia->ri_max_frmr_depth);
>>   			if (IS_ERR(r->r.frmr.fr_mr)) {
>>   				rc = PTR_ERR(r->r.frmr.fr_mr);
>>   				dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
>>   					" failed %i\n", __func__, rc);
>>   				goto out;
>>   			}
>> -			r->r.frmr.fr_pgl =
>> -				ib_alloc_fast_reg_page_list(ia->ri_id-
>>> device,
>> -
>> RPCRDMA_MAX_SEGS);
>> +			r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
>> +						ia->ri_id->device,
>> +						ia->ri_max_frmr_depth);
>>   			if (IS_ERR(r->r.frmr.fr_pgl)) {
>>   				rc = PTR_ERR(r->r.frmr.fr_pgl);
>>   				dprintk("RPC:       %s: "
>> @@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct
>> rpcrdma_mr_seg *seg,
>>   	seg1->mr_offset -= pageoff;	/* start of page */
>>   	seg1->mr_len += pageoff;
>>   	len = -pageoff;
>> -	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
>> -		*nsegs = RPCRDMA_MAX_DATA_SEGS;
>> +	if (*nsegs > ia->ri_max_frmr_depth)
>> +		*nsegs = ia->ri_max_frmr_depth;
>>   	for (page_no = i = 0; i < *nsegs;) {
>>   		rpcrdma_map_one(ia, seg, writing);
>>   		pa = seg->mr_dma;
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h
>> b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445d..98340a3 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -66,6 +66,7 @@ struct rpcrdma_ia {
>>   	struct completion	ri_done;
>>   	int			ri_async_rc;
>>   	enum rpcrdma_memreg	ri_memreg_strategy;
>> +	unsigned int		ri_max_frmr_depth;
>>   };
>>
>>   /*
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
>> body of a message to majordomo@vger.kernel.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
> N?????r??y???b?X???v?^?)?{.n?+????{???"??^n?r???z???h????&???G???h?(????j"???m?????z?????f???h???~?mml==

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise May 16, 2014, 2:14 p.m. UTC | #3
By the way, Devesh:  Is the device advertising FRMR support, yet setting 
the max page list len to zero?  That's a driver bug...



On 5/16/2014 9:10 AM, Steve Wise wrote:
> I guess the client code doesn't verify that the device supports the 
> chosen memreg mode.  That's not good.   Lemme fix this and respin this 
> patch.
>
>
> On 5/16/2014 2:08 AM, Devesh Sharma wrote:
>> Chuck
>>
>> This patch is causing a CPU soft-lockup if underlying vendor reports 
>> devattr.max_fast_reg_pagr_list_len = 0 and ia->ri_memreg_strategy = 
>> FRMR (Default option).
>> I think there is need to refer to device capability flags. If 
>> strategy = FRMR is forced and devattr.max_fast_reg_pagr_list_len=0 
>> then flash an error and fail RPC with -EIO.
>>
>> See inline:
>>
>>> -----Original Message-----
>>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>>> owner@vger.kernel.org] On Behalf Of Chuck Lever
>>> Sent: Thursday, May 01, 2014 1:00 AM
>>> To: linux-nfs@vger.kernel.org; linux-rdma@vger.kernel.org
>>> Cc: Anna.Schumaker@netapp.com
>>> Subject: [PATCH V3 01/17] xprtrdma: mind the device's max fast register
>>> page list depth
>>>
>>> From: Steve Wise <swise@opengridcomputing.com>
>>>
>>> Some rdma devices don't support a fast register page list depth of 
>>> at least
>>> RPCRDMA_MAX_DATA_SEGS.  So xprtrdma needs to chunk its fast register
>>> regions according to the minimum of the device max supported depth or
>>> RPCRDMA_MAX_DATA_SEGS.
>>>
>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>>
>>>   net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ---
>>>   net/sunrpc/xprtrdma/verbs.c     |   47 +++++++++++++++++++++++++++++-
>>> ---------
>>>   net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
>>>   3 files changed, 36 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c
>>> b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead52..400aa1b 100644
>>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>>> @@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, 
>>> struct
>>> xdr_buf *target,
>>>       /* success. all failures return above */
>>>       req->rl_nchunks = nchunks;
>>>
>>> -    BUG_ON(nchunks == 0);
>>> -    BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
>>> -           && (nchunks > 3));
>>> -
>>>       /*
>>>        * finish off header. If write, marshal discrim and nchunks.
>>>        */
>>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>>> index 9372656..55fb09a 100644
>>> --- a/net/sunrpc/xprtrdma/verbs.c
>>> +++ b/net/sunrpc/xprtrdma/verbs.c
>>> @@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct
>>> sockaddr *addr, int memreg)
>>>                   __func__);
>>>               memreg = RPCRDMA_REGISTER;
>>>   #endif
>>> +        } else {
>>> +            /* Mind the ia limit on FRMR page list depth */
>>> +            ia->ri_max_frmr_depth = min_t(unsigned int,
>>> +                RPCRDMA_MAX_DATA_SEGS,
>>> +                devattr.max_fast_reg_page_list_len);
>>>           }
>>>           break;
>>>       }
>>> @@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct
>>> rpcrdma_ia *ia,
>>>       ep->rep_attr.srq = NULL;
>>>       ep->rep_attr.cap.max_send_wr = cdata->max_requests;
>>>       switch (ia->ri_memreg_strategy) {
>>> -    case RPCRDMA_FRMR:
>>> +    case RPCRDMA_FRMR: {
>>> +        int depth = 7;
>>> +
>>>           /* Add room for frmr register and invalidate WRs.
>>>            * 1. FRMR reg WR for head
>>>            * 2. FRMR invalidate WR for head
>>> -         * 3. FRMR reg WR for pagelist
>>> -         * 4. FRMR invalidate WR for pagelist
>>> +         * 3. N FRMR reg WRs for pagelist
>>> +         * 4. N FRMR invalidate WRs for pagelist
>>>            * 5. FRMR reg WR for tail
>>>            * 6. FRMR invalidate WR for tail
>>>            * 7. The RDMA_SEND WR
>>>            */
>>> -        ep->rep_attr.cap.max_send_wr *= 7;
>>> +
>>> +        /* Calculate N if the device max FRMR depth is smaller than
>>> +         * RPCRDMA_MAX_DATA_SEGS.
>>> +         */
>>> +        if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
>>> +            int delta = RPCRDMA_MAX_DATA_SEGS -
>>> +                    ia->ri_max_frmr_depth;
>>> +
>>> +            do {
>>> +                depth += 2; /* FRMR reg + invalidate */
>>> +                delta -= ia->ri_max_frmr_depth;
>> If ia->ri_max_frmr_depth is = 0. This loop becomes infinite loop.
>>
>>> +            } while (delta > 0);
>>> +
>>> +        }
>>> +        ep->rep_attr.cap.max_send_wr *= depth;
>>>           if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
>>> -            cdata->max_requests = devattr.max_qp_wr / 7;
>>> +            cdata->max_requests = devattr.max_qp_wr / depth;
>>>               if (!cdata->max_requests)
>>>                   return -EINVAL;
>>> -            ep->rep_attr.cap.max_send_wr = cdata-
>>>> max_requests * 7;
>>> +            ep->rep_attr.cap.max_send_wr = cdata-
>>>> max_requests *
>>> +                               depth;
>>>           }
>>>           break;
>>> +    }
>>>       case RPCRDMA_MEMWINDOWS_ASYNC:
>>>       case RPCRDMA_MEMWINDOWS:
>>>           /* Add room for mw_binds+unbinds - overkill! */ @@ -
>>> 1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,
>>> struct rpcrdma_ep *ep,
>>>       case RPCRDMA_FRMR:
>>>           for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--
>>> ) {
>>>               r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
>>> -
>>> RPCRDMA_MAX_SEGS);
>>> +                        ia->ri_max_frmr_depth);
>>>               if (IS_ERR(r->r.frmr.fr_mr)) {
>>>                   rc = PTR_ERR(r->r.frmr.fr_mr);
>>>                   dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
>>>                       " failed %i\n", __func__, rc);
>>>                   goto out;
>>>               }
>>> -            r->r.frmr.fr_pgl =
>>> -                ib_alloc_fast_reg_page_list(ia->ri_id-
>>>> device,
>>> -
>>> RPCRDMA_MAX_SEGS);
>>> +            r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
>>> +                        ia->ri_id->device,
>>> +                        ia->ri_max_frmr_depth);
>>>               if (IS_ERR(r->r.frmr.fr_pgl)) {
>>>                   rc = PTR_ERR(r->r.frmr.fr_pgl);
>>>                   dprintk("RPC:       %s: "
>>> @@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct
>>> rpcrdma_mr_seg *seg,
>>>       seg1->mr_offset -= pageoff;    /* start of page */
>>>       seg1->mr_len += pageoff;
>>>       len = -pageoff;
>>> -    if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
>>> -        *nsegs = RPCRDMA_MAX_DATA_SEGS;
>>> +    if (*nsegs > ia->ri_max_frmr_depth)
>>> +        *nsegs = ia->ri_max_frmr_depth;
>>>       for (page_no = i = 0; i < *nsegs;) {
>>>           rpcrdma_map_one(ia, seg, writing);
>>>           pa = seg->mr_dma;
>>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h
>>> b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445d..98340a3 100644
>>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>>> @@ -66,6 +66,7 @@ struct rpcrdma_ia {
>>>       struct completion    ri_done;
>>>       int            ri_async_rc;
>>>       enum rpcrdma_memreg    ri_memreg_strategy;
>>> +    unsigned int        ri_max_frmr_depth;
>>>   };
>>>
>>>   /*
>>>
>>> -- 
>>> To unsubscribe from this list: send the line "unsubscribe 
>>> linux-rdma" in the
>>> body of a message to majordomo@vger.kernel.org More majordomo info at
>>> http://vger.kernel.org/majordomo-info.html
>> N?????r??y???b?X???v?^?)?{.n?+????{???"??^n?r???z? ??h????&?? 
>> ?G???h?(????j"???m?????z?????f???h???~?mml==
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise May 16, 2014, 2:29 p.m. UTC | #4
Looks like ocrdma does this.  See ocrdma_query_device().  It advertises 
IB_DEVICE_MEM_MGT_EXTENSIONS but sets max_fast_reg_page_list_len to 0.  
The Verbs spec sez if you advertise the mem extensions, then you need to 
support all of them.   Is this just a bug in the driver?  Or does it 
really not support FRMRs?

Steve.


On 5/16/2014 9:14 AM, Steve Wise wrote:
> By the way, Devesh:  Is the device advertising FRMR support, yet 
> setting the max page list len to zero?  That's a driver bug...
>
>
>
> On 5/16/2014 9:10 AM, Steve Wise wrote:
>> I guess the client code doesn't verify that the device supports the 
>> chosen memreg mode.  That's not good.   Lemme fix this and respin 
>> this patch.
>>
>>
>> On 5/16/2014 2:08 AM, Devesh Sharma wrote:
>>> Chuck
>>>
>>> This patch is causing a CPU soft-lockup if underlying vendor reports 
>>> devattr.max_fast_reg_pagr_list_len = 0 and ia->ri_memreg_strategy = 
>>> FRMR (Default option).
>>> I think there is need to refer to device capability flags. If 
>>> strategy = FRMR is forced and devattr.max_fast_reg_pagr_list_len=0 
>>> then flash an error and fail RPC with -EIO.
>>>
>>> See inline:
>>>
>>>> -----Original Message-----
>>>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>>>> owner@vger.kernel.org] On Behalf Of Chuck Lever
>>>> Sent: Thursday, May 01, 2014 1:00 AM
>>>> To: linux-nfs@vger.kernel.org; linux-rdma@vger.kernel.org
>>>> Cc: Anna.Schumaker@netapp.com
>>>> Subject: [PATCH V3 01/17] xprtrdma: mind the device's max fast 
>>>> register
>>>> page list depth
>>>>
>>>> From: Steve Wise <swise@opengridcomputing.com>
>>>>
>>>> Some rdma devices don't support a fast register page list depth of 
>>>> at least
>>>> RPCRDMA_MAX_DATA_SEGS.  So xprtrdma needs to chunk its fast register
>>>> regions according to the minimum of the device max supported depth or
>>>> RPCRDMA_MAX_DATA_SEGS.
>>>>
>>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>>>> ---
>>>>
>>>>   net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ---
>>>>   net/sunrpc/xprtrdma/verbs.c     |   47 
>>>> +++++++++++++++++++++++++++++-
>>>> ---------
>>>>   net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
>>>>   3 files changed, 36 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c
>>>> b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead52..400aa1b 100644
>>>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c
>>>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
>>>> @@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, 
>>>> struct
>>>> xdr_buf *target,
>>>>       /* success. all failures return above */
>>>>       req->rl_nchunks = nchunks;
>>>>
>>>> -    BUG_ON(nchunks == 0);
>>>> -    BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
>>>> -           && (nchunks > 3));
>>>> -
>>>>       /*
>>>>        * finish off header. If write, marshal discrim and nchunks.
>>>>        */
>>>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>>>> index 9372656..55fb09a 100644
>>>> --- a/net/sunrpc/xprtrdma/verbs.c
>>>> +++ b/net/sunrpc/xprtrdma/verbs.c
>>>> @@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct
>>>> sockaddr *addr, int memreg)
>>>>                   __func__);
>>>>               memreg = RPCRDMA_REGISTER;
>>>>   #endif
>>>> +        } else {
>>>> +            /* Mind the ia limit on FRMR page list depth */
>>>> +            ia->ri_max_frmr_depth = min_t(unsigned int,
>>>> +                RPCRDMA_MAX_DATA_SEGS,
>>>> +                devattr.max_fast_reg_page_list_len);
>>>>           }
>>>>           break;
>>>>       }
>>>> @@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct
>>>> rpcrdma_ia *ia,
>>>>       ep->rep_attr.srq = NULL;
>>>>       ep->rep_attr.cap.max_send_wr = cdata->max_requests;
>>>>       switch (ia->ri_memreg_strategy) {
>>>> -    case RPCRDMA_FRMR:
>>>> +    case RPCRDMA_FRMR: {
>>>> +        int depth = 7;
>>>> +
>>>>           /* Add room for frmr register and invalidate WRs.
>>>>            * 1. FRMR reg WR for head
>>>>            * 2. FRMR invalidate WR for head
>>>> -         * 3. FRMR reg WR for pagelist
>>>> -         * 4. FRMR invalidate WR for pagelist
>>>> +         * 3. N FRMR reg WRs for pagelist
>>>> +         * 4. N FRMR invalidate WRs for pagelist
>>>>            * 5. FRMR reg WR for tail
>>>>            * 6. FRMR invalidate WR for tail
>>>>            * 7. The RDMA_SEND WR
>>>>            */
>>>> -        ep->rep_attr.cap.max_send_wr *= 7;
>>>> +
>>>> +        /* Calculate N if the device max FRMR depth is smaller than
>>>> +         * RPCRDMA_MAX_DATA_SEGS.
>>>> +         */
>>>> +        if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
>>>> +            int delta = RPCRDMA_MAX_DATA_SEGS -
>>>> +                    ia->ri_max_frmr_depth;
>>>> +
>>>> +            do {
>>>> +                depth += 2; /* FRMR reg + invalidate */
>>>> +                delta -= ia->ri_max_frmr_depth;
>>> If ia->ri_max_frmr_depth is = 0. This loop becomes infinite loop.
>>>
>>>> +            } while (delta > 0);
>>>> +
>>>> +        }
>>>> +        ep->rep_attr.cap.max_send_wr *= depth;
>>>>           if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
>>>> -            cdata->max_requests = devattr.max_qp_wr / 7;
>>>> +            cdata->max_requests = devattr.max_qp_wr / depth;
>>>>               if (!cdata->max_requests)
>>>>                   return -EINVAL;
>>>> -            ep->rep_attr.cap.max_send_wr = cdata-
>>>>> max_requests * 7;
>>>> +            ep->rep_attr.cap.max_send_wr = cdata-
>>>>> max_requests *
>>>> +                               depth;
>>>>           }
>>>>           break;
>>>> +    }
>>>>       case RPCRDMA_MEMWINDOWS_ASYNC:
>>>>       case RPCRDMA_MEMWINDOWS:
>>>>           /* Add room for mw_binds+unbinds - overkill! */ @@ -
>>>> 1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,
>>>> struct rpcrdma_ep *ep,
>>>>       case RPCRDMA_FRMR:
>>>>           for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--
>>>> ) {
>>>>               r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
>>>> -
>>>> RPCRDMA_MAX_SEGS);
>>>> +                        ia->ri_max_frmr_depth);
>>>>               if (IS_ERR(r->r.frmr.fr_mr)) {
>>>>                   rc = PTR_ERR(r->r.frmr.fr_mr);
>>>>                   dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
>>>>                       " failed %i\n", __func__, rc);
>>>>                   goto out;
>>>>               }
>>>> -            r->r.frmr.fr_pgl =
>>>> -                ib_alloc_fast_reg_page_list(ia->ri_id-
>>>>> device,
>>>> -
>>>> RPCRDMA_MAX_SEGS);
>>>> +            r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
>>>> +                        ia->ri_id->device,
>>>> +                        ia->ri_max_frmr_depth);
>>>>               if (IS_ERR(r->r.frmr.fr_pgl)) {
>>>>                   rc = PTR_ERR(r->r.frmr.fr_pgl);
>>>>                   dprintk("RPC:       %s: "
>>>> @@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct
>>>> rpcrdma_mr_seg *seg,
>>>>       seg1->mr_offset -= pageoff;    /* start of page */
>>>>       seg1->mr_len += pageoff;
>>>>       len = -pageoff;
>>>> -    if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
>>>> -        *nsegs = RPCRDMA_MAX_DATA_SEGS;
>>>> +    if (*nsegs > ia->ri_max_frmr_depth)
>>>> +        *nsegs = ia->ri_max_frmr_depth;
>>>>       for (page_no = i = 0; i < *nsegs;) {
>>>>           rpcrdma_map_one(ia, seg, writing);
>>>>           pa = seg->mr_dma;
>>>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h
>>>> b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445d..98340a3 100644
>>>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>>>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>>>> @@ -66,6 +66,7 @@ struct rpcrdma_ia {
>>>>       struct completion    ri_done;
>>>>       int            ri_async_rc;
>>>>       enum rpcrdma_memreg    ri_memreg_strategy;
>>>> +    unsigned int        ri_max_frmr_depth;
>>>>   };
>>>>
>>>>   /*
>>>>
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe 
>>>> linux-rdma" in the
>>>> body of a message to majordomo@vger.kernel.org More majordomo info at
>>>> http://vger.kernel.org/majordomo-info.html
>>> N?????r??y???b?X???v?^?)?{.n?+????{???"??^n?r???z? 
>>> ??h????&?? ?G???h?(????j"???m?????z?????f???h???~?mml==
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Devesh Sharma May 17, 2014, 8:23 a.m. UTC | #5
Yes Steve,

This is a bug in ocrdma. We will be suplying a fix for this in the near future. Its because of this bug i was able to figure out that infinite loop is causing a soft lockup

Thanks for pointing out you will see a fix soon for this bug.

Regards
Devesh
________________________________________
From: Steve Wise [swise@opengridcomputing.com]

Sent: Friday, May 16, 2014 7:59 PM
To: Devesh Sharma; Chuck Lever; linux-nfs@vger.kernel.org; linux-rdma@vger.kernel.org
Cc: Anna.Schumaker@netapp.com
Subject: Re: [PATCH V3 01/17] xprtrdma: mind the device's max fast register page        list depth

Looks like ocrdma does this.  See ocrdma_query_device().  It advertises
IB_DEVICE_MEM_MGT_EXTENSIONS but sets max_fast_reg_page_list_len to 0.
The Verbs spec sez if you advertise the mem extensions, then you need to
support all of them.   Is this just a bug in the driver?  Or does it
really not support FRMRs?

Steve.


On 5/16/2014 9:14 AM, Steve Wise wrote:
> By the way, Devesh:  Is the device advertising FRMR support, yet

> setting the max page list len to zero?  That's a driver bug...

>

>

>

> On 5/16/2014 9:10 AM, Steve Wise wrote:

>> I guess the client code doesn't verify that the device supports the

>> chosen memreg mode.  That's not good.   Lemme fix this and respin

>> this patch.

>>

>>

>> On 5/16/2014 2:08 AM, Devesh Sharma wrote:

>>> Chuck

>>>

>>> This patch is causing a CPU soft-lockup if underlying vendor reports

>>> devattr.max_fast_reg_pagr_list_len = 0 and ia->ri_memreg_strategy =

>>> FRMR (Default option).

>>> I think there is need to refer to device capability flags. If

>>> strategy = FRMR is forced and devattr.max_fast_reg_pagr_list_len=0

>>> then flash an error and fail RPC with -EIO.

>>>

>>> See inline:

>>>

>>>> -----Original Message-----

>>>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-

>>>> owner@vger.kernel.org] On Behalf Of Chuck Lever

>>>> Sent: Thursday, May 01, 2014 1:00 AM

>>>> To: linux-nfs@vger.kernel.org; linux-rdma@vger.kernel.org

>>>> Cc: Anna.Schumaker@netapp.com

>>>> Subject: [PATCH V3 01/17] xprtrdma: mind the device's max fast

>>>> register

>>>> page list depth

>>>>

>>>> From: Steve Wise <swise@opengridcomputing.com>

>>>>

>>>> Some rdma devices don't support a fast register page list depth of

>>>> at least

>>>> RPCRDMA_MAX_DATA_SEGS.  So xprtrdma needs to chunk its fast register

>>>> regions according to the minimum of the device max supported depth or

>>>> RPCRDMA_MAX_DATA_SEGS.

>>>>

>>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>

>>>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>

>>>> ---

>>>>

>>>>   net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ---

>>>>   net/sunrpc/xprtrdma/verbs.c     |   47

>>>> +++++++++++++++++++++++++++++-

>>>> ---------

>>>>   net/sunrpc/xprtrdma/xprt_rdma.h |    1 +

>>>>   3 files changed, 36 insertions(+), 16 deletions(-)

>>>>

>>>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c

>>>> b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead52..400aa1b 100644

>>>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c

>>>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c

>>>> @@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst,

>>>> struct

>>>> xdr_buf *target,

>>>>       /* success. all failures return above */

>>>>       req->rl_nchunks = nchunks;

>>>>

>>>> -    BUG_ON(nchunks == 0);

>>>> -    BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)

>>>> -           && (nchunks > 3));

>>>> -

>>>>       /*

>>>>        * finish off header. If write, marshal discrim and nchunks.

>>>>        */

>>>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c

>>>> index 9372656..55fb09a 100644

>>>> --- a/net/sunrpc/xprtrdma/verbs.c

>>>> +++ b/net/sunrpc/xprtrdma/verbs.c

>>>> @@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct

>>>> sockaddr *addr, int memreg)

>>>>                   __func__);

>>>>               memreg = RPCRDMA_REGISTER;

>>>>   #endif

>>>> +        } else {

>>>> +            /* Mind the ia limit on FRMR page list depth */

>>>> +            ia->ri_max_frmr_depth = min_t(unsigned int,

>>>> +                RPCRDMA_MAX_DATA_SEGS,

>>>> +                devattr.max_fast_reg_page_list_len);

>>>>           }

>>>>           break;

>>>>       }

>>>> @@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct

>>>> rpcrdma_ia *ia,

>>>>       ep->rep_attr.srq = NULL;

>>>>       ep->rep_attr.cap.max_send_wr = cdata->max_requests;

>>>>       switch (ia->ri_memreg_strategy) {

>>>> -    case RPCRDMA_FRMR:

>>>> +    case RPCRDMA_FRMR: {

>>>> +        int depth = 7;

>>>> +

>>>>           /* Add room for frmr register and invalidate WRs.

>>>>            * 1. FRMR reg WR for head

>>>>            * 2. FRMR invalidate WR for head

>>>> -         * 3. FRMR reg WR for pagelist

>>>> -         * 4. FRMR invalidate WR for pagelist

>>>> +         * 3. N FRMR reg WRs for pagelist

>>>> +         * 4. N FRMR invalidate WRs for pagelist

>>>>            * 5. FRMR reg WR for tail

>>>>            * 6. FRMR invalidate WR for tail

>>>>            * 7. The RDMA_SEND WR

>>>>            */

>>>> -        ep->rep_attr.cap.max_send_wr *= 7;

>>>> +

>>>> +        /* Calculate N if the device max FRMR depth is smaller than

>>>> +         * RPCRDMA_MAX_DATA_SEGS.

>>>> +         */

>>>> +        if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {

>>>> +            int delta = RPCRDMA_MAX_DATA_SEGS -

>>>> +                    ia->ri_max_frmr_depth;

>>>> +

>>>> +            do {

>>>> +                depth += 2; /* FRMR reg + invalidate */

>>>> +                delta -= ia->ri_max_frmr_depth;

>>> If ia->ri_max_frmr_depth is = 0. This loop becomes infinite loop.

>>>

>>>> +            } while (delta > 0);

>>>> +

>>>> +        }

>>>> +        ep->rep_attr.cap.max_send_wr *= depth;

>>>>           if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {

>>>> -            cdata->max_requests = devattr.max_qp_wr / 7;

>>>> +            cdata->max_requests = devattr.max_qp_wr / depth;

>>>>               if (!cdata->max_requests)

>>>>                   return -EINVAL;

>>>> -            ep->rep_attr.cap.max_send_wr = cdata-

>>>>> max_requests * 7;

>>>> +            ep->rep_attr.cap.max_send_wr = cdata-

>>>>> max_requests *

>>>> +                               depth;

>>>>           }

>>>>           break;

>>>> +    }

>>>>       case RPCRDMA_MEMWINDOWS_ASYNC:

>>>>       case RPCRDMA_MEMWINDOWS:

>>>>           /* Add room for mw_binds+unbinds - overkill! */ @@ -

>>>> 1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf,

>>>> struct rpcrdma_ep *ep,

>>>>       case RPCRDMA_FRMR:

>>>>           for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--

>>>> ) {

>>>>               r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,

>>>> -

>>>> RPCRDMA_MAX_SEGS);

>>>> +                        ia->ri_max_frmr_depth);

>>>>               if (IS_ERR(r->r.frmr.fr_mr)) {

>>>>                   rc = PTR_ERR(r->r.frmr.fr_mr);

>>>>                   dprintk("RPC:       %s: ib_alloc_fast_reg_mr"

>>>>                       " failed %i\n", __func__, rc);

>>>>                   goto out;

>>>>               }

>>>> -            r->r.frmr.fr_pgl =

>>>> -                ib_alloc_fast_reg_page_list(ia->ri_id-

>>>>> device,

>>>> -

>>>> RPCRDMA_MAX_SEGS);

>>>> +            r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(

>>>> +                        ia->ri_id->device,

>>>> +                        ia->ri_max_frmr_depth);

>>>>               if (IS_ERR(r->r.frmr.fr_pgl)) {

>>>>                   rc = PTR_ERR(r->r.frmr.fr_pgl);

>>>>                   dprintk("RPC:       %s: "

>>>> @@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct

>>>> rpcrdma_mr_seg *seg,

>>>>       seg1->mr_offset -= pageoff;    /* start of page */

>>>>       seg1->mr_len += pageoff;

>>>>       len = -pageoff;

>>>> -    if (*nsegs > RPCRDMA_MAX_DATA_SEGS)

>>>> -        *nsegs = RPCRDMA_MAX_DATA_SEGS;

>>>> +    if (*nsegs > ia->ri_max_frmr_depth)

>>>> +        *nsegs = ia->ri_max_frmr_depth;

>>>>       for (page_no = i = 0; i < *nsegs;) {

>>>>           rpcrdma_map_one(ia, seg, writing);

>>>>           pa = seg->mr_dma;

>>>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h

>>>> b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445d..98340a3 100644

>>>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h

>>>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h

>>>> @@ -66,6 +66,7 @@ struct rpcrdma_ia {

>>>>       struct completion    ri_done;

>>>>       int            ri_async_rc;

>>>>       enum rpcrdma_memreg    ri_memreg_strategy;

>>>> +    unsigned int        ri_max_frmr_depth;

>>>>   };

>>>>

>>>>   /*

>>>>

>>>> --

>>>> To unsubscribe from this list: send the line "unsubscribe

>>>> linux-rdma" in the

>>>> body of a message to majordomo@vger.kernel.org More majordomo info at

>>>> http://vger.kernel.org/majordomo-info.html

>>> N?????r??y???b?X???v?^?)?{.n?+????{???"??^n?r???z?

>>> ??h????&?? ?G???h?(????j"???m?????z?????f???h???~?mml==

>>

>> --

>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in

>> the body of a message to majordomo@vger.kernel.org

>> More majordomo info at http://vger.kernel.org/majordomo-info.html

>

> --

> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in

> the body of a message to majordomo@vger.kernel.org

> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 96ead52..400aa1b 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -248,10 +248,6 @@  rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	/* success. all failures return above */
 	req->rl_nchunks = nchunks;
 
-	BUG_ON(nchunks == 0);
-	BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
-	       && (nchunks > 3));
-
 	/*
 	 * finish off header. If write, marshal discrim and nchunks.
 	 */
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 9372656..55fb09a 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -539,6 +539,11 @@  rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 				__func__);
 			memreg = RPCRDMA_REGISTER;
 #endif
+		} else {
+			/* Mind the ia limit on FRMR page list depth */
+			ia->ri_max_frmr_depth = min_t(unsigned int,
+				RPCRDMA_MAX_DATA_SEGS,
+				devattr.max_fast_reg_page_list_len);
 		}
 		break;
 	}
@@ -659,24 +664,42 @@  rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.srq = NULL;
 	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
 	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
+	case RPCRDMA_FRMR: {
+		int depth = 7;
+
 		/* Add room for frmr register and invalidate WRs.
 		 * 1. FRMR reg WR for head
 		 * 2. FRMR invalidate WR for head
-		 * 3. FRMR reg WR for pagelist
-		 * 4. FRMR invalidate WR for pagelist
+		 * 3. N FRMR reg WRs for pagelist
+		 * 4. N FRMR invalidate WRs for pagelist
 		 * 5. FRMR reg WR for tail
 		 * 6. FRMR invalidate WR for tail
 		 * 7. The RDMA_SEND WR
 		 */
-		ep->rep_attr.cap.max_send_wr *= 7;
+
+		/* Calculate N if the device max FRMR depth is smaller than
+		 * RPCRDMA_MAX_DATA_SEGS.
+		 */
+		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+			int delta = RPCRDMA_MAX_DATA_SEGS -
+				    ia->ri_max_frmr_depth;
+
+			do {
+				depth += 2; /* FRMR reg + invalidate */
+				delta -= ia->ri_max_frmr_depth;
+			} while (delta > 0);
+
+		}
+		ep->rep_attr.cap.max_send_wr *= depth;
 		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
-			cdata->max_requests = devattr.max_qp_wr / 7;
+			cdata->max_requests = devattr.max_qp_wr / depth;
 			if (!cdata->max_requests)
 				return -EINVAL;
-			ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
+			ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+						       depth;
 		}
 		break;
+	}
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
 		/* Add room for mw_binds+unbinds - overkill! */
@@ -1043,16 +1066,16 @@  rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	case RPCRDMA_FRMR:
 		for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
 			r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-							 RPCRDMA_MAX_SEGS);
+						ia->ri_max_frmr_depth);
 			if (IS_ERR(r->r.frmr.fr_mr)) {
 				rc = PTR_ERR(r->r.frmr.fr_mr);
 				dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
 					" failed %i\n", __func__, rc);
 				goto out;
 			}
-			r->r.frmr.fr_pgl =
-				ib_alloc_fast_reg_page_list(ia->ri_id->device,
-							    RPCRDMA_MAX_SEGS);
+			r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+						ia->ri_id->device,
+						ia->ri_max_frmr_depth);
 			if (IS_ERR(r->r.frmr.fr_pgl)) {
 				rc = PTR_ERR(r->r.frmr.fr_pgl);
 				dprintk("RPC:       %s: "
@@ -1498,8 +1521,8 @@  rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 	seg1->mr_offset -= pageoff;	/* start of page */
 	seg1->mr_len += pageoff;
 	len = -pageoff;
-	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
-		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	if (*nsegs > ia->ri_max_frmr_depth)
+		*nsegs = ia->ri_max_frmr_depth;
 	for (page_no = i = 0; i < *nsegs;) {
 		rpcrdma_map_one(ia, seg, writing);
 		pa = seg->mr_dma;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index cc1445d..98340a3 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -66,6 +66,7 @@  struct rpcrdma_ia {
 	struct completion	ri_done;
 	int			ri_async_rc;
 	enum rpcrdma_memreg	ri_memreg_strategy;
+	unsigned int		ri_max_frmr_depth;
 };
 
 /*