[v1,4/5] xprtrdma: Reduce required number of send SGEs
diff mbox

Message ID 20170113174322.32692.66126.stgit@manet.1015granger.net
State New
Headers show

Commit Message

Chuck Lever Jan. 13, 2017, 5:43 p.m. UTC
The MAX_SEND_SGES check introduced in commit 655fec6987be
("xprtrdma: Use gathered Send for large inline messages") fails
for devices that have a small max_sge.

Instead of checking for a large fixed maximum number of SGEs,
check for a minimum small number. RPC-over-RDMA will switch to
using a Read chunk if an xdr_buf has more pages than can fit in
the device's max_sge limit. This is better than failing all
together to mount the server.

This fix supports devices that have as few as three send SGEs
available.

Reported-By: Selvin Xavier <selvin.xavier@broadcom.com>
Reported-By: Devesh Sharma <devesh.sharma@broadcom.com>
Reported-by: Honggang Li <honli@redhat.com>
Reported-by: Ram Amrani <Ram.Amrani@cavium.com>
Fixes: 655fec6987be ("xprtrdma: Use gathered Send for large ...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |   23 ++++++++++++++++++++++-
 net/sunrpc/xprtrdma/verbs.c     |   13 +++++++------
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
 3 files changed, 30 insertions(+), 7 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Parav Pandit Jan. 13, 2017, 6:01 p.m. UTC | #1
Hi Chuck,

> -----Original Message-----

> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-

> owner@vger.kernel.org] On Behalf Of Chuck Lever

> Sent: Friday, January 13, 2017 11:43 AM

> To: linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org

> Subject: [PATCH v1 4/5] xprtrdma: Reduce required number of send SGEs

> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c

> b/net/sunrpc/xprtrdma/rpc_rdma.c index 4909758..ab699f9 100644


>  /* The client can't know how large the actual reply will be. Thus it diff --git

> a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index

> 12e8242..5dcdd0b 100644

> --- a/net/sunrpc/xprtrdma/verbs.c

> +++ b/net/sunrpc/xprtrdma/verbs.c

> @@ -488,18 +488,19 @@ static void rpcrdma_destroy_id(struct rdma_cm_id

> *id)

>   */

>  int

>  rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,

> -				struct rpcrdma_create_data_internal *cdata)

> +		  struct rpcrdma_create_data_internal *cdata)

>  {

>  	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;

> +	unsigned int max_qp_wr, max_sge;

>  	struct ib_cq *sendcq, *recvcq;

> -	unsigned int max_qp_wr;

>  	int rc;

> 

> -	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {

> -		dprintk("RPC:       %s: insufficient sge's available\n",

> -			__func__);

> +	max_sge = min(ia->ri_device->attrs.max_sge,

> RPCRDMA_MAX_SEND_SGES);

> +	if (max_sge < 3) {

> +		pr_warn("rpcrdma: HCA provides only %d send SGEs\n",

> max_sge);

>  		return -ENOMEM;

>  	}

> +	ia->ri_max_sgeno = max_sge - 3;

> 


I didn't noticed this new ri_max_sgeno variable being used in this patch set. Did I miss?
You also might want to rename it to ri_max_sge_num.
Regardless for some device with 3 SGEs, ri_max_sgeno will become zero. Is that fine?
You wanted to check for value of 5?
It would be good to have define for this minimum required 3 SGEs header file such as RPCRDMA_MIN_REQ_RECV_SGE.


>  	if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {

>  		dprintk("RPC:       %s: insufficient wqe's available\n",

> @@ -524,7 +525,7 @@ static void rpcrdma_destroy_id(struct rdma_cm_id

> *id)

>  	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;

>  	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;

>  	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */

> -	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;

> +	ep->rep_attr.cap.max_send_sge = max_sge;

>  	ep->rep_attr.cap.max_recv_sge = 1;

>  	ep->rep_attr.cap.max_inline_data = 0;

>  	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; diff --git

> a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h

> index f495df0c..c134d0b 100644

> --- a/net/sunrpc/xprtrdma/xprt_rdma.h

> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h

> @@ -74,6 +74,7 @@ struct rpcrdma_ia {

>  	unsigned int		ri_max_frmr_depth;

>  	unsigned int		ri_max_inline_write;

>  	unsigned int		ri_max_inline_read;

> +	unsigned int		ri_max_sgeno;

>  	bool			ri_reminv_expected;

>  	bool			ri_implicit_padding;

>  	enum ib_mr_type		ri_mrtype;

> 

> --

> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the

> body of a message to majordomo@vger.kernel.org More majordomo info at

> http://vger.kernel.org/majordomo-info.html
Chuck Lever Jan. 13, 2017, 6:30 p.m. UTC | #2
> On Jan 13, 2017, at 1:01 PM, Parav Pandit <parav@mellanox.com> wrote:
> 
> Hi Chuck,
> 
>> -----Original Message-----
>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>> owner@vger.kernel.org] On Behalf Of Chuck Lever
>> Sent: Friday, January 13, 2017 11:43 AM
>> To: linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>> Subject: [PATCH v1 4/5] xprtrdma: Reduce required number of send SGEs
>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c
>> b/net/sunrpc/xprtrdma/rpc_rdma.c index 4909758..ab699f9 100644
> 
>> /* The client can't know how large the actual reply will be. Thus it diff --git
>> a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index
>> 12e8242..5dcdd0b 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -488,18 +488,19 @@ static void rpcrdma_destroy_id(struct rdma_cm_id
>> *id)
>>  */
>> int
>> rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
>> -				struct rpcrdma_create_data_internal *cdata)
>> +		  struct rpcrdma_create_data_internal *cdata)
>> {
>> 	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
>> +	unsigned int max_qp_wr, max_sge;
>> 	struct ib_cq *sendcq, *recvcq;
>> -	unsigned int max_qp_wr;
>> 	int rc;
>> 
>> -	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
>> -		dprintk("RPC:       %s: insufficient sge's available\n",
>> -			__func__);
>> +	max_sge = min(ia->ri_device->attrs.max_sge,
>> RPCRDMA_MAX_SEND_SGES);
>> +	if (max_sge < 3) {
>> +		pr_warn("rpcrdma: HCA provides only %d send SGEs\n",
>> max_sge);
>> 		return -ENOMEM;
>> 	}
>> +	ia->ri_max_sgeno = max_sge - 3;
>> 
> 
> I didn't noticed this new ri_max_sgeno variable being used in this patch set. Did I miss?

Yes, you snipped out the other hunk where it is used.


> You also might want to rename it to ri_max_sge_num.
> Regardless for some device with 3 SGEs, ri_max_sgeno will become zero. Is that fine?

Yes, that's OK, and tested. Zero means that NFS WRITE and SYMLINK
payloads will always be sent in a Read chunk.


> You wanted to check for value of 5?
> It would be good to have define for this minimum required 3 SGEs header file such as RPCRDMA_MIN_REQ_RECV_SGE.

OK.


> 
> 
>> 	if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
>> 		dprintk("RPC:       %s: insufficient wqe's available\n",
>> @@ -524,7 +525,7 @@ static void rpcrdma_destroy_id(struct rdma_cm_id
>> *id)
>> 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
>> 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
>> 	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */
>> -	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
>> +	ep->rep_attr.cap.max_send_sge = max_sge;
>> 	ep->rep_attr.cap.max_recv_sge = 1;
>> 	ep->rep_attr.cap.max_inline_data = 0;
>> 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; diff --git
>> a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index f495df0c..c134d0b 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -74,6 +74,7 @@ struct rpcrdma_ia {
>> 	unsigned int		ri_max_frmr_depth;
>> 	unsigned int		ri_max_inline_write;
>> 	unsigned int		ri_max_inline_read;
>> +	unsigned int		ri_max_sgeno;
>> 	bool			ri_reminv_expected;
>> 	bool			ri_implicit_padding;
>> 	enum ib_mr_type		ri_mrtype;
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
>> body of a message to majordomo@vger.kernel.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
> N‹§ēæėrļ›yúčšØbēXŽķĮ§vØ^–)Þš{.nĮ+‰·ĨŠ{ą­ŲšŠ{ayšʇڙë,j­ĒfĢĒ·hš‹āzđŪwĨĒļĒ·Ķj:+v‰ĻŠwčjØmķŸĸūŦ‘ęįzZ+ƒųšŽŠÝĒj"ú!

--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit Jan. 13, 2017, 7:14 p.m. UTC | #3
PiA+PiArCWlhLT5yaV9tYXhfc2dlbm8gPSBtYXhfc2dlIC0gMzsNCj4gPj4NCj4gPg0KPiA+IEkg
ZGlkbid0IG5vdGljZWQgdGhpcyBuZXcgcmlfbWF4X3NnZW5vIHZhcmlhYmxlIGJlaW5nIHVzZWQg
aW4gdGhpcyBwYXRjaCBzZXQuDQo+IERpZCBJIG1pc3M/DQo+IA0KPiBZZXMsIHlvdSBzbmlwcGVk
IG91dCB0aGUgb3RoZXIgaHVuayB3aGVyZSBpdCBpcyB1c2VkLg0KDQpSaWdodC4gSSBzZWUgaXQg
bm93Lg0KPiANCj4gPiBZb3UgYWxzbyBtaWdodCB3YW50IHRvIHJlbmFtZSBpdCB0byByaV9tYXhf
c2dlX251bS4NCj4gPiBSZWdhcmRsZXNzIGZvciBzb21lIGRldmljZSB3aXRoIDMgU0dFcywgcmlf
bWF4X3NnZW5vIHdpbGwgYmVjb21lIHplcm8uIElzDQo+IHRoYXQgZmluZT8NCj4gDQo+IFllcywg
dGhhdCdzIE9LLCBhbmQgdGVzdGVkLiBaZXJvIG1lYW5zIHRoYXQgTkZTIFdSSVRFIGFuZCBTWU1M
SU5LDQo+IHBheWxvYWRzIHdpbGwgYWx3YXlzIGJlIHNlbnQgaW4gYSBSZWFkIGNodW5rLg0KPiAN
Ck9rLiBUaGFua3MuDQoNCj4gDQo+ID4gSXQgd291bGQgYmUgZ29vZCB0byBoYXZlIGRlZmluZSBm
b3IgdGhpcyBtaW5pbXVtIHJlcXVpcmVkIDMgU0dFcyBoZWFkZXINCj4gZmlsZSBzdWNoIGFzIFJQ
Q1JETUFfTUlOX1JFUV9SRUNWX1NHRS4NCj4gDQo+IE9LLg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 4909758..ab699f9 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -126,13 +126,34 @@  void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
  * plus the RPC call fit under the transport's inline limit. If the
  * combined call message size exceeds that limit, the client must use
  * the read chunk list for this operation.
+ *
+ * A read chunk is also required if sending the RPC call inline would
+ * exceed this device's max_sge limit.
  */
 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
 				struct rpc_rqst *rqst)
 {
+	struct xdr_buf *xdr = &rqst->rq_snd_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	unsigned int count, remaining, offset;
+
+	if (xdr->len > ia->ri_max_inline_write)
+		return false;
+
+	if (xdr->page_len) {
+		remaining = xdr->page_len;
+		offset = xdr->page_base & ~PAGE_MASK;
+		count = 0;
+		while (remaining) {
+			remaining -= min_t(unsigned int,
+					   PAGE_SIZE - offset, remaining);
+			offset = 0;
+			if (++count > ia->ri_max_sgeno)
+				return false;
+		}
+	}
 
-	return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
+	return true;
 }
 
 /* The client can't know how large the actual reply will be. Thus it
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 12e8242..5dcdd0b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -488,18 +488,19 @@  static void rpcrdma_destroy_id(struct rdma_cm_id *id)
  */
 int
 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
-				struct rpcrdma_create_data_internal *cdata)
+		  struct rpcrdma_create_data_internal *cdata)
 {
 	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
+	unsigned int max_qp_wr, max_sge;
 	struct ib_cq *sendcq, *recvcq;
-	unsigned int max_qp_wr;
 	int rc;
 
-	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
-		dprintk("RPC:       %s: insufficient sge's available\n",
-			__func__);
+	max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES);
+	if (max_sge < 3) {
+		pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
 		return -ENOMEM;
 	}
+	ia->ri_max_sgeno = max_sge - 3;
 
 	if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
 		dprintk("RPC:       %s: insufficient wqe's available\n",
@@ -524,7 +525,7 @@  static void rpcrdma_destroy_id(struct rdma_cm_id *id)
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
 	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */
-	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
+	ep->rep_attr.cap.max_send_sge = max_sge;
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f495df0c..c134d0b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,6 +74,7 @@  struct rpcrdma_ia {
 	unsigned int		ri_max_frmr_depth;
 	unsigned int		ri_max_inline_write;
 	unsigned int		ri_max_inline_read;
+	unsigned int		ri_max_sgeno;
 	bool			ri_reminv_expected;
 	bool			ri_implicit_padding;
 	enum ib_mr_type		ri_mrtype;