diff mbox

[10/10] svcrdma: Switch CQs from IB_POLL_SOFTIRQ to IB_POLL_WORKQUEUE

Message ID 20160428151550.13068.24199.stgit@klimt.1015granger.net (mailing list archive)
State Not Applicable
Headers show

Commit Message

Chuck Lever April 28, 2016, 3:15 p.m. UTC
Spread NFSD completion handling across CPUs, and replace
BH-friendly spin locking with plain spin locks.

iozone -i0 -i1 -s128m -y1k -az -I -N

Microseconds/op Mode. Output is in microseconds per operation.

Before:
              KB  reclen   write rewrite    read    reread
          131072       1      51      51       43       43
          131072       2      53      52       42       43
          131072       4      53      52       43       43
          131072       8      55      54       44       44
          131072      16      62      59       49       47
          131072      32      72      69       53       53
          131072      64      92      87       66       66
          131072     128     144     130       94       93
          131072     256     225     216      146      145
          131072     512     485     474      251      251
          131072    1024     573     540      514      512
          131072    2048    1007     941      624      618
          131072    4096    1672    1699      976      969
          131072    8192    3179    3158     1660     1649
          131072   16384    5836    5659     3062     3041

After:
              KB  reclen   write rewrite    read    reread
          131072       1      54      54       43       43
          131072       2      55      55       43       43
          131072       4      56      57       44       45
          131072       8      59      58       45       45
          131072      16      64      62       47       47
          131072      32      76      74       54       54
          131072      64      96      91       67       66
          131072     128     148     133       97       97
          131072     256     229     227      148      147
          131072     512     488     445      252      255
          131072    1024     582     534      511      540
          131072    2048     998     988      614      620
          131072    4096    1685    1679      946      965
          131072    8192    3113    3048     1650     1644
          131072   16384    6010    5745     3046     3053

NFS READ is roughly the same, NFS WRITE is marginally worse.

Before:
GETATTR:
	242 ops (0%)
	avg bytes sent per op: 127
	avg bytes received per op: 112
	backlog wait: 0.000000
 	RTT: 0.041322
 	total execute time: 0.049587 (milliseconds)

After:
GETATTR:
	242 ops (0%)
	avg bytes sent per op: 127
	avg bytes received per op: 112
	backlog wait: 0.000000
 	RTT: 0.045455
 	total execute time: 0.053719 (milliseconds)

Small op latency increased by 4usec.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  |    6 +++---
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   26 +++++++++++++-------------
 2 files changed, 16 insertions(+), 16 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Steve Wise April 28, 2016, 3:59 p.m. UTC | #1
> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> owner@vger.kernel.org] On Behalf Of Chuck Lever
> Sent: Thursday, April 28, 2016 10:16 AM
> To: linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: [PATCH 10/10] svcrdma: Switch CQs from IB_POLL_SOFTIRQ to
> IB_POLL_WORKQUEUE
> 
> Spread NFSD completion handling across CPUs, and replace
> BH-friendly spin locking with plain spin locks.
> 
> iozone -i0 -i1 -s128m -y1k -az -I -N
> 
> Microseconds/op Mode. Output is in microseconds per operation.
> 
> Before:
>               KB  reclen   write rewrite    read    reread
>           131072       1      51      51       43       43
>           131072       2      53      52       42       43
>           131072       4      53      52       43       43
>           131072       8      55      54       44       44
>           131072      16      62      59       49       47
>           131072      32      72      69       53       53
>           131072      64      92      87       66       66
>           131072     128     144     130       94       93
>           131072     256     225     216      146      145
>           131072     512     485     474      251      251
>           131072    1024     573     540      514      512
>           131072    2048    1007     941      624      618
>           131072    4096    1672    1699      976      969
>           131072    8192    3179    3158     1660     1649
>           131072   16384    5836    5659     3062     3041
> 
> After:
>               KB  reclen   write rewrite    read    reread
>           131072       1      54      54       43       43
>           131072       2      55      55       43       43
>           131072       4      56      57       44       45
>           131072       8      59      58       45       45
>           131072      16      64      62       47       47
>           131072      32      76      74       54       54
>           131072      64      96      91       67       66
>           131072     128     148     133       97       97
>           131072     256     229     227      148      147
>           131072     512     488     445      252      255
>           131072    1024     582     534      511      540
>           131072    2048     998     988      614      620
>           131072    4096    1685    1679      946      965
>           131072    8192    3113    3048     1650     1644
>           131072   16384    6010    5745     3046     3053
> 
> NFS READ is roughly the same, NFS WRITE is marginally worse.
> 
> Before:
> GETATTR:
> 	242 ops (0%)
> 	avg bytes sent per op: 127
> 	avg bytes received per op: 112
> 	backlog wait: 0.000000
>  	RTT: 0.041322
>  	total execute time: 0.049587 (milliseconds)
> 
> After:
> GETATTR:
> 	242 ops (0%)
> 	avg bytes sent per op: 127
> 	avg bytes received per op: 112
> 	backlog wait: 0.000000
>  	RTT: 0.045455
>  	total execute time: 0.053719 (milliseconds)
> 
> Small op latency increased by 4usec.
> 


Hey Chuck, in what scenario or under what type of load do you expect this change to help performance?  I guess it would help as you scale out the number of clients and thus the number of CQs in use?   Do you do any measurements along these lines?

Stevo



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever April 28, 2016, 4:15 p.m. UTC | #2
> On Apr 28, 2016, at 11:59 AM, Steve Wise <swise@opengridcomputing.com> wrote:
> 
> 
> 
>> -----Original Message-----
>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>> owner@vger.kernel.org] On Behalf Of Chuck Lever
>> Sent: Thursday, April 28, 2016 10:16 AM
>> To: linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>> Subject: [PATCH 10/10] svcrdma: Switch CQs from IB_POLL_SOFTIRQ to
>> IB_POLL_WORKQUEUE
>> 
>> Spread NFSD completion handling across CPUs, and replace
>> BH-friendly spin locking with plain spin locks.
>> 
>> iozone -i0 -i1 -s128m -y1k -az -I -N
>> 
>> Microseconds/op Mode. Output is in microseconds per operation.
>> 
>> Before:
>>              KB  reclen   write rewrite    read    reread
>>          131072       1      51      51       43       43
>>          131072       2      53      52       42       43
>>          131072       4      53      52       43       43
>>          131072       8      55      54       44       44
>>          131072      16      62      59       49       47
>>          131072      32      72      69       53       53
>>          131072      64      92      87       66       66
>>          131072     128     144     130       94       93
>>          131072     256     225     216      146      145
>>          131072     512     485     474      251      251
>>          131072    1024     573     540      514      512
>>          131072    2048    1007     941      624      618
>>          131072    4096    1672    1699      976      969
>>          131072    8192    3179    3158     1660     1649
>>          131072   16384    5836    5659     3062     3041
>> 
>> After:
>>              KB  reclen   write rewrite    read    reread
>>          131072       1      54      54       43       43
>>          131072       2      55      55       43       43
>>          131072       4      56      57       44       45
>>          131072       8      59      58       45       45
>>          131072      16      64      62       47       47
>>          131072      32      76      74       54       54
>>          131072      64      96      91       67       66
>>          131072     128     148     133       97       97
>>          131072     256     229     227      148      147
>>          131072     512     488     445      252      255
>>          131072    1024     582     534      511      540
>>          131072    2048     998     988      614      620
>>          131072    4096    1685    1679      946      965
>>          131072    8192    3113    3048     1650     1644
>>          131072   16384    6010    5745     3046     3053
>> 
>> NFS READ is roughly the same, NFS WRITE is marginally worse.
>> 
>> Before:
>> GETATTR:
>> 	242 ops (0%)
>> 	avg bytes sent per op: 127
>> 	avg bytes received per op: 112
>> 	backlog wait: 0.000000
>> 	RTT: 0.041322
>> 	total execute time: 0.049587 (milliseconds)
>> 
>> After:
>> GETATTR:
>> 	242 ops (0%)
>> 	avg bytes sent per op: 127
>> 	avg bytes received per op: 112
>> 	backlog wait: 0.000000
>> 	RTT: 0.045455
>> 	total execute time: 0.053719 (milliseconds)
>> 
>> Small op latency increased by 4usec.
>> 
> 
> 
> Hey Chuck, in what scenario or under what type of load do you expect this change to help performance?  I guess it would help as you scale out the number of clients and thus the number of CQs in use?

Allowing completions to run on any CPU should help if the
softIRQ thread is constrained to one CPU.

Flapping bottom-halfs fewer times for each incoming RPC
_should_ also be beneficial.

We are also interested in posting RDMA Read requests during
Receive completion processing. That would reduce the
latency of any request involving a Read chunk by removing
a heavyweight context switch.

I've also noticed that changing just the Receive CQ to use
workqueue has only negligible impact on performance (as
measured using the above tool).


> Do you do any measurements along these lines?

I don't have the quantity of hardware needed for that kind
of analysis. You might have a few more clients in your
lab...

I think my basic question is whether I've missed something,
if the approach can be improved, am I using the correct
metrics, etc.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 41adbe7..6159ae6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -586,13 +586,13 @@  int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	dprintk("svcrdma: rqstp=%p\n", rqstp);
 
-	spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+	spin_lock(&rdma_xprt->sc_rq_dto_lock);
 	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
 		ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
 				  struct svc_rdma_op_ctxt,
 				  dto_q);
 		list_del_init(&ctxt->dto_q);
-		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 		rdma_read_complete(rqstp, ctxt);
 		goto complete;
 	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
@@ -605,7 +605,7 @@  int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		clear_bit(XPT_DATA, &xprt->xpt_flags);
 		ctxt = NULL;
 	}
-	spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
 	if (!ctxt) {
 		/* This is the EAGAIN path. The svc_recv routine will
 		 * return -EAGAIN, the nfsd thread will go to call into
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index dd94401..8850a5f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -186,7 +186,7 @@  struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
 	struct svc_rdma_op_ctxt *ctxt = NULL;
 
-	spin_lock_bh(&xprt->sc_ctxt_lock);
+	spin_lock(&xprt->sc_ctxt_lock);
 	xprt->sc_ctxt_used++;
 	if (list_empty(&xprt->sc_ctxts))
 		goto out_empty;
@@ -194,7 +194,7 @@  struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 	ctxt = list_first_entry(&xprt->sc_ctxts,
 				struct svc_rdma_op_ctxt, free);
 	list_del_init(&ctxt->free);
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	spin_unlock(&xprt->sc_ctxt_lock);
 
 out:
 	ctxt->count = 0;
@@ -205,15 +205,15 @@  out_empty:
 	/* Either pre-allocation missed the mark, or send
 	 * queue accounting is broken.
 	 */
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	spin_unlock(&xprt->sc_ctxt_lock);
 
 	ctxt = alloc_ctxt(xprt, GFP_NOIO);
 	if (ctxt)
 		goto out;
 
-	spin_lock_bh(&xprt->sc_ctxt_lock);
+	spin_lock(&xprt->sc_ctxt_lock);
 	xprt->sc_ctxt_used--;
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	spin_unlock(&xprt->sc_ctxt_lock);
 	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
 	return NULL;
 }
@@ -248,10 +248,10 @@  void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 		for (i = 0; i < ctxt->count; i++)
 			put_page(ctxt->pages[i]);
 
-	spin_lock_bh(&xprt->sc_ctxt_lock);
+	spin_lock(&xprt->sc_ctxt_lock);
 	xprt->sc_ctxt_used--;
 	list_add(&ctxt->free, &xprt->sc_ctxts);
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
+	spin_unlock(&xprt->sc_ctxt_lock);
 }
 
 static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -897,14 +897,14 @@  struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_fastreg_mr *frmr = NULL;
 
-	spin_lock_bh(&rdma->sc_frmr_q_lock);
+	spin_lock(&rdma->sc_frmr_q_lock);
 	if (!list_empty(&rdma->sc_frmr_q)) {
 		frmr = list_entry(rdma->sc_frmr_q.next,
 				  struct svc_rdma_fastreg_mr, frmr_list);
 		list_del_init(&frmr->frmr_list);
 		frmr->sg_nents = 0;
 	}
-	spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	spin_unlock(&rdma->sc_frmr_q_lock);
 	if (frmr)
 		return frmr;
 
@@ -918,10 +918,10 @@  void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
 		ib_dma_unmap_sg(rdma->sc_cm_id->device,
 				frmr->sg, frmr->sg_nents, frmr->direction);
 		atomic_dec(&rdma->sc_dma_used);
-		spin_lock_bh(&rdma->sc_frmr_q_lock);
+		spin_lock(&rdma->sc_frmr_q_lock);
 		WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
 		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
-		spin_unlock_bh(&rdma->sc_frmr_q_lock);
+		spin_unlock(&rdma->sc_frmr_q_lock);
 	}
 }
 
@@ -999,13 +999,13 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		goto errout;
 	}
 	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
-					0, IB_POLL_SOFTIRQ);
+					0, IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_sq_cq)) {
 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
 		goto errout;
 	}
 	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
-					0, IB_POLL_SOFTIRQ);
+					0, IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_rq_cq)) {
 		dprintk("svcrdma: error creating RQ CQ for connect request\n");
 		goto errout;