Message ID | 20150709204305.26247.39173.stgit@manet.1015granger.net (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
we need to honor the max limits of device by checking dev_attr.max_sge? a vendor may not support 4 sges. On Fri, Jul 10, 2015 at 2:13 AM, Chuck Lever <chuck.lever@oracle.com> wrote: > Only the RPC/RDMA header is sent when making an RDMA_NOMSG call. > That header resides in the first element of the iovec array > passed to rpcrdma_ep_post(). > > Instead of special casing the iovec element with the pad, just > sync all the elements in the send iovec. Syncing the zero pad is > not strictly necessary, but the pad is rarely if ever used these > days, and the extra cost in that case is small. > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++++ > net/sunrpc/xprtrdma/verbs.c | 27 +++++++++++---------------- > net/sunrpc/xprtrdma/xprt_rdma.h | 18 ++++++++++-------- > 3 files changed, 25 insertions(+), 24 deletions(-) > > diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c > index cb05233..2e721f2 100644 > --- a/net/sunrpc/xprtrdma/rpc_rdma.c > +++ b/net/sunrpc/xprtrdma/rpc_rdma.c > @@ -575,6 +575,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) > req->rl_send_iov[0].length = hdrlen; > req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); > > + req->rl_niovs = 1; > + if (rtype == rpcrdma_areadch) > + return 0; > + > req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); > req->rl_send_iov[1].length = rpclen; > req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c > index cdf5220..9199436 100644 > --- a/net/sunrpc/xprtrdma/verbs.c > +++ b/net/sunrpc/xprtrdma/verbs.c > @@ -651,7 +651,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, > if (rc) > return rc; > ep->rep_attr.cap.max_recv_wr = cdata->max_requests; > - ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); > + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; > ep->rep_attr.cap.max_recv_sge = 1; > ep->rep_attr.cap.max_inline_data = 0; > ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; > @@ -1303,9 +1303,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, > struct rpcrdma_ep *ep, > struct rpcrdma_req *req) > { > + struct ib_device *device = ia->ri_device; > struct ib_send_wr send_wr, *send_wr_fail; > struct rpcrdma_rep *rep = req->rl_reply; > - int rc; > + struct ib_sge *iov = req->rl_send_iov; > + int i, rc; > > if (rep) { > rc = rpcrdma_ep_post_recv(ia, ep, rep); > @@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, > > send_wr.next = NULL; > send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; > - send_wr.sg_list = req->rl_send_iov; > + send_wr.sg_list = iov; > send_wr.num_sge = req->rl_niovs; > send_wr.opcode = IB_WR_SEND; > - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ > - ib_dma_sync_single_for_device(ia->ri_device, > - req->rl_send_iov[3].addr, > - req->rl_send_iov[3].length, > - DMA_TO_DEVICE); > - ib_dma_sync_single_for_device(ia->ri_device, > - req->rl_send_iov[1].addr, > - req->rl_send_iov[1].length, > - DMA_TO_DEVICE); > - ib_dma_sync_single_for_device(ia->ri_device, > - req->rl_send_iov[0].addr, > - req->rl_send_iov[0].length, > - DMA_TO_DEVICE); > + > + for (i = 0; i < send_wr.num_sge; i++) > + ib_dma_sync_single_for_device(device, iov[i].addr, > + iov[i].length, DMA_TO_DEVICE); > + dprintk("RPC: %s: posting %d s/g entries\n", > + __func__, send_wr.num_sge); > > if (DECR_CQCOUNT(ep) > 0) > send_wr.send_flags = 0; > diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h > index ce4e79e..90da480 100644 > --- a/net/sunrpc/xprtrdma/xprt_rdma.h > +++ b/net/sunrpc/xprtrdma/xprt_rdma.h > @@ -256,16 +256,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ > char *mr_offset; /* kva if no page, else offset */ > }; > > +#define RPCRDMA_MAX_IOVS (4) > + > struct rpcrdma_req { > - unsigned int rl_niovs; /* 0, 2 or 4 */ > - unsigned int rl_nchunks; /* non-zero if chunks */ > - unsigned int rl_connect_cookie; /* retry detection */ > - struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ > + unsigned int rl_niovs; > + unsigned int rl_nchunks; > + unsigned int rl_connect_cookie; > + struct rpcrdma_buffer *rl_buffer; > struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ > - struct ib_sge rl_send_iov[4]; /* for active requests */ > - struct rpcrdma_regbuf *rl_rdmabuf; > - struct rpcrdma_regbuf *rl_sendbuf; > - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; > + struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; > + struct rpcrdma_regbuf *rl_rdmabuf; > + struct rpcrdma_regbuf *rl_sendbuf; > + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; > }; > > static inline struct rpcrdma_req * > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/10/2015 7:29 AM, Devesh Sharma wrote: > we need to honor the max limits of device by checking > dev_attr.max_sge? a vendor may not support 4 sges. iWARP requires a minimum of 4 send SGEs (draft-hilland-verbs 8.1.3.2) An RI MUST support at least four Scatter/Gather Elements per Scatter/Gather List when the Scatter/Gather List refers to the Data Source of a Send Operation Type or the Data Sink of a Receive Operation. An RI is NOT REQUIRED to support more than one Scatter/Gather Element per Scatter/Gather List when the Scatter/Gather List refers to the Data Source of an RDMA Write. I'm not certain if IB and RoCE state a similar minimum requirement, but it seems a very bad idea to have fewer. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 10, 2015 at 6:28 PM, Tom Talpey <tom@talpey.com> wrote: > On 7/10/2015 7:29 AM, Devesh Sharma wrote: >> >> we need to honor the max limits of device by checking >> dev_attr.max_sge? a vendor may not support 4 sges. > > > iWARP requires a minimum of 4 send SGEs (draft-hilland-verbs 8.1.3.2) > > An RI MUST support at least four Scatter/Gather Elements per > Scatter/Gather List when the Scatter/Gather List refers to the Data > Source of a Send Operation Type or the Data Sink of a Receive > Operation. An RI is NOT REQUIRED to support more than one > Scatter/Gather Element per Scatter/Gather List when the > Scatter/Gather List refers to the Data Source of an RDMA Write. > > I'm not certain if IB and RoCE state a similar minimum requirement, > but it seems a very bad idea to have fewer. To my knowledge IBTA Spec do not pose any such minimum requirement. RoCE also do not puts any minimum requirement. I think its fine if xprtrdma honors the device limits, thus covering iWARP devices because all iWARP devices would support minimum 4. Chuck would correct me if xprtrdma do have any minimum requirements > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 10, 2015, at 10:11 AM, Devesh Sharma <devesh.sharma@avagotech.com> wrote: > On Fri, Jul 10, 2015 at 6:28 PM, Tom Talpey <tom@talpey.com> wrote: >> On 7/10/2015 7:29 AM, Devesh Sharma wrote: >>> >>> we need to honor the max limits of device by checking >>> dev_attr.max_sge? a vendor may not support 4 sges. >> >> >> iWARP requires a minimum of 4 send SGEs (draft-hilland-verbs 8.1.3.2) >> >> An RI MUST support at least four Scatter/Gather Elements per >> Scatter/Gather List when the Scatter/Gather List refers to the Data >> Source of a Send Operation Type or the Data Sink of a Receive >> Operation. An RI is NOT REQUIRED to support more than one >> Scatter/Gather Element per Scatter/Gather List when the >> Scatter/Gather List refers to the Data Source of an RDMA Write. >> >> I'm not certain if IB and RoCE state a similar minimum requirement, >> but it seems a very bad idea to have fewer. > > To my knowledge IBTA Spec do not pose any such minimum requirement. > RoCE also do not puts any minimum requirement. I think its fine if > xprtrdma honors the device limits, thus covering iWARP devices because > all iWARP devices would support minimum 4. > > Chuck would correct me if xprtrdma do have any minimum requirements At least 2 SGEs are required for normal operation. The code in rpcrdma_marshal_req() sets up an iovec for the RPC/RDMA header and one for the RPC message itself. These are used in rpcrdma_ep_post() to SEND each request. Four SGEs are needed only for RDMA_MSGP type requests, but so far I have never seen one of these used. I wonder if that logic can be removed. It is certainly possible to examine the device’s max_sge field in rpcrdma_ep_create() and fail transport creation if the device’s max_sge is less than RPC_MAX_IOVS. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Chuck, On 07/09/2015 04:43 PM, Chuck Lever wrote: > Only the RPC/RDMA header is sent when making an RDMA_NOMSG call. > That header resides in the first element of the iovec array > passed to rpcrdma_ep_post(). > > Instead of special casing the iovec element with the pad, just > sync all the elements in the send iovec. Syncing the zero pad is > not strictly necessary, but the pad is rarely if ever used these > days, and the extra cost in that case is small. > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c > index cdf5220..9199436 100644 > --- a/net/sunrpc/xprtrdma/verbs.c > +++ b/net/sunrpc/xprtrdma/verbs.c > @@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, > > send_wr.next = NULL; > send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; > - send_wr.sg_list = req->rl_send_iov; > + send_wr.sg_list = iov; > send_wr.num_sge = req->rl_niovs; > send_wr.opcode = IB_WR_SEND; > - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ > - ib_dma_sync_single_for_device(ia->ri_device, > - req->rl_send_iov[3].addr, > - req->rl_send_iov[3].length, > - DMA_TO_DEVICE); > - ib_dma_sync_single_for_device(ia->ri_device, > - req->rl_send_iov[1].addr, > - req->rl_send_iov[1].length, > - DMA_TO_DEVICE); > - ib_dma_sync_single_for_device(ia->ri_device, > - req->rl_send_iov[0].addr, > - req->rl_send_iov[0].length, > - DMA_TO_DEVICE); > + > + for (i = 0; i < send_wr.num_sge; i++) > + ib_dma_sync_single_for_device(device, iov[i].addr, > + iov[i].length, DMA_TO_DEVICE); Two questions here: 1) Is syncing order important? The original code went 3 - 1 - 0, but now we're going 0 - 1 - 2 - 3. 2) Is iov[2] the zero pad you mentioned in your commit message? If not, do you know why it wasn't already syncing? Thanks, Anna > + dprintk("RPC: %s: posting %d s/g entries\n", > + __func__, send_wr.num_sge); > > if (DECR_CQCOUNT(ep) > 0) > send_wr.send_flags = 0; -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Jul 10, 2015, at 4:43 PM, Anna Schumaker <Anna.Schumaker@netapp.com> wrote: > > Hi Chuck, > > On 07/09/2015 04:43 PM, Chuck Lever wrote: >> Only the RPC/RDMA header is sent when making an RDMA_NOMSG call. >> That header resides in the first element of the iovec array >> passed to rpcrdma_ep_post(). >> >> Instead of special casing the iovec element with the pad, just >> sync all the elements in the send iovec. Syncing the zero pad is >> not strictly necessary, but the pad is rarely if ever used these >> days, and the extra cost in that case is small. >> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> >> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c >> index cdf5220..9199436 100644 >> --- a/net/sunrpc/xprtrdma/verbs.c >> +++ b/net/sunrpc/xprtrdma/verbs.c >> @@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, >> >> send_wr.next = NULL; >> send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; >> - send_wr.sg_list = req->rl_send_iov; >> + send_wr.sg_list = iov; >> send_wr.num_sge = req->rl_niovs; >> send_wr.opcode = IB_WR_SEND; >> - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ >> - ib_dma_sync_single_for_device(ia->ri_device, >> - req->rl_send_iov[3].addr, >> - req->rl_send_iov[3].length, >> - DMA_TO_DEVICE); >> - ib_dma_sync_single_for_device(ia->ri_device, >> - req->rl_send_iov[1].addr, >> - req->rl_send_iov[1].length, >> - DMA_TO_DEVICE); >> - ib_dma_sync_single_for_device(ia->ri_device, >> - req->rl_send_iov[0].addr, >> - req->rl_send_iov[0].length, >> - DMA_TO_DEVICE); >> + >> + for (i = 0; i < send_wr.num_sge; i++) >> + ib_dma_sync_single_for_device(device, iov[i].addr, >> + iov[i].length, DMA_TO_DEVICE); > > Two questions here: > 1) Is syncing order important? The original code went 3 - 1 - 0, but now we're going 0 - 1 - 2 - 3. No, the syncing order isn’t important. What’s important is that the sge’s be synced _before_ the SEND Work Request is posted so that the HCA can see the freshly marshaled RPC. > 2) Is iov[2] the zero pad you mentioned in your commit message? Yes, it is. > If not, do you know why it wasn't already syncing? The pad always contains zeroes. Syncing it is optional, AFAICT. v2 of this series will have some changes in this area, and the zero pad will be gone entirely. > Thanks, > Anna > >> + dprintk("RPC: %s: posting %d s/g entries\n", >> + __func__, send_wr.num_sge); >> >> if (DECR_CQCOUNT(ep) > 0) >> send_wr.send_flags = 0; > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Chuck Lever chucklever@gmail.com -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 10, 2015 at 10:53:40AM -0400, Chuck Lever wrote: > It is certainly possible to examine the device’s max_sge field > in rpcrdma_ep_create() and fail transport creation if the > device’s max_sge is less than RPC_MAX_IOVS. I just want to draw Sagi's attention to this problem, when considering my thoughts on an alternate API for posting. This thread is about NFS needing 4 sges for a (rare?) message type. The API direction I suggested addresses this kind of issue as well, because under the covers the driver/core can spin up a temporary MR for this case and thus support an infinite s/g list for all posts. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index cb05233..2e721f2 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -575,6 +575,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + req->rl_niovs = 1; + if (rtype == rpcrdma_areadch) + return 0; + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cdf5220..9199436 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -651,7 +651,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -1303,9 +1303,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { + struct ib_device *device = ia->ri_device; struct ib_send_wr send_wr, *send_wr_fail; struct rpcrdma_rep *rep = req->rl_reply; - int rc; + struct ib_sge *iov = req->rl_send_iov; + int i, rc; if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); @@ -1316,22 +1318,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr.next = NULL; send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; - send_wr.sg_list = req->rl_send_iov; + send_wr.sg_list = iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[3].addr, - req->rl_send_iov[3].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[1].addr, - req->rl_send_iov[1].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[0].addr, - req->rl_send_iov[0].length, - DMA_TO_DEVICE); + + for (i = 0; i < send_wr.num_sge; i++) + ib_dma_sync_single_for_device(device, iov[i].addr, + iov[i].length, DMA_TO_DEVICE); + dprintk("RPC: %s: posting %d s/g entries\n", + __func__, send_wr.num_sge); if (DECR_CQCOUNT(ep) > 0) send_wr.send_flags = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ce4e79e..90da480 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -256,16 +256,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; +#define RPCRDMA_MAX_IOVS (4) + struct rpcrdma_req { - unsigned int rl_niovs; /* 0, 2 or 4 */ - unsigned int rl_nchunks; /* non-zero if chunks */ - unsigned int rl_connect_cookie; /* retry detection */ - struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + unsigned int rl_niovs; + unsigned int rl_nchunks; + unsigned int rl_connect_cookie; + struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct ib_sge rl_send_iov[4]; /* for active requests */ - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; + struct rpcrdma_regbuf *rl_rdmabuf; + struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req *
Only the RPC/RDMA header is sent when making an RDMA_NOMSG call. That header resides in the first element of the iovec array passed to rpcrdma_ep_post(). Instead of special casing the iovec element with the pad, just sync all the elements in the send iovec. Syncing the zero pad is not strictly necessary, but the pad is rarely if ever used these days, and the extra cost in that case is small. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++++ net/sunrpc/xprtrdma/verbs.c | 27 +++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 18 ++++++++++-------- 3 files changed, 25 insertions(+), 24 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html