Message ID | 20151110120432.GA8230@infradead.org (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
> On Nov 10, 2015, at 7:04 AM, Christoph Hellwig <hch@infradead.org> wrote: > >> On Tue, Nov 10, 2015 at 01:46:40PM +0200, Sagi Grimberg wrote: >> >> >>> On 10/11/2015 13:41, Christoph Hellwig wrote: >>> Oh, and while we're at it. Can someone explain why we're even >>> using rdma_read_chunk_frmr for IB? It seems to work around the >>> fact tat iWarp only allow a single RDMA READ SGE, but it's used >>> whenever the device has IB_DEVICE_MEM_MGT_EXTENSIONS, which seems >>> wrong. >> >> I think Steve can answer it better than I can. I think that it is >> just to have a single code path for both IB and iWARP. I agree that >> the condition seems wrong and for small transfers rdma_read_chunk_frmr >> is really a performance loss. > > Well, the code path already exists, but only is used fi > IB_DEVICE_MEM_MGT_EXTENSIONS isn't set. Below is an untested patch > that demonstrates how I think svcrdma should setup the reads. Note > that this also allows to entirely remove it's allphys MR. Two comments: 1. RFCs and changes in sunrpc must be copied to linux-nfs. 2. Is there a reason IB is not using the allphys MR for RDMA Read? That would be much more efficient. On the NFS server that MR isn't exposed, thus it is safe to use. > Note that as a followon this would also allow to remove the > non-READ_W_INV code path from rdma_read_chunk_frmr as a future > step. > > > diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h > index f869807..35fa638 100644 > --- a/include/linux/sunrpc/svc_rdma.h > +++ b/include/linux/sunrpc/svc_rdma.h > @@ -147,7 +147,6 @@ struct svcxprt_rdma { > struct ib_qp *sc_qp; > struct ib_cq *sc_rq_cq; > struct ib_cq *sc_sq_cq; > - struct ib_mr *sc_phys_mr; /* MR for server memory */ > int (*sc_reader)(struct svcxprt_rdma *, > struct svc_rqst *, > struct svc_rdma_op_ctxt *, > diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c > index ff4f01e..a410cb6 100644 > --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c > +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c > @@ -160,7 +160,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, > goto err; > atomic_inc(&xprt->sc_dma_used); > > - /* The lkey here is either a local dma lkey or a dma_mr lkey */ > + /* The lkey here is the local dma lkey */ > ctxt->sge[pno].lkey = xprt->sc_dma_lkey; > ctxt->sge[pno].length = len; > ctxt->count++; > diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c > index 9f3eb89..2780da4 100644 > --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c > +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c > @@ -887,8 +887,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) > struct ib_cq_init_attr cq_attr = {}; > struct ib_qp_init_attr qp_attr; > struct ib_device *dev; > - int uninitialized_var(dma_mr_acc); > - int need_dma_mr = 0; > int ret = 0; > int i; > > @@ -986,68 +984,41 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) > } > newxprt->sc_qp = newxprt->sc_cm_id->qp; > > - /* > - * Use the most secure set of MR resources based on the > - * transport type and available memory management features in > - * the device. Here's the table implemented below: > - * > - * Fast Global DMA Remote WR > - * Reg LKEY MR Access > - * Sup'd Sup'd Needed Needed > - * > - * IWARP N N Y Y > - * N Y Y Y > - * Y N Y N > - * Y Y N - > - * > - * IB N N Y N > - * N Y N - > - * Y N Y N > - * Y Y N - > - * > - * NB: iWARP requires remote write access for the data sink > - * of an RDMA_READ. IB does not. > - */ > - newxprt->sc_reader = rdma_read_chunk_lcl; > - if (dev->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { > - newxprt->sc_frmr_pg_list_len = > - dev->max_fast_reg_page_list_len; > - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; > - newxprt->sc_reader = rdma_read_chunk_frmr; > - } > + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) { > + /* > + * iWARP requires remote write access for the data sink, and > + * only supports a single SGE for RDMA_READ requests, so we'll > + * have to use a memory registration for each RDMA_READ. > + */ > + if (!(dev->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { > + /* > + * We're an iWarp device but don't support FRs. > + * Tought luck, better exit early as we have little > + * chance of doing something useful. > + */ > + goto errout; > + } > > - /* > - * Determine if a DMA MR is required and if so, what privs are required > - */ > - if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && > - !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) > + newxprt->sc_frmr_pg_list_len = dev->max_fast_reg_page_list_len; > + newxprt->sc_dev_caps = > + SVCRDMA_DEVCAP_FAST_REG | > + SVCRDMA_DEVCAP_READ_W_INV; > + newxprt->sc_reader = rdma_read_chunk_frmr; > + } else if (rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) { > + /* > + * For IB or RoCE life is easy, no unsafe write access is > + * required and multiple SGEs are supported, so we don't need > + * to use MRs. > + */ > + newxprt->sc_reader = rdma_read_chunk_lcl; > + } else { > + /* > + * Neither iWarp nor IB-ish, we're out of luck. > + */ > goto errout; > - > - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || > - !(dev->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { > - need_dma_mr = 1; > - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; > - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && > - !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) > - dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; > } > > - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) > - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; > - > - /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ > - if (need_dma_mr) { > - /* Register all of physical memory */ > - newxprt->sc_phys_mr = > - ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); > - if (IS_ERR(newxprt->sc_phys_mr)) { > - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", > - ret); > - goto errout; > - } > - newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; > - } else > - newxprt->sc_dma_lkey = dev->local_dma_lkey; > + newxprt->sc_dma_lkey = dev->local_dma_lkey; > > /* Post receive buffers */ > for (i = 0; i < newxprt->sc_max_requests; i++) { > @@ -1203,9 +1174,6 @@ static void __svc_rdma_free(struct work_struct *work) > if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) > ib_destroy_cq(rdma->sc_rq_cq); > > - if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) > - ib_dereg_mr(rdma->sc_phys_mr); > - > if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) > ib_dealloc_pd(rdma->sc_pd); > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/10/2015 9:34 AM, Chuck Lever wrote: > 2. Is there a reason IB is not using the allphys MR for RDMA Read? > That would be much more efficient. On the NFS server that MR isn't > exposed, thus it is safe to use. True, but only if the layout of the buffer's physical pages do not exceed the local RDMA Read scatter limit. If the size is large then the privileged MR will require additional RDMA Read operations, which can be more expensive. It's certainly a valid optimization, if the buffer "fits". -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 10, 2015 at 04:04:32AM -0800, Christoph Hellwig wrote: > On Tue, Nov 10, 2015 at 01:46:40PM +0200, Sagi Grimberg wrote: > > > > > > On 10/11/2015 13:41, Christoph Hellwig wrote: > > >Oh, and while we're at it. Can someone explain why we're even > > >using rdma_read_chunk_frmr for IB? It seems to work around the > > >fact tat iWarp only allow a single RDMA READ SGE, but it's used > > >whenever the device has IB_DEVICE_MEM_MGT_EXTENSIONS, which seems > > >wrong. > > > > I think Steve can answer it better than I can. I think that it is > > just to have a single code path for both IB and iWARP. I agree that > > the condition seems wrong and for small transfers rdma_read_chunk_frmr > > is really a performance loss. > > Well, the code path already exists, but only is used fi > IB_DEVICE_MEM_MGT_EXTENSIONS isn't set. Below is an untested patch > that demonstrates how I think svcrdma should setup the reads. Note > that this also allows to entirely remove it's allphys MR. > > Note that as a followon this would also allow to remove the > non-READ_W_INV code path from rdma_read_chunk_frmr as a future > step. I like this, my only comment is we should have a rdma_cap for this behavior, rdma_cap_needs_rdma_read_mr(pd) or something? > + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) { Use here > + /* > + * iWARP requires remote write access for the data sink, and > + * only supports a single SGE for RDMA_READ requests, so we'll > + * have to use a memory registration for each RDMA_READ. > + */ > + if (!(dev->device_cap_flags & > IB_DEVICE_MEM_MGT_EXTENSIONS)) { Lets enforce this in the core, if rdma_cap_needs_rdma_read_mr is set the the device must also set IB_DEVICE_MEM_MGT_EXTENSIONS, check at device creation time. > + } else if (rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) { > + /* > + * For IB or RoCE life is easy, no unsafe write access is > + * required and multiple SGEs are supported, so we don't need > + * to use MRs. > + */ > + newxprt->sc_reader = rdma_read_chunk_lcl; > + } else { > + /* > + * Neither iWarp nor IB-ish, we're out of luck. > + */ > goto errout; No need for the else, !rdma_cap_needs_rdma_read_mr means pd->local_dma_lkey is okay to use. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/10/2015 1:25 PM, Jason Gunthorpe wrote: > On Tue, Nov 10, 2015 at 04:04:32AM -0800, Christoph Hellwig wrote: >> On Tue, Nov 10, 2015 at 01:46:40PM +0200, Sagi Grimberg wrote: >>> >>> >>> On 10/11/2015 13:41, Christoph Hellwig wrote: >>>> Oh, and while we're at it. Can someone explain why we're even >>>> using rdma_read_chunk_frmr for IB? It seems to work around the >>>> fact tat iWarp only allow a single RDMA READ SGE, but it's used >>>> whenever the device has IB_DEVICE_MEM_MGT_EXTENSIONS, which seems >>>> wrong. >>> >>> I think Steve can answer it better than I can. I think that it is >>> just to have a single code path for both IB and iWARP. I agree that >>> the condition seems wrong and for small transfers rdma_read_chunk_frmr >>> is really a performance loss. >> >> Well, the code path already exists, but only is used fi >> IB_DEVICE_MEM_MGT_EXTENSIONS isn't set. Below is an untested patch >> that demonstrates how I think svcrdma should setup the reads. Note >> that this also allows to entirely remove it's allphys MR. >> >> Note that as a followon this would also allow to remove the >> non-READ_W_INV code path from rdma_read_chunk_frmr as a future >> step. > > I like this, my only comment is we should have a rdma_cap for this > behavior, rdma_cap_needs_rdma_read_mr(pd) or something? Windows NDKPI has this, it's the oh-so-succinctly-named flag NDK_ADAPTER_FLAG_RDMA_READ_SINK_NOT_REQUIRED. The ULP is free to ignore it and pass the NDK_MR_FLAG_RDMA_READ_SINK flag anyway, the provider is expected to ignore it if not needed. > >> + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) { > > Use here > >> + /* >> + * iWARP requires remote write access for the data sink, and >> + * only supports a single SGE for RDMA_READ requests, so we'll >> + * have to use a memory registration for each RDMA_READ. >> + */ >> + if (!(dev->device_cap_flags & >> IB_DEVICE_MEM_MGT_EXTENSIONS)) { > > Lets enforce this in the core, if rdma_cap_needs_rdma_read_mr is set > the the device must also set IB_DEVICE_MEM_MGT_EXTENSIONS, check at > device creation time. > >> + } else if (rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) { >> + /* >> + * For IB or RoCE life is easy, no unsafe write access is >> + * required and multiple SGEs are supported, so we don't need >> + * to use MRs. >> + */ >> + newxprt->sc_reader = rdma_read_chunk_lcl; >> + } else { >> + /* >> + * Neither iWarp nor IB-ish, we're out of luck. >> + */ >> goto errout; > > No need for the else, !rdma_cap_needs_rdma_read_mr means pd->local_dma_lkey is okay > to use. Hmm, agreed, but it must still be acceptable to perform a registration instead of using the local_dma_lkey. As mentioned earlier, there are scatter limits and other implications for all-physical addressing that an upper layer may choose to avoid. Tom. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 10, 2015 at 04:00:43PM -0500, Tom Talpey wrote: > Hmm, agreed, but it must still be acceptable to perform a registration > instead of using the local_dma_lkey. As mentioned earlier, there are > scatter limits and other implications for all-physical addressing that > an upper layer may choose to avoid. It is always acceptable to use a lkey MR instead of the local dma lkey, but ULPs should prefer to use the local dma lkey if possible, for performance reasons. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/10/2015 4:17 PM, Jason Gunthorpe wrote: > On Tue, Nov 10, 2015 at 04:00:43PM -0500, Tom Talpey wrote: > >> Hmm, agreed, but it must still be acceptable to perform a registration >> instead of using the local_dma_lkey. As mentioned earlier, there are >> scatter limits and other implications for all-physical addressing that >> an upper layer may choose to avoid. > > It is always acceptable to use a lkey MR instead of the local dma > lkey, but ULPs should prefer to use the local dma lkey if possible, > for performance reasons. Sure, the key words are "if possible". For example, a single 1MB RDMA Read is unlikely to be possible with the dma lkey, assuming worst-case physical page scatter it would need 256 SGEs. But 1MB can be registered easily. In any case, my point was that the ULP gets to choose, so, it looks like we agree. Tom. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Nov 10, 2015, at 4:30 PM, Tom Talpey <tom@talpey.com> wrote: > > On 11/10/2015 4:17 PM, Jason Gunthorpe wrote: >> On Tue, Nov 10, 2015 at 04:00:43PM -0500, Tom Talpey wrote: >> >>> Hmm, agreed, but it must still be acceptable to perform a registration >>> instead of using the local_dma_lkey. As mentioned earlier, there are >>> scatter limits and other implications for all-physical addressing that >>> an upper layer may choose to avoid. >> >> It is always acceptable to use a lkey MR instead of the local dma >> lkey, but ULPs should prefer to use the local dma lkey if possible, >> for performance reasons. > > Sure, the key words are "if possible". For example, a single 1MB > RDMA Read is unlikely to be possible with the dma lkey, assuming > worst-case physical page scatter it would need 256 SGEs. But 1MB > can be registered easily. > > In any case, my point was that the ULP gets to choose, so, it looks > like we agree. I’d like to see our NFS server use the local DMA lkey where it makes sense, to avoid the cost of registering and invalidating memory. I have to agree with Tom that once the device’s s/g limit is exceeded, the server has to post an RDMA Read WR every few pages, and appears to get expensive for large NFS requests. The current mechanism of statically choosing either FRMR or local DMA depending on the device is probably not adequate. Maybe we could post all the Read WRs via a single chain? Or stick with FRMR when the amount of data to read is significant. I’ve also tested Christoph’s patch. The logic currently in rdma_read_chunk_lcl does not seem up to the task. Once the device’s s/g limit is exceeded, the server starts throwing local length exceptions, and the client workload hangs. None of this is impossible to fix, but there is some work to do here. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 10, 2015 at 06:01:01PM -0500, Chuck Lever wrote: > The current mechanism of statically choosing either FRMR or > local DMA depending on the device is probably not adequate. > Maybe we could post all the Read WRs via a single chain? Or > stick with FRMR when the amount of data to read is significant. Yes, those are the right ways to go.. IMHO, the break point for when it makes sense to switch from a RDMA READ chain to a MR is going to be a RDMA core tunable. The performance curve won't have much to do with the ULP. But certainly, if a requests fits in the SG list then it should just use the local dma lkey directly, and consider allocating an appropriate S/G list size when creating the QP. The special thing about iwarp becomes simply defeating the use of the local dma lkey for RDMA READ. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 10, 2015 at 11:25:46AM -0700, Jason Gunthorpe wrote: > I like this, my only comment is we should have a rdma_cap for this > behavior, rdma_cap_needs_rdma_read_mr(pd) or something? Yes, that's better than checking the protocol. > > + if (!(dev->device_cap_flags & > > IB_DEVICE_MEM_MGT_EXTENSIONS)) { > > Lets enforce this in the core, if rdma_cap_needs_rdma_read_mr is set > the the device must also set IB_DEVICE_MEM_MGT_EXTENSIONS, check at > device creation time. The iWarp verbs spec requires them to be supported, so that should not be an issue. > > + } else if (rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) { > > + /* > > + * For IB or RoCE life is easy, no unsafe write access is > > + * required and multiple SGEs are supported, so we don't need > > + * to use MRs. > > + */ > > + newxprt->sc_reader = rdma_read_chunk_lcl; > > + } else { > > + /* > > + * Neither iWarp nor IB-ish, we're out of luck. > > + */ > > goto errout; > > No need for the else, !rdma_cap_needs_rdma_read_mr means pd->local_dma_lkey is okay > to use. What would happen if someone tried to use NFS on usnic without this? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 10, 2015 at 04:55:51PM -0700, Jason Gunthorpe wrote: > IMHO, the break point for when it makes sense to switch from a RDMA > READ chain to a MR is going to be a RDMA core tunable. The performance > curve won't have much to do with the ULP. core/device, a lot of it depends on when we'd exceed the nr_sge limit as well. But yes, it needs to move out ot the ULP and into the core, the ULP has no business here. As said a few times before we need a core API that takes a struct scatterlist and builds RDMA READ/WRITE requests from it in an efficient way. I have started talking to Sagi and preparing some build blocks for it, but getting all corner cases right will take a while. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jason, > It is always acceptable to use a lkey MR instead of the local dma > lkey, but ULPs should prefer to use the local dma lkey if possible, > for performance reasons. I don't necessarily agree with this statement (at least with the second part of it), the world is not always perfect. For RDMA READs, a HCA will perform the memory scatters when on the RX path, when receiving the read responses containing the data. This means that the HCA needs to perform a lookup of the relevant scatter entries upon each read response. Due to that, modern HCAs keep a dedicate cache for this type of RX-path lookup (which is limited in size naturally). So, having *small* sgls for rdma_read is much (much) more friendly to the HCA caches which means that for large transfers, even for IB, it might be better to register memory than having endless sgls. I was able to see that under some workloads. This is yet another non-trivial consideration that ULPs need to be aware of... We really are better off putting this stuff in the core... Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > I’d like to see our NFS server use the local DMA lkey where it > makes sense, to avoid the cost of registering and invalidating > memory. > > I have to agree with Tom that once the device’s s/g limit is > exceeded, the server has to post an RDMA Read WR every few > pages, and appears to get expensive for large NFS requests. > > The current mechanism of statically choosing either FRMR or > local DMA depending on the device is probably not adequate. > Maybe we could post all the Read WRs via a single chain? Or > stick with FRMR when the amount of data to read is significant. > > I’ve also tested Christoph’s patch. The logic currently in > rdma_read_chunk_lcl does not seem up to the task. Once the > device’s s/g limit is exceeded, the server starts throwing > local length exceptions, and the client workload hangs. This is probably because this code path wasn't reached/tested for a long time as it's hard to find a device that doesn't support FRWR for a long time now... -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Sagi- > On Nov 11, 2015, at 4:28 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > >> I’d like to see our NFS server use the local DMA lkey where it >> makes sense, to avoid the cost of registering and invalidating >> memory. >> >> I have to agree with Tom that once the device’s s/g limit is >> exceeded, the server has to post an RDMA Read WR every few >> pages, and appears to get expensive for large NFS requests. >> >> The current mechanism of statically choosing either FRMR or >> local DMA depending on the device is probably not adequate. >> Maybe we could post all the Read WRs via a single chain? Or >> stick with FRMR when the amount of data to read is significant. >> >> I’ve also tested Christoph’s patch. The logic currently in >> rdma_read_chunk_lcl does not seem up to the task. Once the >> device’s s/g limit is exceeded, the server starts throwing >> local length exceptions, and the client workload hangs. > > This is probably because this code path wasn't reached/tested for > a long time as it's hard to find a device that doesn't support FRWR > for a long time now… An alternate explanation is that the provider is not setting device->max_sge_rd properly. rdma_read_chunks_lcl() seems to be the only thing in my copy of the kernel tree that relies on that value. I’ve reproduced the local length errors with CX-2 and CX-3 Pro, which both set that field to 32. If I artificially set that field to 30, I don’t see any issue. Is commit 18ebd40773bf correct? -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> An alternate explanation is that the provider is not setting > device->max_sge_rd properly. rdma_read_chunks_lcl() seems to > be the only thing in my copy of the kernel tree that relies on > that value. > > I’ve reproduced the local length errors with CX-2 and CX-3 > Pro, which both set that field to 32. If I artificially > set that field to 30, I don’t see any issue. > > Is commit 18ebd40773bf correct? See my latest patchset "Handle mlx4 max_sge_rd correctly" (v2). It fixes exactly this. Now you will be able to add your "Tested-by:" tag ;) It on Doug to take it... -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Nov 11, 2015 at 12:02:25AM -0800, Christoph Hellwig wrote: > > No need for the else, !rdma_cap_needs_rdma_read_mr means pd->local_dma_lkey is okay > > to use. > > What would happen if someone tried to use NFS on usnic without this? Honestly, I have no idea how usnic fits into the kernel side, it simply doesn't provide the kapi. It should fail much earlier, like when creating a PD, IMHO. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Nov 11, 2015 at 11:25:06AM +0200, Sagi Grimberg wrote: > For RDMA READs, a HCA will perform the memory scatters when on the RX > path, when receiving the read responses containing the data. This means > that the HCA needs to perform a lookup of the relevant scatter entries > upon each read response. Due to that, modern HCAs keep a dedicate cache > for this type of RX-path lookup (which is limited in size naturally). There is always a memory scatter, and MR setup overheads and table reads are not zero. There is nothing architectural in the verbs or IBA that would require a HCA to run slower for SGL than MR... As you say, another choice the ULP should not be making.. Even selecting the optimal SGL list len is not something the ULP should do, in that case. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Nov 11, 2015, at 11:29 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > > > >> An alternate explanation is that the provider is not setting >> device->max_sge_rd properly. rdma_read_chunks_lcl() seems to >> be the only thing in my copy of the kernel tree that relies on >> that value. >> >> I’ve reproduced the local length errors with CX-2 and CX-3 >> Pro, which both set that field to 32. If I artificially >> set that field to 30, I don’t see any issue. >> >> Is commit 18ebd40773bf correct? > > See my latest patchset "Handle mlx4 max_sge_rd correctly" (v2). > > It fixes exactly this. > Now you will be able to add your "Tested-by:" tag ;) Confirmed that the current value (32) does not work, and that the proposed replacement (30) does work, in an NFS server that uses the _lcl path with mlx4. > It on Doug to take it... Noted that the NFS server’s _lcl path is the only kernel consumer of this value. Because the current NFS server does not use the _lcl path with mlx4 devices, maybe this fix is not needed in stable, unless the max_sge_rd field is visible to user space. However no future changes to the NFS server’s reader selection logic should be made until this fix, or one like it, is merged. And I recommend adding a "Fixes: 18ebd40773bf" tag. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f869807..35fa638 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -147,7 +147,6 @@ struct svcxprt_rdma { struct ib_qp *sc_qp; struct ib_cq *sc_rq_cq; struct ib_cq *sc_sq_cq; - struct ib_mr *sc_phys_mr; /* MR for server memory */ int (*sc_reader)(struct svcxprt_rdma *, struct svc_rqst *, struct svc_rdma_op_ctxt *, diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ff4f01e..a410cb6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -160,7 +160,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, goto err; atomic_inc(&xprt->sc_dma_used); - /* The lkey here is either a local dma lkey or a dma_mr lkey */ + /* The lkey here is the local dma lkey */ ctxt->sge[pno].lkey = xprt->sc_dma_lkey; ctxt->sge[pno].length = len; ctxt->count++; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9f3eb89..2780da4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -887,8 +887,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; struct ib_device *dev; - int uninitialized_var(dma_mr_acc); - int need_dma_mr = 0; int ret = 0; int i; @@ -986,68 +984,41 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } newxprt->sc_qp = newxprt->sc_cm_id->qp; - /* - * Use the most secure set of MR resources based on the - * transport type and available memory management features in - * the device. Here's the table implemented below: - * - * Fast Global DMA Remote WR - * Reg LKEY MR Access - * Sup'd Sup'd Needed Needed - * - * IWARP N N Y Y - * N Y Y Y - * Y N Y N - * Y Y N - - * - * IB N N Y N - * N Y N - - * Y N Y N - * Y Y N - - * - * NB: iWARP requires remote write access for the data sink - * of an RDMA_READ. IB does not. - */ - newxprt->sc_reader = rdma_read_chunk_lcl; - if (dev->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - newxprt->sc_frmr_pg_list_len = - dev->max_fast_reg_page_list_len; - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; - newxprt->sc_reader = rdma_read_chunk_frmr; - } + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) { + /* + * iWARP requires remote write access for the data sink, and + * only supports a single SGE for RDMA_READ requests, so we'll + * have to use a memory registration for each RDMA_READ. + */ + if (!(dev->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { + /* + * We're an iWarp device but don't support FRs. + * Tought luck, better exit early as we have little + * chance of doing something useful. + */ + goto errout; + } - /* - * Determine if a DMA MR is required and if so, what privs are required - */ - if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && - !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) + newxprt->sc_frmr_pg_list_len = dev->max_fast_reg_page_list_len; + newxprt->sc_dev_caps = + SVCRDMA_DEVCAP_FAST_REG | + SVCRDMA_DEVCAP_READ_W_INV; + newxprt->sc_reader = rdma_read_chunk_frmr; + } else if (rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) { + /* + * For IB or RoCE life is easy, no unsafe write access is + * required and multiple SGEs are supported, so we don't need + * to use MRs. + */ + newxprt->sc_reader = rdma_read_chunk_lcl; + } else { + /* + * Neither iWarp nor IB-ish, we're out of luck. + */ goto errout; - - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || - !(dev->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && - !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) - dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; } - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - - /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ - if (need_dma_mr) { - /* Register all of physical memory */ - newxprt->sc_phys_mr = - ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", - ret); - goto errout; - } - newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; - } else - newxprt->sc_dma_lkey = dev->local_dma_lkey; + newxprt->sc_dma_lkey = dev->local_dma_lkey; /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { @@ -1203,9 +1174,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) ib_destroy_cq(rdma->sc_rq_cq); - if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) - ib_dereg_mr(rdma->sc_phys_mr); - if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd);