Message ID | 1437548143-24893-39-git-send-email-sagig@mellanox.com (mailing list archive) |
---|---|
State | RFC |
Headers | show |
> @@ -2585,11 +2517,9 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, > struct isert_device *device = isert_conn->device; > struct ib_device *ib_dev = device->ib_device; > struct ib_mr *mr; > struct ib_send_wr fr_wr, inv_wr; > struct ib_send_wr *bad_wr, *wr = NULL; > + int ret; > > if (mem->dma_nents == 1) { > sge->lkey = device->mr->lkey; > @@ -2600,40 +2530,32 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, > return 0; > } > > + if (ind == ISERT_DATA_KEY_VALID) > /* Registering data buffer */ > mr = fr_desc->data_mr; > + else > /* Registering protection buffer */ > mr = fr_desc->pi_ctx->prot_mr; > > if (!(fr_desc->ind & ind)) { > isert_inv_rkey(&inv_wr, mr); > wr = &inv_wr; > } > > + ret = ib_map_mr_sg(mr, mem->sg, mem->nents, IB_ACCESS_LOCAL_WRITE); > + if (ret) { > + isert_err("failed to map sg %p with %d entries\n", > + mem->sg, mem->dma_nents); > + return ret; > + } > + > + isert_dbg("Use fr_desc %p sg_nents %d offset %u\n", > + fr_desc, mem->nents, mem->offset); > + > /* Prepare FASTREG WR */ > memset(&fr_wr, 0, sizeof(fr_wr)); > + ib_set_fastreg_wr(mr, mr->lkey, ISER_FASTREG_LI_WRID, > + false, &fr_wr); Shouldn't ib_set_fastreg_wr take care of this memset? Also it seems instead of the singalled flag to it we might just set that or other flags later if we really want to. > struct pi_context { > struct ib_mr *prot_mr; > - struct ib_fast_reg_page_list *prot_frpl; > struct ib_mr *sig_mr; > }; > > struct fast_reg_descriptor { > struct list_head list; > struct ib_mr *data_mr; > - struct ib_fast_reg_page_list *data_frpl; > u8 ind; > struct pi_context *pi_ctx; As a follow on it might be worth to just kill off the separate pi_context structure here. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/22/2015 8:04 PM, Christoph Hellwig wrote: >> @@ -2585,11 +2517,9 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, >> struct isert_device *device = isert_conn->device; >> struct ib_device *ib_dev = device->ib_device; >> struct ib_mr *mr; >> struct ib_send_wr fr_wr, inv_wr; >> struct ib_send_wr *bad_wr, *wr = NULL; >> + int ret; >> >> if (mem->dma_nents == 1) { >> sge->lkey = device->mr->lkey; >> @@ -2600,40 +2530,32 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, >> return 0; >> } >> >> + if (ind == ISERT_DATA_KEY_VALID) >> /* Registering data buffer */ >> mr = fr_desc->data_mr; >> + else >> /* Registering protection buffer */ >> mr = fr_desc->pi_ctx->prot_mr; >> >> if (!(fr_desc->ind & ind)) { >> isert_inv_rkey(&inv_wr, mr); >> wr = &inv_wr; >> } >> >> + ret = ib_map_mr_sg(mr, mem->sg, mem->nents, IB_ACCESS_LOCAL_WRITE); >> + if (ret) { >> + isert_err("failed to map sg %p with %d entries\n", >> + mem->sg, mem->dma_nents); >> + return ret; >> + } >> + >> + isert_dbg("Use fr_desc %p sg_nents %d offset %u\n", >> + fr_desc, mem->nents, mem->offset); >> + >> /* Prepare FASTREG WR */ >> memset(&fr_wr, 0, sizeof(fr_wr)); >> + ib_set_fastreg_wr(mr, mr->lkey, ISER_FASTREG_LI_WRID, >> + false, &fr_wr); > > Shouldn't ib_set_fastreg_wr take care of this memset? Also it seems > instead of the singalled flag to it we might just set that or > other flags later if we really want to. The reason I didn't put it in was that ib_send_wr is not a small struct (92 bytes IIRC). So I'm a bit reluctant to add an unconditional memset. Maybe it's better that the callers can carefully set it to save some cycles? > >> struct pi_context { >> struct ib_mr *prot_mr; >> - struct ib_fast_reg_page_list *prot_frpl; >> struct ib_mr *sig_mr; >> }; >> >> struct fast_reg_descriptor { >> struct list_head list; >> struct ib_mr *data_mr; >> - struct ib_fast_reg_page_list *data_frpl; >> u8 ind; >> struct pi_context *pi_ctx; > > As a follow on it might be worth to just kill off the separate > pi_context structure here. Yea we can do that.. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 22, 2015 at 08:33:16PM +0300, Sagi Grimberg wrote: > >> memset(&fr_wr, 0, sizeof(fr_wr)); > >>+ ib_set_fastreg_wr(mr, mr->lkey, ISER_FASTREG_LI_WRID, > >>+ false, &fr_wr); > > > >Shouldn't ib_set_fastreg_wr take care of this memset? Also it seems > >instead of the singalled flag to it we might just set that or > >other flags later if we really want to. Seems reasonable. If you want to micro optimize then just zero the few items that are defined to be accessed for fastreg, no need to zero the whole structure. Infact, you may have already done that, so just drop the memset entirely. > The reason I didn't put it in was that ib_send_wr is not a small struct > (92 bytes IIRC). So I'm a bit reluctant to add an unconditional memset. > Maybe it's better that the callers can carefully set it to save some > cycles? If you want to optimize this path, then Sean is right, move the post into the driver and stop pretending that ib_post_send is a performance API. ib_post_fastreg_wr would be a function that needs 3 register passed arguments and does a simple copy to the driver's actual sendq No 96 byte structure memset, no stack traffic, no conditional jumps. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/22/2015 8:57 PM, Jason Gunthorpe wrote: > On Wed, Jul 22, 2015 at 08:33:16PM +0300, Sagi Grimberg wrote: >>>> memset(&fr_wr, 0, sizeof(fr_wr)); >>>> + ib_set_fastreg_wr(mr, mr->lkey, ISER_FASTREG_LI_WRID, >>>> + false, &fr_wr); >>> >>> Shouldn't ib_set_fastreg_wr take care of this memset? Also it seems >>> instead of the singalled flag to it we might just set that or >>> other flags later if we really want to. > > Seems reasonable. > > If you want to micro optimize then just zero the few items that are > defined to be accessed for fastreg, no need to zero the whole > structure. Infact, you may have already done that, so just drop the > memset entirely. I will. > >> The reason I didn't put it in was that ib_send_wr is not a small struct >> (92 bytes IIRC). So I'm a bit reluctant to add an unconditional memset. >> Maybe it's better that the callers can carefully set it to save some >> cycles? > > If you want to optimize this path, then Sean is right, move the post > into the driver and stop pretending that ib_post_send is a performance > API. > > ib_post_fastreg_wr would be a function that needs 3 register passed > arguments and does a simple copy to the driver's actual sendq That will require to take the SQ lock and write a doorbell for each registration and post you want to do. I'm confident that constructing a post chain with a single sq lock acquire and a single doorbell will be much much better even with conditional jumps and memsets. svcrdma, isert (and iser - not upstream yet) are doing it. I think that others should do it too. My tests shows that this makes a difference in small IO workloads. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 23, 2015, at 6:27 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > On 7/22/2015 8:57 PM, Jason Gunthorpe wrote: >> On Wed, Jul 22, 2015 at 08:33:16PM +0300, Sagi Grimberg wrote: >>>>> memset(&fr_wr, 0, sizeof(fr_wr)); >>>>> + ib_set_fastreg_wr(mr, mr->lkey, ISER_FASTREG_LI_WRID, >>>>> + false, &fr_wr); >>>> >>>> Shouldn't ib_set_fastreg_wr take care of this memset? Also it seems >>>> instead of the singalled flag to it we might just set that or >>>> other flags later if we really want to. >> >> Seems reasonable. >> >> If you want to micro optimize then just zero the few items that are >> defined to be accessed for fastreg, no need to zero the whole >> structure. Infact, you may have already done that, so just drop the >> memset entirely. > > I will. > >> >>> The reason I didn't put it in was that ib_send_wr is not a small struct >>> (92 bytes IIRC). So I'm a bit reluctant to add an unconditional memset. >>> Maybe it's better that the callers can carefully set it to save some >>> cycles? >> >> If you want to optimize this path, then Sean is right, move the post >> into the driver and stop pretending that ib_post_send is a performance >> API. >> >> ib_post_fastreg_wr would be a function that needs 3 register passed >> arguments and does a simple copy to the driver's actual sendq > > That will require to take the SQ lock and write a doorbell for each > registration and post you want to do. I'm confident that constructing > a post chain with a single sq lock acquire and a single doorbell will > be much much better even with conditional jumps and memsets. I agree. xprtrdma uses several MRs per RPC. It would be more efficient to chain together several WRs and post once to deal with these, especially for HCAs/providers that have a shallow page_list depth. > svcrdma, isert (and iser - not upstream yet) are doing it. I think that > others should do it too. My tests shows that this makes a difference in > small IO workloads. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 23, 2015 at 01:27:23PM +0300, Sagi Grimberg wrote: > >ib_post_fastreg_wr would be a function that needs 3 register passed > >arguments and does a simple copy to the driver's actual sendq > > That will require to take the SQ lock and write a doorbell for each > registration and post you want to do. I'm confident that constructing > a post chain with a single sq lock acquire and a single doorbell will > be much much better even with conditional jumps and memsets. You are still thinking at a micro level, the ULP should be working at a higher level and requesting the MR(s) and the actual work together so the driver can run the the whole chain of posts without extra stack traffic, locking or doorbells. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/23/2015 7:31 PM, Jason Gunthorpe wrote: > On Thu, Jul 23, 2015 at 01:27:23PM +0300, Sagi Grimberg wrote: >>> ib_post_fastreg_wr would be a function that needs 3 register passed >>> arguments and does a simple copy to the driver's actual sendq >> >> That will require to take the SQ lock and write a doorbell for each >> registration and post you want to do. I'm confident that constructing >> a post chain with a single sq lock acquire and a single doorbell will >> be much much better even with conditional jumps and memsets. > > You are still thinking at a micro level, the ULP should be working at > a higher level and requesting the MR(s) and the actual work together > so the driver can run the the whole chain of posts without extra stack > traffic, locking or doorbells. But I'd also want to chain the subsequent RDMA(s) or SEND (with the rkey(s) under the same post. I'm sorry but the idea of handling memory region mapping (possibly more than one), detecting gaps and deciding on the strategy of what to do and who knows what else under the send queue lock doesn't seem like a good idea, its a complete overkill IMO. I don't mean to be negative about your ideas, I just don't think that doing all the work in the drivers is going to get us to a better place. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 23, 2015 at 07:59:48PM +0300, Sagi Grimberg wrote: > I don't mean to be negative about your ideas, I just don't think that > doing all the work in the drivers is going to get us to a better place. No worries, I'm hoping someone can put the peices together and figure out how to code share all the duplication we seem to have in the ULPs. The more I've look at them, the more it seems like they get basic things wrong, like SQE accouting in NFS, dma flush ordering in NFS, rkey security in SRP/iSER.. Sharing code means we can fix those problems for good. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 23, 2015, at 2:53 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote: > On Thu, Jul 23, 2015 at 07:59:48PM +0300, Sagi Grimberg wrote: >> I don't mean to be negative about your ideas, I just don't think that >> doing all the work in the drivers is going to get us to a better place. > > No worries, I'm hoping someone can put the peices together and figure > out how to code share all the duplication we seem to have in the ULPs. > > The more I've look at them, the more it seems like they get basic > things wrong, like SQE accouting in NFS, dma flush ordering in NFS, I have a work-in-progress prototype that addresses both of these issues. Unfinished, but operational: http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=shortlog;h=refs/heads/nfs-rdma-future Having this should give us time to analyze the performance impact of these changes, and to dial in an approach that aligns with your vision about the unified APIs that you and Sagi have been discussing. FRWR is seeing a 10-15% throughput reduction with 8-thread dbench, but a 5% improvement with 16-thread fio IOPS. 4K and 8K direct read and write are negatively impacted. I don’t see any significant change in client CPU utilization, but have not yet examined changes in interrupt workload, nor have I done any spin lock or CPU bus traffic analysis. But none of this is as bad as I feared it could be. There are plenty of other areas that can recoup some or all of this loss eventually. I converted the RPC reply handler tasklet to a work queue context to allow sleeping. A new .ro_unmap_sync method is invoked after the RPC/RDMA header is parsed but before xprt_complete_rqst() wakes up the waiting RPC. .ro_unmap_sync is 100% synchronous. It does not return to the reply handler until the MRs are invalid and unmapped. For FMR, .ro_unmap_sync makes a list of the RPC’s MRs and passes that list to a single ib_unmap_fmr() call, then performs DMA unmap and releases the MRs. This is actually much more efficient than the current logic, which serially does an ib_unmap_fmr() for each MR the RPC owns. So FMR overall performs better with this change. For FRWR, .ro_unmap_sync builds a chain of LOCAL_INV WRs for the RPC’s MRs and posts that with a single ib_post_send(). The final WR in the chain is signaled. A kernel completion is used to wait for the LINV chain to complete. Then DMA unmap and MR release. This lengthens per-RPC latency for FRWR, because the LINVs are now fully accounted for in the RPC round-trip rather than being done asynchronously after the RPC completes. So here performance is closer to FMR, but is still better by a substantial margin. Because the next RPC cannot awaken until the last send completes, send queue accounting is based on RPC/RDMA credit flow control. I’m sure there are some details here that still need to be addressed, but this fixes the big problem with FRWR send queue accounting, which was that LOCAL_INV WRs would continue to consume SQEs while another RPC was allowed to start. I think switching to use s/g lists will be straightforward and could simplify the overall approach somewhat. > rkey security in SRP/iSER.. > > Sharing code means we can fix those problems for good. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 24, 2015 at 10:36:07AM -0400, Chuck Lever wrote: > Unfinished, but operational: > > http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=shortlog;h=refs/heads/nfs-rdma-future Nice.. Can you spend some time and reflect on how some of this could be lowered into the core code? The FMR and FRWR side have many similarities now.. > FRWR is seeing a 10-15% throughput reduction with 8-thread dbench, > but a 5% improvement with 16-thread fio IOPS. 4K and 8K direct > read and write are negatively impacted. I'm not surprised since invalidate is sync. I belive you need to incorporate SEND WITH INVALIDATE to substantially recover this overhead. It would be neat if the RQ could continue to advance while waiting for the invalidate.. That looks almost doable.. > I converted the RPC reply handler tasklet to a work queue context > to allow sleeping. A new .ro_unmap_sync method is invoked after > the RPC/RDMA header is parsed but before xprt_complete_rqst() > wakes up the waiting RPC. .. so the issue is the RPC must be substantially parsed to learn which MR it is associated with to schedule the invalidate? > This is actually much more efficient than the current logic, > which serially does an ib_unmap_fmr() for each MR the RPC owns. > So FMR overall performs better with this change. Interesting.. > Because the next RPC cannot awaken until the last send completes, > send queue accounting is based on RPC/RDMA credit flow control. So for FRWR the sync invalidate effectively guarentees all SQEs related to this RPC are flushed. That seems reasonable, if the number of SQEs and CQEs are properly sized in relation to the RPC slot count it should be workable.. How does FMR and PHYS synchronize? > I’m sure there are some details here that still need to be > addressed, but this fixes the big problem with FRWR send queue > accounting, which was that LOCAL_INV WRs would continue to > consume SQEs while another RPC was allowed to start. Did you test without that artificial limit you mentioned before? I'm also wondering about this: > During some other testing I found that when a completion upcall > returns to the provider leaving CQEs still on the completion queue, > there is a non-zero probability that a completion will be lost. What does lost mean? The CQ is edge triggered, so if you don't drain it you might not get another timely CQ callback (which is bad), but CQEs themselves should not be lost. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe > Sent: Friday, July 24, 2015 11:27 AM > To: Chuck Lever > Cc: Sagi Grimberg; Christoph Hellwig; linux-rdma; Liran Liss; Oren Duer > Subject: Re: [PATCH WIP 38/43] iser-target: Port to new memory registration API > > On Fri, Jul 24, 2015 at 10:36:07AM -0400, Chuck Lever wrote: > > > Unfinished, but operational: > > > > http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=shortlog;h=refs/heads/nfs-rdma-future > > Nice.. > > Can you spend some time and reflect on how some of this could be > lowered into the core code? The FMR and FRWR side have many > similarities now.. > > > FRWR is seeing a 10-15% throughput reduction with 8-thread dbench, > > but a 5% improvement with 16-thread fio IOPS. 4K and 8K direct > > read and write are negatively impacted. > > I'm not surprised since invalidate is sync. I belive you need to > incorporate SEND WITH INVALIDATE to substantially recover this > overhead. > > It would be neat if the RQ could continue to advance while waiting for > the invalidate.. That looks almost doable.. > > > I converted the RPC reply handler tasklet to a work queue context > > to allow sleeping. A new .ro_unmap_sync method is invoked after > > the RPC/RDMA header is parsed but before xprt_complete_rqst() > > wakes up the waiting RPC. > > .. so the issue is the RPC must be substantially parsed to learn which > MR it is associated with to schedule the invalidate? > > > This is actually much more efficient than the current logic, > > which serially does an ib_unmap_fmr() for each MR the RPC owns. > > So FMR overall performs better with this change. > > Interesting.. > > > Because the next RPC cannot awaken until the last send completes, > > send queue accounting is based on RPC/RDMA credit flow control. > > So for FRWR the sync invalidate effectively guarentees all SQEs > related to this RPC are flushed. That seems reasonable, if the number > of SQEs and CQEs are properly sized in relation to the RPC slot count > it should be workable.. > > How does FMR and PHYS synchronize? > > > I’m sure there are some details here that still need to be > > addressed, but this fixes the big problem with FRWR send queue > > accounting, which was that LOCAL_INV WRs would continue to > > consume SQEs while another RPC was allowed to start. > > Did you test without that artificial limit you mentioned before? > > I'm also wondering about this: > > > During some other testing I found that when a completion upcall > > returns to the provider leaving CQEs still on the completion queue, > > there is a non-zero probability that a completion will be lost. > > What does lost mean? > > The CQ is edge triggered, so if you don't drain it you might not get > another timely CQ callback (which is bad), but CQEs themselves should > not be lost. > This condition (not fully draining the CQEs) is due to SQ flow control, yes? If so, then when the SQ resumes can it wake up the appropriate thread (simulating another CQE insertion)? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 24, 2015, at 12:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote: > On Fri, Jul 24, 2015 at 10:36:07AM -0400, Chuck Lever wrote: > >> Unfinished, but operational: >> >> http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=shortlog;h=refs/heads/nfs-rdma-future > > Nice.. > > Can you spend some time and reflect on how some of this could be > lowered into the core code? The point of the prototype is to start thinking about this with actual data. :-) So I’m with you. > The FMR and FRWR side have many > similarities now.. >> FRWR is seeing a 10-15% throughput reduction with 8-thread dbench, >> but a 5% improvement with 16-thread fio IOPS. 4K and 8K direct >> read and write are negatively impacted. > > I'm not surprised since invalidate is sync. I belive you need to > incorporate SEND WITH INVALIDATE to substantially recover this > overhead. I tried to find another kernel ULP using SEND WITH INVALIDATE, but I didn’t see one. I assume you mean the NFS server would use this WR when replying, to knock down the RPC’s client MRs remotely? > It would be neat if the RQ could continue to advance while waiting for > the invalidate.. That looks almost doable.. The new reply handling work queue is not restricted to serial reply processing. Unlike the tasklet model, multiple RPC replies can be processed at once, and can run across all CPUs. The tasklet was global, shared across all RPC/RDMA receive queues on that client. AFAICT there is very little else that is shared between RPC replies. I think using a work queue instead may be a tiny bit slower for each RPC (perhaps due to additional context switching), but will allow much better scaling with the number of transports and mount points the client creates. I may not have understood your comment. >> I converted the RPC reply handler tasklet to a work queue context >> to allow sleeping. A new .ro_unmap_sync method is invoked after >> the RPC/RDMA header is parsed but before xprt_complete_rqst() >> wakes up the waiting RPC. > > .. so the issue is the RPC must be substantially parsed to learn which > MR it is associated with to schedule the invalidate? Only the RPC/RDMA header has to be parsed, but yes. The needed parsing is handled in rpcrdma_reply_handler right before the .ro_unmap_unsync call. Parsing the RPC reply results is then done by the upper layer once xprt_complete_rqst() has run. >> This is actually much more efficient than the current logic, >> which serially does an ib_unmap_fmr() for each MR the RPC owns. >> So FMR overall performs better with this change. > > Interesting.. > >> Because the next RPC cannot awaken until the last send completes, >> send queue accounting is based on RPC/RDMA credit flow control. > > So for FRWR the sync invalidate effectively guarentees all SQEs > related to this RPC are flushed. That seems reasonable, if the number > of SQEs and CQEs are properly sized in relation to the RPC slot count > it should be workable.. Yes, both queues are sized in rpcrdma_ep_create() according to the slot count / credit limit. > How does FMR and PHYS synchronize? We still rely on timing there. The RPC's send buffer may be re-allocated by the next RPC if that RPC wants to send a bigger request than this one. Thus there is still a tiny but non-zero risk the HCA may not be done with that send buffer. Closing that hole is still on my to-do list. >> I’m sure there are some details here that still need to be >> addressed, but this fixes the big problem with FRWR send queue >> accounting, which was that LOCAL_INV WRs would continue to >> consume SQEs while another RPC was allowed to start. > > Did you test without that artificial limit you mentioned before? Yes. No problems now, the limit is removed in the last patch in that series. > I'm also wondering about this: > >> During some other testing I found that when a completion upcall >> returns to the provider leaving CQEs still on the completion queue, >> there is a non-zero probability that a completion will be lost. > > What does lost mean? Lost means a WC in the CQ is skipped by ib_poll_cq(). In other words, I expected that during the next upcall, ib_poll_cq() would return WCs that were not processed, starting with the last one on the CQ when my upcall handler returned. I found this by intentionally having the completion handler process only one or two WCs and then return. > The CQ is edge triggered, so if you don't drain it you might not get > another timely CQ callback (which is bad), but CQEs themselves should > not be lost. I’m not sure I fully understand this problem, it might even be my misuderstanding about ib_poll_cq(). But forcing the completion upcall handler to completely drain the CQ during each upcall prevents the issue. (Note, I don’t think fixing this is a pre-requisite for the synchronous invalidate work, but it just happened to be in the patch queue). -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 24, 2015 at 01:46:05PM -0400, Chuck Lever wrote: > > I'm not surprised since invalidate is sync. I belive you need to > > incorporate SEND WITH INVALIDATE to substantially recover this > > overhead. > > I tried to find another kernel ULP using SEND WITH INVALIDATE, but > I didn’t see one. I assume you mean the NFS server would use this > WR when replying, to knock down the RPC’s client MRs remotely? Yes. I think the issue with it not being used in the kernel is mainly to do with lack of standardization. The verb cannot be used unless both sides negotiate it and perhaps the older RDMA protocols have not been revised to include that. For simple testing purposes it shouldn't be too hard to force it to get an idea if it is worth perusing. On the RECV work completion check if the right rkey was invalidated and skip the invalidation step. Presumably the HCA does all this internally very quickly.. > I may not have understood your comment. Okay, I didn't looke closely at the entire series together.. > Only the RPC/RDMA header has to be parsed, but yes. The needed > parsing is handled in rpcrdma_reply_handler right before the > .ro_unmap_unsync call. Right, okay, if this could be done in the rq callback itself rather than bounce to a wq and immediately turn around the needed invalidate posts you'd get back a little more overhead by reducing the time to turn it around... Then bounce to the wq to complete from the SQ callback ? > > Did you test without that artificial limit you mentioned before? > > Yes. No problems now, the limit is removed in the last patch > in that series. Okay, so that was just overflowing the sq due to not accounting.. > >> During some other testing I found that when a completion upcall > >> returns to the provider leaving CQEs still on the completion queue, > >> there is a non-zero probability that a completion will be lost. > > > > What does lost mean? > > Lost means a WC in the CQ is skipped by ib_poll_cq(). > > In other words, I expected that during the next upcall, > ib_poll_cq() would return WCs that were not processed, starting > with the last one on the CQ when my upcall handler returned. Yes, this is what it should do. I wouldn't expect a timely upcall, but none should be lost. > I found this by intentionally having the completion handler > process only one or two WCs and then return. > > > The CQ is edge triggered, so if you don't drain it you might not get > > another timely CQ callback (which is bad), but CQEs themselves should > > not be lost. > > I’m not sure I fully understand this problem, it might > even be my misuderstanding about ib_poll_cq(). But forcing > the completion upcall handler to completely drain the CQ > during each upcall prevents the issue. CQs should never be lost. The idea that you can completely drain the CQ during the upcall is inherently racey, so this cannot be the answer to whatever the problem is.. Is there any chance this is still an artifact of the lazy SQE flow control? The RDMA buffer SQE recycling is solved by the sync invalidate, but workloads that don't use RDMA buffers (ie SEND only) will still run without proper flow control... If you are totally certain a CQ was dropped from ib_poll_cq, and that the SQ is not overflowing by strict accounting, then I'd say driver problem, but the odds of having an undetected driver problem like that at this point seem somehow small... Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 24, 2015, at 3:10 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote: > On Fri, Jul 24, 2015 at 01:46:05PM -0400, Chuck Lever wrote: >>> I'm not surprised since invalidate is sync. I belive you need to >>> incorporate SEND WITH INVALIDATE to substantially recover this >>> overhead. >> >> I tried to find another kernel ULP using SEND WITH INVALIDATE, but >> I didn’t see one. I assume you mean the NFS server would use this >> WR when replying, to knock down the RPC’s client MRs remotely? > > Yes. I think the issue with it not being used in the kernel is mainly > to do with lack of standardization. The verb cannot be used unless > both sides negotiate it and perhaps the older RDMA protocols have not > been revised to include that. And RPC-over-RDMA version 1 does not have any way to signal that the server has invalidated the MRs. Such signaling would be a pre-requisite to allow the Linux NFS/RDMA client to interoperate with non-Linux NFS/RDMA servers that do not have such support. >> Only the RPC/RDMA header has to be parsed, but yes. The needed >> parsing is handled in rpcrdma_reply_handler right before the >> .ro_unmap_unsync call. > > Right, okay, if this could be done in the rq callback itself rather > than bounce to a wq and immediately turn around the needed invalidate > posts you'd get back a little more overhead by reducing the time to > turn it around... Then bounce to the wq to complete from the SQ > callback ? For FRWR, you could post LINV from the receive completion upcall handler, and handle the rest of the invalidation from the send completion upcall, then poke the RPC reply handler. But this wouldn’t work at all for FMR, whose unmap verb is synchronous, would it? I’m not sure we’d buy more than a few microseconds here, and the receive upcall is single-threaded. I’ll move the “lost WC” discussion to another thread. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 24, 2015 at 03:59:06PM -0400, Chuck Lever wrote: > And RPC-over-RDMA version 1 does not have any way to signal that > the server has invalidated the MRs. Such signaling would be a > pre-requisite to allow the Linux NFS/RDMA client to interoperate > with non-Linux NFS/RDMA servers that do not have such support. You can implement client support immediately, nothing special is required. When processing a SEND WC check ex.invalidate_rkey and IB_WC_WITH_INVALIDATE. If that rkey matches the MR associated with that RPC slot then skip the invalidate. No protocol negotiation is required at that point. I am unclear what happens sever side if the server starts issuing SEND_WITH_INVALIDATE to a client that doesn't expect it. The net result is a MR would be invalidated twice. I don't know if this is OK or not. If it is OK, then the server can probably just start using it as well without negotiation. Otherwise the client has to signal the server it supports it once at connection setup. > For FRWR, you could post LINV from the receive completion upcall > handler, and handle the rest of the invalidation from the send > completion upcall, then poke the RPC reply handler. Yes > But this wouldn’t work at all for FMR, whose unmap verb is > synchronous, would it? It could run the FMR unmap in a thread/workqueue/tasklet and then complete the RPC side from that context. Same basic idea, using your taslket not the driver's sendq context. > I’m not sure we’d buy more than a few microseconds here, and > the receive upcall is single-threaded. Not sure on how that matches your performance goals, just remarking that lauching the invalidate in the recv upcall and completing processing from the sendq upcall is the very best performance you can expect from this API. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe > Sent: Friday, July 24, 2015 3:25 PM > To: Chuck Lever > Cc: Sagi Grimberg; Christoph Hellwig; linux-rdma; Liran Liss; Oren Duer > Subject: Re: [PATCH WIP 38/43] iser-target: Port to new memory registration API > > On Fri, Jul 24, 2015 at 03:59:06PM -0400, Chuck Lever wrote: > > And RPC-over-RDMA version 1 does not have any way to signal that > > the server has invalidated the MRs. Such signaling would be a > > pre-requisite to allow the Linux NFS/RDMA client to interoperate > > with non-Linux NFS/RDMA servers that do not have such support. > > You can implement client support immediately, nothing special is > required. > > When processing a SEND WC check ex.invalidate_rkey and > IB_WC_WITH_INVALIDATE. If that rkey matches the MR associated with > that RPC slot then skip the invalidate. > > No protocol negotiation is required at that point. > > I am unclear what happens sever side if the server starts issuing > SEND_WITH_INVALIDATE to a client that doesn't expect it. The net > result is a MR would be invalidated twice. I don't know if this is OK > or not. > It is ok to invalidate an already-invalid MR. > If it is OK, then the server can probably just start using it as > well without negotiation. > > Otherwise the client has to signal the server it supports it once at > connection setup. > > > For FRWR, you could post LINV from the receive completion upcall > > handler, and handle the rest of the invalidation from the send > > completion upcall, then poke the RPC reply handler. > > Yes > > > But this wouldn’t work at all for FMR, whose unmap verb is > > synchronous, would it? > > It could run the FMR unmap in a thread/workqueue/tasklet and then > complete the RPC side from that context. Same basic idea, using your > taslket not the driver's sendq context. > > > I’m not sure we’d buy more than a few microseconds here, and > > the receive upcall is single-threaded. > > Not sure on how that matches your performance goals, just remarking > that lauching the invalidate in the recv upcall and completing > processing from the sendq upcall is the very best performance you can > expect from this API. > > Jason > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > I am unclear what happens sever side if the server starts issuing > > SEND_WITH_INVALIDATE to a client that doesn't expect it. The net > > result is a MR would be invalidated twice. I don't know if this is OK > > or not. > > It is ok to invalidate an already-invalid MR. Nice, ah but I forgot about the last issue.. A server must not send the SEND_WITH_INVALIDATE OP to a client HCA that does not support it in HW. At least on IB the operation code is different, so it will break.. So negotiation is needed.. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 24, 2015, at 1:46 PM, Chuck Lever <chuck.lever@oracle.com> wrote: > On Jul 24, 2015, at 12:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote: > >> On Fri, Jul 24, 2015 at 10:36:07AM -0400, Chuck Lever wrote: >> >>> Unfinished, but operational: >>> >>> http://git.linux-nfs.org/?p=cel/cel-2.6.git;a=shortlog;h=refs/heads/nfs-rdma-future >> >> Nice.. >> >> Can you spend some time and reflect on how some of this could be >> lowered into the core code? > > The point of the prototype is to start thinking about this with > actual data. :-) So I’m with you. > > >> The FMR and FRWR side have many >> similarities now.. IMO ib_unmap_fmr is a very different animal from LOCAL_INV WR. ib_unmap_fmr is synchronous, provides no ordering guarantees with send queue operations, and does not depend on a connected QP to be available. You could emulate asynchronicity with a work queue but that still does not provide SQ ordering. There are few if any failure modes for ib_unmap_fmr. LOCAL_INV WR is asynchronous, provides strong ordering with other send queue operations, but does require a non-NULL QP in RTS to work. The failure modes are complex: without a QP in RTS, the post_send fails. If the QP leaves RTS while LOCAL_INV is in flight, the LINV flushes. MRs can be left in a state where the MR's rkey is not in sync with the HW, in which case a synchronous operation may be required to recover the MR. These are the reasons I elected to employ a synchronous invalidation model in the RPC reply handler. This model can be made to work adequately for both FMR and FRWR, provides proper DMA unmap ordering guarantees for both, and hides wonky transport disconnect recovery mechanics. The only downside is the performance cost. A generic MR invalidation API that buries underlying verb activity and guarantees proper DMA unmap ordering I think would have to be synchronous. In the long run, two things will change: first, FMR will eventually be deprecated; and second, ULPs will likely adopt SEND_WITH_INV. The complexion of MR invalidation could be vastly different in a few years: handled entirely by the target-side, and only verified by the initiator. Verification doesn't need to sleep, and the slow path (the target failed to invalidate) can be deferred. All that would be necessary at that point would be a synchronous invalidation API (synchronous in the sense that the invalidate is complete if the API returns without error). -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jul 27, 2015 at 11:57:46AM -0400, Chuck Lever wrote: > IMO ib_unmap_fmr is a very different animal from LOCAL_INV WR. Sure, but how many of these properties does NFS actually care about, now that it is running the API properly? > ib_unmap_fmr is synchronous, provides no ordering guarantees with > send queue operations, and does not depend on a connected QP to > be available. You could emulate asynchronicity with a work queue > but that still does not provide SQ ordering. There are few if any > failure modes for ib_unmap_fmr. I'm having a hard time seeing how SQ ordering is important when the API is used properly. Once you explicitly order the DMA unmap after the invalidate completion you no longer need implicit SQ ordering Is there a way to combine SQ implicit ordering and the Linux DMA API together correctly? > flight, the LINV flushes. MRs can be left in a state where the > MR's rkey is not in sync with the HW, in which case a > synchronous operation may be required to recover the MR. The error handling seems like a trivial difference, a ib_recover_failed_qp_mr(mr); sort of call could resync everything after a QP blows up.. > The complexion of MR invalidation could be vastly different in > a few years: handled entirely by the target-side, and only > verified by the initiator. Verification doesn't need to sleep, > and the slow path (the target failed to invalidate) can be > deferred. The initiator still needs to have the ability to issue the invalidate if the target doesn't do it, so all the code still exists.. Even ignoring those issues, should we be talking about putting FMR under the new ib_alloc_mr and ib_map_mr interfaces? Would that help much even if the post and unmap flows are totally different? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 27, 2015, at 1:25 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote: > On Mon, Jul 27, 2015 at 11:57:46AM -0400, Chuck Lever wrote: >> IMO ib_unmap_fmr is a very different animal from LOCAL_INV WR. > > Sure, but how many of these properties does NFS actually care about, > now that it is running the API properly? > >> ib_unmap_fmr is synchronous, provides no ordering guarantees with >> send queue operations, and does not depend on a connected QP to >> be available. You could emulate asynchronicity with a work queue >> but that still does not provide SQ ordering. There are few if any >> failure modes for ib_unmap_fmr. > > I'm having a hard time seeing how SQ ordering is important when the > API is used properly. Once you explicitly order the DMA unmap after > the invalidate completion you no longer need implicit SQ ordering > > Is there a way to combine SQ implicit ordering and the Linux DMA API > together correctly? > >> flight, the LINV flushes. MRs can be left in a state where the >> MR's rkey is not in sync with the HW, in which case a >> synchronous operation may be required to recover the MR. > > The error handling seems like a trivial difference, a > ib_recover_failed_qp_mr(mr); sort of call could resync everything > after a QP blows up.. Out of interest, why does this need to be exposed to ULPs? I don't feel a ULP should have to deal with broken MRs following a transport disconnect. It falls in that category of things every ULP that supports FRWR has to do, and each has plenty of opportunity to get it wrong. >> The complexion of MR invalidation could be vastly different in >> a few years: handled entirely by the target-side, and only >> verified by the initiator. Verification doesn't need to sleep, >> and the slow path (the target failed to invalidate) can be >> deferred. > > The initiator still needs to have the ability to issue the invalidate > if the target doesn't do it, so all the code still exists.. > > Even ignoring those issues, should we be talking about putting FMR > under the new ib_alloc_mr and ib_map_mr interfaces? Would that help > much even if the post and unmap flows are totally different? My opinion is FMR should be separate from the new API. Some have expressed an interest in combining all kernel registration mechanisms under a single API, but they seem too different from each other to do that successfully. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jul 28, 2015 at 04:06:23PM -0400, Chuck Lever wrote: > My opinion is FMR should be separate from the new API. Some have > expressed an interest in combining all kernel registration > mechanisms under a single API, but they seem too different from > each other to do that successfully. Hi Chuck, I think we can fit FMR partially under this API, e.g. alloc and map_sg fit in very well, but then instead of post and invalidate we'll need to call into slightly modified existing FMR pool APIs. I'd suggest to postponed the issue for now, I'll prepare a prototype once we've finished the FR-side API. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 94395ce..af1c01d 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -486,10 +486,8 @@ isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) list_for_each_entry_safe(fr_desc, tmp, &isert_conn->fr_pool, list) { list_del(&fr_desc->list); - ib_free_fast_reg_page_list(fr_desc->data_frpl); ib_dereg_mr(fr_desc->data_mr); if (fr_desc->pi_ctx) { - ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); ib_dereg_mr(fr_desc->pi_ctx->prot_mr); ib_dereg_mr(fr_desc->pi_ctx->sig_mr); kfree(fr_desc->pi_ctx); @@ -517,22 +515,13 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc, return -ENOMEM; } - pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(device, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(pi_ctx->prot_frpl)) { - isert_err("Failed to allocate prot frpl err=%ld\n", - PTR_ERR(pi_ctx->prot_frpl)); - ret = PTR_ERR(pi_ctx->prot_frpl); - goto err_pi_ctx; - } - pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, ISCSI_ISER_SG_TABLESIZE, 0); if (IS_ERR(pi_ctx->prot_mr)) { isert_err("Failed to allocate prot frmr err=%ld\n", PTR_ERR(pi_ctx->prot_mr)); ret = PTR_ERR(pi_ctx->prot_mr); - goto err_prot_frpl; + goto err_pi_ctx; } desc->ind |= ISERT_PROT_KEY_VALID; @@ -552,8 +541,6 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc, err_prot_mr: ib_dereg_mr(pi_ctx->prot_mr); -err_prot_frpl: - ib_free_fast_reg_page_list(pi_ctx->prot_frpl); err_pi_ctx: kfree(pi_ctx); @@ -564,34 +551,18 @@ static int isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, struct fast_reg_descriptor *fr_desc) { - int ret; - - fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(fr_desc->data_frpl)) { - isert_err("Failed to allocate data frpl err=%ld\n", - PTR_ERR(fr_desc->data_frpl)); - return PTR_ERR(fr_desc->data_frpl); - } - fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, ISCSI_ISER_SG_TABLESIZE, 0); if (IS_ERR(fr_desc->data_mr)) { isert_err("Failed to allocate data frmr err=%ld\n", PTR_ERR(fr_desc->data_mr)); - ret = PTR_ERR(fr_desc->data_mr); - goto err_data_frpl; + return PTR_ERR(fr_desc->data_mr); } fr_desc->ind |= ISERT_DATA_KEY_VALID; isert_dbg("Created fr_desc %p\n", fr_desc); return 0; - -err_data_frpl: - ib_free_fast_reg_page_list(fr_desc->data_frpl); - - return ret; } static int @@ -2521,45 +2492,6 @@ unmap_cmd: return ret; } -static int -isert_map_fr_pagelist(struct ib_device *ib_dev, - struct scatterlist *sg_start, int sg_nents, u64 *fr_pl) -{ - u64 start_addr, end_addr, page, chunk_start = 0; - struct scatterlist *tmp_sg; - int i = 0, new_chunk, last_ent, n_pages; - - n_pages = 0; - new_chunk = 1; - last_ent = sg_nents - 1; - for_each_sg(sg_start, tmp_sg, sg_nents, i) { - start_addr = ib_sg_dma_address(ib_dev, tmp_sg); - if (new_chunk) - chunk_start = start_addr; - end_addr = start_addr + ib_sg_dma_len(ib_dev, tmp_sg); - - isert_dbg("SGL[%d] dma_addr: 0x%llx len: %u\n", - i, (unsigned long long)tmp_sg->dma_address, - tmp_sg->length); - - if ((end_addr & ~PAGE_MASK) && i < last_ent) { - new_chunk = 0; - continue; - } - new_chunk = 1; - - page = chunk_start & PAGE_MASK; - do { - fr_pl[n_pages++] = page; - isert_dbg("Mapped page_list[%d] page_addr: 0x%llx\n", - n_pages - 1, page); - page += PAGE_SIZE; - } while (page < end_addr); - } - - return n_pages; -} - static inline void isert_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) { @@ -2585,11 +2517,9 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, struct isert_device *device = isert_conn->device; struct ib_device *ib_dev = device->ib_device; struct ib_mr *mr; - struct ib_fast_reg_page_list *frpl; struct ib_send_wr fr_wr, inv_wr; struct ib_send_wr *bad_wr, *wr = NULL; - int ret, pagelist_len; - u32 page_off; + int ret; if (mem->dma_nents == 1) { sge->lkey = device->mr->lkey; @@ -2600,40 +2530,32 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, return 0; } - if (ind == ISERT_DATA_KEY_VALID) { + if (ind == ISERT_DATA_KEY_VALID) /* Registering data buffer */ mr = fr_desc->data_mr; - frpl = fr_desc->data_frpl; - } else { + else /* Registering protection buffer */ mr = fr_desc->pi_ctx->prot_mr; - frpl = fr_desc->pi_ctx->prot_frpl; - } - - page_off = mem->offset % PAGE_SIZE; - - isert_dbg("Use fr_desc %p sg_nents %d offset %u\n", - fr_desc, mem->nents, mem->offset); - - pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents, - &frpl->page_list[0]); if (!(fr_desc->ind & ind)) { isert_inv_rkey(&inv_wr, mr); wr = &inv_wr; } + ret = ib_map_mr_sg(mr, mem->sg, mem->nents, IB_ACCESS_LOCAL_WRITE); + if (ret) { + isert_err("failed to map sg %p with %d entries\n", + mem->sg, mem->dma_nents); + return ret; + } + + isert_dbg("Use fr_desc %p sg_nents %d offset %u\n", + fr_desc, mem->nents, mem->offset); + /* Prepare FASTREG WR */ memset(&fr_wr, 0, sizeof(fr_wr)); - fr_wr.wr_id = ISER_FASTREG_LI_WRID; - fr_wr.opcode = IB_WR_FAST_REG_MR; - fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off; - fr_wr.wr.fast_reg.page_list = frpl; - fr_wr.wr.fast_reg.page_list_len = pagelist_len; - fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr_wr.wr.fast_reg.length = mem->len; - fr_wr.wr.fast_reg.rkey = mr->rkey; - fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE; + ib_set_fastreg_wr(mr, mr->lkey, ISER_FASTREG_LI_WRID, + false, &fr_wr); if (!wr) wr = &fr_wr; @@ -2648,8 +2570,8 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, fr_desc->ind &= ~ind; sge->lkey = mr->lkey; - sge->addr = frpl->page_list[0] + page_off; - sge->length = mem->len; + sge->addr = mr->iova; + sge->length = mr->length; isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", sge->addr, sge->length, sge->lkey); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 9ec23a78..a63fc6a 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -84,14 +84,12 @@ enum isert_indicator { struct pi_context { struct ib_mr *prot_mr; - struct ib_fast_reg_page_list *prot_frpl; struct ib_mr *sig_mr; }; struct fast_reg_descriptor { struct list_head list; struct ib_mr *data_mr; - struct ib_fast_reg_page_list *data_frpl; u8 ind; struct pi_context *pi_ctx; };
Signed-off-by: Sagi Grimberg <sagig@mellanox.com> --- drivers/infiniband/ulp/isert/ib_isert.c | 116 ++++++-------------------------- drivers/infiniband/ulp/isert/ib_isert.h | 2 - 2 files changed, 19 insertions(+), 99 deletions(-)