Message ID | 1437548143-24893-38-git-send-email-sagig@mellanox.com (mailing list archive) |
---|---|
State | RFC |
Headers | show |
On Jul 22, 2015, at 2:55 AM, Sagi Grimberg <sagig@mellanox.com> wrote: > Signed-off-by: Sagi Grimberg <sagig@mellanox.com> > --- > net/sunrpc/xprtrdma/frwr_ops.c | 80 ++++++++++++++++++++++------------------- > net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++- > 2 files changed, 47 insertions(+), 37 deletions(-) > > diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c > index 517efed..e28246b 100644 > --- a/net/sunrpc/xprtrdma/frwr_ops.c > +++ b/net/sunrpc/xprtrdma/frwr_ops.c > @@ -151,9 +151,13 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, > f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, depth, 0); > if (IS_ERR(f->fr_mr)) > goto out_mr_err; > - f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); > - if (IS_ERR(f->fr_pgl)) > + > + f->sg = kcalloc(sizeof(*f->sg), depth, GFP_KERNEL); > + if (IS_ERR(f->sg)) > goto out_list_err; > + > + sg_init_table(f->sg, depth); > + > return 0; > > out_mr_err: > @@ -163,7 +167,7 @@ out_mr_err: > return rc; > > out_list_err: > - rc = PTR_ERR(f->fr_pgl); > + rc = -ENOMEM; > dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", > __func__, rc); > ib_dereg_mr(f->fr_mr); > @@ -179,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r) > if (rc) > dprintk("RPC: %s: ib_dereg_mr status %i\n", > __func__, rc); > - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); > + kfree(r->r.frmr.sg); > } > > static int > @@ -320,10 +324,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, > struct ib_send_wr fastreg_wr, *bad_wr; > u8 key; > int len, pageoff; > - int i, rc; > - int seg_len; > - u64 pa; > - int page_no; > + int i, rc, access; > > mw = seg1->rl_mw; > seg1->rl_mw = NULL; > @@ -344,39 +345,46 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, > if (nsegs > ia->ri_max_frmr_depth) > nsegs = ia->ri_max_frmr_depth; > > - for (page_no = i = 0; i < nsegs;) { > - rpcrdma_map_one(device, seg, direction); > - pa = seg->mr_dma; > - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { > - frmr->fr_pgl->page_list[page_no++] = pa; > - pa += PAGE_SIZE; > - } > + for (i = 0; i < nsegs;) { > + sg_set_page(&frmr->sg[i], seg->mr_page, > + seg->mr_len, offset_in_page(seg->mr_offset)); Cautionary note: here we’re dealing with both the “contiguous set of pages” case and the “small region of bytes in a single page” case. See rpcrdma_convert_iovs(): sometimes RPC send or receive buffers can be registered (RDMA_NOMSG). > len += seg->mr_len; > - ++seg; > ++i; > - /* Check for holes */ > + ++seg; > + > + /* Check for holes - needed?? */ > if ((i < nsegs && offset_in_page(seg->mr_offset)) || > offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) > break; > } > + > + frmr->sg_nents = i; > + frmr->dma_nents = ib_dma_map_sg(device, frmr->sg, > + frmr->sg_nents, direction); > + if (!frmr->dma_nents) { > + pr_err("RPC: %s: failed to dma map sg %p sg_nents %d\n", > + __func__, frmr->sg, frmr->sg_nents); > + return -ENOMEM; > + } > + > dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", > __func__, mw, i, len); > > - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); > - fastreg_wr.wr_id = (unsigned long)(void *)mw; > - fastreg_wr.opcode = IB_WR_FAST_REG_MR; > - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; > - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; > - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; > - fastreg_wr.wr.fast_reg.page_list_len = page_no; > - fastreg_wr.wr.fast_reg.length = len; > - fastreg_wr.wr.fast_reg.access_flags = writing ? > - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : > - IB_ACCESS_REMOTE_READ; > mr = frmr->fr_mr; > + access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : > + IB_ACCESS_REMOTE_READ; > + rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access); I like this (and the matching ib_dma_unmap_sg). But why wouldn’t this function be called ib_dma_map_sg() ? The name ib_map_mr_sg() had me thinking for a moment that this API actually posted the FASTREG WR, but I see that it doesn’t. > + if (rc) { > + pr_err("RPC: %s: failed to map mr %p rc %d\n", > + __func__, frmr->fr_mr, rc); > + return rc; > + } > + > key = (u8)(mr->rkey & 0x000000FF); > ib_update_fast_reg_key(mr, ++key); > - fastreg_wr.wr.fast_reg.rkey = mr->rkey; > + > + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); > + ib_set_fastreg_wr(mr, mr->rkey, (uintptr_t)mw, false, &fastreg_wr); > > DECR_CQCOUNT(&r_xprt->rx_ep); > rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); > @@ -385,15 +393,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, > > seg1->rl_mw = mw; > seg1->mr_rkey = mr->rkey; > - seg1->mr_base = seg1->mr_dma + pageoff; > + seg1->mr_base = mr->iova; > seg1->mr_nsegs = i; > seg1->mr_len = len; > return i; > > out_senderr: > dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); > - while (i--) > - rpcrdma_unmap_one(device, --seg); > + ib_dma_unmap_sg(device, frmr->sg, frmr->sg_nents, direction); > __frwr_queue_recovery(mw); > return rc; > } > @@ -407,22 +414,23 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) > struct rpcrdma_mr_seg *seg1 = seg; > struct rpcrdma_ia *ia = &r_xprt->rx_ia; > struct rpcrdma_mw *mw = seg1->rl_mw; > + struct rpcrdma_frmr *frmr = &mw->r.frmr; > struct ib_send_wr invalidate_wr, *bad_wr; > int rc, nsegs = seg->mr_nsegs; > > dprintk("RPC: %s: FRMR %p\n", __func__, mw); > > seg1->rl_mw = NULL; > - mw->r.frmr.fr_state = FRMR_IS_INVALID; > + frmr->fr_state = FRMR_IS_INVALID; > > memset(&invalidate_wr, 0, sizeof(invalidate_wr)); > invalidate_wr.wr_id = (unsigned long)(void *)mw; > invalidate_wr.opcode = IB_WR_LOCAL_INV; > - invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey; > + invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; > DECR_CQCOUNT(&r_xprt->rx_ep); > > - while (seg1->mr_nsegs--) > - rpcrdma_unmap_one(ia->ri_device, seg++); > + ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); ->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction” in the rpcrdma_frmr. > + > read_lock(&ia->ri_qplock); > rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); > read_unlock(&ia->ri_qplock); > diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h > index 886f8c8..a1c3ab2b 100644 > --- a/net/sunrpc/xprtrdma/xprt_rdma.h > +++ b/net/sunrpc/xprtrdma/xprt_rdma.h > @@ -195,7 +195,9 @@ enum rpcrdma_frmr_state { > }; > > struct rpcrdma_frmr { > - struct ib_fast_reg_page_list *fr_pgl; > + struct scatterlist *sg; > + unsigned int sg_nents; > + unsigned int dma_nents; > struct ib_mr *fr_mr; > enum rpcrdma_frmr_state fr_state; > struct work_struct fr_work; -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
>> + for (i = 0; i < nsegs;) { >> + sg_set_page(&frmr->sg[i], seg->mr_page, >> + seg->mr_len, offset_in_page(seg->mr_offset)); > > Cautionary note: here we’re dealing with both the “contiguous > set of pages” case and the “small region of bytes in a single page” > case. See rpcrdma_convert_iovs(): sometimes RPC send or receive > buffers can be registered (RDMA_NOMSG). I noticed that (I think). I think this is handled correctly. What exactly is the caution note here? >> mr = frmr->fr_mr; >> + access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : >> + IB_ACCESS_REMOTE_READ; >> + rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access); > > I like this (and the matching ib_dma_unmap_sg). But why wouldn’t > this function be called ib_dma_map_sg() ? The name ib_map_mr_sg() > had me thinking for a moment that this API actually posted the > FASTREG WR, but I see that it doesn’t. Umm, ib_dma_map_sg is already taken :) This is what I came up with, it maps the SG elements to the MR private context. I'd like to keep the post API for now. It will be possible to to add a wrapper function that would do: - dma_map_sg - ib_map_mr_sg - init fastreg send_wr - post_send (maybe) >> - while (seg1->mr_nsegs--) >> - rpcrdma_unmap_one(ia->ri_device, seg++); >> + ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); > > ->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced > with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction” > in the rpcrdma_frmr. Yep, that's correct, if I had turned on dma mapping debug it would shout at me here... Note, I added in the git repo a patch to allow arbitrary sg lists in frwr_op_map() which would allow you to skip the holes check... seems to work with mlx5... I did noticed the mlx4 gives a protection error with after the conversion... I'll look into that... -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Jul 22, 2015, at 11:41 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > >>> + for (i = 0; i < nsegs;) { >>> + sg_set_page(&frmr->sg[i], seg->mr_page, >>> + seg->mr_len, offset_in_page(seg->mr_offset)); >> >> Cautionary note: here we’re dealing with both the “contiguous >> set of pages” case and the “small region of bytes in a single page” >> case. See rpcrdma_convert_iovs(): sometimes RPC send or receive >> buffers can be registered (RDMA_NOMSG). > > I noticed that (I think). I think this is handled correctly. > What exactly is the caution note here? Well the sg is turned into a page list below your API. Just want to make sure that we have tested your xprtrdma alterations with all the ULP possibilities. When you are further along I can pull this and run my functional tests. >>> mr = frmr->fr_mr; >>> + access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : >>> + IB_ACCESS_REMOTE_READ; >>> + rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access); >> >> I like this (and the matching ib_dma_unmap_sg). But why wouldn’t >> this function be called ib_dma_map_sg() ? The name ib_map_mr_sg() >> had me thinking for a moment that this API actually posted the >> FASTREG WR, but I see that it doesn’t. > > Umm, ib_dma_map_sg is already taken :) > > This is what I came up with, it maps the SG elements to the MR > private context. > > I'd like to keep the post API for now. It will be possible to > to add a wrapper function that would do: > - dma_map_sg > - ib_map_mr_sg > - init fastreg send_wr > - post_send (maybe) Where xprtrdma might improve is by setting up all the FASTREG WRs for one RPC with a single chain and post_send. We could do that with your INDIR_MR concept, for example. >>> - while (seg1->mr_nsegs--) >>> - rpcrdma_unmap_one(ia->ri_device, seg++); >>> + ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); >> >> ->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced >> with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction” >> in the rpcrdma_frmr. > > Yep, that's correct, if I had turned on dma mapping debug it would shout > at me here... > > Note, I added in the git repo a patch to allow arbitrary sg lists in > frwr_op_map() which would allow you to skip the holes check... seems to > work with mlx5... > > I did noticed the mlx4 gives a protection error with after the conversion... I'll look into that... Should also get Steve and Devesh to try this with their adapters. -- Chuck Lever -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 22, 2015 at 11:03:49AM -0400, Chuck Lever wrote: > I like this (and the matching ib_dma_unmap_sg). But why wouldn?t > this function be called ib_dma_map_sg() ? The name ib_map_mr_sg() > had me thinking for a moment that this API actually posted the > FASTREG WR, but I see that it doesn?t. We already have a ib_dma_map_sg, which is a wrapper around dma_map_sg that allows ehc ipath amd qib to do naughty things instead of the regular dma mapping. But it seems maybe the dma_map_sg calls or the magic for those other drivers should be folded into Sagi's new API as those HCA apparently don't need physical addresses and thus the S/G list. God knows what's they're doing with a list of virtual addresses, but removing the struct scatterlist abuse there would be highly welcome. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/22/2015 1:55 AM, Sagi Grimberg wrote: > Signed-off-by: Sagi Grimberg <sagig@mellanox.com> > --- > net/sunrpc/xprtrdma/frwr_ops.c | 80 ++++++++++++++++++++++------------------- > net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++- > 2 files changed, 47 insertions(+), 37 deletions(-) Did you intend to change svcrdma as well? > diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c > index 517efed..e28246b 100644 > --- a/net/sunrpc/xprtrdma/frwr_ops.c > +++ b/net/sunrpc/xprtrdma/frwr_ops.c > @@ -151,9 +151,13 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, > f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, depth, 0); > if (IS_ERR(f->fr_mr)) > goto out_mr_err; > - f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); > - if (IS_ERR(f->fr_pgl)) > + > + f->sg = kcalloc(sizeof(*f->sg), depth, GFP_KERNEL); > + if (IS_ERR(f->sg)) > goto out_list_err; > + > + sg_init_table(f->sg, depth); > + > return 0; > > out_mr_err: > @@ -163,7 +167,7 @@ out_mr_err: > return rc; > > out_list_err: > - rc = PTR_ERR(f->fr_pgl); > + rc = -ENOMEM; > dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", > __func__, rc); > ib_dereg_mr(f->fr_mr); > @@ -179,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r) > if (rc) > dprintk("RPC: %s: ib_dereg_mr status %i\n", > __func__, rc); > - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); > + kfree(r->r.frmr.sg); > } > > static int > @@ -320,10 +324,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, > struct ib_send_wr fastreg_wr, *bad_wr; > u8 key; > int len, pageoff; > - int i, rc; > - int seg_len; > - u64 pa; > - int page_no; > + int i, rc, access; > > mw = seg1->rl_mw; > seg1->rl_mw = NULL; > @@ -344,39 +345,46 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, > if (nsegs > ia->ri_max_frmr_depth) > nsegs = ia->ri_max_frmr_depth; > > - for (page_no = i = 0; i < nsegs;) { > - rpcrdma_map_one(device, seg, direction); > - pa = seg->mr_dma; > - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { > - frmr->fr_pgl->page_list[page_no++] = pa; > - pa += PAGE_SIZE; > - } > + for (i = 0; i < nsegs;) { > + sg_set_page(&frmr->sg[i], seg->mr_page, > + seg->mr_len, offset_in_page(seg->mr_offset)); > len += seg->mr_len; > - ++seg; > ++i; > - /* Check for holes */ > + ++seg; > + > + /* Check for holes - needed?? */ > if ((i < nsegs && offset_in_page(seg->mr_offset)) || > offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) > break; > } > + > + frmr->sg_nents = i; > + frmr->dma_nents = ib_dma_map_sg(device, frmr->sg, > + frmr->sg_nents, direction); > + if (!frmr->dma_nents) { > + pr_err("RPC: %s: failed to dma map sg %p sg_nents %d\n", > + __func__, frmr->sg, frmr->sg_nents); > + return -ENOMEM; > + } > + > dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", > __func__, mw, i, len); > > - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); > - fastreg_wr.wr_id = (unsigned long)(void *)mw; > - fastreg_wr.opcode = IB_WR_FAST_REG_MR; > - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; > - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; > - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; > - fastreg_wr.wr.fast_reg.page_list_len = page_no; > - fastreg_wr.wr.fast_reg.length = len; > - fastreg_wr.wr.fast_reg.access_flags = writing ? > - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : > - IB_ACCESS_REMOTE_READ; > mr = frmr->fr_mr; > + access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : > + IB_ACCESS_REMOTE_READ; > + rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access); > + if (rc) { > + pr_err("RPC: %s: failed to map mr %p rc %d\n", > + __func__, frmr->fr_mr, rc); > + return rc; > + } > + > key = (u8)(mr->rkey & 0x000000FF); > ib_update_fast_reg_key(mr, ++key); > - fastreg_wr.wr.fast_reg.rkey = mr->rkey; > + > + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); > + ib_set_fastreg_wr(mr, mr->rkey, (uintptr_t)mw, false, &fastreg_wr); > > DECR_CQCOUNT(&r_xprt->rx_ep); > rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); > @@ -385,15 +393,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, > > seg1->rl_mw = mw; > seg1->mr_rkey = mr->rkey; > - seg1->mr_base = seg1->mr_dma + pageoff; > + seg1->mr_base = mr->iova; > seg1->mr_nsegs = i; > seg1->mr_len = len; > return i; > > out_senderr: > dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); > - while (i--) > - rpcrdma_unmap_one(device, --seg); > + ib_dma_unmap_sg(device, frmr->sg, frmr->sg_nents, direction); > __frwr_queue_recovery(mw); > return rc; > } > @@ -407,22 +414,23 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) > struct rpcrdma_mr_seg *seg1 = seg; > struct rpcrdma_ia *ia = &r_xprt->rx_ia; > struct rpcrdma_mw *mw = seg1->rl_mw; > + struct rpcrdma_frmr *frmr = &mw->r.frmr; > struct ib_send_wr invalidate_wr, *bad_wr; > int rc, nsegs = seg->mr_nsegs; > > dprintk("RPC: %s: FRMR %p\n", __func__, mw); > > seg1->rl_mw = NULL; > - mw->r.frmr.fr_state = FRMR_IS_INVALID; > + frmr->fr_state = FRMR_IS_INVALID; > > memset(&invalidate_wr, 0, sizeof(invalidate_wr)); > invalidate_wr.wr_id = (unsigned long)(void *)mw; > invalidate_wr.opcode = IB_WR_LOCAL_INV; > - invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey; > + invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; > DECR_CQCOUNT(&r_xprt->rx_ep); > > - while (seg1->mr_nsegs--) > - rpcrdma_unmap_one(ia->ri_device, seg++); > + ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); > + > read_lock(&ia->ri_qplock); > rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); > read_unlock(&ia->ri_qplock); > diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h > index 886f8c8..a1c3ab2b 100644 > --- a/net/sunrpc/xprtrdma/xprt_rdma.h > +++ b/net/sunrpc/xprtrdma/xprt_rdma.h > @@ -195,7 +195,9 @@ enum rpcrdma_frmr_state { > }; > > struct rpcrdma_frmr { > - struct ib_fast_reg_page_list *fr_pgl; > + struct scatterlist *sg; > + unsigned int sg_nents; > + unsigned int dma_nents; > struct ib_mr *fr_mr; > enum rpcrdma_frmr_state fr_state; > struct work_struct fr_work; -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/22/2015 10:21 PM, Steve Wise wrote: > > On 7/22/2015 1:55 AM, Sagi Grimberg wrote: >> Signed-off-by: Sagi Grimberg <sagig@mellanox.com> >> --- >> net/sunrpc/xprtrdma/frwr_ops.c | 80 >> ++++++++++++++++++++++------------------- >> net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++- >> 2 files changed, 47 insertions(+), 37 deletions(-) > > Did you intend to change svcrdma as well? All the ULPs need to convert. I didn't have a chance to convert svcrdma yet. Want to take it? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/22/2015 7:04 PM, Chuck Lever wrote: > > On Jul 22, 2015, at 11:41 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > >> >>>> + for (i = 0; i < nsegs;) { >>>> + sg_set_page(&frmr->sg[i], seg->mr_page, >>>> + seg->mr_len, offset_in_page(seg->mr_offset)); >>> >>> Cautionary note: here we’re dealing with both the “contiguous >>> set of pages” case and the “small region of bytes in a single page” >>> case. See rpcrdma_convert_iovs(): sometimes RPC send or receive >>> buffers can be registered (RDMA_NOMSG). >> >> I noticed that (I think). I think this is handled correctly. >> What exactly is the caution note here? > > Well the sg is turned into a page list below your API. Just > want to make sure that we have tested your xprtrdma alterations > with all the ULP possibilities. When you are further along I > can pull this and run my functional tests. > > >>>> mr = frmr->fr_mr; >>>> + access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : >>>> + IB_ACCESS_REMOTE_READ; >>>> + rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access); >>> >>> I like this (and the matching ib_dma_unmap_sg). But why wouldn’t >>> this function be called ib_dma_map_sg() ? The name ib_map_mr_sg() >>> had me thinking for a moment that this API actually posted the >>> FASTREG WR, but I see that it doesn’t. >> >> Umm, ib_dma_map_sg is already taken :) >> >> This is what I came up with, it maps the SG elements to the MR >> private context. >> >> I'd like to keep the post API for now. It will be possible to >> to add a wrapper function that would do: >> - dma_map_sg >> - ib_map_mr_sg >> - init fastreg send_wr >> - post_send (maybe) > > Where xprtrdma might improve is by setting up all the FASTREG > WRs for one RPC with a single chain and post_send. We could do > that with your INDIR_MR concept, for example. BTW, it would be great if you can play with it a little bit. I'm more confident with the iSER part... I added two small fixes when I tested with mlx4. It seems to work... > > >>>> - while (seg1->mr_nsegs--) >>>> - rpcrdma_unmap_one(ia->ri_device, seg++); >>>> + ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); >>> >>> ->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced >>> with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction” >>> in the rpcrdma_frmr. >> >> Yep, that's correct, if I had turned on dma mapping debug it would shout >> at me here... >> >> Note, I added in the git repo a patch to allow arbitrary sg lists in >> frwr_op_map() which would allow you to skip the holes check... seems to >> work with mlx5... >> >> I did noticed the mlx4 gives a protection error with after the conversion... I'll look into that... > > Should also get Steve and Devesh to try this with their adapters. Ah, yes please. I've only compiled tested drivers other than mlx4, mlx5 which means there is a 99.9% (probably 100%) that it doesn't work. It would be great to get help on porting the rest of the ULPs as well, but that can wait until we converge on the API... -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: Sagi Grimberg [mailto:sagig@dev.mellanox.co.il] > Sent: Thursday, July 23, 2015 5:21 AM > To: Steve Wise; Sagi Grimberg; linux-rdma@vger.kernel.org > Cc: Liran Liss; Oren Duer > Subject: Re: [PATCH WIP 37/43] xprtrdma: Port to new memory registration API > > On 7/22/2015 10:21 PM, Steve Wise wrote: > > > > On 7/22/2015 1:55 AM, Sagi Grimberg wrote: > >> Signed-off-by: Sagi Grimberg <sagig@mellanox.com> > >> --- > >> net/sunrpc/xprtrdma/frwr_ops.c | 80 > >> ++++++++++++++++++++++------------------- > >> net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++- > >> 2 files changed, 47 insertions(+), 37 deletions(-) > > > > Did you intend to change svcrdma as well? > > All the ULPs need to convert. I didn't have a chance to convert > svcrdma yet. Want to take it? Not right now. My focus is still on enabling iSER. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 517efed..e28246b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -151,9 +151,13 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, depth, 0); if (IS_ERR(f->fr_mr)) goto out_mr_err; - f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); - if (IS_ERR(f->fr_pgl)) + + f->sg = kcalloc(sizeof(*f->sg), depth, GFP_KERNEL); + if (IS_ERR(f->sg)) goto out_list_err; + + sg_init_table(f->sg, depth); + return 0; out_mr_err: @@ -163,7 +167,7 @@ out_mr_err: return rc; out_list_err: - rc = PTR_ERR(f->fr_pgl); + rc = -ENOMEM; dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", __func__, rc); ib_dereg_mr(f->fr_mr); @@ -179,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r) if (rc) dprintk("RPC: %s: ib_dereg_mr status %i\n", __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + kfree(r->r.frmr.sg); } static int @@ -320,10 +324,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct ib_send_wr fastreg_wr, *bad_wr; u8 key; int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; + int i, rc, access; mw = seg1->rl_mw; seg1->rl_mw = NULL; @@ -344,39 +345,46 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < nsegs;) { - rpcrdma_map_one(device, seg, direction); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } + for (i = 0; i < nsegs;) { + sg_set_page(&frmr->sg[i], seg->mr_page, + seg->mr_len, offset_in_page(seg->mr_offset)); len += seg->mr_len; - ++seg; ++i; - /* Check for holes */ + ++seg; + + /* Check for holes - needed?? */ if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } + + frmr->sg_nents = i; + frmr->dma_nents = ib_dma_map_sg(device, frmr->sg, + frmr->sg_nents, direction); + if (!frmr->dma_nents) { + pr_err("RPC: %s: failed to dma map sg %p sg_nents %d\n", + __func__, frmr->sg, frmr->sg_nents); + return -ENOMEM; + } + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", __func__, mw, i, len); - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.length = len; - fastreg_wr.wr.fast_reg.access_flags = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; mr = frmr->fr_mr; + access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access); + if (rc) { + pr_err("RPC: %s: failed to map mr %p rc %d\n", + __func__, frmr->fr_mr, rc); + return rc; + } + key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + ib_set_fastreg_wr(mr, mr->rkey, (uintptr_t)mw, false, &fastreg_wr); DECR_CQCOUNT(&r_xprt->rx_ep); rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); @@ -385,15 +393,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, seg1->rl_mw = mw; seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_base = mr->iova; seg1->mr_nsegs = i; seg1->mr_len = len; return i; out_senderr: dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - while (i--) - rpcrdma_unmap_one(device, --seg); + ib_dma_unmap_sg(device, frmr->sg, frmr->sg_nents, direction); __frwr_queue_recovery(mw); return rc; } @@ -407,22 +414,23 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw = seg1->rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; struct ib_send_wr invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; dprintk("RPC: %s: FRMR %p\n", __func__, mw); seg1->rl_mw = NULL; - mw->r.frmr.fr_state = FRMR_IS_INVALID; + frmr->fr_state = FRMR_IS_INVALID; memset(&invalidate_wr, 0, sizeof(invalidate_wr)); invalidate_wr.wr_id = (unsigned long)(void *)mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey; + invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia->ri_device, seg++); + ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); + read_lock(&ia->ri_qplock); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 886f8c8..a1c3ab2b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -195,7 +195,9 @@ enum rpcrdma_frmr_state { }; struct rpcrdma_frmr { - struct ib_fast_reg_page_list *fr_pgl; + struct scatterlist *sg; + unsigned int sg_nents; + unsigned int dma_nents; struct ib_mr *fr_mr; enum rpcrdma_frmr_state fr_state; struct work_struct fr_work;
Signed-off-by: Sagi Grimberg <sagig@mellanox.com> --- net/sunrpc/xprtrdma/frwr_ops.c | 80 ++++++++++++++++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++- 2 files changed, 47 insertions(+), 37 deletions(-)