diff mbox

[WIP,37/43] xprtrdma: Port to new memory registration API

Message ID 1437548143-24893-38-git-send-email-sagig@mellanox.com (mailing list archive)
State RFC
Headers show

Commit Message

Sagi Grimberg July 22, 2015, 6:55 a.m. UTC
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  | 80 ++++++++++++++++++++++-------------------
 net/sunrpc/xprtrdma/xprt_rdma.h |  4 ++-
 2 files changed, 47 insertions(+), 37 deletions(-)

Comments

Chuck Lever III July 22, 2015, 3:03 p.m. UTC | #1
On Jul 22, 2015, at 2:55 AM, Sagi Grimberg <sagig@mellanox.com> wrote:

> Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
> ---
> net/sunrpc/xprtrdma/frwr_ops.c  | 80 ++++++++++++++++++++++-------------------
> net/sunrpc/xprtrdma/xprt_rdma.h |  4 ++-
> 2 files changed, 47 insertions(+), 37 deletions(-)
> 
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index 517efed..e28246b 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -151,9 +151,13 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
> 	f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, depth, 0);
> 	if (IS_ERR(f->fr_mr))
> 		goto out_mr_err;
> -	f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
> -	if (IS_ERR(f->fr_pgl))
> +
> +	f->sg = kcalloc(sizeof(*f->sg), depth, GFP_KERNEL);
> +	if (IS_ERR(f->sg))
> 		goto out_list_err;
> +
> +	sg_init_table(f->sg, depth);
> +
> 	return 0;
> 
> out_mr_err:
> @@ -163,7 +167,7 @@ out_mr_err:
> 	return rc;
> 
> out_list_err:
> -	rc = PTR_ERR(f->fr_pgl);
> +	rc = -ENOMEM;
> 	dprintk("RPC:       %s: ib_alloc_fast_reg_page_list status %i\n",
> 		__func__, rc);
> 	ib_dereg_mr(f->fr_mr);
> @@ -179,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r)
> 	if (rc)
> 		dprintk("RPC:       %s: ib_dereg_mr status %i\n",
> 			__func__, rc);
> -	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
> +	kfree(r->r.frmr.sg);
> }
> 
> static int
> @@ -320,10 +324,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
> 	struct ib_send_wr fastreg_wr, *bad_wr;
> 	u8 key;
> 	int len, pageoff;
> -	int i, rc;
> -	int seg_len;
> -	u64 pa;
> -	int page_no;
> +	int i, rc, access;
> 
> 	mw = seg1->rl_mw;
> 	seg1->rl_mw = NULL;
> @@ -344,39 +345,46 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
> 	if (nsegs > ia->ri_max_frmr_depth)
> 		nsegs = ia->ri_max_frmr_depth;
> 
> -	for (page_no = i = 0; i < nsegs;) {
> -		rpcrdma_map_one(device, seg, direction);
> -		pa = seg->mr_dma;
> -		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
> -			frmr->fr_pgl->page_list[page_no++] = pa;
> -			pa += PAGE_SIZE;
> -		}
> +	for (i = 0; i < nsegs;) {
> +		sg_set_page(&frmr->sg[i], seg->mr_page,
> +			    seg->mr_len, offset_in_page(seg->mr_offset));

Cautionary note: here we’re dealing with both the “contiguous
set of pages” case and the “small region of bytes in a single page”
case. See rpcrdma_convert_iovs(): sometimes RPC send or receive
buffers can be registered (RDMA_NOMSG).


> 		len += seg->mr_len;
> -		++seg;
> 		++i;
> -		/* Check for holes */
> +		++seg;
> +
> +		/* Check for holes - needed?? */
> 		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
> 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
> 			break;
> 	}
> +
> +	frmr->sg_nents = i;
> +	frmr->dma_nents = ib_dma_map_sg(device, frmr->sg,
> +					frmr->sg_nents, direction);
> +	if (!frmr->dma_nents) {
> +		pr_err("RPC:       %s: failed to dma map sg %p sg_nents %d\n",
> +			__func__, frmr->sg, frmr->sg_nents);
> +		return -ENOMEM;
> +	}
> +
> 	dprintk("RPC:       %s: Using frmr %p to map %d segments (%d bytes)\n",
> 		__func__, mw, i, len);
> 
> -	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
> -	fastreg_wr.wr_id = (unsigned long)(void *)mw;
> -	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
> -	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
> -	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
> -	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
> -	fastreg_wr.wr.fast_reg.page_list_len = page_no;
> -	fastreg_wr.wr.fast_reg.length = len;
> -	fastreg_wr.wr.fast_reg.access_flags = writing ?
> -				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
> -				IB_ACCESS_REMOTE_READ;
> 	mr = frmr->fr_mr;
> +	access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
> +			   IB_ACCESS_REMOTE_READ;
> +	rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access);

I like this (and the matching ib_dma_unmap_sg). But why wouldn’t
this function be called ib_dma_map_sg() ? The name ib_map_mr_sg()
had me thinking for a moment that this API actually posted the
FASTREG WR, but I see that it doesn’t.


> +	if (rc) {
> +		pr_err("RPC:       %s: failed to map mr %p rc %d\n",
> +			__func__, frmr->fr_mr, rc);
> +		return rc;
> +	}
> +
> 	key = (u8)(mr->rkey & 0x000000FF);
> 	ib_update_fast_reg_key(mr, ++key);
> -	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
> +
> +	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
> +	ib_set_fastreg_wr(mr, mr->rkey, (uintptr_t)mw, false, &fastreg_wr);
> 
> 	DECR_CQCOUNT(&r_xprt->rx_ep);
> 	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
> @@ -385,15 +393,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
> 
> 	seg1->rl_mw = mw;
> 	seg1->mr_rkey = mr->rkey;
> -	seg1->mr_base = seg1->mr_dma + pageoff;
> +	seg1->mr_base = mr->iova;
> 	seg1->mr_nsegs = i;
> 	seg1->mr_len = len;
> 	return i;
> 
> out_senderr:
> 	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
> -	while (i--)
> -		rpcrdma_unmap_one(device, --seg);
> +	ib_dma_unmap_sg(device, frmr->sg, frmr->sg_nents, direction);
> 	__frwr_queue_recovery(mw);
> 	return rc;
> }
> @@ -407,22 +414,23 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
> 	struct rpcrdma_mr_seg *seg1 = seg;
> 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
> 	struct rpcrdma_mw *mw = seg1->rl_mw;
> +	struct rpcrdma_frmr *frmr = &mw->r.frmr;
> 	struct ib_send_wr invalidate_wr, *bad_wr;
> 	int rc, nsegs = seg->mr_nsegs;
> 
> 	dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
> 
> 	seg1->rl_mw = NULL;
> -	mw->r.frmr.fr_state = FRMR_IS_INVALID;
> +	frmr->fr_state = FRMR_IS_INVALID;
> 
> 	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
> 	invalidate_wr.wr_id = (unsigned long)(void *)mw;
> 	invalidate_wr.opcode = IB_WR_LOCAL_INV;
> -	invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
> +	invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
> 	DECR_CQCOUNT(&r_xprt->rx_ep);
> 
> -	while (seg1->mr_nsegs--)
> -		rpcrdma_unmap_one(ia->ri_device, seg++);
> +	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);

->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced
with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction”
in the rpcrdma_frmr.


> +
> 	read_lock(&ia->ri_qplock);
> 	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
> 	read_unlock(&ia->ri_qplock);
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index 886f8c8..a1c3ab2b 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -195,7 +195,9 @@ enum rpcrdma_frmr_state {
> };
> 
> struct rpcrdma_frmr {
> -	struct ib_fast_reg_page_list	*fr_pgl;
> +	struct scatterlist		*sg;
> +	unsigned int			sg_nents;
> +	unsigned int			dma_nents;
> 	struct ib_mr			*fr_mr;
> 	enum rpcrdma_frmr_state		fr_state;
> 	struct work_struct		fr_work;

--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg July 22, 2015, 3:41 p.m. UTC | #2
>> +	for (i = 0; i < nsegs;) {
>> +		sg_set_page(&frmr->sg[i], seg->mr_page,
>> +			    seg->mr_len, offset_in_page(seg->mr_offset));
>
> Cautionary note: here we’re dealing with both the “contiguous
> set of pages” case and the “small region of bytes in a single page”
> case. See rpcrdma_convert_iovs(): sometimes RPC send or receive
> buffers can be registered (RDMA_NOMSG).

I noticed that (I think). I think this is handled correctly.
What exactly is the caution note here?

>> 	mr = frmr->fr_mr;
>> +	access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
>> +			   IB_ACCESS_REMOTE_READ;
>> +	rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access);
>
> I like this (and the matching ib_dma_unmap_sg). But why wouldn’t
> this function be called ib_dma_map_sg() ? The name ib_map_mr_sg()
> had me thinking for a moment that this API actually posted the
> FASTREG WR, but I see that it doesn’t.

Umm, ib_dma_map_sg is already taken :)

This is what I came up with, it maps the SG elements to the MR
private context.

I'd like to keep the post API for now. It will be possible to
to add a wrapper function that would do:
- dma_map_sg
- ib_map_mr_sg
- init fastreg send_wr
- post_send (maybe)


>> -	while (seg1->mr_nsegs--)
>> -		rpcrdma_unmap_one(ia->ri_device, seg++);
>> +	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
>
> ->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced
> with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction”
> in the rpcrdma_frmr.

Yep, that's correct, if I had turned on dma mapping debug it would shout
at me here...

Note, I added in the git repo a patch to allow arbitrary sg lists in
frwr_op_map() which would allow you to skip the holes check... seems to
work with mlx5...

I did noticed the mlx4 gives a protection error with after the 
conversion... I'll look into that...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III July 22, 2015, 4:04 p.m. UTC | #3
On Jul 22, 2015, at 11:41 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:

> 
>>> +	for (i = 0; i < nsegs;) {
>>> +		sg_set_page(&frmr->sg[i], seg->mr_page,
>>> +			    seg->mr_len, offset_in_page(seg->mr_offset));
>> 
>> Cautionary note: here we’re dealing with both the “contiguous
>> set of pages” case and the “small region of bytes in a single page”
>> case. See rpcrdma_convert_iovs(): sometimes RPC send or receive
>> buffers can be registered (RDMA_NOMSG).
> 
> I noticed that (I think). I think this is handled correctly.
> What exactly is the caution note here?

Well the sg is turned into a page list below your API. Just
want to make sure that we have tested your xprtrdma alterations
with all the ULP possibilities. When you are further along I
can pull this and run my functional tests.


>>> 	mr = frmr->fr_mr;
>>> +	access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
>>> +			   IB_ACCESS_REMOTE_READ;
>>> +	rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access);
>> 
>> I like this (and the matching ib_dma_unmap_sg). But why wouldn’t
>> this function be called ib_dma_map_sg() ? The name ib_map_mr_sg()
>> had me thinking for a moment that this API actually posted the
>> FASTREG WR, but I see that it doesn’t.
> 
> Umm, ib_dma_map_sg is already taken :)
> 
> This is what I came up with, it maps the SG elements to the MR
> private context.
> 
> I'd like to keep the post API for now. It will be possible to
> to add a wrapper function that would do:
> - dma_map_sg
> - ib_map_mr_sg
> - init fastreg send_wr
> - post_send (maybe)

Where xprtrdma might improve is by setting up all the FASTREG
WRs for one RPC with a single chain and post_send. We could do
that with your INDIR_MR concept, for example.


>>> -	while (seg1->mr_nsegs--)
>>> -		rpcrdma_unmap_one(ia->ri_device, seg++);
>>> +	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
>> 
>> ->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced
>> with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction”
>> in the rpcrdma_frmr.
> 
> Yep, that's correct, if I had turned on dma mapping debug it would shout
> at me here...
> 
> Note, I added in the git repo a patch to allow arbitrary sg lists in
> frwr_op_map() which would allow you to skip the holes check... seems to
> work with mlx5...
> 
> I did noticed the mlx4 gives a protection error with after the conversion... I'll look into that...

Should also get Steve and Devesh to try this with their adapters.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig July 22, 2015, 4:59 p.m. UTC | #4
On Wed, Jul 22, 2015 at 11:03:49AM -0400, Chuck Lever wrote:
> I like this (and the matching ib_dma_unmap_sg). But why wouldn?t
> this function be called ib_dma_map_sg() ? The name ib_map_mr_sg()
> had me thinking for a moment that this API actually posted the
> FASTREG WR, but I see that it doesn?t.

We already have a ib_dma_map_sg, which is a wrapper around dma_map_sg
that allows ehc ipath amd qib to do naughty things instead of the
regular dma mapping.

But it seems maybe the dma_map_sg calls or the magic for those other
drivers should be folded into Sagi's new API as those HCA apparently
don't need physical addresses and thus the S/G list.

God knows what's they're doing with a list of virtual addresses, but
removing the struct scatterlist abuse there would be highly welcome.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 22, 2015, 7:21 p.m. UTC | #5
On 7/22/2015 1:55 AM, Sagi Grimberg wrote:
> Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
> ---
>   net/sunrpc/xprtrdma/frwr_ops.c  | 80 ++++++++++++++++++++++-------------------
>   net/sunrpc/xprtrdma/xprt_rdma.h |  4 ++-
>   2 files changed, 47 insertions(+), 37 deletions(-)

Did you intend to change svcrdma as well?

> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index 517efed..e28246b 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -151,9 +151,13 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
>   	f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, depth, 0);
>   	if (IS_ERR(f->fr_mr))
>   		goto out_mr_err;
> -	f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
> -	if (IS_ERR(f->fr_pgl))
> +
> +	f->sg = kcalloc(sizeof(*f->sg), depth, GFP_KERNEL);
> +	if (IS_ERR(f->sg))
>   		goto out_list_err;
> +
> +	sg_init_table(f->sg, depth);
> +
>   	return 0;
>   
>   out_mr_err:
> @@ -163,7 +167,7 @@ out_mr_err:
>   	return rc;
>   
>   out_list_err:
> -	rc = PTR_ERR(f->fr_pgl);
> +	rc = -ENOMEM;
>   	dprintk("RPC:       %s: ib_alloc_fast_reg_page_list status %i\n",
>   		__func__, rc);
>   	ib_dereg_mr(f->fr_mr);
> @@ -179,7 +183,7 @@ __frwr_release(struct rpcrdma_mw *r)
>   	if (rc)
>   		dprintk("RPC:       %s: ib_dereg_mr status %i\n",
>   			__func__, rc);
> -	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
> +	kfree(r->r.frmr.sg);
>   }
>   
>   static int
> @@ -320,10 +324,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>   	struct ib_send_wr fastreg_wr, *bad_wr;
>   	u8 key;
>   	int len, pageoff;
> -	int i, rc;
> -	int seg_len;
> -	u64 pa;
> -	int page_no;
> +	int i, rc, access;
>   
>   	mw = seg1->rl_mw;
>   	seg1->rl_mw = NULL;
> @@ -344,39 +345,46 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>   	if (nsegs > ia->ri_max_frmr_depth)
>   		nsegs = ia->ri_max_frmr_depth;
>   
> -	for (page_no = i = 0; i < nsegs;) {
> -		rpcrdma_map_one(device, seg, direction);
> -		pa = seg->mr_dma;
> -		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
> -			frmr->fr_pgl->page_list[page_no++] = pa;
> -			pa += PAGE_SIZE;
> -		}
> +	for (i = 0; i < nsegs;) {
> +		sg_set_page(&frmr->sg[i], seg->mr_page,
> +			    seg->mr_len, offset_in_page(seg->mr_offset));
>   		len += seg->mr_len;
> -		++seg;
>   		++i;
> -		/* Check for holes */
> +		++seg;
> +
> +		/* Check for holes - needed?? */
>   		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
>   		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
>   			break;
>   	}
> +
> +	frmr->sg_nents = i;
> +	frmr->dma_nents = ib_dma_map_sg(device, frmr->sg,
> +					frmr->sg_nents, direction);
> +	if (!frmr->dma_nents) {
> +		pr_err("RPC:       %s: failed to dma map sg %p sg_nents %d\n",
> +			__func__, frmr->sg, frmr->sg_nents);
> +		return -ENOMEM;
> +	}
> +
>   	dprintk("RPC:       %s: Using frmr %p to map %d segments (%d bytes)\n",
>   		__func__, mw, i, len);
>   
> -	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
> -	fastreg_wr.wr_id = (unsigned long)(void *)mw;
> -	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
> -	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
> -	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
> -	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
> -	fastreg_wr.wr.fast_reg.page_list_len = page_no;
> -	fastreg_wr.wr.fast_reg.length = len;
> -	fastreg_wr.wr.fast_reg.access_flags = writing ?
> -				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
> -				IB_ACCESS_REMOTE_READ;
>   	mr = frmr->fr_mr;
> +	access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
> +			   IB_ACCESS_REMOTE_READ;
> +	rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access);
> +	if (rc) {
> +		pr_err("RPC:       %s: failed to map mr %p rc %d\n",
> +			__func__, frmr->fr_mr, rc);
> +		return rc;
> +	}
> +
>   	key = (u8)(mr->rkey & 0x000000FF);
>   	ib_update_fast_reg_key(mr, ++key);
> -	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
> +
> +	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
> +	ib_set_fastreg_wr(mr, mr->rkey, (uintptr_t)mw, false, &fastreg_wr);
>   
>   	DECR_CQCOUNT(&r_xprt->rx_ep);
>   	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
> @@ -385,15 +393,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>   
>   	seg1->rl_mw = mw;
>   	seg1->mr_rkey = mr->rkey;
> -	seg1->mr_base = seg1->mr_dma + pageoff;
> +	seg1->mr_base = mr->iova;
>   	seg1->mr_nsegs = i;
>   	seg1->mr_len = len;
>   	return i;
>   
>   out_senderr:
>   	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
> -	while (i--)
> -		rpcrdma_unmap_one(device, --seg);
> +	ib_dma_unmap_sg(device, frmr->sg, frmr->sg_nents, direction);
>   	__frwr_queue_recovery(mw);
>   	return rc;
>   }
> @@ -407,22 +414,23 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>   	struct rpcrdma_mr_seg *seg1 = seg;
>   	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>   	struct rpcrdma_mw *mw = seg1->rl_mw;
> +	struct rpcrdma_frmr *frmr = &mw->r.frmr;
>   	struct ib_send_wr invalidate_wr, *bad_wr;
>   	int rc, nsegs = seg->mr_nsegs;
>   
>   	dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
>   
>   	seg1->rl_mw = NULL;
> -	mw->r.frmr.fr_state = FRMR_IS_INVALID;
> +	frmr->fr_state = FRMR_IS_INVALID;
>   
>   	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
>   	invalidate_wr.wr_id = (unsigned long)(void *)mw;
>   	invalidate_wr.opcode = IB_WR_LOCAL_INV;
> -	invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
> +	invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
>   	DECR_CQCOUNT(&r_xprt->rx_ep);
>   
> -	while (seg1->mr_nsegs--)
> -		rpcrdma_unmap_one(ia->ri_device, seg++);
> +	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
> +
>   	read_lock(&ia->ri_qplock);
>   	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
>   	read_unlock(&ia->ri_qplock);
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index 886f8c8..a1c3ab2b 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -195,7 +195,9 @@ enum rpcrdma_frmr_state {
>   };
>   
>   struct rpcrdma_frmr {
> -	struct ib_fast_reg_page_list	*fr_pgl;
> +	struct scatterlist		*sg;
> +	unsigned int			sg_nents;
> +	unsigned int			dma_nents;
>   	struct ib_mr			*fr_mr;
>   	enum rpcrdma_frmr_state		fr_state;
>   	struct work_struct		fr_work;

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg July 23, 2015, 10:20 a.m. UTC | #6
On 7/22/2015 10:21 PM, Steve Wise wrote:
>
> On 7/22/2015 1:55 AM, Sagi Grimberg wrote:
>> Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
>> ---
>>   net/sunrpc/xprtrdma/frwr_ops.c  | 80
>> ++++++++++++++++++++++-------------------
>>   net/sunrpc/xprtrdma/xprt_rdma.h |  4 ++-
>>   2 files changed, 47 insertions(+), 37 deletions(-)
>
> Did you intend to change svcrdma as well?

All the ULPs need to convert. I didn't have a chance to convert
svcrdma yet. Want to take it?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg July 23, 2015, 10:42 a.m. UTC | #7
On 7/22/2015 7:04 PM, Chuck Lever wrote:
>
> On Jul 22, 2015, at 11:41 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>
>>
>>>> +	for (i = 0; i < nsegs;) {
>>>> +		sg_set_page(&frmr->sg[i], seg->mr_page,
>>>> +			    seg->mr_len, offset_in_page(seg->mr_offset));
>>>
>>> Cautionary note: here we’re dealing with both the “contiguous
>>> set of pages” case and the “small region of bytes in a single page”
>>> case. See rpcrdma_convert_iovs(): sometimes RPC send or receive
>>> buffers can be registered (RDMA_NOMSG).
>>
>> I noticed that (I think). I think this is handled correctly.
>> What exactly is the caution note here?
>
> Well the sg is turned into a page list below your API. Just
> want to make sure that we have tested your xprtrdma alterations
> with all the ULP possibilities. When you are further along I
> can pull this and run my functional tests.
>
>
>>>> 	mr = frmr->fr_mr;
>>>> +	access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
>>>> +			   IB_ACCESS_REMOTE_READ;
>>>> +	rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access);
>>>
>>> I like this (and the matching ib_dma_unmap_sg). But why wouldn’t
>>> this function be called ib_dma_map_sg() ? The name ib_map_mr_sg()
>>> had me thinking for a moment that this API actually posted the
>>> FASTREG WR, but I see that it doesn’t.
>>
>> Umm, ib_dma_map_sg is already taken :)
>>
>> This is what I came up with, it maps the SG elements to the MR
>> private context.
>>
>> I'd like to keep the post API for now. It will be possible to
>> to add a wrapper function that would do:
>> - dma_map_sg
>> - ib_map_mr_sg
>> - init fastreg send_wr
>> - post_send (maybe)
>
> Where xprtrdma might improve is by setting up all the FASTREG
> WRs for one RPC with a single chain and post_send. We could do
> that with your INDIR_MR concept, for example.

BTW, it would be great if you can play with it a little bit. I'm more
confident with the iSER part... I added two small fixes when I tested
with mlx4. It seems to work...

>
>
>>>> -	while (seg1->mr_nsegs--)
>>>> -		rpcrdma_unmap_one(ia->ri_device, seg++);
>>>> +	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
>>>
>>> ->mr_dir was previously set by rpcrdma_map_one(), which you’ve replaced
>>> with ib_map_mr_sg(). So maybe frwr_op_map() needs to save “direction”
>>> in the rpcrdma_frmr.
>>
>> Yep, that's correct, if I had turned on dma mapping debug it would shout
>> at me here...
>>
>> Note, I added in the git repo a patch to allow arbitrary sg lists in
>> frwr_op_map() which would allow you to skip the holes check... seems to
>> work with mlx5...
>>
>> I did noticed the mlx4 gives a protection error with after the conversion... I'll look into that...
>
> Should also get Steve and Devesh to try this with their adapters.

Ah, yes please. I've only compiled tested drivers other than mlx4, mlx5
which means there is a 99.9% (probably 100%) that it doesn't work.

It would be great to get help on porting the rest of the ULPs as well,
but that can wait until we converge on the API...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 23, 2015, 1:46 p.m. UTC | #8
> -----Original Message-----
> From: Sagi Grimberg [mailto:sagig@dev.mellanox.co.il]
> Sent: Thursday, July 23, 2015 5:21 AM
> To: Steve Wise; Sagi Grimberg; linux-rdma@vger.kernel.org
> Cc: Liran Liss; Oren Duer
> Subject: Re: [PATCH WIP 37/43] xprtrdma: Port to new memory registration API
> 
> On 7/22/2015 10:21 PM, Steve Wise wrote:
> >
> > On 7/22/2015 1:55 AM, Sagi Grimberg wrote:
> >> Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
> >> ---
> >>   net/sunrpc/xprtrdma/frwr_ops.c  | 80
> >> ++++++++++++++++++++++-------------------
> >>   net/sunrpc/xprtrdma/xprt_rdma.h |  4 ++-
> >>   2 files changed, 47 insertions(+), 37 deletions(-)
> >
> > Did you intend to change svcrdma as well?
> 
> All the ULPs need to convert. I didn't have a chance to convert
> svcrdma yet. Want to take it?

Not right now.  My focus is still on enabling iSER.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 517efed..e28246b 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -151,9 +151,13 @@  __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
 	f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_FAST_REG, depth, 0);
 	if (IS_ERR(f->fr_mr))
 		goto out_mr_err;
-	f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
-	if (IS_ERR(f->fr_pgl))
+
+	f->sg = kcalloc(sizeof(*f->sg), depth, GFP_KERNEL);
+	if (IS_ERR(f->sg))
 		goto out_list_err;
+
+	sg_init_table(f->sg, depth);
+
 	return 0;
 
 out_mr_err:
@@ -163,7 +167,7 @@  out_mr_err:
 	return rc;
 
 out_list_err:
-	rc = PTR_ERR(f->fr_pgl);
+	rc = -ENOMEM;
 	dprintk("RPC:       %s: ib_alloc_fast_reg_page_list status %i\n",
 		__func__, rc);
 	ib_dereg_mr(f->fr_mr);
@@ -179,7 +183,7 @@  __frwr_release(struct rpcrdma_mw *r)
 	if (rc)
 		dprintk("RPC:       %s: ib_dereg_mr status %i\n",
 			__func__, rc);
-	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+	kfree(r->r.frmr.sg);
 }
 
 static int
@@ -320,10 +324,7 @@  frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	struct ib_send_wr fastreg_wr, *bad_wr;
 	u8 key;
 	int len, pageoff;
-	int i, rc;
-	int seg_len;
-	u64 pa;
-	int page_no;
+	int i, rc, access;
 
 	mw = seg1->rl_mw;
 	seg1->rl_mw = NULL;
@@ -344,39 +345,46 @@  frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (nsegs > ia->ri_max_frmr_depth)
 		nsegs = ia->ri_max_frmr_depth;
 
-	for (page_no = i = 0; i < nsegs;) {
-		rpcrdma_map_one(device, seg, direction);
-		pa = seg->mr_dma;
-		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-			frmr->fr_pgl->page_list[page_no++] = pa;
-			pa += PAGE_SIZE;
-		}
+	for (i = 0; i < nsegs;) {
+		sg_set_page(&frmr->sg[i], seg->mr_page,
+			    seg->mr_len, offset_in_page(seg->mr_offset));
 		len += seg->mr_len;
-		++seg;
 		++i;
-		/* Check for holes */
+		++seg;
+
+		/* Check for holes - needed?? */
 		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 			break;
 	}
+
+	frmr->sg_nents = i;
+	frmr->dma_nents = ib_dma_map_sg(device, frmr->sg,
+					frmr->sg_nents, direction);
+	if (!frmr->dma_nents) {
+		pr_err("RPC:       %s: failed to dma map sg %p sg_nents %d\n",
+			__func__, frmr->sg, frmr->sg_nents);
+		return -ENOMEM;
+	}
+
 	dprintk("RPC:       %s: Using frmr %p to map %d segments (%d bytes)\n",
 		__func__, mw, i, len);
 
-	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
-	fastreg_wr.wr_id = (unsigned long)(void *)mw;
-	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
-	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
-	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
-	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-	fastreg_wr.wr.fast_reg.page_list_len = page_no;
-	fastreg_wr.wr.fast_reg.length = len;
-	fastreg_wr.wr.fast_reg.access_flags = writing ?
-				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
-				IB_ACCESS_REMOTE_READ;
 	mr = frmr->fr_mr;
+	access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+			   IB_ACCESS_REMOTE_READ;
+	rc = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, access);
+	if (rc) {
+		pr_err("RPC:       %s: failed to map mr %p rc %d\n",
+			__func__, frmr->fr_mr, rc);
+		return rc;
+	}
+
 	key = (u8)(mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(mr, ++key);
-	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	ib_set_fastreg_wr(mr, mr->rkey, (uintptr_t)mw, false, &fastreg_wr);
 
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
@@ -385,15 +393,14 @@  frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 
 	seg1->rl_mw = mw;
 	seg1->mr_rkey = mr->rkey;
-	seg1->mr_base = seg1->mr_dma + pageoff;
+	seg1->mr_base = mr->iova;
 	seg1->mr_nsegs = i;
 	seg1->mr_len = len;
 	return i;
 
 out_senderr:
 	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-	while (i--)
-		rpcrdma_unmap_one(device, --seg);
+	ib_dma_unmap_sg(device, frmr->sg, frmr->sg_nents, direction);
 	__frwr_queue_recovery(mw);
 	return rc;
 }
@@ -407,22 +414,23 @@  frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mw *mw = seg1->rl_mw;
+	struct rpcrdma_frmr *frmr = &mw->r.frmr;
 	struct ib_send_wr invalidate_wr, *bad_wr;
 	int rc, nsegs = seg->mr_nsegs;
 
 	dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
 
 	seg1->rl_mw = NULL;
-	mw->r.frmr.fr_state = FRMR_IS_INVALID;
+	frmr->fr_state = FRMR_IS_INVALID;
 
 	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
 	invalidate_wr.wr_id = (unsigned long)(void *)mw;
 	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
+	invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
-	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia->ri_device, seg++);
+	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
+
 	read_lock(&ia->ri_qplock);
 	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
 	read_unlock(&ia->ri_qplock);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 886f8c8..a1c3ab2b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -195,7 +195,9 @@  enum rpcrdma_frmr_state {
 };
 
 struct rpcrdma_frmr {
-	struct ib_fast_reg_page_list	*fr_pgl;
+	struct scatterlist		*sg;
+	unsigned int			sg_nents;
+	unsigned int			dma_nents;
 	struct ib_mr			*fr_mr;
 	enum rpcrdma_frmr_state		fr_state;
 	struct work_struct		fr_work;