diff mbox

[v3,05/15] xprtrdma: Remove last ib_reg_phys_mr() call site

Message ID 20150720190311.10997.12636.stgit@manet.1015granger.net (mailing list archive)
State Not Applicable
Headers show

Commit Message

Chuck Lever III July 20, 2015, 7:03 p.m. UTC
All HCA providers have an ib_get_dma_mr() verb. Thus
rpcrdma_ia_open() will either grab the device's local_dma_key if one
is available, or it will call ib_get_dma_mr() which is a 100%
guaranteed fallback.

There is never any need to use the ib_reg_phys_mr() code path in
rpcrdma_register_internal(), so it can be removed. The remaining
logic in rpcrdma_{de}register_internal() is folded into
rpcrdma_{alloc,free}_regbuf().

This is clean up only. No behavior change is expected.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Devesh Sharma <devesh.sharma@avagotech.com>
Reviewed-By: Sagi Grimberg <sagig@mellanox.com>
Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
---
 net/sunrpc/xprtrdma/verbs.c     |  102 ++++++++-------------------------------
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 
 2 files changed, 21 insertions(+), 82 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tom Talpey July 20, 2015, 8:34 p.m. UTC | #1
On 7/20/2015 12:03 PM, Chuck Lever wrote:
> All HCA providers have an ib_get_dma_mr() verb. Thus
> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> is available, or it will call ib_get_dma_mr() which is a 100%
> guaranteed fallback.

I recall that in the past, some providers did not support mapping
all of the machine's potential physical memory with a single dma_mr.
If an rnic did/does not support 44-ish bits of length per region,
for example.

Have you verified that all current providers do in fact support the
necessary physical address range for this, and that the requirement
is stated in the verbs going forward?


>
> There is never any need to use the ib_reg_phys_mr() code path in
> rpcrdma_register_internal(), so it can be removed. The remaining
> logic in rpcrdma_{de}register_internal() is folded into
> rpcrdma_{alloc,free}_regbuf().
>
> This is clean up only. No behavior change is expected.
>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> Reviewed-by: Devesh Sharma <devesh.sharma@avagotech.com>
> Reviewed-By: Sagi Grimberg <sagig@mellanox.com>
> Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
> ---
>   net/sunrpc/xprtrdma/verbs.c     |  102 ++++++++-------------------------------
>   net/sunrpc/xprtrdma/xprt_rdma.h |    1
>   2 files changed, 21 insertions(+), 82 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 1065808..da184f9 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -1229,75 +1229,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
>   		(unsigned long long)seg->mr_dma, seg->mr_dmalen);
>   }
>
> -static int
> -rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
> -				struct ib_mr **mrp, struct ib_sge *iov)
> -{
> -	struct ib_phys_buf ipb;
> -	struct ib_mr *mr;
> -	int rc;
> -
> -	/*
> -	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
> -	 */
> -	iov->addr = ib_dma_map_single(ia->ri_device,
> -			va, len, DMA_BIDIRECTIONAL);
> -	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
> -		return -ENOMEM;
> -
> -	iov->length = len;
> -
> -	if (ia->ri_have_dma_lkey) {
> -		*mrp = NULL;
> -		iov->lkey = ia->ri_dma_lkey;
> -		return 0;
> -	} else if (ia->ri_bind_mem != NULL) {
> -		*mrp = NULL;
> -		iov->lkey = ia->ri_bind_mem->lkey;
> -		return 0;
> -	}
> -
> -	ipb.addr = iov->addr;
> -	ipb.size = iov->length;
> -	mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
> -			IB_ACCESS_LOCAL_WRITE, &iov->addr);
> -
> -	dprintk("RPC:       %s: phys convert: 0x%llx "
> -			"registered 0x%llx length %d\n",
> -			__func__, (unsigned long long)ipb.addr,
> -			(unsigned long long)iov->addr, len);
> -
> -	if (IS_ERR(mr)) {
> -		*mrp = NULL;
> -		rc = PTR_ERR(mr);
> -		dprintk("RPC:       %s: failed with %i\n", __func__, rc);
> -	} else {
> -		*mrp = mr;
> -		iov->lkey = mr->lkey;
> -		rc = 0;
> -	}
> -
> -	return rc;
> -}
> -
> -static int
> -rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
> -				struct ib_mr *mr, struct ib_sge *iov)
> -{
> -	int rc;
> -
> -	ib_dma_unmap_single(ia->ri_device,
> -			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
> -
> -	if (NULL == mr)
> -		return 0;
> -
> -	rc = ib_dereg_mr(mr);
> -	if (rc)
> -		dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
> -	return rc;
> -}
> -
>   /**
>    * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
>    * @ia: controlling rpcrdma_ia
> @@ -1317,26 +1248,30 @@ struct rpcrdma_regbuf *
>   rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
>   {
>   	struct rpcrdma_regbuf *rb;
> -	int rc;
> +	struct ib_sge *iov;
>
> -	rc = -ENOMEM;
>   	rb = kmalloc(sizeof(*rb) + size, flags);
>   	if (rb == NULL)
>   		goto out;
>
> -	rb->rg_size = size;
> -	rb->rg_owner = NULL;
> -	rc = rpcrdma_register_internal(ia, rb->rg_base, size,
> -				       &rb->rg_mr, &rb->rg_iov);
> -	if (rc)
> +	iov = &rb->rg_iov;
> +	iov->addr = ib_dma_map_single(ia->ri_device,
> +				      (void *)rb->rg_base, size,
> +				      DMA_BIDIRECTIONAL);
> +	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
>   		goto out_free;
>
> +	iov->length = size;
> +	iov->lkey = ia->ri_have_dma_lkey ?
> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> +	rb->rg_size = size;
> +	rb->rg_owner = NULL;
>   	return rb;
>
>   out_free:
>   	kfree(rb);
>   out:
> -	return ERR_PTR(rc);
> +	return ERR_PTR(-ENOMEM);
>   }
>
>   /**
> @@ -1347,10 +1282,15 @@ out:
>   void
>   rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
>   {
> -	if (rb) {
> -		rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
> -		kfree(rb);
> -	}
> +	struct ib_sge *iov;
> +
> +	if (!rb)
> +		return;
> +
> +	iov = &rb->rg_iov;
> +	ib_dma_unmap_single(ia->ri_device,
> +			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
> +	kfree(rb);
>   }
>
>   /*
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index abee472..ce4e79e 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -119,7 +119,6 @@ struct rpcrdma_ep {
>   struct rpcrdma_regbuf {
>   	size_t			rg_size;
>   	struct rpcrdma_req	*rg_owner;
> -	struct ib_mr		*rg_mr;
>   	struct ib_sge		rg_iov;
>   	__be32			rg_base[0] __attribute__ ((aligned(256)));
>   };
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III July 20, 2015, 8:55 p.m. UTC | #2
Hi Tom-


On Jul 20, 2015, at 4:34 PM, Tom Talpey <tom@talpey.com> wrote:

> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>> All HCA providers have an ib_get_dma_mr() verb. Thus
>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>> is available, or it will call ib_get_dma_mr() which is a 100%
>> guaranteed fallback.
> 
> I recall that in the past, some providers did not support mapping
> all of the machine's potential physical memory with a single dma_mr.
> If an rnic did/does not support 44-ish bits of length per region,
> for example.

The buffers affected by this change are small, so I’m confident that
restriction would not be a problem here.

What might break with such restricted hardware is ALLPHYSICAL on
large-memory systems. ALLPHYSICAL does rely on a single DMA MR that
covers all of the NFS client’s memory.

That would be a problem both before and after this patch, as far as
I can tell.


> Have you verified that all current providers do in fact support the
> necessary physical address range for this, and that the requirement
> is stated in the verbs going forward?


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2015, 9:05 p.m. UTC | #3
On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >All HCA providers have an ib_get_dma_mr() verb. Thus
> >rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >is available, or it will call ib_get_dma_mr() which is a 100%
> >guaranteed fallback.
> 
> I recall that in the past, some providers did not support mapping
> all of the machine's potential physical memory with a single dma_mr.
> If an rnic did/does not support 44-ish bits of length per region,
> for example.

Looks like you are right, but the standard in kernel is to require
ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
big memory machine with kernel ULPs.

Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
physical memory, and silently break all kernel ULPs if they are used
on a modern machine with > 4G.

Is that right Steve?

Based on that, should we remove the cxgb3 driver as well? Or at least
can you fix it up to at least fail get_dma_mr if there is too much
ram?

Everything else looked Ok.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 20, 2015, 9:16 p.m. UTC | #4
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> Sent: Monday, July 20, 2015 4:06 PM
> To: Tom Talpey; Steve Wise
> Cc: Chuck Lever; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> > On 7/20/2015 12:03 PM, Chuck Lever wrote:
> > >All HCA providers have an ib_get_dma_mr() verb. Thus
> > >rpcrdma_ia_open() will either grab the device's local_dma_key if one
> > >is available, or it will call ib_get_dma_mr() which is a 100%
> > >guaranteed fallback.
> >
> > I recall that in the past, some providers did not support mapping
> > all of the machine's potential physical memory with a single dma_mr.
> > If an rnic did/does not support 44-ish bits of length per region,
> > for example.
> 
> Looks like you are right, but the standard in kernel is to require
> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> big memory machine with kernel ULPs.
> 
> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> physical memory, and silently break all kernel ULPs if they are used
> on a modern machine with > 4G.
> 
> Is that right Steve?
>

Yes.
 
> Based on that, should we remove the cxgb3 driver as well? Or at least
> can you fix it up to at least fail get_dma_mr if there is too much
> ram?
> 

I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of available
ram?

What is the process again to remove a driver?  I can remove amso...


 
> Everything else looked Ok.
> 
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 20, 2015, 9:34 p.m. UTC | #5
> 
> > Based on that, should we remove the cxgb3 driver as well? Or at least
> > can you fix it up to at least fail get_dma_mr if there is too much
> > ram?
> >
> 
> I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of available
ram?
>

Something like this?

@@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
        /*
         * T3 only supports 32 bits of size.
         */
+       if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
+               return -ENOTSUPP;
+
        bl.size = 0xffffffff;
        bl.addr = 0;
        kva = 0;
 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 20, 2015, 9:37 p.m. UTC | #6
> -----Original Message-----
> From: Steve Wise [mailto:swise@opengridcomputing.com]
> Sent: Monday, July 20, 2015 4:34 PM
> To: 'Jason Gunthorpe'; 'Tom Talpey'
> Cc: 'Chuck Lever'; 'linux-rdma@vger.kernel.org'; 'linux-nfs@vger.kernel.org'
> Subject: RE: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> 
> >
> > > Based on that, should we remove the cxgb3 driver as well? Or at least
> > > can you fix it up to at least fail get_dma_mr if there is too much
> > > ram?
> > >
> >
> > I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
available ram?
> >
> 
> Something like this?
> 
> @@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
>         /*
>          * T3 only supports 32 bits of size.
>          */
> +       if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
> +               return -ENOTSUPP;

oops: ERR_PTR(-ENOTSUPP);

> +
>         bl.size = 0xffffffff;
>         bl.addr = 0;
>         kva = 0;
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 20, 2015, 9:55 p.m. UTC | #7
On 7/20/2015 1:55 PM, Chuck Lever wrote:
> On Jul 20, 2015, at 4:34 PM, Tom Talpey <tom@talpey.com> wrote:
>
>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>> guaranteed fallback.
>>
>> I recall that in the past, some providers did not support mapping
>> all of the machine's potential physical memory with a single dma_mr.
>> If an rnic did/does not support 44-ish bits of length per region,
>> for example.
>
> The buffers affected by this change are small, so I’m confident that
> restriction would not be a problem here.

It's not about the buffer size, it's about the region. Because the
get_dma_mr does not specify a base address and length, the rnic must
basically attempt to map a base of zero and a length of the largest
physical offset.

This is not the case with the previous phys_reg_mr, which specified
the exact phys page range.

> What might break with such restricted hardware is ALLPHYSICAL on
> large-memory systems. ALLPHYSICAL does rely on a single DMA MR that
> covers all of the NFS client’s memory.

Yes, indeed it always did, but that mode was not intended for general
use.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 20, 2015, 10:04 p.m. UTC | #8
On 7/20/2015 2:16 PM, Steve Wise wrote:
>
>
>> -----Original Message-----
>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
>> Sent: Monday, July 20, 2015 4:06 PM
>> To: Tom Talpey; Steve Wise
>> Cc: Chuck Lever; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>
>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>> guaranteed fallback.
>>>
>>> I recall that in the past, some providers did not support mapping
>>> all of the machine's potential physical memory with a single dma_mr.
>>> If an rnic did/does not support 44-ish bits of length per region,
>>> for example.
>>
>> Looks like you are right, but the standard in kernel is to require
>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
>> big memory machine with kernel ULPs.
>>
>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
>> physical memory, and silently break all kernel ULPs if they are used
>> on a modern machine with > 4G.
>>
>> Is that right Steve?
>>
>
> Yes.
>
>> Based on that, should we remove the cxgb3 driver as well? Or at least
>> can you fix it up to at least fail get_dma_mr if there is too much
>> ram?
>>
>
> I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of available
> ram?

A) are you sure it's an unsigned length, i.e. is it really 31 bits?

B) why bother to check? Are machines with <4GB interesting, and worth
supporting a special optimization?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2015, 10:13 p.m. UTC | #9
On Mon, Jul 20, 2015 at 04:37:15PM -0500, Steve Wise wrote:
> 
> 
> > From: Steve Wise [mailto:swise@opengridcomputing.com]
> > Sent: Monday, July 20, 2015 4:34 PM
> > To: 'Jason Gunthorpe'; 'Tom Talpey'
> > Cc: 'Chuck Lever'; 'linux-rdma@vger.kernel.org'; 'linux-nfs@vger.kernel.org'
> > Subject: RE: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> > 
> > 
> > >
> > > > Based on that, should we remove the cxgb3 driver as well? Or at least
> > > > can you fix it up to at least fail get_dma_mr if there is too much
> > > > ram?
> > > >
> > >
> > > I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
> available ram?
> > >
> > 
> > Something like this?
> > 
> > @@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
> >         /*
> >          * T3 only supports 32 bits of size.
> >          */
> > +       if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
> > +               return -ENOTSUPP;
> 
> oops: ERR_PTR(-ENOTSUPP);
> 
> > +
> >         bl.size = 0xffffffff;
> >         bl.addr = 0;
> >         kva = 0;

Yah, that seems much better.. With the patch set I am working on this
will mean all ULPs will fail to create a kernel PD on cxgb3 if the
above triggers. If our error handling works that should just make it
unuable from kernel space which is what we need to see..

To remove the amasso driver, move it to Staging. I don't know the
exact details, sorry. Hopefully Doug will be able to help you do that.

Maybe add some kind of one time print 'cxgb3 is not usuable on
machines with >4G of RAM' as well?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2015, 10:17 p.m. UTC | #10
On Mon, Jul 20, 2015 at 03:04:21PM -0700, Tom Talpey wrote:
> B) why bother to check? Are machines with <4GB interesting, and worth
> supporting a special optimization?

mainline drivers shouldn't silently malfunction.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III July 20, 2015, 10:21 p.m. UTC | #11
On Jul 20, 2015, at 5:55 PM, Tom Talpey <tom@talpey.com> wrote:

> On 7/20/2015 1:55 PM, Chuck Lever wrote:
>> On Jul 20, 2015, at 4:34 PM, Tom Talpey <tom@talpey.com> wrote:
>> 
>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>> guaranteed fallback.
>>> 
>>> I recall that in the past, some providers did not support mapping
>>> all of the machine's potential physical memory with a single dma_mr.
>>> If an rnic did/does not support 44-ish bits of length per region,
>>> for example.
>> 
>> The buffers affected by this change are small, so I’m confident that
>> restriction would not be a problem here.
> 
> It's not about the buffer size, it's about the region. Because the
> get_dma_mr does not specify a base address and length, the rnic must
> basically attempt to map a base of zero and a length of the largest
> physical offset.

Understood now, but:


> This is not the case with the previous phys_reg_mr, which specified
> the exact phys page range.

rpcrdma_ia_open() fails immediately if IB_DEVICE_LOCAL_DMA_LKEY
is not asserted _and_ ib_get_dma_mr() fails. We never get to the
logic in rpcrdma_regbuf_alloc() in that case, so ib_reg_phys_mr()
would still never be invoked. That really is dead code.

If you prefer, I can adjust the patch description to remove the
“100% guaranteed fallback” line.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2015, 10:26 p.m. UTC | #12
On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
> +	iov->length = size;
> +	iov->lkey = ia->ri_have_dma_lkey ?
> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> +	rb->rg_size = size;
> +	rb->rg_owner = NULL;
>  	return rb;

There is something odd looking about this..

ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
RPCRDMA_MTHCAFMR cases.

RPCRDMA_FRMR doesn't set it up.

So this code in rpcrdma_alloc_regbuf is never called for the FRMR
case?

If yes, then, how is FRMR working? There is absolutely no reason to
use FRMR to register local send buffers, just use the global all
memory lkey...

If no, then that is an oops?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 20, 2015, 10:26 p.m. UTC | #13
On 7/20/2015 3:17 PM, Jason Gunthorpe wrote:
> On Mon, Jul 20, 2015 at 03:04:21PM -0700, Tom Talpey wrote:
>> B) why bother to check? Are machines with <4GB interesting, and worth
>> supporting a special optimization?
>
> mainline drivers shouldn't silently malfunction.

I meant why bother to check, just always return failure. It's more
honest, and robust. Someone might test with 2GB, conclude it worked,
then deploy with 8GB and boom.

Also, what about hotplug RAM? Same situation.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 20, 2015, 10:30 p.m. UTC | #14
On 7/20/2015 3:21 PM, Chuck Lever wrote:
>
> On Jul 20, 2015, at 5:55 PM, Tom Talpey <tom@talpey.com> wrote:
>
>> On 7/20/2015 1:55 PM, Chuck Lever wrote:
>>> On Jul 20, 2015, at 4:34 PM, Tom Talpey <tom@talpey.com> wrote:
>>>
>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>>> guaranteed fallback.
>>>>
>>>> I recall that in the past, some providers did not support mapping
>>>> all of the machine's potential physical memory with a single dma_mr.
>>>> If an rnic did/does not support 44-ish bits of length per region,
>>>> for example.
>>>
>>> The buffers affected by this change are small, so I’m confident that
>>> restriction would not be a problem here.
>>
>> It's not about the buffer size, it's about the region. Because the
>> get_dma_mr does not specify a base address and length, the rnic must
>> basically attempt to map a base of zero and a length of the largest
>> physical offset.
>
> Understood now, but:
>
>
>> This is not the case with the previous phys_reg_mr, which specified
>> the exact phys page range.
>
> rpcrdma_ia_open() fails immediately if IB_DEVICE_LOCAL_DMA_LKEY
> is not asserted _and_ ib_get_dma_mr() fails. We never get to the
> logic in rpcrdma_regbuf_alloc() in that case, so ib_reg_phys_mr()
> would still never be invoked. That really is dead code.
>
> If you prefer, I can adjust the patch description to remove the
> “100% guaranteed fallback” line.


Ok, good and yes 100% sounds like a risky statement.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III July 20, 2015, 10:31 p.m. UTC | #15
On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:

> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>> +	iov->length = size;
>> +	iov->lkey = ia->ri_have_dma_lkey ?
>> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>> +	rb->rg_size = size;
>> +	rb->rg_owner = NULL;
>> 	return rb;
> 
> There is something odd looking about this..
> 
> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
> RPCRDMA_MTHCAFMR cases.
> 
> RPCRDMA_FRMR doesn't set it up.
> 
> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
> case?
> 
> If yes, then, how is FRMR working? There is absolutely no reason to
> use FRMR to register local send buffers, just use the global all
> memory lkey...
> 
> If no, then that is an oops?

I’ve tested this code, no oops.

FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
IB_DEVICE_LOCAL_DMA_LKEY is not asserted.

FMR and PHYSICAL use the DMA lkey if it exists, otherwise they
use an lkey from ib_get_dma_mr.

Look in 06/15, this is cleaned up.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 20, 2015, 10:41 p.m. UTC | #16
> -----Original Message-----
> From: Tom Talpey [mailto:tom@talpey.com]
> Sent: Monday, July 20, 2015 5:04 PM
> To: Steve Wise; 'Jason Gunthorpe'
> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> On 7/20/2015 2:16 PM, Steve Wise wrote:
> >
> >
> >> -----Original Message-----
> >> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> >> Sent: Monday, July 20, 2015 4:06 PM
> >> To: Tom Talpey; Steve Wise
> >> Cc: Chuck Lever; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> >> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>
> >> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> >>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >>>> All HCA providers have an ib_get_dma_mr() verb. Thus
> >>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >>>> is available, or it will call ib_get_dma_mr() which is a 100%
> >>>> guaranteed fallback.
> >>>
> >>> I recall that in the past, some providers did not support mapping
> >>> all of the machine's potential physical memory with a single dma_mr.
> >>> If an rnic did/does not support 44-ish bits of length per region,
> >>> for example.
> >>
> >> Looks like you are right, but the standard in kernel is to require
> >> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> >> big memory machine with kernel ULPs.
> >>
> >> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> >> physical memory, and silently break all kernel ULPs if they are used
> >> on a modern machine with > 4G.
> >>
> >> Is that right Steve?
> >>
> >
> > Yes.
> >
> >> Based on that, should we remove the cxgb3 driver as well? Or at least
> >> can you fix it up to at least fail get_dma_mr if there is too much
> >> ram?
> >>
> >
> > I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
available
> > ram?
> 
> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
> 

yes.

> B) why bother to check? Are machines with <4GB interesting, and worth
> supporting a special optimization?

No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2015, 10:41 p.m. UTC | #17
On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
> 
> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
> 
> > On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
> >> +	iov->length = size;
> >> +	iov->lkey = ia->ri_have_dma_lkey ?
> >> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
> >> +	rb->rg_size = size;
> >> +	rb->rg_owner = NULL;
> >> 	return rb;
> > 
> > There is something odd looking about this..
> > 
> > ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
> > RPCRDMA_MTHCAFMR cases.
> > 
> > RPCRDMA_FRMR doesn't set it up.
> > 
> > So this code in rpcrdma_alloc_regbuf is never called for the FRMR
> > case?
> > 
> > If yes, then, how is FRMR working? There is absolutely no reason to
> > use FRMR to register local send buffers, just use the global all
> > memory lkey...
> > 
> > If no, then that is an oops?
> 
> I’ve tested this code, no oops.
> 
> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.

Ah, I see. Ok.

Is there a reason to link FRWR and LOCAL_DMA_LKEY together? Just use
the code you have for fmr with frwr to get the lkey, probably move it
to rpcrdma_ia_open .. Physical MR should create a 2nd MR dedicated for
rkey use.

That will work really well with the series I'm working on:

https://github.com/jgunthorpe/linux/tree/remove-ib_get_dma_mr

To just drop ib_get_dma_mr entirely.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2015, 10:42 p.m. UTC | #18
On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
> > B) why bother to check? Are machines with <4GB interesting, and worth
> > supporting a special optimization?
> 
> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.

Doesn't look like the NFS client will work. It requires an all
physical memory lkey for SEND and RECV buffers..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 20, 2015, 10:43 p.m. UTC | #19
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgunthorpe@obsidianresearch.com]
> Sent: Monday, July 20, 2015 5:14 PM
> To: Steve Wise
> Cc: 'Tom Talpey'; 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> On Mon, Jul 20, 2015 at 04:37:15PM -0500, Steve Wise wrote:
> >
> >
> > > From: Steve Wise [mailto:swise@opengridcomputing.com]
> > > Sent: Monday, July 20, 2015 4:34 PM
> > > To: 'Jason Gunthorpe'; 'Tom Talpey'
> > > Cc: 'Chuck Lever'; 'linux-rdma@vger.kernel.org'; 'linux-nfs@vger.kernel.org'
> > > Subject: RE: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> > >
> > >
> > > >
> > > > > Based on that, should we remove the cxgb3 driver as well? Or at least
> > > > > can you fix it up to at least fail get_dma_mr if there is too much
> > > > > ram?
> > > > >
> > > >
> > > > I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
> > available ram?
> > > >
> > >
> > > Something like this?
> > >
> > > @@ -736,6 +736,9 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
> > >         /*
> > >          * T3 only supports 32 bits of size.
> > >          */
> > > +       if ((totalram_pages << PAGE_SHIFT) > 0xffffffff)
> > > +               return -ENOTSUPP;
> >
> > oops: ERR_PTR(-ENOTSUPP);
> >
> > > +
> > >         bl.size = 0xffffffff;
> > >         bl.addr = 0;
> > >         kva = 0;
> 
> Yah, that seems much better.. With the patch set I am working on this
> will mean all ULPs will fail to create a kernel PD on cxgb3 if the
> above triggers. If our error handling works that should just make it
> unuable from kernel space which is what we need to see..
> 

What about user PDs?  We need that to work for cxgb3.


> To remove the amasso driver, move it to Staging. I don't know the
> exact details, sorry. Hopefully Doug will be able to help you do that.
>

ok thanks

 
> Maybe add some kind of one time print 'cxgb3 is not usuable on
> machines with >4G of RAM' as well?
>

Could do.

> Jason

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2015, 10:54 p.m. UTC | #20
On Mon, Jul 20, 2015 at 05:43:34PM -0500, Steve Wise wrote:
> > Yah, that seems much better.. With the patch set I am working on this
> > will mean all ULPs will fail to create a kernel PD on cxgb3 if the
> > above triggers. If our error handling works that should just make it
> > unuable from kernel space which is what we need to see..

> What about user PDs?  We need that to work for cxgb3.

Only in-kernel uses call ib_alloc_pd

The user flow calls ib_uverbs_alloc_pd which calls the driver
directly.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 20, 2015, 10:54 p.m. UTC | #21
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgunthorpe@obsidianresearch.com]
> Sent: Monday, July 20, 2015 5:54 PM
> To: Steve Wise
> Cc: 'Tom Talpey'; 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> On Mon, Jul 20, 2015 at 05:43:34PM -0500, Steve Wise wrote:
> > > Yah, that seems much better.. With the patch set I am working on this
> > > will mean all ULPs will fail to create a kernel PD on cxgb3 if the
> > > above triggers. If our error handling works that should just make it
> > > unuable from kernel space which is what we need to see..
> 
> > What about user PDs?  We need that to work for cxgb3.
> 
> Only in-kernel uses call ib_alloc_pd
> 
> The user flow calls ib_uverbs_alloc_pd which calls the driver
> directly.
>

Ok good.  Thanks!


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III July 20, 2015, 11:36 p.m. UTC | #22
On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:

> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>> 
>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>> 
>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>> +	iov->length = size;
>>>> +	iov->lkey = ia->ri_have_dma_lkey ?
>>>> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>> +	rb->rg_size = size;
>>>> +	rb->rg_owner = NULL;
>>>> 	return rb;
>>> 
>>> There is something odd looking about this..
>>> 
>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>> RPCRDMA_MTHCAFMR cases.
>>> 
>>> RPCRDMA_FRMR doesn't set it up.
>>> 
>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>> case?
>>> 
>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>> use FRMR to register local send buffers, just use the global all
>>> memory lkey...
>>> 
>>> If no, then that is an oops?
>> 
>> I’ve tested this code, no oops.
>> 
>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
> 
> Ah, I see. Ok.
> 
> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?

Tom would know.

> Just use
> the code you have for fmr with frwr to get the lkey, probably move it
> to rpcrdma_ia_open .. Physical MR should create a 2nd MR dedicated for
> rkey use.
> 
> That will work really well with the series I'm working on:
> 
> https://github.com/jgunthorpe/linux/tree/remove-ib_get_dma_mr
> 
> To just drop ib_get_dma_mr entirely.

Yes, I sorted this out so it would be obvious how to deal with
xprtrdma when the time comes. All three registration modes can
just use pd->local_dma_lkey for the SEND/RECV buffers.

PHYSICAL would still need ib_get_dma_mr() (or something equivalent)
for MRs needing remote access, agreed.

--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 21, 2015, 12:11 a.m. UTC | #23
On 7/20/2015 4:36 PM, Chuck Lever wrote:
>
> On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>
>> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>>>
>>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>>>
>>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>>> +	iov->length = size;
>>>>> +	iov->lkey = ia->ri_have_dma_lkey ?
>>>>> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>>> +	rb->rg_size = size;
>>>>> +	rb->rg_owner = NULL;
>>>>> 	return rb;
>>>>
>>>> There is something odd looking about this..
>>>>
>>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>>> RPCRDMA_MTHCAFMR cases.
>>>>
>>>> RPCRDMA_FRMR doesn't set it up.
>>>>
>>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>>> case?
>>>>
>>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>>> use FRMR to register local send buffers, just use the global all
>>>> memory lkey...
>>>>
>>>> If no, then that is an oops?
>>>
>>> I’ve tested this code, no oops.
>>>
>>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
>>
>> Ah, I see. Ok.
>>
>> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?
>
> Tom would know.

Tom Talpey or Tom Tucker? I don't think I used a dma_lkey at all
in the original work, but there were some changes later.

I thought not all rnic's supported a dma_lkey. So requiring it
seems like a bad idea, to me.

>> Just use
>> the code you have for fmr with frwr to get the lkey, probably move it
>> to rpcrdma_ia_open .. Physical MR should create a 2nd MR dedicated for
>> rkey use.

But FMR is not supported by modern mlx5 and cxgb4?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 21, 2015, 12:15 a.m. UTC | #24
On 7/20/2015 3:41 PM, Steve Wise wrote:
>
>
>> -----Original Message-----
>> From: Tom Talpey [mailto:tom@talpey.com]
>> Sent: Monday, July 20, 2015 5:04 PM
>> To: Steve Wise; 'Jason Gunthorpe'
>> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>
>> On 7/20/2015 2:16 PM, Steve Wise wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
>>>> Sent: Monday, July 20, 2015 4:06 PM
>>>> To: Tom Talpey; Steve Wise
>>>> Cc: Chuck Lever; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>>>
>>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
>>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>>>> guaranteed fallback.
>>>>>
>>>>> I recall that in the past, some providers did not support mapping
>>>>> all of the machine's potential physical memory with a single dma_mr.
>>>>> If an rnic did/does not support 44-ish bits of length per region,
>>>>> for example.
>>>>
>>>> Looks like you are right, but the standard in kernel is to require
>>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
>>>> big memory machine with kernel ULPs.
>>>>
>>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
>>>> physical memory, and silently break all kernel ULPs if they are used
>>>> on a modern machine with > 4G.
>>>>
>>>> Is that right Steve?
>>>>
>>>
>>> Yes.
>>>
>>>> Based on that, should we remove the cxgb3 driver as well? Or at least
>>>> can you fix it up to at least fail get_dma_mr if there is too much
>>>> ram?
>>>>
>>>
>>> I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
> available
>>> ram?
>>
>> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
>>
>
> yes.
>
>> B) why bother to check? Are machines with <4GB interesting, and worth
>> supporting a special optimization?
>
> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.

I'm obviously not making myself clear. I am suggesting that cxgb3 fail
the ib_get_dma_mr() verb, regardless of installed memory.

I am not suggesting it fail to load, or fail other memreg requests. It
should work normally in all other respects.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III July 21, 2015, 12:34 a.m. UTC | #25
On Jul 20, 2015, at 8:11 PM, Tom Talpey <tom@talpey.com> wrote:

> On 7/20/2015 4:36 PM, Chuck Lever wrote:
>> 
>> On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>> 
>>> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>>>> 
>>>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>>>> 
>>>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>>>> +	iov->length = size;
>>>>>> +	iov->lkey = ia->ri_have_dma_lkey ?
>>>>>> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>>>> +	rb->rg_size = size;
>>>>>> +	rb->rg_owner = NULL;
>>>>>> 	return rb;
>>>>> 
>>>>> There is something odd looking about this..
>>>>> 
>>>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>>>> RPCRDMA_MTHCAFMR cases.
>>>>> 
>>>>> RPCRDMA_FRMR doesn't set it up.
>>>>> 
>>>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>>>> case?
>>>>> 
>>>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>>>> use FRMR to register local send buffers, just use the global all
>>>>> memory lkey...
>>>>> 
>>>>> If no, then that is an oops?
>>>> 
>>>> I’ve tested this code, no oops.
>>>> 
>>>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>>>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
>>> 
>>> Ah, I see. Ok.
>>> 
>>> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?
>> 
>> Tom would know.
> 
> Tom Talpey or Tom Tucker? I don't think I used a dma_lkey at all
> in the original work, but there were some changes later.

The commit which adds FRWR support to xprtrdma introduced a check in
rpcrdma_ia_open:

+       case RPCRDMA_FRMR:
+               /* Requires both frmr reg and local dma lkey */
+               if ((devattr.device_cap_flags &
+                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+                   (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {

This seems to be the only place the local DMA lkey requirement is
expressed or enforced. But yeah, the local DMA lkey doesn’t seem
to be used anywhere in the FRWR-specific parts of the code.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 21, 2015, 1:03 a.m. UTC | #26
On 7/20/2015 5:34 PM, Chuck Lever wrote:
>
> On Jul 20, 2015, at 8:11 PM, Tom Talpey <tom@talpey.com> wrote:
>
>> On 7/20/2015 4:36 PM, Chuck Lever wrote:
>>>
>>> On Jul 20, 2015, at 6:41 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>>>
>>>> On Mon, Jul 20, 2015 at 06:31:11PM -0400, Chuck Lever wrote:
>>>>>
>>>>> On Jul 20, 2015, at 6:26 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>>>>>
>>>>>> On Mon, Jul 20, 2015 at 03:03:11PM -0400, Chuck Lever wrote:
>>>>>>> +	iov->length = size;
>>>>>>> +	iov->lkey = ia->ri_have_dma_lkey ?
>>>>>>> +				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
>>>>>>> +	rb->rg_size = size;
>>>>>>> +	rb->rg_owner = NULL;
>>>>>>> 	return rb;
>>>>>>
>>>>>> There is something odd looking about this..
>>>>>>
>>>>>> ri_bind_mem is only setup in the RPCRDMA_ALLPHYSICAL and
>>>>>> RPCRDMA_MTHCAFMR cases.
>>>>>>
>>>>>> RPCRDMA_FRMR doesn't set it up.
>>>>>>
>>>>>> So this code in rpcrdma_alloc_regbuf is never called for the FRMR
>>>>>> case?
>>>>>>
>>>>>> If yes, then, how is FRMR working? There is absolutely no reason to
>>>>>> use FRMR to register local send buffers, just use the global all
>>>>>> memory lkey...
>>>>>>
>>>>>> If no, then that is an oops?
>>>>>
>>>>> I’ve tested this code, no oops.
>>>>>
>>>>> FRWR always uses the DMA lkey. xprtrdma does not use FRWR if
>>>>> IB_DEVICE_LOCAL_DMA_LKEY is not asserted.
>>>>
>>>> Ah, I see. Ok.
>>>>
>>>> Is there a reason to link FRWR and LOCAL_DMA_LKEY together?
>>>
>>> Tom would know.
>>
>> Tom Talpey or Tom Tucker? I don't think I used a dma_lkey at all
>> in the original work, but there were some changes later.
>
> The commit which adds FRWR support to xprtrdma introduced a check in
> rpcrdma_ia_open:
>
> +       case RPCRDMA_FRMR:
> +               /* Requires both frmr reg and local dma lkey */
> +               if ((devattr.device_cap_flags &
> +                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
> +                   (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
>
> This seems to be the only place the local DMA lkey requirement is
> expressed or enforced. But yeah, the local DMA lkey doesn’t seem
> to be used anywhere in the FRWR-specific parts of the code.

I now vaguely remember someone telling me that both attributes were
required, but a) I may have misunderstood and b) I'm sure something
has changed. FRMR was brand new at the time.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 21, 2015, 2:33 p.m. UTC | #27
> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Tom Talpey
> Sent: Monday, July 20, 2015 7:15 PM
> To: Steve Wise; 'Jason Gunthorpe'
> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> On 7/20/2015 3:41 PM, Steve Wise wrote:
> >
> >
> >> -----Original Message-----
> >> From: Tom Talpey [mailto:tom@talpey.com]
> >> Sent: Monday, July 20, 2015 5:04 PM
> >> To: Steve Wise; 'Jason Gunthorpe'
> >> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> >> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>
> >> On 7/20/2015 2:16 PM, Steve Wise wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> >>>> Sent: Monday, July 20, 2015 4:06 PM
> >>>> To: Tom Talpey; Steve Wise
> >>>> Cc: Chuck Lever; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> >>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>>>
> >>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> >>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
> >>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
> >>>>>> guaranteed fallback.
> >>>>>
> >>>>> I recall that in the past, some providers did not support mapping
> >>>>> all of the machine's potential physical memory with a single dma_mr.
> >>>>> If an rnic did/does not support 44-ish bits of length per region,
> >>>>> for example.
> >>>>
> >>>> Looks like you are right, but the standard in kernel is to require
> >>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> >>>> big memory machine with kernel ULPs.
> >>>>
> >>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> >>>> physical memory, and silently break all kernel ULPs if they are used
> >>>> on a modern machine with > 4G.
> >>>>
> >>>> Is that right Steve?
> >>>>
> >>>
> >>> Yes.
> >>>
> >>>> Based on that, should we remove the cxgb3 driver as well? Or at least
> >>>> can you fix it up to at least fail get_dma_mr if there is too much
> >>>> ram?
> >>>>
> >>>
> >>> I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
> > available
> >>> ram?
> >>
> >> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
> >>
> >
> > yes.
> >
> >> B) why bother to check? Are machines with <4GB interesting, and worth
> >> supporting a special optimization?
> >
> > No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> 
> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
> the ib_get_dma_mr() verb, regardless of installed memory.
> 
> I am not suggesting it fail to load, or fail other memreg requests. It
> should work normally in all other respects.

Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment? 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey July 21, 2015, 8:47 p.m. UTC | #28
On 7/21/2015 7:33 AM, Steve Wise wrote:
>
>
>> -----Original Message-----
>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Tom Talpey
>> Sent: Monday, July 20, 2015 7:15 PM
>> To: Steve Wise; 'Jason Gunthorpe'
>> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>
>> On 7/20/2015 3:41 PM, Steve Wise wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Tom Talpey [mailto:tom@talpey.com]
>>>> Sent: Monday, July 20, 2015 5:04 PM
>>>> To: Steve Wise; 'Jason Gunthorpe'
>>>> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>>>
>>>> On 7/20/2015 2:16 PM, Steve Wise wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
>>>>>> Sent: Monday, July 20, 2015 4:06 PM
>>>>>> To: Tom Talpey; Steve Wise
>>>>>> Cc: Chuck Lever; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
>>>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
>>>>>>
>>>>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
>>>>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
>>>>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
>>>>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
>>>>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
>>>>>>>> guaranteed fallback.
>>>>>>>
>>>>>>> I recall that in the past, some providers did not support mapping
>>>>>>> all of the machine's potential physical memory with a single dma_mr.
>>>>>>> If an rnic did/does not support 44-ish bits of length per region,
>>>>>>> for example.
>>>>>>
>>>>>> Looks like you are right, but the standard in kernel is to require
>>>>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
>>>>>> big memory machine with kernel ULPs.
>>>>>>
>>>>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
>>>>>> physical memory, and silently break all kernel ULPs if they are used
>>>>>> on a modern machine with > 4G.
>>>>>>
>>>>>> Is that right Steve?
>>>>>>
>>>>>
>>>>> Yes.
>>>>>
>>>>>> Based on that, should we remove the cxgb3 driver as well? Or at least
>>>>>> can you fix it up to at least fail get_dma_mr if there is too much
>>>>>> ram?
>>>>>>
>>>>>
>>>>> I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
>>> available
>>>>> ram?
>>>>
>>>> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
>>>>
>>>
>>> yes.
>>>
>>>> B) why bother to check? Are machines with <4GB interesting, and worth
>>>> supporting a special optimization?
>>>
>>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
>>
>> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
>> the ib_get_dma_mr() verb, regardless of installed memory.
>>
>> I am not suggesting it fail to load, or fail other memreg requests. It
>> should work normally in all other respects.
>
> Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment?

Sure, do you mean making it conditional on #if sizeof(physaddr) <= 32?
That would make sense I guess.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 21, 2015, 8:55 p.m. UTC | #29
> -----Original Message-----
> From: Tom Talpey [mailto:tom@talpey.com]
> Sent: Tuesday, July 21, 2015 3:47 PM
> To: Steve Wise; 'Jason Gunthorpe'
> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> On 7/21/2015 7:33 AM, Steve Wise wrote:
> >
> >
> >> -----Original Message-----
> >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Tom Talpey
> >> Sent: Monday, July 20, 2015 7:15 PM
> >> To: Steve Wise; 'Jason Gunthorpe'
> >> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> >> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>
> >> On 7/20/2015 3:41 PM, Steve Wise wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Tom Talpey [mailto:tom@talpey.com]
> >>>> Sent: Monday, July 20, 2015 5:04 PM
> >>>> To: Steve Wise; 'Jason Gunthorpe'
> >>>> Cc: 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> >>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>>>
> >>>> On 7/20/2015 2:16 PM, Steve Wise wrote:
> >>>>>
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> >>>>>> Sent: Monday, July 20, 2015 4:06 PM
> >>>>>> To: Tom Talpey; Steve Wise
> >>>>>> Cc: Chuck Lever; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> >>>>>> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> >>>>>>
> >>>>>> On Mon, Jul 20, 2015 at 01:34:16PM -0700, Tom Talpey wrote:
> >>>>>>> On 7/20/2015 12:03 PM, Chuck Lever wrote:
> >>>>>>>> All HCA providers have an ib_get_dma_mr() verb. Thus
> >>>>>>>> rpcrdma_ia_open() will either grab the device's local_dma_key if one
> >>>>>>>> is available, or it will call ib_get_dma_mr() which is a 100%
> >>>>>>>> guaranteed fallback.
> >>>>>>>
> >>>>>>> I recall that in the past, some providers did not support mapping
> >>>>>>> all of the machine's potential physical memory with a single dma_mr.
> >>>>>>> If an rnic did/does not support 44-ish bits of length per region,
> >>>>>>> for example.
> >>>>>>
> >>>>>> Looks like you are right, but the standard in kernel is to require
> >>>>>> ib_get_dma_mr, if the HCA can't do that, then it cannot be used on a
> >>>>>> big memory machine with kernel ULPs.
> >>>>>>
> >>>>>> Looking deeper, both amso1100 and cxgb3 seem limited to 32 bits of
> >>>>>> physical memory, and silently break all kernel ULPs if they are used
> >>>>>> on a modern machine with > 4G.
> >>>>>>
> >>>>>> Is that right Steve?
> >>>>>>
> >>>>>
> >>>>> Yes.
> >>>>>
> >>>>>> Based on that, should we remove the cxgb3 driver as well? Or at least
> >>>>>> can you fix it up to at least fail get_dma_mr if there is too much
> >>>>>> ram?
> >>>>>>
> >>>>>
> >>>>> I would like to keep cxgb3 around.  I can add code to fail if the memory is > 32b.  Do you know how I get the amount of
> >>> available
> >>>>> ram?
> >>>>
> >>>> A) are you sure it's an unsigned length, i.e. is it really 31 bits?
> >>>>
> >>>
> >>> yes.
> >>>
> >>>> B) why bother to check? Are machines with <4GB interesting, and worth
> >>>> supporting a special optimization?
> >>>
> >>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> >>
> >> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
> >> the ib_get_dma_mr() verb, regardless of installed memory.
> >>
> >> I am not suggesting it fail to load, or fail other memreg requests. It
> >> should work normally in all other respects.
> >
> > Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment?
> 
> Sure, do you mean making it conditional on #if sizeof(physaddr) <= 32?
> That would make sense I guess.

No, a runtime check.  x64 platforms will work too if the mem size takes <= 32b to describe. 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 21, 2015, 9:22 p.m. UTC | #30
> > >>>> B) why bother to check? Are machines with <4GB interesting, and worth
> > >>>> supporting a special optimization?
> > >>>
> > >>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> > >>
> > >> I'm obviously not making myself clear. I am suggesting that cxgb3 fail
> > >> the ib_get_dma_mr() verb, regardless of installed memory.
> > >>
> > >> I am not suggesting it fail to load, or fail other memreg requests. It
> > >> should work normally in all other respects.
> > >
> > > Even with its limitation, doesn't it have utility for someone using cxgb3 in an embedded 32b environment?
> >
> > Sure, do you mean making it conditional on #if sizeof(physaddr) <= 32?
> > That would make sense I guess.
> 
> No, a runtime check.  x64 platforms will work too if the mem size takes <= 32b to describe.

Jason/Doug, do you think it should allow dma mr allocaton iff totalmem_pages < 4GB?  Or do a check on the sizeof totalmem_pages and
only allow dma mrs if it is <= 4?


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 21, 2015, 10:41 p.m. UTC | #31
On 7/20/2015 5:42 PM, Jason Gunthorpe wrote:
> On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
>>> B) why bother to check? Are machines with <4GB interesting, and worth
>>> supporting a special optimization?
>> No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> Doesn't look like the NFS client will work. It requires an all
> physical memory lkey for SEND and RECV buffers..
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Looks like cxgb3 supports LOCAL_DMA_LKEY and MEM_MGT_EXTENSIONS so dma 
mrs aren't required for NFSRDMA:

t4:~/linux-2.6/drivers/infiniband/hw/cxgb3 # grep IB_DEVICE_ iwch_provider.c
         strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
         dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
                                 IB_DEVICE_MEM_WINDOW |
                                 IB_DEVICE_MEM_MGT_EXTENSIONS;

So cxgb3 can still support NFSRDMA and user verbs w/o get_dma_mr(). I'll 
submit a patch soon to only support get_dma_mr() if unsigned long is 4 
bytes...

Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 21, 2015, 10:54 p.m. UTC | #32
On Tue, Jul 21, 2015 at 05:41:22PM -0500, Steve Wise wrote:
> On 7/20/2015 5:42 PM, Jason Gunthorpe wrote:
> >On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
> >>>B) why bother to check? Are machines with <4GB interesting, and worth
> >>>supporting a special optimization?
> >>No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> >Doesn't look like the NFS client will work. It requires an all
> >physical memory lkey for SEND and RECV buffers..
> >
> >Jason
> 
> Looks like cxgb3 supports LOCAL_DMA_LKEY and MEM_MGT_EXTENSIONS so dma mrs
> aren't required for NFSRDMA:
> 
> t4:~/linux-2.6/drivers/infiniband/hw/cxgb3 # grep IB_DEVICE_ iwch_provider.c
>         strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
>         dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
>                                 IB_DEVICE_MEM_WINDOW |
>                                 IB_DEVICE_MEM_MGT_EXTENSIONS;

Neat. Is the dma_mask set properly (I don't see any set at all)?

> So cxgb3 can still support NFSRDMA and user verbs w/o get_dma_mr(). I'll
> submit a patch soon to only support get_dma_mr() if unsigned long is 4
> bytes...

So, NFS and RDS seem to be the only iWarp compatible ULPs?

NFS has worked, and will continue to work with the global lkey.

RDS looks like it relies on an insecure all physical rkey, so it won't
work until that is fixed.

So, I'd just use sizeof(physaddr_t) > 4 as the test. The only people
that could be impacted are RDS users using distro kernels on machines
with less than 4G of ram. I somehow doubt there are any of those...

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise July 22, 2015, 1:58 p.m. UTC | #33
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> Sent: Tuesday, July 21, 2015 5:55 PM
> To: Steve Wise
> Cc: 'Tom Talpey'; 'Chuck Lever'; linux-rdma@vger.kernel.org; linux-nfs@vger.kernel.org
> Subject: Re: [PATCH v3 05/15] xprtrdma: Remove last ib_reg_phys_mr() call site
> 
> On Tue, Jul 21, 2015 at 05:41:22PM -0500, Steve Wise wrote:
> > On 7/20/2015 5:42 PM, Jason Gunthorpe wrote:
> > >On Mon, Jul 20, 2015 at 05:41:27PM -0500, Steve Wise wrote:
> > >>>B) why bother to check? Are machines with <4GB interesting, and worth
> > >>>supporting a special optimization?
> > >>No, but cxgb3 is still interesting to user applications, and perhaps NFSRDMA using FRMRs.
> > >Doesn't look like the NFS client will work. It requires an all
> > >physical memory lkey for SEND and RECV buffers..
> > >
> > >Jason
> >
> > Looks like cxgb3 supports LOCAL_DMA_LKEY and MEM_MGT_EXTENSIONS so dma mrs
> > aren't required for NFSRDMA:
> >
> > t4:~/linux-2.6/drivers/infiniband/hw/cxgb3 # grep IB_DEVICE_ iwch_provider.c
> >         strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
> >         dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
> >                                 IB_DEVICE_MEM_WINDOW |
> >                                 IB_DEVICE_MEM_MGT_EXTENSIONS;
> 
> Neat. Is the dma_mask set properly (I don't see any set at all)?
> 

iw_cxgb3 isn't a PCI driver.  It sits on top of cxgb3 which is the pci device driver and calls pci_set_dma_mask().

> > So cxgb3 can still support NFSRDMA and user verbs w/o get_dma_mr(). I'll
> > submit a patch soon to only support get_dma_mr() if unsigned long is 4
> > bytes...
> 
> So, NFS and RDS seem to be the only iWarp compatible ULPs?
> 
> NFS has worked, and will continue to work with the global lkey.
> 
> RDS looks like it relies on an insecure all physical rkey, so it won't
> work until that is fixed.
> 
> So, I'd just use sizeof(physaddr_t) > 4 as the test. The only people
> that could be impacted are RDS users using distro kernels on machines
> with less than 4G of ram. I somehow doubt there are any of those...
> 
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 1065808..da184f9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1229,75 +1229,6 @@  rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
 		(unsigned long long)seg->mr_dma, seg->mr_dmalen);
 }
 
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
-				struct ib_mr **mrp, struct ib_sge *iov)
-{
-	struct ib_phys_buf ipb;
-	struct ib_mr *mr;
-	int rc;
-
-	/*
-	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
-	 */
-	iov->addr = ib_dma_map_single(ia->ri_device,
-			va, len, DMA_BIDIRECTIONAL);
-	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
-		return -ENOMEM;
-
-	iov->length = len;
-
-	if (ia->ri_have_dma_lkey) {
-		*mrp = NULL;
-		iov->lkey = ia->ri_dma_lkey;
-		return 0;
-	} else if (ia->ri_bind_mem != NULL) {
-		*mrp = NULL;
-		iov->lkey = ia->ri_bind_mem->lkey;
-		return 0;
-	}
-
-	ipb.addr = iov->addr;
-	ipb.size = iov->length;
-	mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
-			IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
-	dprintk("RPC:       %s: phys convert: 0x%llx "
-			"registered 0x%llx length %d\n",
-			__func__, (unsigned long long)ipb.addr,
-			(unsigned long long)iov->addr, len);
-
-	if (IS_ERR(mr)) {
-		*mrp = NULL;
-		rc = PTR_ERR(mr);
-		dprintk("RPC:       %s: failed with %i\n", __func__, rc);
-	} else {
-		*mrp = mr;
-		iov->lkey = mr->lkey;
-		rc = 0;
-	}
-
-	return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
-				struct ib_mr *mr, struct ib_sge *iov)
-{
-	int rc;
-
-	ib_dma_unmap_single(ia->ri_device,
-			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
-	if (NULL == mr)
-		return 0;
-
-	rc = ib_dereg_mr(mr);
-	if (rc)
-		dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
-	return rc;
-}
-
 /**
  * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  * @ia: controlling rpcrdma_ia
@@ -1317,26 +1248,30 @@  struct rpcrdma_regbuf *
 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
 {
 	struct rpcrdma_regbuf *rb;
-	int rc;
+	struct ib_sge *iov;
 
-	rc = -ENOMEM;
 	rb = kmalloc(sizeof(*rb) + size, flags);
 	if (rb == NULL)
 		goto out;
 
-	rb->rg_size = size;
-	rb->rg_owner = NULL;
-	rc = rpcrdma_register_internal(ia, rb->rg_base, size,
-				       &rb->rg_mr, &rb->rg_iov);
-	if (rc)
+	iov = &rb->rg_iov;
+	iov->addr = ib_dma_map_single(ia->ri_device,
+				      (void *)rb->rg_base, size,
+				      DMA_BIDIRECTIONAL);
+	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
 		goto out_free;
 
+	iov->length = size;
+	iov->lkey = ia->ri_have_dma_lkey ?
+				ia->ri_dma_lkey : ia->ri_bind_mem->lkey;
+	rb->rg_size = size;
+	rb->rg_owner = NULL;
 	return rb;
 
 out_free:
 	kfree(rb);
 out:
-	return ERR_PTR(rc);
+	return ERR_PTR(-ENOMEM);
 }
 
 /**
@@ -1347,10 +1282,15 @@  out:
 void
 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 {
-	if (rb) {
-		rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
-		kfree(rb);
-	}
+	struct ib_sge *iov;
+
+	if (!rb)
+		return;
+
+	iov = &rb->rg_iov;
+	ib_dma_unmap_single(ia->ri_device,
+			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
+	kfree(rb);
 }
 
 /*
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index abee472..ce4e79e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -119,7 +119,6 @@  struct rpcrdma_ep {
 struct rpcrdma_regbuf {
 	size_t			rg_size;
 	struct rpcrdma_req	*rg_owner;
-	struct ib_mr		*rg_mr;
 	struct ib_sge		rg_iov;
 	__be32			rg_base[0] __attribute__ ((aligned(256)));
 };