diff mbox

[v2,09/17] IB/Verbs: Use helper cap_read_multi_sge() and reform svc_rdma_accept()

Message ID 5523CEE4.5060901@profitbricks.com (mailing list archive)
State Rejected
Headers show

Commit Message

Michael Wang April 7, 2015, 12:34 p.m. UTC
Introduce helper cap_read_multi_sge() to help us check if the port of an
IB device support RDMA Read Multiple Scatter-Gather Entries.

Reform svc_rdma_accept() to adopt management helpers.

Cc: Tom Talpey <tom@talpey.com>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Michael Wang <yun.wang@profitbricks.com>
---
 include/rdma/ib_verbs.h                  | 15 +++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  |  4 ++--
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 12 +++++-------
 3 files changed, 22 insertions(+), 9 deletions(-)

Comments

Tom Talpey April 7, 2015, 3:46 p.m. UTC | #1
On 4/7/2015 8:34 AM, Michael Wang wrote:
>   /**
> + * cap_read_multi_sge - Check if the port of device has the capability
> + * RDMA Read Multiple Scatter-Gather Entries.
> + *
> + * @device: Device to be checked
> + * @port_num: Port number of the device
> + *
> + * Return 0 when port of the device don't support
> + * RDMA Read Multiple Scatter-Gather Entries.
> + */
> +static inline int cap_read_multi_sge(struct ib_device *device, u8 port_num)
> +{
> +	return !rdma_transport_iwarp(device, port_num);
> +}

This just papers over the issue we discussed earlier. How *many*
entries does the device support? If a device supports one, or two,
is that enough? How does the upper layer know the limit?

This needs an explicit device attribute, to be fixed properly.

> +
> +/**
>    * cap_ipoib - Check if the port of device has the capability
>    * IP over Infiniband.
>    *
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> index e011027..604d035 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
> @@ -118,8 +118,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
>
>   static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
>   {
> -	if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
> -	     RDMA_TRANSPORT_IWARP)
> +	if (!cap_read_multi_sge(xprt->sc_cm_id->device,
> +				xprt->sc_cm_id->port_num))
>   		return 1;
>   	else
>   		return min_t(int, sge_count, xprt->sc_max_sge);

This is incorrect. The RDMA Read max is not at all the same as the
max_sge. It is a different operation, with a different set of work
request parameters.

In other words, the above same comment applies.


> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> index 4e61880..e75175d 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> @@ -979,8 +979,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>   	/*
>   	 * Determine if a DMA MR is required and if so, what privs are required
>   	 */
> -	switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
> -	case RDMA_TRANSPORT_IWARP:
> +	if (rdma_transport_iwarp(newxprt->sc_cm_id->device,
> +				 newxprt->sc_cm_id->port_num)) {
>   		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;

Do I read this correctly that it is forcing the "read with invalidate"
capability to "on" for all iWARP devices? I don't think that is correct,
for the legacy devices you're also supporting.


> @@ -992,8 +992,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>   			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
>   		} else
>   			need_dma_mr = 0;
> -		break;
> -	case RDMA_TRANSPORT_IB:
> +	} else if (rdma_ib_mgmt(newxprt->sc_cm_id->device,
> +				newxprt->sc_cm_id->port_num)) {
>   		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
>   			need_dma_mr = 1;
>   			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;

Now I'm even more confused. How is the presence of IB management
related to needing a privileged lmr?


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 7, 2015, 4:05 p.m. UTC | #2
Hi, Tom

Thanks for the comments :-)

On 04/07/2015 05:46 PM, Tom Talpey wrote:
> On 4/7/2015 8:34 AM, Michael Wang wrote:
>>   /**
>> + * cap_read_multi_sge - Check if the port of device has the capability
>> + * RDMA Read Multiple Scatter-Gather Entries.
>> + *
>> + * @device: Device to be checked
>> + * @port_num: Port number of the device
>> + *
>> + * Return 0 when port of the device don't support
>> + * RDMA Read Multiple Scatter-Gather Entries.
>> + */
>> +static inline int cap_read_multi_sge(struct ib_device *device, u8 port_num)
>> +{
>> +    return !rdma_transport_iwarp(device, port_num);
>> +}
> 
> This just papers over the issue we discussed earlier. How *many*
> entries does the device support? If a device supports one, or two,
> is that enough? How does the upper layer know the limit?
> 
> This needs an explicit device attribute, to be fixed properly.

This is the prototype to expose the problem we have in here, I
would prefer some one good at this part to extending the API in
future, basing on the right logical.

Currently this just inherit from the legacy, it implemented
in order to be compatible with the current code.

> 
>> +
>> +/**
>>    * cap_ipoib - Check if the port of device has the capability
>>    * IP over Infiniband.
>>    *
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> index e011027..604d035 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> @@ -118,8 +118,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
>>
>>   static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
>>   {
>> -    if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
>> -         RDMA_TRANSPORT_IWARP)
>> +    if (!cap_read_multi_sge(xprt->sc_cm_id->device,
>> +                xprt->sc_cm_id->port_num))
>>           return 1;
>>       else
>>           return min_t(int, sge_count, xprt->sc_max_sge);
> 
> This is incorrect. The RDMA Read max is not at all the same as the
> max_sge. It is a different operation, with a different set of work
> request parameters.
> 
> In other words, the above same comment applies.

Any idea on how to improve this part?

Again, all these helpers just inherit the old logical, if
it's wrong, let's correct it ;-)

And if we don't know how to correct, we can leave this as a
signpost and waiting for someone good at this particular part
to fix it.

> 
> 
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> index 4e61880..e75175d 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> @@ -979,8 +979,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>>       /*
>>        * Determine if a DMA MR is required and if so, what privs are required
>>        */
>> -    switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
>> -    case RDMA_TRANSPORT_IWARP:
>> +    if (rdma_transport_iwarp(newxprt->sc_cm_id->device,
>> +                 newxprt->sc_cm_id->port_num)) {
>>           newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
> 
> Do I read this correctly that it is forcing the "read with invalidate"
> capability to "on" for all iWARP devices? I don't think that is correct,
> for the legacy devices you're also supporting.

Hmm.. but that's exactly same as the old logical, correct?
Or do you mean the old logical is wrong?

> 
> 
>> @@ -992,8 +992,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>>               dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
>>           } else
>>               need_dma_mr = 0;
>> -        break;
>> -    case RDMA_TRANSPORT_IB:
>> +    } else if (rdma_ib_mgmt(newxprt->sc_cm_id->device,
>> +                newxprt->sc_cm_id->port_num)) {
>>           if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
>>               need_dma_mr = 1;
>>               dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
> 
> Now I'm even more confused. How is the presence of IB management
> related to needing a privileged lmr?

I think you actually mean we need some more wrapper here
with the right name, correct?

I'm not good at this part, any suggestions?

Regards,
Michael Wang

> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe April 7, 2015, 5:42 p.m. UTC | #3
On Tue, Apr 07, 2015 at 11:46:57AM -0400, Tom Talpey wrote:
> On 4/7/2015 8:34 AM, Michael Wang wrote:
> >  /**
> >+ * cap_read_multi_sge - Check if the port of device has the capability
> >+ * RDMA Read Multiple Scatter-Gather Entries.
> >+ *
> >+ * @device: Device to be checked
> >+ * @port_num: Port number of the device
> >+ *
> >+ * Return 0 when port of the device don't support
> >+ * RDMA Read Multiple Scatter-Gather Entries.
> >+ */
> >+static inline int cap_read_multi_sge(struct ib_device *device, u8 port_num)
> >+{
> >+	return !rdma_transport_iwarp(device, port_num);
> >+}
> 
> This just papers over the issue we discussed earlier. How *many*
> entries does the device support? If a device supports one, or two,
> is that enough? How does the upper layer know the limit?

I think Michael is fine to just make this one mechanical change.

The kernel only supports two kinds of devices today, ones with 1 read
SGE and ones where READ SGE == WRITE SGE == SEND SGE.

If someone makes another variation then it is up to them to propose a
better fix.


> >  static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
> >  {
> >-	if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
> >-	     RDMA_TRANSPORT_IWARP)
> >+	if (!cap_read_multi_sge(xprt->sc_cm_id->device,
> >+				xprt->sc_cm_id->port_num))
> >  		return 1;
> >  	else
> >  		return min_t(int, sge_count, xprt->sc_max_sge);
> 
> This is incorrect. The RDMA Read max is not at all the same as the
> max_sge. It is a different operation, with a different set of work
> request parameters.

The algorithm looks OK to me,

        newxprt->sc_max_sge = min((size_t)devattr.max_sge,
                                  (size_t)RPCSVC_MAXPAGES);

So it returns 1 or the number of sge entries per WR, and max_sge is
for READ/WRITE/SEND in every case except when cap_read_multi_sge == 1

> >  	/*
> >  	 * Determine if a DMA MR is required and if so, what privs are required
> >  	 */
> >-	switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
> >-	case RDMA_TRANSPORT_IWARP:
> >+	if (rdma_transport_iwarp(newxprt->sc_cm_id->device,
> >+				 newxprt->sc_cm_id->port_num)) {
> >  		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
> 
> Do I read this correctly that it is forcing the "read with invalidate"
> capability to "on" for all iWARP devices? I don't think that is correct,
> for the legacy devices you're also supporting.

No idea here, this logic was added in:

commit 3a5c63803d0552a3ad93b85c262f12cd86471443
Author: Tom Tucker <tom@opengridcomputing.com>
Date:   Tue Sep 30 13:46:13 2008 -0500

    svcrdma: Query device for Fast Reg support during connection setup
    
    Query the device capabilities in the svc_rdma_accept function to determine
    what advanced memory management capabilities are supported by the device.
    Based on the query, select the most secure model available given the
    requirements of the transport and capabilities of the adapter.
    
    Signed-off-by: Tom Tucker <tom@opengridcomputing.com>

> >@@ -992,8 +992,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
> >  			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
> >  		} else
> >  			need_dma_mr = 0;
> >-		break;
> >-	case RDMA_TRANSPORT_IB:
> >+	} else if (rdma_ib_mgmt(newxprt->sc_cm_id->device,
> >+				newxprt->sc_cm_id->port_num)) {
> >  		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
> >  			need_dma_mr = 1;
> >  			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
> 
> Now I'm even more confused. How is the presence of IB management
> related to needing a privileged lmr?

Agree, this needs to be someone else. 

I think the test is probably based on this comment:

        * NB:  iWARP requires remote write access for the data sink
        *      of an RDMA_READ. IB does not.

So the if should be:

          if (cap_rdma_read_needs_write(..) && 
	  !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
                       need_dma_mr = 1;
                       dma_mr_acc =
                               (IB_ACCESS_LOCAL_WRITE |
                                IB_ACCESS_REMOTE_WRITE);

And the identical if blocks merged.

Plus the
	if (rdma_transport_iwarp(newxprt->sc_cm_id->device,
				 newxprt->sc_cm_id->port_num))
  		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 8, 2015, 8:51 a.m. UTC | #4
On 04/07/2015 07:42 PM, Jason Gunthorpe wrote:
[snip]
>>> @@ -992,8 +992,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>>>  			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
>>>  		} else
>>>  			need_dma_mr = 0;
>>> -		break;
>>> -	case RDMA_TRANSPORT_IB:
>>> +	} else if (rdma_ib_mgmt(newxprt->sc_cm_id->device,
>>> +				newxprt->sc_cm_id->port_num)) {
>>>  		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
>>>  			need_dma_mr = 1;
>>>  			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
>>
>> Now I'm even more confused. How is the presence of IB management
>> related to needing a privileged lmr?
> 
> Agree, this needs to be someone else. 
> 
> I think the test is probably based on this comment:
> 
>         * NB:  iWARP requires remote write access for the data sink
>         *      of an RDMA_READ. IB does not.
> 
> So the if should be:
> 
>           if (cap_rdma_read_needs_write(..) && 
> 	  !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
>                        need_dma_mr = 1;
>                        dma_mr_acc =
>                                (IB_ACCESS_LOCAL_WRITE |
>                                 IB_ACCESS_REMOTE_WRITE);
> 
> And the identical if blocks merged.
> 
> Plus the
> 	if (rdma_transport_iwarp(newxprt->sc_cm_id->device,
> 				 newxprt->sc_cm_id->port_num))
>   		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV

Sounds good :-) I'll give this part a reform in next version.

Regards,
Michael Wang

> 
> Jason
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9db8966..cae6f2d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1849,6 +1849,21 @@  static inline int cap_ib_mcast(struct ib_device *device, u8 port_num)
 }
 
 /**
+ * cap_read_multi_sge - Check if the port of device has the capability
+ * RDMA Read Multiple Scatter-Gather Entries.
+ *
+ * @device: Device to be checked
+ * @port_num: Port number of the device
+ *
+ * Return 0 when port of the device don't support
+ * RDMA Read Multiple Scatter-Gather Entries.
+ */
+static inline int cap_read_multi_sge(struct ib_device *device, u8 port_num)
+{
+	return !rdma_transport_iwarp(device, port_num);
+}
+
+/**
  * cap_ipoib - Check if the port of device has the capability
  * IP over Infiniband.
  *
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e011027..604d035 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -118,8 +118,8 @@  static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 
 static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
 {
-	if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
-	     RDMA_TRANSPORT_IWARP)
+	if (!cap_read_multi_sge(xprt->sc_cm_id->device,
+				xprt->sc_cm_id->port_num))
 		return 1;
 	else
 		return min_t(int, sge_count, xprt->sc_max_sge);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4e61880..e75175d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -979,8 +979,8 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	/*
 	 * Determine if a DMA MR is required and if so, what privs are required
 	 */
-	switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
-	case RDMA_TRANSPORT_IWARP:
+	if (rdma_transport_iwarp(newxprt->sc_cm_id->device,
+				 newxprt->sc_cm_id->port_num)) {
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
 		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
 			need_dma_mr = 1;
@@ -992,8 +992,8 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
 		} else
 			need_dma_mr = 0;
-		break;
-	case RDMA_TRANSPORT_IB:
+	} else if (rdma_ib_mgmt(newxprt->sc_cm_id->device,
+				newxprt->sc_cm_id->port_num)) {
 		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
 			need_dma_mr = 1;
 			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
@@ -1003,10 +1003,8 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
 		} else
 			need_dma_mr = 0;
-		break;
-	default:
+	} else
 		goto errout;
-	}
 
 	/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
 	if (need_dma_mr) {