diff mbox

[v1,04/14] xprtrdma: Use ib_device pointer safely

Message ID 20150504175720.3483.80356.stgit@manet.1015granger.net (mailing list archive)
State Rejected
Headers show

Commit Message

Chuck Lever III May 4, 2015, 5:57 p.m. UTC
The connect worker can replace ri_id, but prevents ri_id->device
from changing during the lifetime of a transport instance.

Cache a copy of ri_id->device in rpcrdma_ia and in rpcrdma_rep.
The cached copy can be used safely in code that does not serialize
with the connect worker.

Other code can use it to save an extra address generation (one
pointer dereference instead of two).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/fmr_ops.c      |    8 +----
 net/sunrpc/xprtrdma/frwr_ops.c     |   12 +++----
 net/sunrpc/xprtrdma/physical_ops.c |    8 +----
 net/sunrpc/xprtrdma/verbs.c        |   61 +++++++++++++++++++-----------------
 net/sunrpc/xprtrdma/xprt_rdma.h    |    2 +
 5 files changed, 43 insertions(+), 48 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Sagi Grimberg May 7, 2015, 10 a.m. UTC | #1
On 5/4/2015 8:57 PM, Chuck Lever wrote:
> The connect worker can replace ri_id, but prevents ri_id->device
> from changing during the lifetime of a transport instance.
>
> Cache a copy of ri_id->device in rpcrdma_ia and in rpcrdma_rep.
> The cached copy can be used safely in code that does not serialize
> with the connect worker.
>
> Other code can use it to save an extra address generation (one
> pointer dereference instead of two).
>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>   net/sunrpc/xprtrdma/fmr_ops.c      |    8 +----
>   net/sunrpc/xprtrdma/frwr_ops.c     |   12 +++----
>   net/sunrpc/xprtrdma/physical_ops.c |    8 +----
>   net/sunrpc/xprtrdma/verbs.c        |   61 +++++++++++++++++++-----------------
>   net/sunrpc/xprtrdma/xprt_rdma.h    |    2 +
>   5 files changed, 43 insertions(+), 48 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
> index 302d4eb..0a96155 100644
> --- a/net/sunrpc/xprtrdma/fmr_ops.c
> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
> @@ -85,7 +85,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>   	   int nsegs, bool writing)
>   {
>   	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
> -	struct ib_device *device = ia->ri_id->device;
> +	struct ib_device *device = ia->ri_device;
>   	enum dma_data_direction direction = rpcrdma_data_dir(writing);
>   	struct rpcrdma_mr_seg *seg1 = seg;
>   	struct rpcrdma_mw *mw = seg1->rl_mw;
> @@ -137,17 +137,13 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>   {
>   	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>   	struct rpcrdma_mr_seg *seg1 = seg;
> -	struct ib_device *device;
>   	int rc, nsegs = seg->mr_nsegs;
>   	LIST_HEAD(l);
>
>   	list_add(&seg1->rl_mw->r.fmr->list, &l);
>   	rc = ib_unmap_fmr(&l);
> -	read_lock(&ia->ri_qplock);
> -	device = ia->ri_id->device;
>   	while (seg1->mr_nsegs--)
> -		rpcrdma_unmap_one(device, seg++);
> -	read_unlock(&ia->ri_qplock);
> +		rpcrdma_unmap_one(ia->ri_device, seg++);

Umm, I'm wandering if this is guaranteed to be the same device as
ri_id->device?

Imagine you are working on a bond device where each slave belongs to
a different adapter. When the active port toggles, you will see a
ADDR_CHANGED event (that the current code does not handle...), what
you'd want to do is just reconnect and rdma_cm will resolve the new
address for you (via the backup slave). I suspect that in case this
flow is concurrent with the reconnects you may end up with a stale
device handle.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III May 7, 2015, 1:39 p.m. UTC | #2
On May 7, 2015, at 6:00 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:

> On 5/4/2015 8:57 PM, Chuck Lever wrote:
>> The connect worker can replace ri_id, but prevents ri_id->device
>> from changing during the lifetime of a transport instance.
>> 
>> Cache a copy of ri_id->device in rpcrdma_ia and in rpcrdma_rep.
>> The cached copy can be used safely in code that does not serialize
>> with the connect worker.
>> 
>> Other code can use it to save an extra address generation (one
>> pointer dereference instead of two).
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>  net/sunrpc/xprtrdma/fmr_ops.c      |    8 +----
>>  net/sunrpc/xprtrdma/frwr_ops.c     |   12 +++----
>>  net/sunrpc/xprtrdma/physical_ops.c |    8 +----
>>  net/sunrpc/xprtrdma/verbs.c        |   61 +++++++++++++++++++-----------------
>>  net/sunrpc/xprtrdma/xprt_rdma.h    |    2 +
>>  5 files changed, 43 insertions(+), 48 deletions(-)
>> 
>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>> index 302d4eb..0a96155 100644
>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>> @@ -85,7 +85,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>>  	   int nsegs, bool writing)
>>  {
>>  	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>> -	struct ib_device *device = ia->ri_id->device;
>> +	struct ib_device *device = ia->ri_device;
>>  	enum dma_data_direction direction = rpcrdma_data_dir(writing);
>>  	struct rpcrdma_mr_seg *seg1 = seg;
>>  	struct rpcrdma_mw *mw = seg1->rl_mw;
>> @@ -137,17 +137,13 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>>  {
>>  	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>  	struct rpcrdma_mr_seg *seg1 = seg;
>> -	struct ib_device *device;
>>  	int rc, nsegs = seg->mr_nsegs;
>>  	LIST_HEAD(l);
>> 
>>  	list_add(&seg1->rl_mw->r.fmr->list, &l);
>>  	rc = ib_unmap_fmr(&l);
>> -	read_lock(&ia->ri_qplock);
>> -	device = ia->ri_id->device;
>>  	while (seg1->mr_nsegs--)
>> -		rpcrdma_unmap_one(device, seg++);
>> -	read_unlock(&ia->ri_qplock);
>> +		rpcrdma_unmap_one(ia->ri_device, seg++);
> 
> Umm, I'm wandering if this is guaranteed to be the same device as
> ri_id->device?
> 
> Imagine you are working on a bond device where each slave belongs to
> a different adapter. When the active port toggles, you will see a
> ADDR_CHANGED event (that the current code does not handle...), what
> you'd want to do is just reconnect and rdma_cm will resolve the new
> address for you (via the backup slave). I suspect that in case this
> flow is concurrent with the reconnects you may end up with a stale
> device handle.

I’m not sure what you mean by “stale” : freed memory?

I’m looking at this code in rpcrdma_ep_connect() :

 916                 if (ia->ri_id->device != id->device) {
 917                         printk("RPC:       %s: can't reconnect on "
 918                                 "different device!\n", __func__);
 919                         rdma_destroy_id(id);
 920                         rc = -ENETUNREACH;
 921                         goto out;
 922                 }

After reconnecting, if the ri_id has changed, the connect fails. Today,
xprtrdma does not support the device changing out from under it.

Note also that our receive completion upcall uses ri_id->device for
DMA map syncing. Would that also be a problem during a bond failover?

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg May 7, 2015, 1:56 p.m. UTC | #3
On 5/7/2015 4:39 PM, Chuck Lever wrote:
>
> On May 7, 2015, at 6:00 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>
>> On 5/4/2015 8:57 PM, Chuck Lever wrote:
>>> The connect worker can replace ri_id, but prevents ri_id->device
>>> from changing during the lifetime of a transport instance.
>>>
>>> Cache a copy of ri_id->device in rpcrdma_ia and in rpcrdma_rep.
>>> The cached copy can be used safely in code that does not serialize
>>> with the connect worker.
>>>
>>> Other code can use it to save an extra address generation (one
>>> pointer dereference instead of two).
>>>
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>>   net/sunrpc/xprtrdma/fmr_ops.c      |    8 +----
>>>   net/sunrpc/xprtrdma/frwr_ops.c     |   12 +++----
>>>   net/sunrpc/xprtrdma/physical_ops.c |    8 +----
>>>   net/sunrpc/xprtrdma/verbs.c        |   61 +++++++++++++++++++-----------------
>>>   net/sunrpc/xprtrdma/xprt_rdma.h    |    2 +
>>>   5 files changed, 43 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>>> index 302d4eb..0a96155 100644
>>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>>> @@ -85,7 +85,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>>>   	   int nsegs, bool writing)
>>>   {
>>>   	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>> -	struct ib_device *device = ia->ri_id->device;
>>> +	struct ib_device *device = ia->ri_device;
>>>   	enum dma_data_direction direction = rpcrdma_data_dir(writing);
>>>   	struct rpcrdma_mr_seg *seg1 = seg;
>>>   	struct rpcrdma_mw *mw = seg1->rl_mw;
>>> @@ -137,17 +137,13 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>>>   {
>>>   	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>>   	struct rpcrdma_mr_seg *seg1 = seg;
>>> -	struct ib_device *device;
>>>   	int rc, nsegs = seg->mr_nsegs;
>>>   	LIST_HEAD(l);
>>>
>>>   	list_add(&seg1->rl_mw->r.fmr->list, &l);
>>>   	rc = ib_unmap_fmr(&l);
>>> -	read_lock(&ia->ri_qplock);
>>> -	device = ia->ri_id->device;
>>>   	while (seg1->mr_nsegs--)
>>> -		rpcrdma_unmap_one(device, seg++);
>>> -	read_unlock(&ia->ri_qplock);
>>> +		rpcrdma_unmap_one(ia->ri_device, seg++);
>>
>> Umm, I'm wandering if this is guaranteed to be the same device as
>> ri_id->device?
>>
>> Imagine you are working on a bond device where each slave belongs to
>> a different adapter. When the active port toggles, you will see a
>> ADDR_CHANGED event (that the current code does not handle...), what
>> you'd want to do is just reconnect and rdma_cm will resolve the new
>> address for you (via the backup slave). I suspect that in case this
>> flow is concurrent with the reconnects you may end up with a stale
>> device handle.
>
> I’m not sure what you mean by “stale” : freed memory?
>
> I’m looking at this code in rpcrdma_ep_connect() :
>
>   916                 if (ia->ri_id->device != id->device) {
>   917                         printk("RPC:       %s: can't reconnect on "
>   918                                 "different device!\n", __func__);
>   919                         rdma_destroy_id(id);
>   920                         rc = -ENETUNREACH;
>   921                         goto out;
>   922                 }
>
> After reconnecting, if the ri_id has changed, the connect fails. Today,
> xprtrdma does not support the device changing out from under it.
>
> Note also that our receive completion upcall uses ri_id->device for
> DMA map syncing. Would that also be a problem during a bond failover?
>

I'm not talking about ri_id->device, this will be consistent. I'm
wandering about ia->ri_device, which might not have been updated yet.

Just asking, assuming your transport device can change between 
consecutive reconnects (the new cm_id will contain another device), is
it safe to rely on ri_device being updated?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III May 7, 2015, 2:12 p.m. UTC | #4
On May 7, 2015, at 9:56 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:

> On 5/7/2015 4:39 PM, Chuck Lever wrote:
>> 
>> On May 7, 2015, at 6:00 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>> 
>>> On 5/4/2015 8:57 PM, Chuck Lever wrote:
>>>> The connect worker can replace ri_id, but prevents ri_id->device
>>>> from changing during the lifetime of a transport instance.
>>>> 
>>>> Cache a copy of ri_id->device in rpcrdma_ia and in rpcrdma_rep.
>>>> The cached copy can be used safely in code that does not serialize
>>>> with the connect worker.
>>>> 
>>>> Other code can use it to save an extra address generation (one
>>>> pointer dereference instead of two).
>>>> 
>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>> ---
>>>>  net/sunrpc/xprtrdma/fmr_ops.c      |    8 +----
>>>>  net/sunrpc/xprtrdma/frwr_ops.c     |   12 +++----
>>>>  net/sunrpc/xprtrdma/physical_ops.c |    8 +----
>>>>  net/sunrpc/xprtrdma/verbs.c        |   61 +++++++++++++++++++-----------------
>>>>  net/sunrpc/xprtrdma/xprt_rdma.h    |    2 +
>>>>  5 files changed, 43 insertions(+), 48 deletions(-)
>>>> 
>>>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>>>> index 302d4eb..0a96155 100644
>>>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>>>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>>>> @@ -85,7 +85,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>>>>  	   int nsegs, bool writing)
>>>>  {
>>>>  	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>>> -	struct ib_device *device = ia->ri_id->device;
>>>> +	struct ib_device *device = ia->ri_device;
>>>>  	enum dma_data_direction direction = rpcrdma_data_dir(writing);
>>>>  	struct rpcrdma_mr_seg *seg1 = seg;
>>>>  	struct rpcrdma_mw *mw = seg1->rl_mw;
>>>> @@ -137,17 +137,13 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>>>>  {
>>>>  	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>>>  	struct rpcrdma_mr_seg *seg1 = seg;
>>>> -	struct ib_device *device;
>>>>  	int rc, nsegs = seg->mr_nsegs;
>>>>  	LIST_HEAD(l);
>>>> 
>>>>  	list_add(&seg1->rl_mw->r.fmr->list, &l);
>>>>  	rc = ib_unmap_fmr(&l);
>>>> -	read_lock(&ia->ri_qplock);
>>>> -	device = ia->ri_id->device;
>>>>  	while (seg1->mr_nsegs--)
>>>> -		rpcrdma_unmap_one(device, seg++);
>>>> -	read_unlock(&ia->ri_qplock);
>>>> +		rpcrdma_unmap_one(ia->ri_device, seg++);
>>> 
>>> Umm, I'm wandering if this is guaranteed to be the same device as
>>> ri_id->device?
>>> 
>>> Imagine you are working on a bond device where each slave belongs to
>>> a different adapter. When the active port toggles, you will see a
>>> ADDR_CHANGED event (that the current code does not handle...), what
>>> you'd want to do is just reconnect and rdma_cm will resolve the new
>>> address for you (via the backup slave). I suspect that in case this
>>> flow is concurrent with the reconnects you may end up with a stale
>>> device handle.
>> 
>> I’m not sure what you mean by “stale” : freed memory?
>> 
>> I’m looking at this code in rpcrdma_ep_connect() :
>> 
>>  916                 if (ia->ri_id->device != id->device) {
>>  917                         printk("RPC:       %s: can't reconnect on "
>>  918                                 "different device!\n", __func__);
>>  919                         rdma_destroy_id(id);
>>  920                         rc = -ENETUNREACH;
>>  921                         goto out;
>>  922                 }
>> 
>> After reconnecting, if the ri_id has changed, the connect fails. Today,
>> xprtrdma does not support the device changing out from under it.
>> 
>> Note also that our receive completion upcall uses ri_id->device for
>> DMA map syncing. Would that also be a problem during a bond failover?
>> 
> 
> I'm not talking about ri_id->device, this will be consistent. I'm
> wandering about ia->ri_device, which might not have been updated yet.

ia->ri_device is never updated. The only place it is set is in
rpcrdma_ia_open().

> Just asking, assuming your transport device can change between consecutive reconnects (the new cm_id will contain another device), is
> it safe to rely on ri_device being updated?

My reading of the above logic is that ia->ri_id->device is guaranteed to
be the same address during the lifetime of the transport instance. If it
changes during a reconnect, rpcrdma_ep_connect() will fail the connect.

In the case of a bonded device, why are the physical slave devices exposed
to consumers? It might be saner to construct a virtual ib_device in this
case that consumers can depend on.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg May 7, 2015, 3:11 p.m. UTC | #5
On 5/7/2015 5:12 PM, Chuck Lever wrote:
>
> On May 7, 2015, at 9:56 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>
>> On 5/7/2015 4:39 PM, Chuck Lever wrote:
>>>
>>> On May 7, 2015, at 6:00 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>>>
>>>> On 5/4/2015 8:57 PM, Chuck Lever wrote:
>>>>> The connect worker can replace ri_id, but prevents ri_id->device
>>>>> from changing during the lifetime of a transport instance.
>>>>>
>>>>> Cache a copy of ri_id->device in rpcrdma_ia and in rpcrdma_rep.
>>>>> The cached copy can be used safely in code that does not serialize
>>>>> with the connect worker.
>>>>>
>>>>> Other code can use it to save an extra address generation (one
>>>>> pointer dereference instead of two).
>>>>>
>>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>>> ---
>>>>>   net/sunrpc/xprtrdma/fmr_ops.c      |    8 +----
>>>>>   net/sunrpc/xprtrdma/frwr_ops.c     |   12 +++----
>>>>>   net/sunrpc/xprtrdma/physical_ops.c |    8 +----
>>>>>   net/sunrpc/xprtrdma/verbs.c        |   61 +++++++++++++++++++-----------------
>>>>>   net/sunrpc/xprtrdma/xprt_rdma.h    |    2 +
>>>>>   5 files changed, 43 insertions(+), 48 deletions(-)
>>>>>
>>>>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>>>>> index 302d4eb..0a96155 100644
>>>>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>>>>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>>>>> @@ -85,7 +85,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>>>>>   	   int nsegs, bool writing)
>>>>>   {
>>>>>   	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>>>> -	struct ib_device *device = ia->ri_id->device;
>>>>> +	struct ib_device *device = ia->ri_device;
>>>>>   	enum dma_data_direction direction = rpcrdma_data_dir(writing);
>>>>>   	struct rpcrdma_mr_seg *seg1 = seg;
>>>>>   	struct rpcrdma_mw *mw = seg1->rl_mw;
>>>>> @@ -137,17 +137,13 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>>>>>   {
>>>>>   	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>>>>   	struct rpcrdma_mr_seg *seg1 = seg;
>>>>> -	struct ib_device *device;
>>>>>   	int rc, nsegs = seg->mr_nsegs;
>>>>>   	LIST_HEAD(l);
>>>>>
>>>>>   	list_add(&seg1->rl_mw->r.fmr->list, &l);
>>>>>   	rc = ib_unmap_fmr(&l);
>>>>> -	read_lock(&ia->ri_qplock);
>>>>> -	device = ia->ri_id->device;
>>>>>   	while (seg1->mr_nsegs--)
>>>>> -		rpcrdma_unmap_one(device, seg++);
>>>>> -	read_unlock(&ia->ri_qplock);
>>>>> +		rpcrdma_unmap_one(ia->ri_device, seg++);
>>>>
>>>> Umm, I'm wandering if this is guaranteed to be the same device as
>>>> ri_id->device?
>>>>
>>>> Imagine you are working on a bond device where each slave belongs to
>>>> a different adapter. When the active port toggles, you will see a
>>>> ADDR_CHANGED event (that the current code does not handle...), what
>>>> you'd want to do is just reconnect and rdma_cm will resolve the new
>>>> address for you (via the backup slave). I suspect that in case this
>>>> flow is concurrent with the reconnects you may end up with a stale
>>>> device handle.
>>>
>>> I’m not sure what you mean by “stale” : freed memory?
>>>
>>> I’m looking at this code in rpcrdma_ep_connect() :
>>>
>>>   916                 if (ia->ri_id->device != id->device) {
>>>   917                         printk("RPC:       %s: can't reconnect on "
>>>   918                                 "different device!\n", __func__);
>>>   919                         rdma_destroy_id(id);
>>>   920                         rc = -ENETUNREACH;
>>>   921                         goto out;
>>>   922                 }
>>>
>>> After reconnecting, if the ri_id has changed, the connect fails. Today,
>>> xprtrdma does not support the device changing out from under it.
>>>
>>> Note also that our receive completion upcall uses ri_id->device for
>>> DMA map syncing. Would that also be a problem during a bond failover?
>>>
>>
>> I'm not talking about ri_id->device, this will be consistent. I'm
>> wandering about ia->ri_device, which might not have been updated yet.
>
> ia->ri_device is never updated. The only place it is set is in
> rpcrdma_ia_open().

So you assume that each ri_id that you will recreate contains the
same device handle?

I think that for ADDR_CHANGE event when the slave belongs to another
device you will hit a mismatch. CC'ing Sean for more info...

>
>> Just asking, assuming your transport device can change between consecutive reconnects (the new cm_id will contain another device), is
>> it safe to rely on ri_device being updated?
>
> My reading of the above logic is that ia->ri_id->device is guaranteed to
> be the same address during the lifetime of the transport instance. If it
> changes during a reconnect, rpcrdma_ep_connect() will fail the connect.

It is the same address - the bond0 IP...

>
> In the case of a bonded device, why are the physical slave devices exposed
> to consumers?

You mean ib_device handle? you need it to create PD/CQ/QP/MRs...
How else can you allocate the device resources without the device
handle?

rdma_cm simply gives you the device handle by the IP route. From there
you own the resources you create.

> It might be saner to construct a virtual ib_device in this
> case that consumers can depend on.

I'm not sure how does a virtual ib_device can work - that goes
to the verbs themselves... Seems like a layering mis-match to me...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III May 11, 2015, 3:22 p.m. UTC | #6
On May 7, 2015, at 11:11 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:

> On 5/7/2015 5:12 PM, Chuck Lever wrote:
>> 
>> On May 7, 2015, at 9:56 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>> 
>>> On 5/7/2015 4:39 PM, Chuck Lever wrote:
>>>> 
>>>> On May 7, 2015, at 6:00 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>>>> 
>>>>> On 5/4/2015 8:57 PM, Chuck Lever wrote:
>>>>>> The connect worker can replace ri_id, but prevents ri_id->device
>>>>>> from changing during the lifetime of a transport instance.
>>>>>> 
>>>>>> Cache a copy of ri_id->device in rpcrdma_ia and in rpcrdma_rep.
>>>>>> The cached copy can be used safely in code that does not serialize
>>>>>> with the connect worker.
>>>>>> 
>>>>>> Other code can use it to save an extra address generation (one
>>>>>> pointer dereference instead of two).
>>>>>> 
>>>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>>>> ---
>>>>>>  net/sunrpc/xprtrdma/fmr_ops.c      |    8 +----
>>>>>>  net/sunrpc/xprtrdma/frwr_ops.c     |   12 +++----
>>>>>>  net/sunrpc/xprtrdma/physical_ops.c |    8 +----
>>>>>>  net/sunrpc/xprtrdma/verbs.c        |   61 +++++++++++++++++++-----------------
>>>>>>  net/sunrpc/xprtrdma/xprt_rdma.h    |    2 +
>>>>>>  5 files changed, 43 insertions(+), 48 deletions(-)
>>>>>> 
>>>>>> diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
>>>>>> index 302d4eb..0a96155 100644
>>>>>> --- a/net/sunrpc/xprtrdma/fmr_ops.c
>>>>>> +++ b/net/sunrpc/xprtrdma/fmr_ops.c
>>>>>> @@ -85,7 +85,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
>>>>>>  	   int nsegs, bool writing)
>>>>>>  {
>>>>>>  	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>>>>> -	struct ib_device *device = ia->ri_id->device;
>>>>>> +	struct ib_device *device = ia->ri_device;
>>>>>>  	enum dma_data_direction direction = rpcrdma_data_dir(writing);
>>>>>>  	struct rpcrdma_mr_seg *seg1 = seg;
>>>>>>  	struct rpcrdma_mw *mw = seg1->rl_mw;
>>>>>> @@ -137,17 +137,13 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
>>>>>>  {
>>>>>>  	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>>>>>>  	struct rpcrdma_mr_seg *seg1 = seg;
>>>>>> -	struct ib_device *device;
>>>>>>  	int rc, nsegs = seg->mr_nsegs;
>>>>>>  	LIST_HEAD(l);
>>>>>> 
>>>>>>  	list_add(&seg1->rl_mw->r.fmr->list, &l);
>>>>>>  	rc = ib_unmap_fmr(&l);
>>>>>> -	read_lock(&ia->ri_qplock);
>>>>>> -	device = ia->ri_id->device;
>>>>>>  	while (seg1->mr_nsegs--)
>>>>>> -		rpcrdma_unmap_one(device, seg++);
>>>>>> -	read_unlock(&ia->ri_qplock);
>>>>>> +		rpcrdma_unmap_one(ia->ri_device, seg++);
>>>>> 
>>>>> Umm, I'm wandering if this is guaranteed to be the same device as
>>>>> ri_id->device?
>>>>> 
>>>>> Imagine you are working on a bond device where each slave belongs to
>>>>> a different adapter. When the active port toggles, you will see a
>>>>> ADDR_CHANGED event (that the current code does not handle...), what
>>>>> you'd want to do is just reconnect and rdma_cm will resolve the new
>>>>> address for you (via the backup slave). I suspect that in case this
>>>>> flow is concurrent with the reconnects you may end up with a stale
>>>>> device handle.
>>>> 
>>>> I’m not sure what you mean by “stale” : freed memory?
>>>> 
>>>> I’m looking at this code in rpcrdma_ep_connect() :
>>>> 
>>>>  916                 if (ia->ri_id->device != id->device) {
>>>>  917                         printk("RPC:       %s: can't reconnect on "
>>>>  918                                 "different device!\n", __func__);
>>>>  919                         rdma_destroy_id(id);
>>>>  920                         rc = -ENETUNREACH;
>>>>  921                         goto out;
>>>>  922                 }
>>>> 
>>>> After reconnecting, if the ri_id has changed, the connect fails. Today,
>>>> xprtrdma does not support the device changing out from under it.
>>>> 
>>>> Note also that our receive completion upcall uses ri_id->device for
>>>> DMA map syncing. Would that also be a problem during a bond failover?
>>>> 
>>> 
>>> I'm not talking about ri_id->device, this will be consistent. I'm
>>> wandering about ia->ri_device, which might not have been updated yet.
>> 
>> ia->ri_device is never updated. The only place it is set is in
>> rpcrdma_ia_open().
> 
> So you assume that each ri_id that you will recreate contains the
> same device handle?

This issue is still unresolved.

xprtrdma does not assume the device pointers are the same, it does
actually check for it. The connect worker is careful to create a new
ri_id first, then check to see that the new ri_id device pointer is
the same as the device pointer in the old (defunct) ri_id.

If the device pointer changes, the old ri_id is kept and the connect
operation fails (retried later).

That is how we guarantee that ia->ri_id->device never changes, and
thus the cached pointer in ia->ri_device never needs to be updated.

> I think that for ADDR_CHANGE event when the slave belongs to another
> device you will hit a mismatch. CC'ing Sean for more info…

Yes, I’d like some confirmation that xprtrdma is not abusing the verbs
API. :-)

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 11, 2015, 6:26 p.m. UTC | #7
> > ia->ri_device is never updated. The only place it is set is in
> > rpcrdma_ia_open().
> 
> So you assume that each ri_id that you will recreate contains the
> same device handle?
> 
> I think that for ADDR_CHANGE event when the slave belongs to another
> device you will hit a mismatch. CC'ing Sean for more info...

I'm not familiar with the xprtrdma code.  From the perspective of the rdma_cm, if a listen is associated with a specific IP address, then it will also be associated with a specific device.  If an address change occurs, and the address moves to another device, then the app is essentially left with an unusable listen.  Received connection requests will not find a matching listen and be dropped.

If the address moves ports on the same device, then I think this works out fine in the case where the app ignores the ADDR_CHANGE event.

> > It might be saner to construct a virtual ib_device in this
> > case that consumers can depend on.
> 
> I'm not sure how does a virtual ib_device can work - that goes
> to the verbs themselves... Seems like a layering mis-match to me...

I interpreted Chuck as asking for some lower-level way of handling this, so that all ULPs don't need to re-implement this.  I agree that this would be hard (impossible?) without ULPs operating at a slightly higher level of abstraction.  Maybe the rdma_cm could be extended for this purpose, for example, by exposing its own devices, which either map directly to a verbs device, or handle a virtual mapping.

- Sean 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III May 11, 2015, 6:57 p.m. UTC | #8
On May 11, 2015, at 2:26 PM, Hefty, Sean <sean.hefty@intel.com> wrote:

>>> ia->ri_device is never updated. The only place it is set is in
>>> rpcrdma_ia_open().
>> 
>> So you assume that each ri_id that you will recreate contains the
>> same device handle?
>> 
>> I think that for ADDR_CHANGE event when the slave belongs to another
>> device you will hit a mismatch. CC'ing Sean for more info...
> 
> I'm not familiar with the xprtrdma code.  From the perspective of the rdma_cm, if a listen is associated with a specific IP address, then it will also be associated with a specific device.  If an address change occurs, and the address moves to another device, then the app is essentially left with an unusable listen.  Received connection requests will not find a matching listen and be dropped.

Thanks Sean.

xprtrdma is the client-side (initiator), so it drives transport connects. The
server-side (target) does the listens. My proposed change is only on the client.

> If the address moves ports on the same device, then I think this works out fine in the case where the app ignores the ADDR_CHANGE event.

xprtrdma currently doesn’t explicitly handle ADDR_CHANGE events (and neither
does the server-side, looks like).

In the deployment scenarios I’m aware of, a single HCA with two ports is
used to form the bonded pair. I’m sure that’s not the only way this can
be done, though.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg May 12, 2015, 10:01 a.m. UTC | #9
On 5/11/2015 9:26 PM, Hefty, Sean wrote:
>>> ia->ri_device is never updated. The only place it is set is in
>>> rpcrdma_ia_open().
>>
>> So you assume that each ri_id that you will recreate contains the
>> same device handle?
>>
>> I think that for ADDR_CHANGE event when the slave belongs to another
>> device you will hit a mismatch. CC'ing Sean for more info...
>
> I'm not familiar with the xprtrdma code.  From the perspective of the rdma_cm, if a listen is associated with a specific IP address, then it will also be associated with a specific device.  If an address change occurs, and the address moves to another device, then the app is essentially left with an unusable listen.  Received connection requests will not find a matching listen and be dropped.
>
> If the address moves ports on the same device, then I think this works out fine in the case where the app ignores the ADDR_CHANGE event.
>
>>> It might be saner to construct a virtual ib_device in this
>>> case that consumers can depend on.
>>
>> I'm not sure how does a virtual ib_device can work - that goes
>> to the verbs themselves... Seems like a layering mis-match to me...
>
> I interpreted Chuck as asking for some lower-level way of handling this, so that all ULPs don't need to re-implement this.

It's not really a re-implementation, address resolution attaches the
the ib device. The assumption that for the same address the same ib
device will be attached is wrong for bonding over more than one device.

It's just something that rdma_cm user needs to be aware of if/when
trying to cache the device handle.

> I agree that this would be hard (impossible?) without ULPs operating at a slightly higher level of abstraction.
> Maybe the rdma_cm could be extended for this purpose, for example, by exposing its own devices, which either map directly to a verbs device, or handle a virtual mapping.

I guess it can be done, but it will require port all the ULPs to
use some sort of abstraction layer. Something that I'm not very 
enthusiastic about...

Sagi.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 302d4eb..0a96155 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -85,7 +85,7 @@  fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	   int nsegs, bool writing)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct ib_device *device = ia->ri_id->device;
+	struct ib_device *device = ia->ri_device;
 	enum dma_data_direction direction = rpcrdma_data_dir(writing);
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_mw *mw = seg1->rl_mw;
@@ -137,17 +137,13 @@  fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mr_seg *seg1 = seg;
-	struct ib_device *device;
 	int rc, nsegs = seg->mr_nsegs;
 	LIST_HEAD(l);
 
 	list_add(&seg1->rl_mw->r.fmr->list, &l);
 	rc = ib_unmap_fmr(&l);
-	read_lock(&ia->ri_qplock);
-	device = ia->ri_id->device;
 	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(device, seg++);
-	read_unlock(&ia->ri_qplock);
+		rpcrdma_unmap_one(ia->ri_device, seg++);
 	if (rc)
 		goto out_err;
 	return nsegs;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index dff0481..66a85fa 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -137,7 +137,7 @@  static int
 frwr_op_init(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+	struct ib_device *device = r_xprt->rx_ia.ri_device;
 	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
 	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
 	int i;
@@ -178,7 +178,7 @@  frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	    int nsegs, bool writing)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct ib_device *device = ia->ri_id->device;
+	struct ib_device *device = ia->ri_device;
 	enum dma_data_direction direction = rpcrdma_data_dir(writing);
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_mw *mw = seg1->rl_mw;
@@ -263,7 +263,6 @@  frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct ib_send_wr invalidate_wr, *bad_wr;
 	int rc, nsegs = seg->mr_nsegs;
-	struct ib_device *device;
 
 	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
 
@@ -273,10 +272,9 @@  frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
-	read_lock(&ia->ri_qplock);
-	device = ia->ri_id->device;
 	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(device, seg++);
+		rpcrdma_unmap_one(ia->ri_device, seg++);
+	read_lock(&ia->ri_qplock);
 	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
 	read_unlock(&ia->ri_qplock);
 	if (rc)
@@ -304,7 +302,7 @@  static void
 frwr_op_reset(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+	struct ib_device *device = r_xprt->rx_ia.ri_device;
 	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
 	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
 	struct rpcrdma_mw *r;
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index ba518af..da149e8 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -50,8 +50,7 @@  physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-	rpcrdma_map_one(ia->ri_id->device, seg,
-			rpcrdma_data_dir(writing));
+	rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
 	seg->mr_rkey = ia->ri_bind_mem->rkey;
 	seg->mr_base = seg->mr_dma;
 	seg->mr_nsegs = 1;
@@ -65,10 +64,7 @@  physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-	read_lock(&ia->ri_qplock);
-	rpcrdma_unmap_one(ia->ri_id->device, seg);
-	read_unlock(&ia->ri_qplock);
-
+	rpcrdma_unmap_one(ia->ri_device, seg);
 	return 1;
 }
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e1eb7c4..ebcb0e2 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -278,7 +278,6 @@  rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
 {
 	struct rpcrdma_rep *rep =
 			(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
-	struct rpcrdma_ia *ia;
 
 	/* WARNING: Only wr_id and status are reliable at this point */
 	if (wc->status != IB_WC_SUCCESS)
@@ -291,9 +290,8 @@  rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
 	dprintk("RPC:       %s: rep %p opcode 'recv', length %u: success\n",
 		__func__, rep, wc->byte_len);
 
-	ia = &rep->rr_rxprt->rx_ia;
 	rep->rr_len = wc->byte_len;
-	ib_dma_sync_single_for_cpu(ia->ri_id->device,
+	ib_dma_sync_single_for_cpu(rep->rr_device,
 				   rdmab_addr(rep->rr_rdmabuf),
 				   rep->rr_len, DMA_FROM_DEVICE);
 	prefetch(rdmab_to_msg(rep->rr_rdmabuf));
@@ -489,7 +487,7 @@  connected:
 
 		pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
 			sap, rpc_get_port(sap),
-			ia->ri_id->device->name,
+			ia->ri_device->name,
 			ia->ri_ops->ro_displayname,
 			xprt->rx_buf.rb_max_requests,
 			ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
@@ -590,8 +588,9 @@  rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		rc = PTR_ERR(ia->ri_id);
 		goto out1;
 	}
+	ia->ri_device = ia->ri_id->device;
 
-	ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+	ia->ri_pd = ib_alloc_pd(ia->ri_device);
 	if (IS_ERR(ia->ri_pd)) {
 		rc = PTR_ERR(ia->ri_pd);
 		dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
@@ -599,7 +598,7 @@  rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		goto out2;
 	}
 
-	rc = ib_query_device(ia->ri_id->device, devattr);
+	rc = ib_query_device(ia->ri_device, devattr);
 	if (rc) {
 		dprintk("RPC:       %s: ib_query_device failed %d\n",
 			__func__, rc);
@@ -608,7 +607,7 @@  rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 
 	if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
 		ia->ri_have_dma_lkey = 1;
-		ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+		ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
 	}
 
 	if (memreg == RPCRDMA_FRMR) {
@@ -623,7 +622,7 @@  rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		}
 	}
 	if (memreg == RPCRDMA_MTHCAFMR) {
-		if (!ia->ri_id->device->alloc_fmr) {
+		if (!ia->ri_device->alloc_fmr) {
 			dprintk("RPC:       %s: MTHCAFMR registration "
 				"not supported by HCA\n", __func__);
 			memreg = RPCRDMA_ALLPHYSICAL;
@@ -773,9 +772,9 @@  rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	init_waitqueue_head(&ep->rep_connect_wait);
 	INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
 
-	sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
-				  rpcrdma_cq_async_error_upcall, ep,
-				  ep->rep_attr.cap.max_send_wr + 1, 0);
+	sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
+			      rpcrdma_cq_async_error_upcall, ep,
+			      ep->rep_attr.cap.max_send_wr + 1, 0);
 	if (IS_ERR(sendcq)) {
 		rc = PTR_ERR(sendcq);
 		dprintk("RPC:       %s: failed to create send CQ: %i\n",
@@ -790,9 +789,9 @@  rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 		goto out2;
 	}
 
-	recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
-				  rpcrdma_cq_async_error_upcall, ep,
-				  ep->rep_attr.cap.max_recv_wr + 1, 0);
+	recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
+			      rpcrdma_cq_async_error_upcall, ep,
+			      ep->rep_attr.cap.max_recv_wr + 1, 0);
 	if (IS_ERR(recvcq)) {
 		rc = PTR_ERR(recvcq);
 		dprintk("RPC:       %s: failed to create recv CQ: %i\n",
@@ -913,7 +912,7 @@  retry:
 		 * More stuff I haven't thought of!
 		 * Rrrgh!
 		 */
-		if (ia->ri_id->device != id->device) {
+		if (ia->ri_device != id->device) {
 			printk("RPC:       %s: can't reconnect on "
 				"different device!\n", __func__);
 			rdma_destroy_id(id);
@@ -1055,6 +1054,7 @@  rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
 		goto out_free;
 	}
 
+	rep->rr_device = ia->ri_device;
 	rep->rr_rxprt = r_xprt;
 	return rep;
 
@@ -1457,9 +1457,9 @@  rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
 	/*
 	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
 	 */
-	iov->addr = ib_dma_map_single(ia->ri_id->device,
+	iov->addr = ib_dma_map_single(ia->ri_device,
 			va, len, DMA_BIDIRECTIONAL);
-	if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
 		return -ENOMEM;
 
 	iov->length = len;
@@ -1503,8 +1503,8 @@  rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
 {
 	int rc;
 
-	ib_dma_unmap_single(ia->ri_id->device,
-			iov->addr, iov->length, DMA_BIDIRECTIONAL);
+	ib_dma_unmap_single(ia->ri_device,
+			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
 
 	if (NULL == mr)
 		return 0;
@@ -1597,15 +1597,18 @@  rpcrdma_ep_post(struct rpcrdma_ia *ia,
 	send_wr.num_sge = req->rl_niovs;
 	send_wr.opcode = IB_WR_SEND;
 	if (send_wr.num_sge == 4)	/* no need to sync any pad (constant) */
-		ib_dma_sync_single_for_device(ia->ri_id->device,
-			req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
-			DMA_TO_DEVICE);
-	ib_dma_sync_single_for_device(ia->ri_id->device,
-		req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
-		DMA_TO_DEVICE);
-	ib_dma_sync_single_for_device(ia->ri_id->device,
-		req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
-		DMA_TO_DEVICE);
+		ib_dma_sync_single_for_device(ia->ri_device,
+					      req->rl_send_iov[3].addr,
+					      req->rl_send_iov[3].length,
+					      DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_device,
+				      req->rl_send_iov[1].addr,
+				      req->rl_send_iov[1].length,
+				      DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_device,
+				      req->rl_send_iov[0].addr,
+				      req->rl_send_iov[0].length,
+				      DMA_TO_DEVICE);
 
 	if (DECR_CQCOUNT(ep) > 0)
 		send_wr.send_flags = 0;
@@ -1638,7 +1641,7 @@  rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
 	recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
 	recv_wr.num_sge = 1;
 
-	ib_dma_sync_single_for_cpu(ia->ri_id->device,
+	ib_dma_sync_single_for_cpu(ia->ri_device,
 				   rdmab_addr(rep->rr_rdmabuf),
 				   rdmab_length(rep->rr_rdmabuf),
 				   DMA_BIDIRECTIONAL);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 143eb10..531ad33 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -62,6 +62,7 @@ 
 struct rpcrdma_ia {
 	const struct rpcrdma_memreg_ops	*ri_ops;
 	rwlock_t		ri_qplock;
+	struct ib_device	*ri_device;
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
 	struct ib_mr		*ri_bind_mem;
@@ -173,6 +174,7 @@  struct rpcrdma_buffer;
 
 struct rpcrdma_rep {
 	unsigned int		rr_len;
+	struct ib_device	*rr_device;
 	struct rpcrdma_xprt	*rr_rxprt;
 	void			(*rr_func)(struct rpcrdma_rep *);
 	struct list_head	rr_list;