diff mbox

[v3,for-next,05/13] IB/cm: Reference count ib_cm_ids

Message ID 1431253604-9214-6-git-send-email-haggaie@mellanox.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Haggai Eran May 10, 2015, 10:26 a.m. UTC
Add reference count (kref) to the ib_cm_id to allow automatic destruction
of an ib_cm_id. This will allow multiple RDMA CM IDs to use a single
ib_cm_id when they are on different network namespaces.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
---
 drivers/infiniband/core/cm.c | 41 +++++++++++++++++++++++++++++++++++++----
 include/rdma/ib_cm.h         | 10 +++++++---
 2 files changed, 44 insertions(+), 7 deletions(-)

Comments

Jason Gunthorpe May 11, 2015, 6:34 p.m. UTC | #1
On Sun, May 10, 2015 at 01:26:36PM +0300, Haggai Eran wrote:
> Add reference count (kref) to the ib_cm_id to allow automatic destruction
> of an ib_cm_id. This will allow multiple RDMA CM IDs to use a single
> ib_cm_id when they are on different network namespaces.
> 
> Signed-off-by: Haggai Eran <haggaie@mellanox.com>
>  drivers/infiniband/core/cm.c | 41 +++++++++++++++++++++++++++++++++++++----
>  include/rdma/ib_cm.h         | 10 +++++++---
>  2 files changed, 44 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
> index 08b18044552a..6b68402fd6df 100644
> +++ b/drivers/infiniband/core/cm.c
> @@ -711,6 +711,7 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
>  	cm_id_priv->id.cm_handler = cm_handler;
>  	cm_id_priv->id.context = context;
>  	cm_id_priv->id.remote_cm_qpn = 1;
> +	kref_init(&cm_id_priv->id.ref);
>  	ret = cm_alloc_id(cm_id_priv);
>  	if (ret)
>  		goto error;

Idiomatically, once kref_init is called, kfree should not be used, you
have to kref_put to destroy it, this error path calls kfree directly.

Probably best to just move the kref_init to after the failable call.

> -void ib_destroy_cm_id(struct ib_cm_id *cm_id)
> +static void __ib_destroy_cm_id(struct kref *ref)
>  {
> +	struct ib_cm_id *cm_id = container_of(ref, struct ib_cm_id, ref);
> +
>  	cm_destroy_id(cm_id, 0);
>  }

Hum, this is quite a heavy free function. Did you check that this is
safe to do asynchronously, that there are no implicit kref's being
held by the caller?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 12, 2015, 6:50 a.m. UTC | #2
On 11/05/2015 21:34, Jason Gunthorpe wrote:
> On Sun, May 10, 2015 at 01:26:36PM +0300, Haggai Eran wrote:
>> Add reference count (kref) to the ib_cm_id to allow automatic destruction
>> of an ib_cm_id. This will allow multiple RDMA CM IDs to use a single
>> ib_cm_id when they are on different network namespaces.
>>
>> Signed-off-by: Haggai Eran <haggaie@mellanox.com>
>>  drivers/infiniband/core/cm.c | 41 +++++++++++++++++++++++++++++++++++++----
>>  include/rdma/ib_cm.h         | 10 +++++++---
>>  2 files changed, 44 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
>> index 08b18044552a..6b68402fd6df 100644
>> +++ b/drivers/infiniband/core/cm.c
>> @@ -711,6 +711,7 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
>>  	cm_id_priv->id.cm_handler = cm_handler;
>>  	cm_id_priv->id.context = context;
>>  	cm_id_priv->id.remote_cm_qpn = 1;
>> +	kref_init(&cm_id_priv->id.ref);
>>  	ret = cm_alloc_id(cm_id_priv);
>>  	if (ret)
>>  		goto error;
> 
> Idiomatically, once kref_init is called, kfree should not be used, you
> have to kref_put to destroy it, this error path calls kfree directly.
> 
> Probably best to just move the kref_init to after the failable call.

Sure.

> 
>> -void ib_destroy_cm_id(struct ib_cm_id *cm_id)
>> +static void __ib_destroy_cm_id(struct kref *ref)
>>  {
>> +	struct ib_cm_id *cm_id = container_of(ref, struct ib_cm_id, ref);
>> +
>>  	cm_destroy_id(cm_id, 0);
>>  }
> 
> Hum, this is quite a heavy free function. Did you check that this is
> safe to do asynchronously, that there are no implicit kref's being
> held by the caller?

I'm not sure what you mean. The function is called by the last kref_put,
and destroys the ID synchronously.

Looking at the code though, I now notice that the other call site to
cm_destroy_id, from within the error path of cm_process_work could now
theoretically destroy an ID with existing references. Is that what you
meant?

Since only listening CM IDs are now shared in RDMA CM, this should not
happen in this patch-set, but perhaps the code can be changed to make
sure it is safe even if that changes. How about if we decrease the
reference count in cm_process_work instead of calling cm_destroy_id
directly? The error code could be stored in the ID.

Alternatively, I can add a WARN macro similar to the one I added in
ib_destroy_cm_id, to verify the reference count is zero when reaching
cm_destroy_id.

Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 12, 2015, 6:54 p.m. UTC | #3
On Tue, May 12, 2015 at 09:50:51AM +0300, Haggai Eran wrote:
> >> -void ib_destroy_cm_id(struct ib_cm_id *cm_id)
> >> +static void __ib_destroy_cm_id(struct kref *ref)
> >>  {
> >> +	struct ib_cm_id *cm_id = container_of(ref, struct ib_cm_id, ref);
> >> +
> >>  	cm_destroy_id(cm_id, 0);
> >>  }
> > 
> > Hum, this is quite a heavy free function. Did you check that this is
> > safe to do asynchronously, that there are no implicit kref's being
> > held by the caller?
> 
> I'm not sure what you mean. The function is called by the last kref_put,
> and destroys the ID synchronously.

Sorry, I was thinking about kobjects and CONFIG_DEBUG_KOBJECT_RELEASE
when I wrote that.


> Looking at the code though, I now notice that the other call site to
> cm_destroy_id, from within the error path of cm_process_work could now
> theoretically destroy an ID with existing references. Is that what you
> meant?

No, but that is certainly a problem.
 
> Since only listening CM IDs are now shared in RDMA CM, this should not
> happen in this patch-set, but perhaps the code can be changed to
> make

I think you need to enforce those semantics..

Firstly, it looks to me like we, again, have two krefs, the one you
added and the 'ref_count'  in the priv structure which is 99% of a
kref. So, again, don't do that.

If you want to share listening CM IDs, then do exactly and only
that. Use the existing ref count scheme for keeping track of the
kfree/etc, and add some kind of sharable listen ref count. Early exit
from cm_destroy_id when the there are still people listening.

That sounds like it keeps the basic rule of cm_destroy_id being
properly paired with the alloc, and allows listen sharing without the
confusion of what does multiple destroy mean.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 13, 2015, 10:20 a.m. UTC | #4
On 12/05/2015 21:54, Jason Gunthorpe wrote:
> On Tue, May 12, 2015 at 09:50:51AM +0300, Haggai Eran wrote:
>> Looking at the code though, I now notice that the other call site to
>> cm_destroy_id, from within the error path of cm_process_work could now
>> theoretically destroy an ID with existing references. Is that what you
>> meant?
> 
> No, but that is certainly a problem.
>  
>> Since only listening CM IDs are now shared in RDMA CM, this should not
>> happen in this patch-set, but perhaps the code can be changed to
>> make
> 
> I think you need to enforce those semantics..
> 
> Firstly, it looks to me like we, again, have two krefs, the one you
> added and the 'ref_count'  in the priv structure which is 99% of a
> kref. So, again, don't do that.
> 
> If you want to share listening CM IDs, then do exactly and only
> that. Use the existing ref count scheme for keeping track of the
> kfree/etc, 
The existing reference count scheme is for synchronization in
cm_destroy_id. The function waits for active handlers to complete before
destroying the id. Decrementing ref_count to zero doesn't cause the id
to be freed.

> and add some kind of sharable listen ref count.
That's basically what the patch does. I can change it from a kref to a
direct reference count if you prefer.

> Early exit
> from cm_destroy_id when the there are still people listening.
> 
> That sounds like it keeps the basic rule of cm_destroy_id being
> properly paired with the alloc, and allows listen sharing without the
> confusion of what does multiple destroy mean.

Won't you find it confusing if a call to ib_destroy_cm_id is successful
but the id actually continues to live? I prefer the API to explicitly
show that you are only decreasing the reference count and that the id
might not be destroyed immediately.

Regards,
Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 13, 2015, 4:58 p.m. UTC | #5
On Wed, May 13, 2015 at 01:20:22PM +0300, Haggai Eran wrote:

> > If you want to share listening CM IDs, then do exactly and only
> > that. Use the existing ref count scheme for keeping track of the
> > kfree/etc,

> The existing reference count scheme is for synchronization in
> cm_destroy_id. The function waits for active handlers to complete
> before

Pedantically, this is true

> destroying the id. Decrementing ref_count to zero doesn't cause the id
> to be freed.

Think it through. cm_destroy_id does this:

	cm_deref_id(cm_id_priv);
	wait_for_completion(&cm_id_priv->comp);

The cm_create_cm_id/cm_destroy_id pair holds a value in refcount.

The only way refcount can go zero is if cm_destroy_id is waiting in
wait_for_completion. So setting the refcount to zero always triggers a
(possibly async) kfree.

> > and add some kind of sharable listen ref count.
> That's basically what the patch does. I can change it from a kref to a
> direct reference count if you prefer.

As I've said, idiomatically I prefer to see kref used to manage object
life time exclusively, not as a general purpose counter.

In this case the share count should be protected by the existing spin
lock.

> > Early exit
> > from cm_destroy_id when the there are still people listening.
> > 
> > That sounds like it keeps the basic rule of cm_destroy_id being
> > properly paired with the alloc, and allows listen sharing without the
> > confusion of what does multiple destroy mean.
> 
> Won't you find it confusing if a call to ib_destroy_cm_id is successful
> but the id actually continues to live?

No.. As the caller, I don't care what the cm layer is doing behind the scenes.

The lifetime if each cm_id is clearly defined:

cm_create_cm_id()
cm_ref_id() / cm_deref_id()
cm_destroy_id()

The fact the CM might share a listen (and only a listen) ID behind the
scenes is not the caller's problem. That is an implementation choice,
each caller stands alone and uses the API properly, assuming it is the
only user of the returned cm_id.

The caller relies on cm_destroy_id being synchronous.

> I prefer the API to explicitly show that you are only decreasing the
> reference count and that the id might not be destroyed immediately.

No, that is fuzzy thinking, and lead to this patch that tried to have
both a ib_cm_id_put and cm_destroy_id being used at once. That is broken.

As discussed, we can't easily convert all call sites of cm_destroy_id
to ib_cm_id_put because 1) We loose the error code and 2) The put_X
idiom is async while cm_destory_id users expect sync.

So the best choice is to retain the cm_create_cm_id()/cm_destroy_id()
pairing, and have cm_destroy_id 'do the right thing' when presented
with a shared listen to destroy -> decrease the share count and free
the underlying listen when no more shares are left.

That said: Are you sure this is going to work? Are all the listen
specific cases of cm_destroy_id OK with not doing the
wait_for_completion and cm_dqueue_work *for stuff related to that
client* ?

If not you'll have to change the patch to create multiple cm_id's for
the same listen and iterate over all of them.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 14, 2015, 6:48 a.m. UTC | #6
On 13/05/2015 19:58, Jason Gunthorpe wrote:
> On Wed, May 13, 2015 at 01:20:22PM +0300, Haggai Eran wrote:
...
>>> Early exit
>>> from cm_destroy_id when the there are still people listening.
>>>
>>> That sounds like it keeps the basic rule of cm_destroy_id being
>>> properly paired with the alloc, and allows listen sharing without the
>>> confusion of what does multiple destroy mean.
>>
>> Won't you find it confusing if a call to ib_destroy_cm_id is successful
>> but the id actually continues to live?
> 
> No.. As the caller, I don't care what the cm layer is doing behind the scenes.
> 
> The lifetime if each cm_id is clearly defined:
> 
> cm_create_cm_id()
> cm_ref_id() / cm_deref_id()
> cm_destroy_id()
> 
> The fact the CM might share a listen (and only a listen) ID behind the
> scenes is not the caller's problem. That is an implementation choice,
> each caller stands alone and uses the API properly, assuming it is the
> only user of the returned cm_id.
I guess that's okay. Previously, the caller relied on context
information from the CM ID to hold its own context. Now it is no longer
allowed to do that, because the listener is now shared. So With this
change in place you could argue that the caller doesn't care if the CM
ID is actually shared or not.

> 
> The caller relies on cm_destroy_id being synchronous.
> 
>> I prefer the API to explicitly show that you are only decreasing the
>> reference count and that the id might not be destroyed immediately.
> 
> No, that is fuzzy thinking, and lead to this patch that tried to have
> both a ib_cm_id_put and cm_destroy_id being used at once. That is broken.
> 
> As discussed, we can't easily convert all call sites of cm_destroy_id
> to ib_cm_id_put because 1) We loose the error code and 2) The put_X
> idiom is async while cm_destory_id users expect sync.
> 
> So the best choice is to retain the cm_create_cm_id()/cm_destroy_id()
> pairing, and have cm_destroy_id 'do the right thing' when presented
> with a shared listen to destroy -> decrease the share count and free
> the underlying listen when no more shares are left.

Okay.

> That said: Are you sure this is going to work? Are all the listen
> specific cases of cm_destroy_id OK with not doing the
> wait_for_completion and cm_dqueue_work *for stuff related to that
> client* ?

As far as I can see, a listening ib_cm_id state's can be either IDLE or
LISTEN. In these states cm_destroy_id doesn't send anything to the wire.
As for the dequeue work, I expect the queue to remain intact if the CM
ID is still used. If there are work items in the queue that would have
belonged to the particular RDMA CM ID being destroyed, then the RDMA CM
ID module will have to reject them by itself.

Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 15, 2015, 7:11 p.m. UTC | #7
> The lifetime if each cm_id is clearly defined:
> 
> cm_create_cm_id()
> cm_ref_id() / cm_deref_id()
> cm_destroy_id()
> 
> The fact the CM might share a listen (and only a listen) ID behind the
> scenes is not the caller's problem. That is an implementation choice,
> each caller stands alone and uses the API properly, assuming it is the
> only user of the returned cm_id.

Actually, I seriously question why the ib_cm should be modified at all for any of this.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 17, 2015, 6:27 a.m. UTC | #8
On 15/05/2015 22:11, Hefty, Sean wrote:
>> The lifetime if each cm_id is clearly defined:
>>
>> cm_create_cm_id()
>> cm_ref_id() / cm_deref_id()
>> cm_destroy_id()
>>
>> The fact the CM might share a listen (and only a listen) ID behind the
>> scenes is not the caller's problem. That is an implementation choice,
>> each caller stands alone and uses the API properly, assuming it is the
>> only user of the returned cm_id.
> 
> Actually, I seriously question why the ib_cm should be modified at all for any of this.

At first I thought of doing all the changes in the rdma_cm module. After
a little thought though, I saw that this would require having a data
structure in rdma_cm that could tell which ib_cm_id to use when
listening on a new rdma_cm_id. That data structure would be indexed by a
service ID. This is exactly what the listen_service_table rb_tree in
ib_cm does, so instead of duplicating the rb_tree's data in another
module, I prefer to make a small change to ib_cm and let it continue
manage that tree.

Haggai

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 7:23 p.m. UTC | #9
On Fri, May 15, 2015 at 07:11:10PM +0000, Hefty, Sean wrote:
> > The fact the CM might share a listen (and only a listen) ID behind the
> > scenes is not the caller's problem. That is an implementation choice,
> > each caller stands alone and uses the API properly, assuming it is the
> > only user of the returned cm_id.
> 
> Actually, I seriously question why the ib_cm should be modified at all for any of this.

I find Haggai's argument compelling, it is a very small amount of code
and data to add a sharing count, and a very large amount to duplicate
the whole service id map into cma.c.

It is in-kernel after all, we should co-design module APIs to work
efficiently.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 19, 2015, 10:52 p.m. UTC | #10
> I find Haggai's argument compelling, it is a very small amount of code
> and data to add a sharing count, and a very large amount to duplicate
> the whole service id map into cma.c.

I get wanting to share the listen list, but we end up with this:

> drivers/infiniband/core/cm.c                       | 129 +++++-

that's more code than what the ib_cm module uses to track listens, and the result is a solution that ends up being split in a weird fashion between the ib_cm, iw_cm (TBD), and rdma_cm.

I wonder if the existing ib_cm interface is even what we want.  Currently, the rdma_cm pushes the private data (i.e. IP address) comparison into the ib_cm.  This is only used by the rdma_cm.  Should that instead be moved out of the ib_cm and handled in the rdma_cm?  And then update the ib_cm to support multiple listens on the same service id.

For example, the ib_cm could simply queue all listen requests on the same service id.  When a REQ arrives, it just calls each listener back until one 'claims' the REQ.  The destruction of a listener could then be synced with the callback.  From what I can tell, the current proposal requires that the ib_cm user be prepared to receive a REQ callback for a listen that it has already destroyed.  If needed, a flag can be added to the ib_cm_listen to indicate if the service id is shared/exclusive.

- Sean 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 11:46 p.m. UTC | #11
On Tue, May 19, 2015 at 10:52:59PM +0000, Hefty, Sean wrote:
> > I find Haggai's argument compelling, it is a very small amount of code
> > and data to add a sharing count, and a very large amount to duplicate
> > the whole service id map into cma.c.
> 
> I get wanting to share the listen list, but we end up with this:
> 
> > drivers/infiniband/core/cm.c                       | 129 +++++-
> 
> that's more code than what the ib_cm module uses to track listens,
> and the result is a solution that ends up being split in a weird
> fashion between the ib_cm, iw_cm (TBD), and rdma_cm.

I was mostly thinking about data overheads with a second rbtree, but I
do wonder if the overhead is already in the rdma_cm between the
listen_list, bind_list, etc. So maybe that should be used instead..

> I wonder if the existing ib_cm interface is even what we want.
> Currently, the rdma_cm pushes the private data (i.e. IP address)
> comparison into the ib_cm.  This is only used by the rdma_cm.
> Should that instead be moved out of the ib_cm and handled in the
> rdma_cm?  And then update the ib_cm to support multiple listens on
> the same service id.

Well, that really is the problem here, the private data compare
doesn't really do enough because rdma cm wildcard's the IP address and
does it's own check anyhow. So I see all of this as different
proposals on how to fix that. rdma_cm basically already does most of
the private data compare itself.

It is hard to make the private data compare more fancy and still
provide EBUSY semantics..

On one hand, we don't really need the cm to maintain a list of
listeners and iterate over them for the rdma cm, that state is already
kept and tracked in rdma_cm when it goes to find the device the IP is
associated with..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 20, 2015, 12:49 a.m. UTC | #12
> > I wonder if the existing ib_cm interface is even what we want.
> > Currently, the rdma_cm pushes the private data (i.e. IP address)
> > comparison into the ib_cm.  This is only used by the rdma_cm.
> > Should that instead be moved out of the ib_cm and handled in the
> > rdma_cm?  And then update the ib_cm to support multiple listens on
> > the same service id.
> 
> Well, that really is the problem here, the private data compare
> doesn't really do enough because rdma cm wildcard's the IP address and
> does it's own check anyhow. So I see all of this as different
> proposals on how to fix that. rdma_cm basically already does most of
> the private data compare itself.

Maybe the first thing to fix is this.  Since the private data compare is no longer sufficient, remove it and update the rdma_cm to handle the change.  Regardless of how the code ends up structured, the namespace support should then easily fit into that solution.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran May 21, 2015, 6:51 a.m. UTC | #13
On 20/05/2015 03:49, Hefty, Sean wrote:
>>> I wonder if the existing ib_cm interface is even what we want.
>>> Currently, the rdma_cm pushes the private data (i.e. IP address)
>>> comparison into the ib_cm.  This is only used by the rdma_cm.
>>> Should that instead be moved out of the ib_cm and handled in the
>>> rdma_cm?  And then update the ib_cm to support multiple listens on
>>> the same service id.
>>
>> Well, that really is the problem here, the private data compare
>> doesn't really do enough because rdma cm wildcard's the IP address and
>> does it's own check anyhow. So I see all of this as different
>> proposals on how to fix that. rdma_cm basically already does most of
>> the private data compare itself.
> 
> Maybe the first thing to fix is this.  Since the private data compare is no longer sufficient, remove it and update the rdma_cm to handle the change.  Regardless of how the code ends up structured, the namespace support should then easily fit into that solution.
Patch 8 in this series (IB/cma: Add compare_data checks to the RDMA CM
module) basically does that (change rdma_cm so that it doesn't rely on
private_data checks in ib_cm). Indeed, it is positioned in the series
before adding the rdma_cm namespace support.

It remains to clean up ib_cm's ib_cm_listen interface now that
compare_data isn't used, but I'm not sure this belongs in this series.

Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 21, 2015, 12:56 p.m. UTC | #14
> It remains to clean up ib_cm's ib_cm_listen interface now that
> compare_data isn't used, but I'm not sure this belongs in this series.

This patch series is changing the behavior that the compare data solves.  Currently, the ib_cm handles all of the multiplexing for the rdma_cm -- that's the reason for the compare data.  This series changes that such that the ib_cm would handle half the multiplexing, with the other half handled by the rdma_cm.  We should not insert that sort of split.  So, I disagree that this isn't part of this series.

Either do all of the multiplexing in the ib_cm -- without exposing it to inet (add port/pkey filtering if that works) -- or move all of it out.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 08b18044552a..6b68402fd6df 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -711,6 +711,7 @@  struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
 	cm_id_priv->id.cm_handler = cm_handler;
 	cm_id_priv->id.context = context;
 	cm_id_priv->id.remote_cm_qpn = 1;
+	kref_init(&cm_id_priv->id.ref);
 	ret = cm_alloc_id(cm_id_priv);
 	if (ret)
 		goto error;
@@ -921,10 +922,42 @@  retest:
 	kfree(cm_id_priv);
 }
 
-void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+static void __ib_destroy_cm_id(struct kref *ref)
 {
+	struct ib_cm_id *cm_id = container_of(ref, struct ib_cm_id, ref);
+
 	cm_destroy_id(cm_id, 0);
 }
+
+/**
+ * ib_cm_id_get - Increase the reference count on an existing ib_cm_id.
+ * @cm_id: Connection identifier to take reference of
+ */
+void ib_cm_id_get(struct ib_cm_id *cm_id)
+{
+	kref_get(&cm_id->ref);
+}
+EXPORT_SYMBOL(ib_cm_id_get);
+
+/**
+ * ib_cm_id_put - Release a connection identifier.
+ * @cm_id: Connection identifier to release
+ *
+ * This call may block until the connection identifier is destroyed.
+ * Returns 1 if the ib_cm_id has been destroyed
+ */
+int ib_cm_id_put(struct ib_cm_id *cm_id)
+{
+	return kref_put(&cm_id->ref, __ib_destroy_cm_id);
+}
+EXPORT_SYMBOL(ib_cm_id_put);
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+	int destroyed = ib_cm_id_put(cm_id);
+
+	WARN_ON_ONCE(!destroyed);
+}
 EXPORT_SYMBOL(ib_destroy_cm_id);
 
 int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
@@ -1603,7 +1636,7 @@  rejected:
 	atomic_dec(&cm_id_priv->refcount);
 	cm_deref_id(listen_cm_id_priv);
 destroy:
-	ib_destroy_cm_id(cm_id);
+	ib_cm_id_put(cm_id);
 	return ret;
 }
 
@@ -3038,7 +3071,7 @@  static int cm_sidr_req_handler(struct cm_work *work)
 	cm_deref_id(cur_cm_id_priv);
 	return 0;
 out:
-	ib_destroy_cm_id(&cm_id_priv->id);
+	ib_cm_id_put(&cm_id_priv->id);
 	return -EINVAL;
 }
 
@@ -3197,7 +3230,7 @@  static void cm_process_send_error(struct ib_mad_send_buf *msg,
 	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
 	cm_free_msg(msg);
 	if (ret)
-		ib_destroy_cm_id(&cm_id_priv->id);
+		ib_cm_id_put(&cm_id_priv->id);
 	return;
 discard:
 	spin_unlock_irq(&cm_id_priv->lock);
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 39ed2d2fbd51..22da34de3bc7 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -287,9 +287,9 @@  struct ib_cm_event {
  * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
  * to a user's existing communication identifier.
  *
- * Users may not call ib_destroy_cm_id while in the context of this callback;
- * however, returning a non-zero value instructs the communication manager to
- * destroy the @cm_id after the callback completes.
+ * Users may not call ib_destroy_cm_id or ib_cm_id_put while in the context of
+ * this callback; however, returning a non-zero value instructs the
+ * communication manager to destroy the @cm_id after the callback completes.
  */
 typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
 			     struct ib_cm_event *event);
@@ -305,6 +305,7 @@  struct ib_cm_id {
 	__be32			local_id;
 	__be32			remote_id;
 	u32			remote_cm_qpn;  /* 1 unless redirected */
+	struct kref		ref;
 };
 
 /**
@@ -330,6 +331,9 @@  struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
  */
 void ib_destroy_cm_id(struct ib_cm_id *cm_id);
 
+void ib_cm_id_get(struct ib_cm_id *cm_id);
+int ib_cm_id_put(struct ib_cm_id *cm_id);
+
 #define IB_SERVICE_ID_AGN_MASK	cpu_to_be64(0xFF00000000000000ULL)
 #define IB_CM_ASSIGN_SERVICE_ID	cpu_to_be64(0x0200000000000000ULL)
 #define IB_CMA_SERVICE_ID	cpu_to_be64(0x0000000001000000ULL)