diff mbox

[RFC,ABI,V1,4/8] RDMA/core: Add support for custom types

Message ID 1467293971-25688-5-git-send-email-matanb@mellanox.com (mailing list archive)
State RFC
Headers show

Commit Message

Matan Barak June 30, 2016, 1:39 p.m. UTC
From: Haggai Eran <haggaie@mellanox.com>

The new ioctl infrastructure supports driver specific types.
This is implemented by having a list of uverbs_uobject_type in the
ib_device. Each element of this list corresponds to a specific type
and specifies its free function, driver's type_id, etc.
The order of elements dictates the order of process release.
The whole type_list should be initialized before any ucontext was
created.

When a ucontext is created, a new list is created in this ib_ucontext.
This list corresponds to the ib_device's type list, as any element
in the ib_dev's type list has a corresponding element in this list.
Each element in the ucontext's list points to its respective
corresponding element in the ib_dev's type list.
In addition, it has a data structure (currently implemented by a list,
but should probably move to using a hash) that maps all ib_uobjects
of the same ib_ucontext and the respective type.

+-------------------------------------------------------------------+
|    ib_device                                                      |
|   +--------------+      +--------------+      +----------------+  |
|   |uobject_type  |      |              |      |                |  |
|   |free_fn       |      |              |      |                |  |
|   |              +----->+              +----->+                |  |
|   |              |      |              |      |                |  |
|   +----^---------+      +--------------+      +----------------+  |
| +------|                                                          |
+-------------------------------------------------------------------+
  |
  |
  |
+-------------------------------------------------------------------+
| |  ib_ucontext                                                    |
| | +--------------+      +--------------+      +----------------+  |
| | |uobject_list  |      |              |      |                |  |
| +-+type          |      |              |      |                |  |
|   |list+         +----->+              +----->+                |  |
|   |    |         |      |              |      |                |  |
|   +--------------+      +--------------+      +----------------+  |
|        |                                                          |
+-------------------------------------------------------------------+
         |
         |
         |
         |
         |  +-----------+        +------------+
         |  | ib_uobject|        |ib_uobject  |
         +-->           +------> |            |
            |           |        |            |
            +-----------+        +------------+

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/uobject.c | 83 +++++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/uobject.h | 16 ++++++++
 include/rdma/ib_verbs.h           |  8 ++++
 3 files changed, 107 insertions(+)

Comments

Jason Gunthorpe July 12, 2016, 7:23 p.m. UTC | #1
On Thu, Jun 30, 2016 at 04:39:27PM +0300, Matan Barak wrote:
> From: Haggai Eran <haggaie@mellanox.com>
> 
> The new ioctl infrastructure supports driver specific types.
> This is implemented by having a list of uverbs_uobject_type in the
> ib_device. Each element of this list corresponds to a specific type
> and specifies its free function, driver's type_id, etc.
> The order of elements dictates the order of process release.
> The whole type_list should be initialized before any ucontext was
> created.

Do we need to have a linked list? Can we use static const arrays like
so much other stuff in the kernel?


> +int ib_uverbs_uobject_type_add(struct list_head	*head,
> +			       void (*free)(struct uverbs_uobject_type *uobject_type,
> +					    struct ib_uobject *uobject,
> +					    struct ib_ucontext *ucontext),

Don't pass function pointers like this, create a proper 'ops' kind of
structure.

Why is this taking a list_head not a ib_device?

> +void ib_uverbs_uobject_type_remove(struct uverbs_uobject_type
> *uobject_type)

Why would a driver ever have to remove a type?

> +{
> +	/*
> +	 * Allocate a new object type for the vendor, this should be done when a
> +	 * vendor is initialized.

Please stop using the word 'vendor' and scrub it from your patches.

This is 'driver specific'

> +	/* List of object under uverbs_object_type */
> +	struct list_head	idr_list;
> +	struct uverbs_uobject_list *type;	/* ptr to ucontext type */
>  };

I'm unclear why we need a list in the driver and list in the
ucontext.. I would have thought they are almost the same.

What use is tracking the ubojects in a per-type list? Do we ever
iterate that way other than on global destruct?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak July 13, 2016, 2:44 p.m. UTC | #2
On 12/07/2016 22:23, Jason Gunthorpe wrote:
> On Thu, Jun 30, 2016 at 04:39:27PM +0300, Matan Barak wrote:
>> From: Haggai Eran <haggaie@mellanox.com>
>>
>> The new ioctl infrastructure supports driver specific types.
>> This is implemented by having a list of uverbs_uobject_type in the
>> ib_device. Each element of this list corresponds to a specific type
>> and specifies its free function, driver's type_id, etc.
>> The order of elements dictates the order of process release.
>> The whole type_list should be initialized before any ucontext was
>> created.
>
> Do we need to have a linked list? Can we use static const arrays like
> so much other stuff in the kernel?
>
>

We can, but then every driver should declare the common objects order by 
itself.
Maybe we could provide a macro that does this for you (if you don't have 
any custom types interleaved).

The basic advantage of a linked list is that we could still have a 
function that builds the common types list and a driver could still 
insert and custom type where ever it wants.

>> +int ib_uverbs_uobject_type_add(struct list_head	*head,
>> +			       void (*free)(struct uverbs_uobject_type *uobject_type,
>> +					    struct ib_uobject *uobject,
>> +					    struct ib_ucontext *ucontext),
>
> Don't pass function pointers like this, create a proper 'ops' kind of
> structure.
>
> Why is this taking a list_head not a ib_device?
>

In order to let you insert a type anywhere you want in the type list.
This affects the destruction order.

>> +void ib_uverbs_uobject_type_remove(struct uverbs_uobject_type
>> *uobject_type)
>
> Why would a driver ever have to remove a type?
>

Correct, unnecessary.

>> +{
>> +	/*
>> +	 * Allocate a new object type for the vendor, this should be done when a
>> +	 * vendor is initialized.
>
> Please stop using the word 'vendor' and scrub it from your patches.
>
> This is 'driver specific'
>

Ok, we'll replace vendor -> driver.

>> +	/* List of object under uverbs_object_type */
>> +	struct list_head	idr_list;
>> +	struct uverbs_uobject_list *type;	/* ptr to ucontext type */
>>  };
>
> I'm unclear why we need a list in the driver and list in the
> ucontext.. I would have thought they are almost the same.
>
> What use is tracking the ubojects in a per-type list? Do we ever
> iterate that way other than on global destruct?
>

The only purpose of these lists are either ucontext destruction or 
device removal (which ultimately deletes all ucontexts).
ucontext destruction requires list per ucontext x type.

> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 13, 2016, 4:39 p.m. UTC | #3
On Wed, Jul 13, 2016 at 05:44:46PM +0300, Matan Barak wrote:

> We can, but then every driver should declare the common objects
> order by itself.  Maybe we could provide a macro that does this for
> you (if you don't have any custom types interleaved).

Hum, that seems so very fragile and complex..

There are other ways to deal with destruction ordering. refcounts
only for use during descruction might be easier. Repeatedly iterate
over the uobject list deleting 0 refcount objects until the list is
empty.

So, eg when you create a MR on a PD then PD's destruct refcount gets
++'d, and when the MR is deleted it gets --'d. This naturally forces
the needed ordering.

> >What use is tracking the ubojects in a per-type list? Do we ever
> >iterate that way other than on global destruct?
> >
> 
> The only purpose of these lists are either ucontext destruction or device
> removal (which ultimately deletes all ucontexts).
> ucontext destruction requires list per ucontext x type.

I wouldn't waste memory on optimizing destruction like that, just run
over the idr repeatedly, matching against the object type.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak July 13, 2016, 4:57 p.m. UTC | #4
On 13/07/2016 19:39, Jason Gunthorpe wrote:
> On Wed, Jul 13, 2016 at 05:44:46PM +0300, Matan Barak wrote:
>
>> We can, but then every driver should declare the common objects
>> order by itself.  Maybe we could provide a macro that does this for
>> you (if you don't have any custom types interleaved).
>
> Hum, that seems so very fragile and complex..
>
> There are other ways to deal with destruction ordering. refcounts
> only for use during descruction might be easier. Repeatedly iterate
> over the uobject list deleting 0 refcount objects until the list is
> empty.
>
> So, eg when you create a MR on a PD then PD's destruct refcount gets
> ++'d, and when the MR is deleted it gets --'d. This naturally forces
> the needed ordering.
>

Yeah, that's nicer. We'll do that instead :)

>>> What use is tracking the ubojects in a per-type list? Do we ever
>>> iterate that way other than on global destruct?
>>>
>>
>> The only purpose of these lists are either ucontext destruction or device
>> removal (which ultimately deletes all ucontexts).
>> ucontext destruction requires list per ucontext x type.
>
> I wouldn't waste memory on optimizing destruction like that, just run
> over the idr repeatedly, matching against the object type.
>

Maybe instead of having an idr per device, it's better to have an idr 
per uverbs file. Doing that, if we decide to unify rdma-cm with the 
device itself, they can share the same idr. This gives us another 
advantage of destroying a ucontext without going over all opened objects.

However, when a device is removed, we need to find all its related 
objects and destroy them. In order to do that in a simple way, we could 
say that a uverbs_file is either not bound to any rdma device or bound 
to a single IB device (do you see any value of bounding a uverbs_file to 
several devices?). By this model you could have one ioctl code for 
device commands, another one for client commands (rdma-cm) and a third 
one for infrastructure based commands (query devices, query clients).

This is of course only relevant if we go for a unified fd for rdma-cm 
and rdma device.


> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 13, 2016, 5:21 p.m. UTC | #5
On Wed, Jul 13, 2016 at 07:57:04PM +0300, Matan Barak wrote:

> Maybe instead of having an idr per device, it's better to have an idr per
> uverbs file.

Oh, I thought you were doing that already!

Yes, the idr must be per ucontext, otherwise we create a big security
problem. Access to one files object #'s cannot be allowed from another
file.

> However, when a device is removed, we need to find all its related objects
> and destroy them. In order to do that in a simple way, we could say that a
> uverbs_file is either not bound to any rdma device or bound to a
> single IB

This is just more searching on the disassociate path. Search all
ucontexts for any uobj connected to the victim device.

If we drop the file == device scheme then we need a generic op to
tell what device a uobj is associated with. All our kobjs already
store this information, so it isn't a big deal.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak July 14, 2016, 5:30 a.m. UTC | #6
On 13/07/2016 20:21, Jason Gunthorpe wrote:
> On Wed, Jul 13, 2016 at 07:57:04PM +0300, Matan Barak wrote:
>
>> Maybe instead of having an idr per device, it's better to have an idr per
>> uverbs file.
>
> Oh, I thought you were doing that already!
>
> Yes, the idr must be per ucontext, otherwise we create a big security
> problem. Access to one files object #'s cannot be allowed from another
> file.
>

We don't have a security problem, as the ucontext pointer is store on 
the uobject. Therefore, the uobject is returned only if it matches the 
current ucontext.

>> However, when a device is removed, we need to find all its related objects
>> and destroy them. In order to do that in a simple way, we could say that a
>> uverbs_file is either not bound to any rdma device or bound to a
>> single IB
>
> This is just more searching on the disassociate path. Search all
> ucontexts for any uobj connected to the victim device.
>
> If we drop the file == device scheme then we need a generic op to
> tell what device a uobj is associated with. All our kobjs already
> store this information, so it isn't a big deal.
>

I've thought about ditching the lists, but there's one fundamental 
problem. Keeping reference counts in kernel doesn't encompass dependency 
relations in the user space. For example, since memory windows could be 
bounded and unbounded from a MR, we need to delete all MWs before 
deleting the MRs.
Secondly, we could safely change the list to hlist. The memory overhead 
will be minimal (probably better than adding and IDR per type).

If we break the file == device scheme, it also requires a few new object 
types (for example, DEVICE and CLIENT object) to pass them from user-space.
So the flow might be using the QUERY ioctl code and query the 
devices/clients. Creating a device/client and get a IDR handle for that. 
For every IOCTL code (either device or client commands), you pass this 
idr handle as part of the ioctl code specific header.

However, does it really worth it? Do you see any usage of binding the 
same uverbs_file to several rdma devices?
If not, we could have a simpler approach of having a file either bounded 
to a device or not bounded to any device. We could still share the IDR 
with clients on the same fd, but they are all guaranteed to set or use 
this exact device.

> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky July 14, 2016, 6:30 a.m. UTC | #7
On Wed, Jul 13, 2016 at 11:21:16AM -0600, Jason Gunthorpe wrote:
> On Wed, Jul 13, 2016 at 07:57:04PM +0300, Matan Barak wrote:
> 
> > Maybe instead of having an idr per device, it's better to have an idr per
> > uverbs file.
> 
> Oh, I thought you were doing that already!
> 
> Yes, the idr must be per ucontext, otherwise we create a big security
> problem. Access to one files object #'s cannot be allowed from another
> file.

On top of the Matan's response, regarding no security issue here. I
didn't see added value in multiplying IDRs, which would be if
creation per-file is done.
Jason Gunthorpe July 14, 2016, 4:41 p.m. UTC | #8
On Thu, Jul 14, 2016 at 08:30:05AM +0300, Matan Barak wrote:

> We don't have a security problem, as the ucontext pointer is store on the
> uobject. Therefore, the uobject is returned only if it matches the current
> ucontext.

That still allows a different security problem, idrs have bounded
capacity and now unprivileged users can exhaust them for all users.

> I've thought about ditching the lists, but there's one fundamental problem.
> Keeping reference counts in kernel doesn't encompass dependency relations in
> the user space. For example, since memory windows could be bounded and
> unbounded from a MR, we need to delete all MWs before deleting the MRs.
> Secondly, we could safely change the list to hlist. The memory overhead will
> be minimal (probably better than adding and IDR per type).

Bleck, that is a terrible API. Destroying the MR should return
associated MW's to unbound.

Is MR the only case of this? I can't think of much else that uses the
WQ that way..

I think the lists should always go, always use repeated iteration. If
you really can't make refcount work then use a type compare instead,
but I don't think we don't need the overhead of keeping lists to
optimize destruction.

The MW case could be handled specially with a simple priority field.

> If we break the file == device scheme, it also requires a few new object
> types (for example, DEVICE and CLIENT object) to pass them from
> user-space.

Don't know what a CLIENT object is, but I think we always need to
have a DEVICE object even if there is only allowed 1 device object per
FD.

We need the device object to match what ibv_open_device does and
provide a basis for sending device specific driver commands and
issuing device specific queries.

If it is a singleton per-fd or created doesn't really matter from an
API design perspective.

> So the flow might be using the QUERY ioctl code and query the
> devices/clients. Creating a device/client and get a IDR handle for that. For
> every IOCTL code (either device or client commands), you pass this idr
> handle as part of the ioctl code specific header.

Yes, as I said, we pretty much have to do that already, and it
wouldn't be on every ioctl, just certain top level ones like PDs.

> However, does it really worth it? Do you see any usage of binding the same
> uverbs_file to several rdma devices?

I'd like to hear more arguments on this point.

The main value I see is combining rdma_cm and uverbs into one fd & IDR, and
that requires allowing multiple devices per fd.

This seems simpler for clients and helps move things in the direction
Sean was talking about where all events can all be aggregated into one
poll loop.

As far as I can tell, the only downside to that is we loose the
per-device permissions on /dev/infiniband/uverbsX, which don't really
work today anyhow.

> If not, we could have a simpler approach of having a file either bounded to
> a device or not bounded to any device. We could still share the IDR with
> clients on the same fd, but they are all guaranteed to set or use this exact
> device.

Why is this simpler?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak July 19, 2016, 2:05 p.m. UTC | #9
On 14/07/2016 19:41, Jason Gunthorpe wrote:
> On Thu, Jul 14, 2016 at 08:30:05AM +0300, Matan Barak wrote:
>
>> We don't have a security problem, as the ucontext pointer is store on the
>> uobject. Therefore, the uobject is returned only if it matches the current
>> ucontext.
>
> That still allows a different security problem, idrs have bounded
> capacity and now unprivileged users can exhaust them for all users.
>

Right, but that's an existing security issue.

>> I've thought about ditching the lists, but there's one fundamental problem.
>> Keeping reference counts in kernel doesn't encompass dependency relations in
>> the user space. For example, since memory windows could be bounded and
>> unbounded from a MR, we need to delete all MWs before deleting the MRs.
>> Secondly, we could safely change the list to hlist. The memory overhead will
>> be minimal (probably better than adding and IDR per type).
>
> Bleck, that is a terrible API. Destroying the MR should return
> associated MW's to unbound.
>
> Is MR the only case of this? I can't think of much else that uses the
> WQ that way..
>

According to the IB spec, a CI could choose either to return an error or 
to move MWs to orphaned MWs (and make sure any access to these MWs will 
be blocked). Some hardwares might implement the first approach and keep 
reference from MRs to MWs in hardware (without having a way to query 
that in software). In these cases, you can't force this unbound (as you 
don't know if they were unbound because of remote invalidate).

AFAIK, MW are the only case.

> I think the lists should always go, always use repeated iteration. If
> you really can't make refcount work then use a type compare instead,
> but I don't think we don't need the overhead of keeping lists to
> optimize destruction.
>
> The MW case could be handled specially with a simple priority field.
>

So you suggest having a priority field in the uobject and iterate over 
the IDR freeing by order. Seems simple enough.

>> If we break the file == device scheme, it also requires a few new object
>> types (for example, DEVICE and CLIENT object) to pass them from
>> user-space.
>
> Don't know what a CLIENT object is, but I think we always need to
> have a DEVICE object even if there is only allowed 1 device object per
> FD.
>

By saying CLIENT I mean rdma-cm or other such users which can handle 
rdma devices and provide services.

> We need the device object to match what ibv_open_device does and
> provide a basis for sending device specific driver commands and
> issuing device specific queries.
>
> If it is a singleton per-fd or created doesn't really matter from an
> API design perspective.
>

If it's a singleton per-fd, you don't need to pass it per action. If you 
get an action on this fd - you know exactly to which device you should 
delegate it.

>> So the flow might be using the QUERY ioctl code and query the
>> devices/clients. Creating a device/client and get a IDR handle for that. For
>> every IOCTL code (either device or client commands), you pass this idr
>> handle as part of the ioctl code specific header.
>
> Yes, as I said, we pretty much have to do that already, and it
> wouldn't be on every ioctl, just certain top level ones like PDs.
>

I meant querying the possible devices or clients. Meaning, introduce 
another ioctl code that is used to get all available clients and devices.

>> However, does it really worth it? Do you see any usage of binding the same
>> uverbs_file to several rdma devices?
>
> I'd like to hear more arguments on this point.
>
> The main value I see is combining rdma_cm and uverbs into one fd & IDR, and
> that requires allowing multiple devices per fd.
>
> This seems simpler for clients and helps move things in the direction
> Sean was talking about where all events can all be aggregated into one
> poll loop.
>
> As far as I can tell, the only downside to that is we loose the
> per-device permissions on /dev/infiniband/uverbsX, which don't really
> work today anyhow.
>

The point here isn't about unifying rdma-cm and uverbs. Assuming we do 
that, we could still allow a user to bind fd to several devices or just 
to a single device.

>> If not, we could have a simpler approach of having a file either bounded to
>> a device or not bounded to any device. We could still share the IDR with
>> clients on the same fd, but they are all guaranteed to set or use this exact
>> device.
>
> Why is this simpler?
>

It's simpler as you don't need to tie objects in the IDR to an IB device 
(as there's only one option). In addition, you don't have to pass a 
device handle to all commands. You could either have a bounded device 
and then it's clear what you're using or you didn't bind any device and 
then all device specific commands (except init_ucontext) fail. Client 
commands (i.e rdma-cm) could either fail or succeed, depending on their 
requirements. It's a closer model to what we have today - uverbs_file 
points to an ib_dev (which might be NULL).

> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 19, 2016, 8:36 p.m. UTC | #10
On Tue, Jul 19, 2016 at 05:05:46PM +0300, Matan Barak wrote:

> According to the IB spec, a CI could choose either to return an error or to
> move MWs to orphaned MWs (and make sure any access to these MWs will be
> blocked). Some hardwares might implement the first approach and keep
> reference from MRs to MWs in hardware (without having a way to query that in
> software). In these cases, you can't force this unbound (as you don't know
> if they were unbound because of remote invalidate).

Ugh, I don't why that was let in the spec, from an app perspective you
have to assume the annoying behavior...

> >The MW case could be handled specially with a simple priority field.

> So you suggest having a priority field in the uobject and iterate over the
> IDR freeing by order. Seems simple enough.

That is one option.. 

> >>So the flow might be using the QUERY ioctl code and query the
> >>devices/clients. Creating a device/client and get a IDR handle for that. For
> >>every IOCTL code (either device or client commands), you pass this idr
> >>handle as part of the ioctl code specific header.
> >
> >Yes, as I said, we pretty much have to do that already, and it
> >wouldn't be on every ioctl, just certain top level ones like PDs.

> I meant querying the possible devices or clients. Meaning, introduce another
> ioctl code that is used to get all available clients and devices.

Not necessarily, the 'create device' could accept the existing sysfs
device string as the starting point, libibverbs already has that
information.

> The point here isn't about unifying rdma-cm and uverbs. Assuming we do that,
> we could still allow a user to bind fd to several devices or just to a
> single device.

I don't understand this comment.

If we keep /dev/uverbs0 then that has to remain 1:1 with a device
because that is our permissions model.

If we move to /dev/rdma_uapi, then that would support 1:N devices.

I can't see a reason to do both concurrently even though that would be
easy enough technically..

> It's simpler as you don't need to tie objects in the IDR to an IB device (as
> there's only one option). In addition, you don't have to pass a device
> handle to all commands. You could either have a bounded device and then it's
> clear what you're using or you didn't bind any device and then all device
> specific commands (except init_ucontext) fail. Client commands (i.e rdma-cm)
> could either fail or succeed, depending on their requirements. It's a closer
> model to what we have today - uverbs_file points to an ib_dev (which might
> be NULL).

I don't see todays model as actually be very good, and those don't
seem like very strong reasons. An extra argument on a couple of
commands (and PD create is the only one that springs to mind) seems
trivial. libiverbs already keeps track of this.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak July 20, 2016, 8:01 a.m. UTC | #11
On 19/07/2016 23:36, Jason Gunthorpe wrote:
> On Tue, Jul 19, 2016 at 05:05:46PM +0300, Matan Barak wrote:
>
>> According to the IB spec, a CI could choose either to return an error or to
>> move MWs to orphaned MWs (and make sure any access to these MWs will be
>> blocked). Some hardwares might implement the first approach and keep
>> reference from MRs to MWs in hardware (without having a way to query that in
>> software). In these cases, you can't force this unbound (as you don't know
>> if they were unbound because of remote invalidate).
>
> Ugh, I don't why that was let in the spec, from an app perspective you
> have to assume the annoying behavior...
>

I agree.

>>> The MW case could be handled specially with a simple priority field.
>
>> So you suggest having a priority field in the uobject and iterate over the
>> IDR freeing by order. Seems simple enough.
>
> That is one option..
>
>>>> So the flow might be using the QUERY ioctl code and query the
>>>> devices/clients. Creating a device/client and get a IDR handle for that. For
>>>> every IOCTL code (either device or client commands), you pass this idr
>>>> handle as part of the ioctl code specific header.
>>>
>>> Yes, as I said, we pretty much have to do that already, and it
>>> wouldn't be on every ioctl, just certain top level ones like PDs.
>
>> I meant querying the possible devices or clients. Meaning, introduce another
>> ioctl code that is used to get all available clients and devices.
>
> Not necessarily, the 'create device' could accept the existing sysfs
> device string as the starting point, libibverbs already has that
> information.
>

But how would you know if rdma-cm is loaded and available? Don't we want 
to have a decent query interface rather than iterating through sysfs?
I agree that it could be lower priority than the rest.

>> The point here isn't about unifying rdma-cm and uverbs. Assuming we do that,
>> we could still allow a user to bind fd to several devices or just to a
>> single device.
>
> I don't understand this comment.
>
> If we keep /dev/uverbs0 then that has to remain 1:1 with a device
> because that is our permissions model.
>
> If we move to /dev/rdma_uapi, then that would support 1:N devices.
>
> I can't see a reason to do both concurrently even though that would be
> easy enough technically..
>

I meant moving to /dev/rdma_uapi, but when you fopen the device and get 
a fd, you could either have fd->NULL or fd->single_rdma_dev mapping.
It's a little bit simpler, however, not that much from just doing 1:N 
mapping.

>> It's simpler as you don't need to tie objects in the IDR to an IB device (as
>> there's only one option). In addition, you don't have to pass a device
>> handle to all commands. You could either have a bounded device and then it's
>> clear what you're using or you didn't bind any device and then all device
>> specific commands (except init_ucontext) fail. Client commands (i.e rdma-cm)
>> could either fail or succeed, depending on their requirements. It's a closer
>> model to what we have today - uverbs_file points to an ib_dev (which might
>> be NULL).
>
> I don't see todays model as actually be very good, and those don't
> seem like very strong reasons. An extra argument on a couple of
> commands (and PD create is the only one that springs to mind) seems
> trivial. libiverbs already keeps track of this.
>

I agree that if we move to /dev/rdma_uapi file, the gap from doing 1:N 
mapping isn't large.

> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2016, 5:49 p.m. UTC | #12
On Wed, Jul 20, 2016 at 11:01:18AM +0300, Matan Barak wrote:

> But how would you know if rdma-cm is loaded and available? Don't we want to
> have a decent query interface rather than iterating through sysfs?
> I agree that it could be lower priority than the rest.

1) 'loaded available' I think you will agree we have a fairly bad user
   experience when it comes to autoloading modules. If we have one fd,
   and the core of that FD is part of the ib_core module that is
   pulled in on driver load then the a request for an API segment (eg uverbs,
   rdma_cm) can now actually trigger a module load, just like net does
   for PF's. So we actually get to a better, saner, place.

2) Yes, an in-band query interface makes much more sense than
   schlepping around sysfs. That could be done in the ioctl, or maybe
   via netlink, but as you say, lower priority.

> I meant moving to /dev/rdma_uapi, but when you fopen the device and get a
> fd, you could either have fd->NULL or fd->single_rdma_dev mapping.
> It's a little bit simpler, however, not that much from just doing 1:N
> mapping.

A major point of the unified interface is to sanely support rdma_cm
when working with multiple devices, so I'd expect 1:N is the only
reasonable option for that interface..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/uobject.c b/drivers/infiniband/core/uobject.c
index 86f6460..bc15be1 100644
--- a/drivers/infiniband/core/uobject.c
+++ b/drivers/infiniband/core/uobject.c
@@ -35,6 +35,89 @@ 
 #include "uobject.h"
 #include "uidr.h"
 
+int ib_uverbs_uobject_type_add(struct list_head	*head,
+			       void (*free)(struct uverbs_uobject_type *uobject_type,
+					    struct ib_uobject *uobject,
+					    struct ib_ucontext *ucontext),
+			       uint16_t	obj_type)
+{
+	/*
+	 * Allocate a new object type for the vendor, this should be done when a
+	 * vendor is initialized.
+	 */
+	struct uverbs_uobject_type *uobject_type;
+
+	uobject_type = kzalloc(sizeof(*uobject_type), GFP_KERNEL);
+	if (!uobject_type)
+		return -ENOMEM;
+
+	uobject_type->free = free;
+	uobject_type->obj_type = obj_type;
+	list_add_tail(&uobject_type->type_list, head);
+	return 0;
+}
+
+void ib_uverbs_uobject_type_remove(struct uverbs_uobject_type *uobject_type)
+{
+	/*
+	 * Allocate a new object type for the vendor, this should be done when a
+	 * vendor is initialized.
+	 */
+	WARN_ON(list_empty(&uobject_type->type_list));
+	list_del(&uobject_type->type_list);
+	kfree(uobject_type);
+}
+
+void ib_uverbs_uobject_type_cleanup_ucontext(struct ib_ucontext *ucontext)
+{
+	struct uverbs_uobject_list *uobject_list, *next_list;
+
+	list_for_each_entry_safe(uobject_list, next_list,
+				 &ucontext->uobjects_lists, type_list) {
+		struct uverbs_uobject_type *type = uobject_list->type;
+		struct ib_uobject *obj, *next_obj;
+
+		list_for_each_entry_safe(obj, next_obj, &uobject_list->list,
+					 idr_list) {
+			/* TODO */
+			type->free(type, obj, ucontext);
+			list_del(&obj->idr_list);
+		}
+
+		list_del(&uobject_list->type_list);
+	}
+}
+
+int ib_uverbs_uobject_type_initialize_ucontext(struct ib_ucontext *ucontext,
+					       struct list_head *type_list)
+{
+	/* create typed list in ucontext */
+	struct uverbs_uobject_type *type;
+	int err;
+
+	INIT_LIST_HEAD(&ucontext->uobjects_lists);
+
+	list_for_each_entry(type, type_list, type_list) {
+		struct uverbs_uobject_list *cur;
+
+		cur = kzalloc(sizeof(*cur), GFP_KERNEL);
+		if (!cur) {
+			err = -ENOMEM;
+			goto err;
+		}
+
+		cur->type = type;
+		INIT_LIST_HEAD(&cur->list);
+		list_add_tail(&cur->type_list, &ucontext->uobjects_lists);
+	}
+
+	return 0;
+
+err:
+	ib_uverbs_uobject_type_cleanup_ucontext(ucontext);
+	return err;
+}
+
 /*
  * The ib_uobject locking scheme is as follows:
  *
diff --git a/drivers/infiniband/core/uobject.h b/drivers/infiniband/core/uobject.h
index 2958ef7..40f6ff1 100644
--- a/drivers/infiniband/core/uobject.h
+++ b/drivers/infiniband/core/uobject.h
@@ -42,6 +42,22 @@  struct uverbs_lock_class {
 	char			name[16];
 };
 
+struct uverbs_uobject_type {
+	struct list_head	type_list;
+	void (*free)(struct uverbs_uobject_type *uobject_type,
+		     struct ib_uobject *uobject,
+		     struct ib_ucontext *ucontext);
+	u16			obj_type;
+	struct uverbs_lock_class lock_class;
+};
+
+/* embed in ucontext per type */
+struct uverbs_uobject_list {
+	struct uverbs_uobject_type	*type;
+	struct list_head		list;
+	struct list_head		type_list;
+};
+
 void init_uobj(struct ib_uobject *uobj, u64 user_handle,
 	       struct ib_ucontext *context, struct uverbs_lock_class *c);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 14bfe3b..c18155f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1325,6 +1325,8 @@  struct ib_ucontext {
 	struct list_head	rule_list;
 	int			closing;
 
+	struct list_head	uobjects_lists;
+
 	struct pid             *tgid;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	struct rb_root      umem_tree;
@@ -1354,6 +1356,9 @@  struct ib_uobject {
 	struct rw_semaphore	mutex;		/* protects .live */
 	struct rcu_head		rcu;		/* kfree_rcu() overhead */
 	int			live;
+	/* List of object under uverbs_object_type */
+	struct list_head	idr_list;
+	struct uverbs_uobject_list *type;	/* ptr to ucontext type */
 };
 
 struct ib_udata {
@@ -1960,6 +1965,9 @@  struct ib_device {
 	 * in fast paths.
 	 */
 	int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
+	struct list_head type_list;
+
+	const struct uverbs_types	*types;
 };
 
 struct ib_client {