diff mbox

[RFC,ABI,V2,5/8] RDMA/core: Add new ioctl interface

Message ID 1468941812-32286-6-git-send-email-matanb@mellanox.com (mailing list archive)
State RFC
Headers show

Commit Message

Matan Barak July 19, 2016, 3:23 p.m. UTC
In this proposed ioctl interface, processing the command starts from
properties of the command and fetching the appropriate user objects
before calling the handler.

Parsing and validation is done according to a specifier declared by
the driver's code. In the driver, all supported types are declared.
For each type we list all supported actions. Each action has a
specifies a handler, which could be either be a standard command or
a vendor specific command.
Along with the handler, a chain is specified as well. This chain
lists all supported attributes and is used for automatically
fetching and validating the command, response and its related objects.

Each chain consists of a validator_chains and a distribution function
which maps different command attributes to the different validators.
It also maps the attribute from the kABI world (which is vendor
specific) to the common kernel language.
Currently, in order to ease transition, we have a standard
distribution function which maps the attributes using the upper
attribute type bit.

A chain is an array of attributes. Each attribute has a type (PTR_IN,
PTR_OUT, IDR and in the future maybe FD, which could be used for
completion channel) and a length. Future work here might add
validation of mandatory attributes (i.e - make sure a specific
attribute was given).

If an IDR attribute is specified, the kernel also states the object
type and the required access (NEW, WRITE, READ or DESTROY).
All uobject management is done automatically by the infrastructure,
meaning - the infrastructure will fail concurrent commands that at
least one of them requires concurrent access (WRITE/DESTROY),
synchronize actions with device removals (dissociate context events)
and take care of reference counting (increase/decrease). Automatically
reference count isn't coded yet, but the general concept here is to add
(INCREASE_REF, DECREASE_REF) to the access specification field and
update these reference in success automatically.

 types
+--------+
|        |                                                                          +--------+
|        |   actions    action      action_spec                           +-----+   |len     |
+--------+  +------+   +-------+   +----------------+   +------------+    |attr1+-> |type    |
| type   +> |action+-> |chain  +-> +validator_chains+-> |common_chain+--> +-----+   |idr_type|
+--------+  +------+   |handler|   |distribution_fn |   +------------+    |attr2|   |access  |
|        |  |      |   +-------+   +----------------+   |vendor chain|    +-----+   +--------+
|        |  |      |                                    +------------+
|        |  +------+
|        |
|        |
|        |
|        |
|        |
|        |
|        |
|        |
|        |
|        |
+--------+

Once validation and object fetching (or creation) completed, we call
the handler:
int (*handler)(struct ib_device *ib_dev, struct ib_ucontext *ucontext,
               struct uverbs_attr_array *ctx, size_t num, void *priv);

Where ctx is an array of uverbs_attr_array. Each element in this array
is an array of attributes which corresponds to one validator.
For example, in the usually used case:

 ctx                               core
+----------------------------+     +------------+
| core: uverbs_attr_array    +---> | valid      |
+----------------------------+     | cmd_attr   |
| vendor: uverbs_attr_array  |     +------------+
|----------------------------+--+  | valid      |
                                |  | cmd_attr   |
                                |  +------------+
                                |  | valid      |
                                |  | obj_attr   |
                                |  +------------+
                                |
                                |  vendor
                                |  +------------+
                                +> | valid      |
                                   | cmd_attr   |
                                   +------------+
                                   | valid      |
                                   | cmd_attr   |
                                   +------------+
                                   | valid      |
                                   | obj_attr   |
                                   +------------+

Ctx array's indices corresponds to the validators order. The indices
of core and vendor corresponds to the attributes name spaces of each
validator. Thus, we could think of the following as one object:
1. Set of attribute specification (with their attribute IDs)
2. Validator which owns (1) specifications
3. A function which could handle this attributes which the handler
   could call

That means that core and vendor are the validated function (3)
parameters and their types exist in this function namespace.

Since the requent used case, we propose a wrapper that
allows a definition of the following handler:
int (*handler)(struct ib_device *ib_dev, struct ib_ucontext *ucontext,
               struct uverbs_attr_array *common,
               struct uverbs_attr_array *vendor,
               void *priv);

Upon success, reference count of uobjects and use count will be a
updated automatically according to the specification.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/Makefile           |   2 +-
 drivers/infiniband/core/device.c           |   3 +
 drivers/infiniband/core/uverbs.h           |   4 +
 drivers/infiniband/core/uverbs_cmd.c       |   1 +
 drivers/infiniband/core/uverbs_ioctl.c     | 279 +++++++++++++++++++++++++++++
 drivers/infiniband/core/uverbs_ioctl_cmd.c |  78 ++++++++
 drivers/infiniband/core/uverbs_main.c      |   6 +
 include/rdma/ib_verbs.h                    |   2 +
 include/rdma/uverbs_ioctl_cmd.h            |  68 +++++++
 include/uapi/rdma/rdma_user_ioctl.h        |  23 +++
 10 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/uverbs_ioctl.c
 create mode 100644 drivers/infiniband/core/uverbs_ioctl_cmd.c
 create mode 100644 include/rdma/uverbs_ioctl_cmd.h

Comments

Christoph Lameter (Ampere) July 20, 2016, 1:25 a.m. UTC | #1
On Tue, 19 Jul 2016, Matan Barak wrote:

> diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
> index 5c1117a..f6927fc 100644
> --- a/include/uapi/rdma/rdma_user_ioctl.h
> +++ b/include/uapi/rdma/rdma_user_ioctl.h
> @@ -42,6 +42,29 @@
>   */
>  #define IB_IOCTL_MAGIC			RDMA_IOCTL_MAGIC
>
> +#define RDMA_VERBS_IOCTL \
> +	_IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr)
> +
> +#define RDMA_DIRECT_IOCTL \
> +	_IOWR(RDMA_IOCTL_MAGIC, 2, struct ib_uverbs_ioctl_hdr)
> +
> +struct ib_uverbs_attr {
> +	__u16 attr_id;		/* command specific type attribute */
> +	__u16 len;		/* NA for idr */
> +	__u32 reserved;
> +	 __u64 ptr_idr;		/* ptr typeo command/idr handle */
> +};
> +
> +struct ib_uverbs_ioctl_hdr {
> +	__u16 length;
> +	__u16 flags;
> +	__u16 object_type;
> +	__u16 driver_id;

driver id??? I would have expected that the driver to be used is
already selected through the file handle as passed to the ioctl.

Having the ability to issue ioctls to multiple drivers via a
single file handle is unusual and will likely trigger security concerns.
In particular various access mechanisms rely on policing access through
file handles.

I guess we have a good reason for doing so? If so then please document
that somewhere. Or did I not see that?


+	 __u16 action; > +	__u16 num_attrs;
> +	struct ib_uverbs_attr  attrs[0];
> +};
> +
>  /* Legacy part
>   * !!!! NOTE: It uses the same command index as VERBS
>   */
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak July 20, 2016, 8:11 a.m. UTC | #2
On 20/07/2016 04:25, Christoph Lameter wrote:
> On Tue, 19 Jul 2016, Matan Barak wrote:
>
>> diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
>> index 5c1117a..f6927fc 100644
>> --- a/include/uapi/rdma/rdma_user_ioctl.h
>> +++ b/include/uapi/rdma/rdma_user_ioctl.h
>> @@ -42,6 +42,29 @@
>>   */
>>  #define IB_IOCTL_MAGIC			RDMA_IOCTL_MAGIC
>>
>> +#define RDMA_VERBS_IOCTL \
>> +	_IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr)
>> +
>> +#define RDMA_DIRECT_IOCTL \
>> +	_IOWR(RDMA_IOCTL_MAGIC, 2, struct ib_uverbs_ioctl_hdr)
>> +
>> +struct ib_uverbs_attr {
>> +	__u16 attr_id;		/* command specific type attribute */
>> +	__u16 len;		/* NA for idr */
>> +	__u32 reserved;
>> +	 __u64 ptr_idr;		/* ptr typeo command/idr handle */
>> +};
>> +
>> +struct ib_uverbs_ioctl_hdr {
>> +	__u16 length;
>> +	__u16 flags;
>> +	__u16 object_type;
>> +	__u16 driver_id;
>
> driver id??? I would have expected that the driver to be used is
> already selected through the file handle as passed to the ioctl.
>

If I recall, Jason proposed that. I think the main reason here is for 
strace and debugging. Since the ABI is now driver specific, this helps 
you parse the structs via strace.

> Having the ability to issue ioctls to multiple drivers via a
> single file handle is unusual and will likely trigger security concerns.
> In particular various access mechanisms rely on policing access through
> file handles.
>

This is actually an ongoing discussion we have in the OFVWG. The benefit 
of unifying rdma-cm and uverbs is that sharing objects (and opening a 
device through rdma-cm and pass it to user-space) becomes easier. That 
means that security should be handled via selinux.

> I guess we have a good reason for doing so? If so then please document
> that somewhere. Or did I not see that?
>
>

The current v2 approach only adds the driver_id for information and 
strace only. I would really like to hear your insights about using a 
single file (rdma_uapi) vs multiple files as we have today.

> +	 __u16 action; > +	__u16 num_attrs;
>> +	struct ib_uverbs_attr  attrs[0];
>> +};
>> +
>>  /* Legacy part
>>   * !!!! NOTE: It uses the same command index as VERBS
>>   */
>>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 20, 2016, 10:03 a.m. UTC | #3
On Wed, 20 Jul 2016, Matan Barak wrote:

> > > +struct ib_uverbs_ioctl_hdr {
> > > +	__u16 length;
> > > +	__u16 flags;
> > > +	__u16 object_type;
> > > +	__u16 driver_id;
> >
> > driver id??? I would have expected that the driver to be used is
> > already selected through the file handle as passed to the ioctl.
> >
>
> If I recall, Jason proposed that. I think the main reason here is for strace
> and debugging. Since the ABI is now driver specific, this helps you parse the
> structs via strace.

You specify the driver_id in the ioctl data structure. The driver in an
ioctl is specified by the descriptor. strace etc would have an easier time
if we would follow the standard conventions for devices and not add
another device_id somewhere.

> This is actually an ongoing discussion we have in the OFVWG. The benefit of
> unifying rdma-cm and uverbs is that sharing objects (and opening a device
> through rdma-cm and pass it to user-space) becomes easier. That means that
> security should be handled via selinux.

Why would there be an issue of sharing data between multiple descriptors?
Data could be a subsystem specific state and not device specific if you
want to share.

> > I guess we have a good reason for doing so? If so then please document
> > that somewhere. Or did I not see that?
> The current v2 approach only adds the driver_id for information and strace
> only. I would really like to hear your insights about using a single file
> (rdma_uapi) vs multiple files as we have today.

Why do you need that information a second time if the descriptor already
provides the device information?

The simple approach would be to have either /dev or /sysfs based
descriptors that can be opened and then used for ioctls. F.e. open

/sys/class/infiniband/mlx5_0/desc or

/sys/class/infiniband/mlx5_0/ports/1/desc

or create a new hierachy using udev/systemd

f.e. /dev/rdma/mlx5_0

and then run ioctls on that file to configure the device. That is very
similar to the traditional use of ioctls. Security for each device can be
controlled separately without inspecting the data being passed (which
would require modifications to the security code). strace and
other tools would just natively know that the descriptor refers to a device.

Additional system calls would allow a straightforward war to control
passing the descriptor on exec and other fuunky things (see manpage
for fcntl f.e.). The usual machinery for device descriptor control and
management would come in almost for free.


IOCTL(2)                                            Linux Programmer's
Manual                                            IOCTL(2)

NAME
       ioctl - control device

SYNOPSIS
       #include <sys/ioctl.h>

       int ioctl(int d, int request, ...);

DESCRIPTION
       The ioctl() function manipulates the underlying device parameters
of special files.  In particular, many operating charac‐
       teristics of character special files (e.g., terminals) may be
controlled with ioctl() requests.  The argument d must be an
       open file descriptor.
Matan Barak July 20, 2016, 11:56 a.m. UTC | #4
On 20/07/2016 13:03, Christoph Lameter wrote:
> On Wed, 20 Jul 2016, Matan Barak wrote:
>
>>>> +struct ib_uverbs_ioctl_hdr {
>>>> +	__u16 length;
>>>> +	__u16 flags;
>>>> +	__u16 object_type;
>>>> +	__u16 driver_id;
>>>
>>> driver id??? I would have expected that the driver to be used is
>>> already selected through the file handle as passed to the ioctl.
>>>
>>
>> If I recall, Jason proposed that. I think the main reason here is for strace
>> and debugging. Since the ABI is now driver specific, this helps you parse the
>> structs via strace.
>
> You specify the driver_id in the ioctl data structure. The driver in an
> ioctl is specified by the descriptor. strace etc would have an easier time
> if we would follow the standard conventions for devices and not add
> another device_id somewhere.
>

If you use only one file (ie rdma_uapi), how will strace know which 
internal RDMA device is used?
It'll need to same some bits for the entire session to figure that out.

>> This is actually an ongoing discussion we have in the OFVWG. The benefit of
>> unifying rdma-cm and uverbs is that sharing objects (and opening a device
>> through rdma-cm and pass it to user-space) becomes easier. That means that
>> security should be handled via selinux.
>
> Why would there be an issue of sharing data between multiple descriptors?
> Data could be a subsystem specific state and not device specific if you
> want to share.
>

Currently uverbs and rdma-cm IDRs are separated. If we would like to 
introduce a semantics where rdma-cm opens the device (and maybe even 
access the device's objects) it needs to somehow share ids with the 
ib_dev context.
So do you suggest that rdma-cm will get a fd and open the required 
device on this fd while returning a handle bounded to this fd (you still 
need to bind the fd to its objects due to security reasons).

>>> I guess we have a good reason for doing so? If so then please document
>>> that somewhere. Or did I not see that?
>> The current v2 approach only adds the driver_id for information and strace
>> only. I would really like to hear your insights about using a single file
>> (rdma_uapi) vs multiple files as we have today.
>
> Why do you need that information a second time if the descriptor already
> provides the device information?
>
> The simple approach would be to have either /dev or /sysfs based
> descriptors that can be opened and then used for ioctls. F.e. open
>
> /sys/class/infiniband/mlx5_0/desc or
>
> /sys/class/infiniband/mlx5_0/ports/1/desc
>
> or create a new hierachy using udev/systemd
>
> f.e. /dev/rdma/mlx5_0
>
> and then run ioctls on that file to configure the device. That is very
> similar to the traditional use of ioctls. Security for each device can be
> controlled separately without inspecting the data being passed (which
> would require modifications to the security code). strace and
> other tools would just natively know that the descriptor refers to a device.
>
> Additional system calls would allow a straightforward war to control
> passing the descriptor on exec and other fuunky things (see manpage
> for fcntl f.e.). The usual machinery for device descriptor control and
> management would come in almost for free.
>
>

That's exactly where we stand right now. There are some supporters for 
unifying all devices to one file. Even if you want to unify rdma-cm to 
the same fd and open the device through it, strace can't know which 
physical device is used.

If we keep file per device as you described, we don't need this field.

> IOCTL(2)                                            Linux Programmer's
> Manual                                            IOCTL(2)
>
> NAME
>        ioctl - control device
>
> SYNOPSIS
>        #include <sys/ioctl.h>
>
>        int ioctl(int d, int request, ...);
>
> DESCRIPTION
>        The ioctl() function manipulates the underlying device parameters
> of special files.  In particular, many operating charac‐
>        teristics of character special files (e.g., terminals) may be
> controlled with ioctl() requests.  The argument d must be an
>        open file descriptor.
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 20, 2016, 5:21 p.m. UTC | #5
On Wed, Jul 20, 2016 at 05:03:46AM -0500, Christoph Lameter wrote:
> > If I recall, Jason proposed that. I think the main reason here is for strace
> > and debugging. Since the ABI is now driver specific, this helps you parse the
> > structs via strace.
> 
> You specify the driver_id in the ioctl data structure. The driver in an
> ioctl is specified by the descriptor. strace etc would have an easier time
> if we would follow the standard conventions for devices and not add
> another device_id somewhere.

There isn't enough ioctl #'s for that. We probably need something like
over 500 ioctls by the time we are done all the drivers and
interfaces.. There is only about 100 reserved for RDMA today, and the
ioctl space is looking pretty full to steal another 4 blocks.

So the basic proposal is to use only a small number of ioctls and
have an 'extended ioctl #' in the struct.

The basic requirement is the same as in ioctl, the ioctl # and
extended # in the struct must be globally unique.

The device_id part of the extension allows each driver to have its own
globally unique number space without requireing another ioctl block.

> Why would there be an issue of sharing data between multiple descriptors?
> Data could be a subsystem specific state and not device specific if you
> want to share.

The state should be fd specific.

> Why do you need that information a second time if the descriptor already
> provides the device information?

Because the descriptor only indirectly implies a specific device.

It is the same reason overlapping ioctls are discouraged in the
kernel, even though I know I opened /dev/foo I still should send
globally unique 'FOO' ioctls to what I opened.

> and then run ioctls on that file to configure the device. That is very
> similar to the traditional use of ioctls. Security for each device can be
> controlled separately without inspecting the data being passed
> (which

It turns out that doesn't work today anyhow, since we have
multi-device requirements in the rdma_cm and you end up with a rdma_cm
descriptor that is world accessible and can interact with the network
even without device permissions.

netdev doesn't work in this screwy way, I think we should move away
from it as well...

> would require modifications to the security code). strace and
> other tools would just natively know that the descriptor refers to a device.

No, strace doesn't know what an open FD is, it just inspects the ioctl
# and content to do its decode. Requiring strace to decode differently
depending on what was opened would be the deviation from what we do
today.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 21, 2016, 12:38 a.m. UTC | #6
On Wed, 20 Jul 2016, Matan Barak wrote:

> If you use only one file (ie rdma_uapi), how will strace know which internal
> RDMA device is used?
> It'll need to same some bits for the entire session to figure that out.

Yes I thought that was the current approach? I do not like that approach.

> Currently uverbs and rdma-cm IDRs are separated. If we would like to introduce
> a semantics where rdma-cm opens the device (and maybe even access the device's
> objects) it needs to somehow share ids with the ib_dev context.
> So do you suggest that rdma-cm will get a fd and open the required device on
> this fd while returning a handle bounded to this fd (you still need to bind
> the fd to its objects due to security reasons).

Both operate on the same device. So they would use the same device handle
(descriptor).

> > Additional system calls would allow a straightforward war to control
> > passing the descriptor on exec and other fuunky things (see manpage
> > for fcntl f.e.). The usual machinery for device descriptor control and
> > management would come in almost for free.
> >
> >
>
> That's exactly where we stand right now. There are some supporters for
> unifying all devices to one file. Even if you want to unify rdma-cm to the
> same fd and open the device through it, strace can't know which physical
> device is used.

strace can know the device through the descriptor in the same way as done
for character and block devices if we would follow the convention of one
descriptor referring to one device. strace cannot know that if you use a
single file for multiple devices.

I think the idea of using a single file as a way to do RPC calls to a
subsystem needs to be retired. Operations occur on devices and ioctls
refer
to devices.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 21, 2016, 12:57 a.m. UTC | #7
On Wed, 20 Jul 2016, Jason Gunthorpe wrote:

> > You specify the driver_id in the ioctl data structure. The driver in an
> > ioctl is specified by the descriptor. strace etc would have an easier time
> > if we would follow the standard conventions for devices and not add
> > another device_id somewhere.
>
> There isn't enough ioctl #'s for that. We probably need something like
> over 500 ioctls by the time we are done all the drivers and
> interfaces.. There is only about 100 reserved for RDMA today, and the
> ioctl space is looking pretty full to steal another 4 blocks.

That sounds crazy. Surely there is a way to reduce that number.

> So the basic proposal is to use only a small number of ioctls and
> have an 'extended ioctl #' in the struct.

Ok then you already have much less.

> The device_id part of the extension allows each driver to have its own
> globally unique number space without requireing another ioctl block.

But you already have that uniqueness if the ioctl filehandle provides
that.

> > Why would there be an issue of sharing data between multiple descriptors?
> > Data could be a subsystem specific state and not device specific if you
> > want to share.
>
> The state should be fd specific.

I think that is what we want. Driver specific ioctls and driver specific
types. If you want subsystem wide types then something special would have
to happen permissions wise.

> > Why do you need that information a second time if the descriptor already
> > provides the device information?
>
> Because the descriptor only indirectly implies a specific device.

If you require specify a device on ioctl then it will directly indicate
the device you are using. AFAICT this is the way its supposed to be.

> > and then run ioctls on that file to configure the device. That is very
> > similar to the traditional use of ioctls. Security for each device can be
> > controlled separately without inspecting the data being passed
> > (which
>
> It turns out that doesn't work today anyhow, since we have
> multi-device requirements in the rdma_cm and you end up with a rdma_cm
> descriptor that is world accessible and can interact with the network
> even without device permissions.

multi-device requirements in the network code require a new device that
aggregates the other ones. The aggregation device is not world accessible.
The same approach could be used for RDMA.

> > would require modifications to the security code). strace and
> > other tools would just natively know that the descriptor refers to a device.
>
> No, strace doesn't know what an open FD is, it just inspects the ioctl
> # and content to do its decode. Requiring strace to decode differently
> depending on what was opened would be the deviation from what we do
> today.

It is the  basic Unix convention to have a descriptor/file handle refer to
a device. There is nothing special required here from strace.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 21, 2016, 1:56 a.m. UTC | #8
On Wed, Jul 20, 2016 at 07:57:09PM -0500, Christoph Lameter wrote:

> > The device_id part of the extension allows each driver to have its own
> > globally unique number space without requireing another ioctl block.
> 
> But you already have that uniqueness if the ioctl filehandle provides
> that.

The argument is that ioctls should be self-describing and never rely
on the filehandle to uniq them. That is basically standard in the
kernel, and why we have Documentation/ioctl/ioctl-number.txt
describing how to uniquely assign ioctl numbers.

There is nothing special here, we are just creating a much larger
ioctl # space by using (ioctl #, operation, device_id) as the globally
unique key, following existing best-practice if self-identifying ioctls.

> multi-device requirements in the network code require a new device that
> aggregates the other ones. The aggregation device is not world accessible.
> The same approach could be used for RDMA.

Some do, some don't. The rdma_cm requirement is more like
listen(0.0.0.0) which does not require special net device aggregation.

> > No, strace doesn't know what an open FD is, it just inspects the ioctl
> > # and content to do its decode. Requiring strace to decode differently
> > depending on what was opened would be the deviation from what we do
> > today.
> 
> It is the  basic Unix convention to have a descriptor/file handle refer to
> a device. There is nothing special required here from strace.

Sometimes yes, sometimes no. There are lots of examples both ways.

/dev/pts/ptmx, /dev/mapper/control, /dev/btrfs-control, etc, etc are
all examples of multi-device control fds similar to the proposed
single char dev.

.. and bear in mind that /dev/uverbs0 doesn't even really make that
much sense as it aggregates two physical ports. There is already no
way to split permissions up by port, which is a logical thing to want
for some dual-rail configurations.

What we have today really doesn't make much sense.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 21, 2016, 2:41 a.m. UTC | #9
On Wed, 20 Jul 2016, Jason Gunthorpe wrote:

> >
> > But you already have that uniqueness if the ioctl filehandle provides
> > that.
>
> The argument is that ioctls should be self-describing and never rely
> on the filehandle to uniq them. That is basically standard in the
> kernel, and why we have Documentation/ioctl/ioctl-number.txt
> describing how to uniquely assign ioctl numbers.

Fully agree with that. But this statement does not relate to what we
were talking about. So I guess I need to restate it one more time:

device_id is useless if the driver is already determined by the device filehanle.

> There is nothing special here, we are just creating a much larger
> ioctl # space by using (ioctl #, operation, device_id) as the globally
> unique key, following existing best-practice if self-identifying ioctls.
>
> > multi-device requirements in the network code require a new device that
> > aggregates the other ones. The aggregation device is not world accessible.
> > The same approach could be used for RDMA.
>
> Some do, some don't. The rdma_cm requirement is more like
> listen(0.0.0.0) which does not require special net device aggregation.

Well that concept is for a packetized protocol stack. In that kind of a
system packets can be routed over multiple devices depending on the
address. We do not have that in the IB stack. Routing of packets would
require kernel code to run for each packet. Instead we have mostly point
to point RDMA requests. So thos consideration is not for the IB stack.

> > It is the  basic Unix convention to have a descriptor/file handle refer to
> > a device. There is nothing special required here from strace.
> Sometimes yes, sometimes no. There are lots of examples both ways.
>
> /dev/pts/ptmx, /dev/mapper/control, /dev/btrfs-control, etc, etc are
> all examples of multi-device control fds similar to the proposed
> single char dev.

Yes but those are not used for communications. They are used to control a
subsystem and access to those requires special priviledges. We are talking
about a device accessible witout special priviledges to do data
communications. /dev/rdmaXXXX to control global behavior of the stack
for all processes would be fine. But we are controlling the interaction of
a process with a device.

> .. and bear in mind that /dev/uverbs0 doesn't even really make that
> much sense as it aggregates two physical ports. There is already no
> way to split permissions up by port, which is a logical thing to want
> for some dual-rail configurations.

Right lets get rid of it. device specific files only.

> What we have today really doesn't make much sense.

Indeed lets adopt the standard filehandles for ioctls and other function
calls as much as posssible. One may then be able to reuse other existing
ioctls and other function calls for those filehandles (like fcntl which I
mentioned earlier).

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 21, 2016, 3:28 a.m. UTC | #10
On Wed, Jul 20, 2016 at 09:41:34PM -0500, Christoph Lameter wrote:
> On Wed, 20 Jul 2016, Jason Gunthorpe wrote:
> 
> > >
> > > But you already have that uniqueness if the ioctl filehandle provides
> > > that.
> >
> > The argument is that ioctls should be self-describing and never rely
> > on the filehandle to uniq them. That is basically standard in the
> > kernel, and why we have Documentation/ioctl/ioctl-number.txt
> > describing how to uniquely assign ioctl numbers.
> 
> Fully agree with that. But this statement does not relate to what we
> were talking about. So I guess I need to restate it one more time:

> device_id is useless if the driver is already determined by the device filehanle.

It isn't useless, it preserves the self describing property that is
the kernel standard for ioctls.

The filehandle major/minor will never uniquely describe the device, we
are not going to give mlx4,mlx5, etc unique major/minor numbers. Yes,
the kernel always implicitly knows what device driver the ioctl will
be delivered to.

But the kernel has that implicit knowledge for ioctls as well and we
still go through the trouble in Documentation/ioctl/ioctl-number.txt
to make them globally unique. device_id is exactly the same thing.

In otherwords, we are going to do something like this:

arg.driver_id = MLX4;
arg.driver_op = MLX4_FROBNICATE;
ioctl(IB_DRIVER_DO_SOMETHING,arg);

Something like strace looks at this and sees
IB_DRIVER_DO_SOMETHING,driver_id,driver_op and knows excatly how to
parse arg (struct mlx4_op_frobincate).

And it can tell it apart from this:

arg.driver_id = MLX5;
arg.driver_op = MLX5_BROBNICATE;
ioctl(IB_DRIVER_DO_SOMETHING,arg);

Which would have a different layout for arg.

Which is *exactly* what you expect for ioctl, and is the basic kernel
standard.

Think of driver_id and driver_op as being a 32 bit globally unique
value assigned to every driver function, run under the
IB_DRIVER_DO_SOMETHING multiplexor.

> > Some do, some don't. The rdma_cm requirement is more like
> > listen(0.0.0.0) which does not require special net device aggregation.
> 
> Well that concept is for a packetized protocol stack. In that kind of a
> system packets can be routed over multiple devices depending on the
> address. We do not have that in the IB stack.

We do have routing, RDMA_CM does it. Only once the connection is
established does it crystalize into specific hardware, which is
basically the same process as the net stack, rdma_cm uses the same
routing table functions to determine the RDMA device to route the QP
too, which is part of the problem with having things spread across all
these distinct FDs. They don't coorporate like they need to.

.. and there is also the issue of namespaces :| People want RDMA
namespaces, which I think really clouds how these per-device file
descriptors would sanely work, esepcially with the separate RDMA CM
fd.

> > /dev/pts/ptmx, /dev/mapper/control, /dev/btrfs-control, etc, etc are
> > all examples of multi-device control fds similar to the proposed
> > single char dev.
> 
> Yes but those are not used for communications. They are used to control a
> subsystem and access to those requires special priviledges. We are talking
> about a device accessible witout special priviledges to do data
> communications. /dev/rdmaXXXX to control global behavior of the stack
> for all processes would be fine. But we are controlling the interaction of
> a process with a device.

Eh? btrfs-control is mutliplexed across all mounted block devices used
by BTRFS, I wouldn't say it is any different than what we are talking
about with rdma.

Do you have an actual use for the currently somewhat broken fine-grained
permissions we have with uverbs0 ?

The only people a single char dev really impacts are users with
multiple cards, and I think that is fairly rare..

> > .. and bear in mind that /dev/uverbs0 doesn't even really make that
> > much sense as it aggregates two physical ports. There is already no
> > way to split permissions up by port, which is a logical thing to want
> > for some dual-rail configurations.
> 
> Right lets get rid of it. device specific files only.

There is a good reason why we bundle ports together - that is part of
how the IN spec works for PDs, APM and connection management. It is
not something we should throw away.

Even if we wanted to we just don't have the overall infrastructure to
restrict on a port by port basis (SELinux should ultimately fix that,
but SELinux will work the same in both single and multi char dev models)

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 21, 2016, 5 a.m. UTC | #11
On Wed, 20 Jul 2016, Jason Gunthorpe wrote:

> > device_id is useless if the driver is already determined by the device filehanle.
>
> It isn't useless, it preserves the self describing property that is
> the kernel standard for ioctls.

What? Never heard about that and certainly do not see that in the ioctls I
have used. Something like the device is specified on on some special
sockets that control network stack behavior and other control files.

> But the kernel has that implicit knowledge for ioctls as well and we
> still go through the trouble in Documentation/ioctl/ioctl-number.txt
> to make them globally unique. device_id is exactly the same thing.

Nope. device_id is not part of the ioctl number. The uniqueness only
applies to the functionality requested from the device. Please do not mix
that up with device identification.


> In otherwords, we are going to do something like this:
>
> arg.driver_id = MLX4;
> arg.driver_op = MLX4_FROBNICATE;
> ioctl(IB_DRIVER_DO_SOMETHING,arg);

Uhhh.. The first argument to ioctl specifies the device!!! The
established way of doing things would be:

f = open("/dev/rdma/mlx5_0")

err = ioctl(f, IOCTL_RDMA_DO_SOMETHING, parameters)

>> Something like strace looks at this and sees
> IB_DRIVER_DO_SOMETHING,driver_id,driver_op and knows excatly how to
> parse arg (struct mlx4_op_frobincate).

Well there is no need to do that with the standard way. The filehandle
identifies the driver. No need to modify strace and no need to
schlepp the device_id around.


> And it can tell it apart from this:
>
> arg.driver_id = MLX5;
> arg.driver_op = MLX5_BROBNICATE;
> ioctl(IB_DRIVER_DO_SOMETHING,arg);
>
> Which would have a different layout for arg.
>
> Which is *exactly* what you expect for ioctl, and is the basic kernel
> standard.

I am sorry but ioctl really has a device parameter the first one. ioctls
without specifying a device do not exist as you seem to assume here.
Please read the manpage.


> Think of driver_id and driver_op as being a 32 bit globally unique
> value assigned to every driver function, run under the
> IB_DRIVER_DO_SOMETHING multiplexor.

No multiplexer please. Lets operate on the device specified.

> > Well that concept is for a packetized protocol stack. In that kind of a
> > system packets can be routed over multiple devices depending on the
> > address. We do not have that in the IB stack.
>
> We do have routing, RDMA_CM does it. Only once the connection is
> established does it crystalize into specific hardware, which is
> basically the same process as the net stack, rdma_cm uses the same
> routing table functions to determine the RDMA device to route the QP
> too, which is part of the problem with having things spread across all
> these distinct FDs. They don't coorporate like they need to.

OMG. And now we are adding vendor specific extensions. This is going to be
great fun to use. Case statements for vendor specific extension after we
have figured out which device to use?

Trivially the routing function could return a filehandle to use

> .. and there is also the issue of namespaces :| People want RDMA
> namespaces, which I think really clouds how these per-device file
> descriptors would sanely work, esepcially with the separate RDMA CM
> fd.

Namespaces already work for filehandles and always have. Look how devices
are handled in different namespaces.

> > > /dev/pts/ptmx, /dev/mapper/control, /dev/btrfs-control, etc, etc are
> > > all examples of multi-device control fds similar to the proposed
> > > single char dev.
> >
> > Yes but those are not used for communications. They are used to control a
> > subsystem and access to those requires special priviledges. We are talking
> > about a device accessible witout special priviledges to do data
> > communications. /dev/rdmaXXXX to control global behavior of the stack
> > for all processes would be fine. But we are controlling the interaction of
> > a process with a device.
>
> Eh? btrfs-control is mutliplexed across all mounted block devices used
> by BTRFS, I wouldn't say it is any different than what we are talking
> about with rdma.

But this is used to modify the operations of btrfs and do administration.
We are discussing here an endpoint establishing connections and
configurations specific to an application.

> Do you have an actual use for the currently somewhat broken fine-grained
> permissions we have with uverbs0 ?

Well a file exposed to one namespace and not another comes to mind.

A system may have multiple RDMA adapters and you want one application
running under one userid to have access to one and not the other. The
system may run a variety of more or less trustworthy apps and you would
like to restrict access. This is not possible with the existing scheme.

Also the filehandle trivially allows security subsystems to monitor the
usage of a device. One can identify all processes using one RDMA device
that may be going down etc etc.

> The only people a single char dev really impacts are users with
> multiple cards, and I think that is fairly rare..

Hmmm.... This is not rare here. We have both Ethernet RDMA and IB RDMA
devices (and sometimes multiple) in our systems.

> > Right lets get rid of it. device specific files only.
>
> There is a good reason why we bundle ports together - that is part of
> how the IN spec works for PDs, APM and connection management. It is
> not something we should throw away.

Ok what do those terms mean?

> Even if we wanted to we just don't have the overall infrastructure to
> restrict on a port by port basis (SELinux should ultimately fix that,
> but SELinux will work the same in both single and multi char dev models)

We already have the security infrastructure to control access by
filehandle both single device and multiple device. The multiplexer device
will cause additional security concerns because the ioctl packet must be
inspected to find the device. Please do not do this.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 21, 2016, 5:44 a.m. UTC | #12
On Thu, Jul 21, 2016 at 12:00:34AM -0500, Christoph Lameter wrote:
> On Wed, 20 Jul 2016, Jason Gunthorpe wrote:
> 
> > > device_id is useless if the driver is already determined by the device filehanle.
> >
> > It isn't useless, it preserves the self describing property that is
> > the kernel standard for ioctls.
> 
> What? Never heard about that and certainly do not see that in the ioctls I
> have used. Something like the device is specified on on some special
> sockets that control network stack behavior and other control files.

How many ioctls have you used that use a complex variable sized struct
as the parameter?

There are a few and they ones I've used are self-describing one way or
another. Perhaps they have an op code or something in the struct, or a
netlink-like format, or *something* but you don't need to know what
the fd is connected to in order to parse the struct. The ioctl # and
the argument should be enough to parse.

> > But the kernel has that implicit knowledge for ioctls as well and we
> >
> > arg.driver_id = MLX4;
> > arg.driver_op = MLX4_FROBNICATE;
> > ioctl(IB_DRIVER_DO_SOMETHING,arg);
> 
> Uhhh.. The first argument to ioctl specifies the device!!! The
> established way of doing things would be:

Yeah, I dropped it for brevity, because it doesn't really matter, we
all know ioctls work on fds...

> Well there is no need to do that with the standard way. The filehandle
> identifies the driver. No need to modify strace and no need to
> schlepp the device_id around.

Nope, that isn't how strace works, strace never checks the filehandle.

> > We do have routing, RDMA_CM does it. Only once the connection is
> > established does it crystalize into specific hardware, which is
> > basically the same process as the net stack, rdma_cm uses the same
> > routing table functions to determine the RDMA device to route the QP
> > too, which is part of the problem with having things spread across all
> > these distinct FDs. They don't coorporate like they need to.
> 
> OMG. And now we are adding vendor specific extensions. This is going to be
> great fun to use. Case statements for vendor specific extension after we
> have figured out which device to use?

There won't be case statements.

You are really surprised by the rdma_cm architecture??? I know it is
goofy, but we are stuck with it..

.. and we have been doing the un-described structs like you are asking
for today. libmlx4 just assumes the fd it is talking to is a mlx4
driver (because sysfs said so) and jams in untagged driver-specific
structs to the command flow. strace cannot parse them and there is no
kernel support for debugging or accidental misuse..

Adding a little tag to the driver specific struct seems totally
harmless, every iteration of the various proposals has had some kind
of struct tag one way or another. I don't know what you see strange
about this.

> Namespaces already work for filehandles and always have. Look how devices
> are handled in different namespaces.

People want to put different ports in different name spaces, the
uverbs fd is a full device thing, it doesn't work and doesn't make
sense as the namespace control point.

Heck, people want to put certain pkeys into namespaces.

The char dev alone is totally unsuitable for the namespace needs.

> Also the filehandle trivially allows security subsystems to monitor the
> usage of a device. One can identify all processes using one RDMA device
> that may be going down etc etc.

Not really, you can have rdma_cm things open listening that don't use
uverbs until a connection is triggered. Perhaps that is rare though.

> We already have the security infrastructure to control access by
> filehandle both single device and multiple device. The multiplexer device
> will cause additional security concerns because the ioctl packet must be
> inspected to find the device. Please do not do this.

I mean in IB, we don't have the ability to securely strip a single
port out of a device. This is why /dev/uverbs0 referes to both ports
on a card. Adding that ability would damage API capabilities we have.

We already have two command multiplexor fds in the current design and
they have exactly the security concerns you allude to. This is why we
have a SELinux patch set under consideration because labeling dev
nodes is not nearly enough. This is why the  namespace patches are
incomplete, etc..

There is no proposal to eliminate the multiplexors, I don't even know
how that could work...

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 21, 2016, 7:11 a.m. UTC | #13
On Wed, 20 Jul 2016, Jason Gunthorpe wrote:

> > Well there is no need to do that with the standard way. The filehandle
> > identifies the driver. No need to modify strace and no need to
> > schlepp the device_id around.
>
> Nope, that isn't how strace works, strace never checks the filehandle.

Ok why would strace check a filehandle in the first place? The descriptor
is the filehandle and you can simply find the operation that created that
file descriptor to find the device it refers to.

> > We already have the security infrastructure to control access by
> > filehandle both single device and multiple device. The multiplexer device
> > will cause additional security concerns because the ioctl packet must be
> > inspected to find the device. Please do not do this.
>
> I mean in IB, we don't have the ability to securely strip a single
> port out of a device. This is why /dev/uverbs0 referes to both ports
> on a card. Adding that ability would damage API capabilities we have.

We could easily do that following naming conventions for partitions or so.
Why would doing so damage the API capabilities? Seems that they are
sufficiently screwed up already. Cleaning that up could help quite a bit.

> We already have two command multiplexor fds in the current design and
> they have exactly the security concerns you allude to. This is why we
> have a SELinux patch set under consideration because labeling dev
> nodes is not nearly enough. This is why the  namespace patches are
> incomplete, etc..

Ok then this is the opportunity to get rid of these things.

> There is no proposal to eliminate the multiplexors, I don't even know
> how that could work...

I thought I just tried to outline how that could work. Consistently use
the device semantics already provided in the kernel and use the
functionality through ioctls, fnctls etc etc as already provided.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig July 21, 2016, 8:04 a.m. UTC | #14
On Wed, Jul 20, 2016 at 11:44:39PM -0600, Jason Gunthorpe wrote:
> How many ioctls have you used that use a complex variable sized struct
> as the parameter?
> 
> There are a few and they ones I've used are self-describing one way or
> another. Perhaps they have an op code or something in the struct, or a
> netlink-like format, or *something* but you don't need to know what
> the fd is connected to in order to parse the struct. The ioctl # and
> the argument should be enough to parse.

In that case they are designed by taste-less fools.  No need to
to spread that BS even further.

> > Well there is no need to do that with the standard way. The filehandle
> > identifies the driver. No need to modify strace and no need to
> > schlepp the device_id around.
> 
> Nope, that isn't how strace works, strace never checks the filehandle.

Strace is just a random tracing tool.  No need to design interface
for it.  Just use a ioctl values that don't confluct with anything
heavily used and we'll be fine.

> You are really surprised by the rdma_cm architecture??? I know it is
> goofy, but we are stuck with it..

Are we?  We'll need to stick to the wire format obviously, but the
software architectures needs to die rather sooner than later.  It's the
worst pile of junk in the RDMA subsystem, and that really means
something.  It's almost impossible to use it correctly due to all the
warts and un- or under-documented assumtions in it's APIs.

> .. and we have been doing the un-described structs like you are asking
> for today. libmlx4 just assumes the fd it is talking to is a mlx4
> driver (because sysfs said so) and jams in untagged driver-specific
> structs to the command flow. strace cannot parse them and there is no
> kernel support for debugging or accidental misuse..
> 
> Adding a little tag to the driver specific struct seems totally
> harmless, every iteration of the various proposals has had some kind
> of struct tag one way or another. I don't know what you see strange
> about this.

Just use different ioctl codes for different drivers and you're done.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 21, 2016, 4:20 p.m. UTC | #15
On Thu, Jul 21, 2016 at 01:04:33AM -0700, Christoph Hellwig wrote:
> On Wed, Jul 20, 2016 at 11:44:39PM -0600, Jason Gunthorpe wrote:
> > How many ioctls have you used that use a complex variable sized struct
> > as the parameter?
> > 
> > There are a few and they ones I've used are self-describing one way or
> > another. Perhaps they have an op code or something in the struct, or a
> > netlink-like format, or *something* but you don't need to know what
> > the fd is connected to in order to parse the struct. The ioctl # and
> > the argument should be enough to parse.
> 
> In that case they are designed by taste-less fools.  No need to
> to spread that BS even further.

Well, what do you suggest then? This sounds like we are back to the
~500 ioctls. Is that OK & better?

This is not a small interface, and when you add in all the
driver-specific stuff there are *alot* of different API
call signatures..

I'm not so sure I want to see MLX5_CREATE_QP, MLX4_CREATE_QP,
QIB_CREATE_QP, etc, etc as ioctl numbers, that seems difficult
to implement the common code for.

> > You are really surprised by the rdma_cm architecture??? I know it is
> > goofy, but we are stuck with it..
> 
> Are we?  We'll need to stick to the wire format obviously, but the
> software architectures needs to die rather sooner than later.  It's the
> worst pile of junk in the RDMA subsystem, and that really means
> something.  It's almost impossible to use it correctly due to all the
> warts and un- or under-documented assumtions in it's APIs.

I think we are, it is baked into user space ...

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 21, 2016, 4:41 p.m. UTC | #16
On Thu, Jul 21, 2016 at 02:11:56AM -0500, Christoph Lameter wrote:
> On Wed, 20 Jul 2016, Jason Gunthorpe wrote:
> 
> > > Well there is no need to do that with the standard way. The filehandle
> > > identifies the driver. No need to modify strace and no need to
> > > schlepp the device_id around.
> >
> > Nope, that isn't how strace works, strace never checks the filehandle.
> 
> Ok why would strace check a filehandle in the first place? The descriptor
> is the filehandle and you can simply find the operation that created that
> file descriptor to find the device it refers to.

strace is stateless and can attach to a running process, it can't
watch for open() to figure things out. This is also why it doesn't
inspect the filehandle...

We don't *need* strace to work, but it should would be nice :|

> > I mean in IB, we don't have the ability to securely strip a single
> > port out of a device. This is why /dev/uverbs0 referes to both ports
> > on a card. Adding that ability would damage API capabilities we have.
> 
> We could easily do that following naming conventions for partitions or so.
> Why would doing so damage the API capabilities? Seems that they are
> sufficiently screwed up already. Cleaning that up could help quite a bit.

The current API is problematic because we try to both be like netdev
in that all devices are accessible (rdma_cm) and at the same with with
individual per-device chardevs (uverbs0).

The mismatch sucks and the two do not get along together as well at
they should, which leads to even more goofyness in the design of
things like rdma_cm :(

So, if you want to move fully to the per-char-dev model then I think
we'd give up the global netdev like behaviors, things like
listen(0.0.0) and output route selection, and so forth. I doubt there
is any support for that.

If we go the other way to a full netdev-like module then we give up
fine grained (currently mildly broken) file system permissions.

> > There is no proposal to eliminate the multiplexors, I don't even know
> > how that could work...
> 
> I thought I just tried to outline how that could work. Consistently use
> the device semantics already provided in the kernel and use the
> functionality through ioctls, fnctls etc etc as already provided.

You haven't explained how we can mesh the rdma_cm, netdev-like
listen(0.0.0.0) type semantics, continue to implement multi-port APM
functionality, share PDs across ports, etc, etc. These are all the
actual things done today that break when we drop the multiplexors.

This isn't a simple API that is 1:1 tied to a single physical object,
it is a sprawling thing with lots of built-in cross-device semantics. :(

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 25, 2016, 4:30 p.m. UTC | #17
On Thu, 21 Jul 2016, Jason Gunthorpe wrote:

> > Ok why would strace check a filehandle in the first place? The descriptor
> > is the filehandle and you can simply find the operation that created that
> > file descriptor to find the device it refers to.
>
> strace is stateless and can attach to a running process, it can't
> watch for open() to figure things out. This is also why it doesn't
> inspect the filehandle...

Well you can still lookup in the file handle in /proc/pid/.... if you want
that. Not sure why you are so focused on this.

> We don't *need* strace to work, but it should would be nice :|

It *is* nice. And it works fine for devices. Lets ensure that devices
are used in a standard way in the IB subsystem so that we can take full
advantage of the syscall infrastructure and the standard system calls.

> > We could easily do that following naming conventions for partitions or so.
> > Why would doing so damage the API capabilities? Seems that they are
> > sufficiently screwed up already. Cleaning that up could help quite a bit.
>
> The current API is problematic because we try to both be like netdev
> in that all devices are accessible (rdma_cm) and at the same with with
> individual per-device chardevs (uverbs0).

Device? uverbs is not a device. A particular connectx3 connected to the
pci bus is. And it should follow establish naming conventions etc. Please
lets drop the crap that is there now. If you use the notion of a device
the way it is designed to then we would have less issues.

> So, if you want to move fully to the per-char-dev model then I think
> we'd give up the global netdev like behaviors, things like
> listen(0.0.0) and output route selection, and so forth. I doubt there
> is any support for that.

Can the official listen() syscall be made to work over
infiniband devices? That would be best maybe?

I think in general one does the connection initiation via TCP and IP
protocol regardless... So really infiniband does only matter as the
underlying protocol over which we have imposed IP semantics via IPoIB.

> If we go the other way to a full netdev-like module then we give up
> fine grained (currently mildly broken) file system permissions.

Maybe go with a device semantic and not with full netdev because this is
not a classic packet based network.

> You haven't explained how we can mesh the rdma_cm, netdev-like
> listen(0.0.0.0) type semantics, continue to implement multi-port APM
> functionality, share PDs across ports, etc, etc. These are all the
> actual things done today that break when we drop the multiplexors.

I am not not *the* expert on this. Frankly this whole RDMA request stuff
is not that interesting. The basic thing that the RDMA API needs to do for
my use case is fast messaging bypassing the kernel. And having gazillion
of special ioctls on the site is not that productive. Can we please reuse
the standard system calls and ioctls as much as possible?

No idea what you mean by multiport "APMs". There is an obvius way to
aggreate devices by creating a new one like done in the storage subsystem.

Sharing PDs? Those are from the same address space using multiple devices.
It would be natural to share that in such a case since they are more bound
to the memory layout of a single process and not so much to the devices.
So PDs could be per process instead of per device.

> This isn't a simple API that is 1:1 tied to a single physical object,
> it is a sprawling thing with lots of built-in cross-device semantics. :(

Yes please simplify this sprawl as much as possible. Follow standard
convention instead of reinvention things like device aggregation.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 25, 2016, 4:32 p.m. UTC | #18
On Thu, 21 Jul 2016, Jason Gunthorpe wrote:

> I'm not so sure I want to see MLX5_CREATE_QP, MLX4_CREATE_QP,
> QIB_CREATE_QP, etc, etc as ioctl numbers, that seems difficult
> to implement the common code for.

Nobody wants that.... Why would that follow? Of course we would have
common ioctls that create QPs in the same way for all drivers.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 25, 2016, 5:39 p.m. UTC | #19
On Mon, Jul 25, 2016 at 11:32:23AM -0500, Christoph Lameter wrote:
> On Thu, 21 Jul 2016, Jason Gunthorpe wrote:
> 
> > I'm not so sure I want to see MLX5_CREATE_QP, MLX4_CREATE_QP,
> > QIB_CREATE_QP, etc, etc as ioctl numbers, that seems difficult
> > to implement the common code for.
> 
> Nobody wants that.... Why would that follow? Of course we would have
> common ioctls that create QPs in the same way for all drivers.

No, not of course. We don't have that today. Today every single
command is split between a commmon part and a variable sized (two-way)
driver-specific part. Nobody has come up with some way to avoid that,
so the plan is to continue that scheme.

Thus, every ioctl has a unique struct layout for every driver.

Christoph H. just said dynamic variable sized ioctl input structures
are ugly, so this is an alternative. Is this alternative less ugly
than the variable struct?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe July 25, 2016, 6:01 p.m. UTC | #20
On Mon, Jul 25, 2016 at 11:30:48AM -0500, Christoph Lameter wrote:

> > > We could easily do that following naming conventions for partitions or so.
> > > Why would doing so damage the API capabilities? Seems that they are
> > > sufficiently screwed up already. Cleaning that up could help quite a bit.
> >
> > The current API is problematic because we try to both be like netdev
> > in that all devices are accessible (rdma_cm) and at the same with with
> > individual per-device chardevs (uverbs0).
> 
> Device? uverbs is not a device. A particular connectx3 connected to the
> pci bus is. And it should follow establish naming conventions etc. Please
> lets drop the crap that is there now. If you use the notion of a device
> the way it is designed to then we would have less issues.

device in the RDMA sense, a device has a specific set of properties
related specifically to RDMA and the libibverbs API is designed around
this notion. We can't totally get rid of it, if we change the kernel
communication then 'device' has to be emulated somehow in userspace.

I don't know what you mean by 'device the way it is designed now' -
what infrastructure are you talking about?

> > So, if you want to move fully to the per-char-dev model then I think
> > we'd give up the global netdev like behaviors, things like
> > listen(0.0.0) and output route selection, and so forth. I doubt there
> > is any support for that.
> 
> Can the official listen() syscall be made to work over
> infiniband devices? That would be best maybe?

I have no idea if this is feasible, AFAIK nobody is looking into that
option.

If we do that, then we get a fd out of the standard scheme, what is
that FD? How does it get linked back to the actual driver ioctl interface?

> I think in general one does the connection initiation via TCP and IP
> protocol regardless... So really infiniband does only matter as the
> underlying protocol over which we have imposed IP semantics via IPoIB.

No, it is mostly all done over native protocols, not IP. IP is just
used for ARP and the routing table.

> > If we go the other way to a full netdev-like module then we give up
> > fine grained (currently mildly broken) file system permissions.
> 
> Maybe go with a device semantic and not with full netdev because this is
> not a classic packet based network.

Well, what do you mean? We are emulating netdev and strongly linking
ipoib netdev devices to the RDMA infrastructure - that is the mismatch
I keep talking about.

> > You haven't explained how we can mesh the rdma_cm, netdev-like
> > listen(0.0.0.0) type semantics, continue to implement multi-port APM
> > functionality, share PDs across ports, etc, etc. These are all the
> > actual things done today that break when we drop the multiplexors.
> 
> I am not not *the* expert on this. Frankly this whole RDMA request stuff
> is not that interesting. The basic thing that the RDMA API needs to do for
> my use case is fast messaging bypassing the kernel. And having gazillion
> of special ioctls on the site is not that productive. Can we please reuse
> the standard system calls and ioctls as much as possible?

I know it is not interesting for you, but this is the majority use
model for RDMA, so it has to be the main purpose for the design.

Your DPDK-like use case really has little need for most of the RDMA
API.

> No idea what you mean by multiport "APMs". There is an obvius way to
> aggreate devices by creating a new one like done in the storage subsystem.

That APM is a special RDMA feature for fast fail over and recovery, it
has dedicated hardware support and at least with our current API
cannot be modeled with device stacking. It is functionally different
from teaming/bonding like we see in ethernet.

> Sharing PDs? Those are from the same address space using multiple devices.
> It would be natural to share that in such a case since they are more bound
> to the memory layout of a single process and not so much to the devices.
> So PDs could be per process instead of per device.

I generally agree that we got this upside down, Devices/ports should
have been created under PDs.

But the fact remains, the hardware we have is very restricted on how
PD resources can be shared between ports. Today only ports on the same
RDMA device can share a PD. That is why we have the upside down model..

So, when I talk about sharing a PD, I mean in the sense that an app
cannot just create a single PD and use that for all RDMA. It currently
needs a PD per RDMA device.

I don't know if anyone is thinking this is a pain point.

> Yes please simplify this sprawl as much as possible. Follow standard
> convention instead of reinvention things like device aggregation.

I can't help that IBTA created a different scheme for device
aggregation, addressing, and basically everything else. 'standard
convention' in the kernel really just means ethernet, and this isn't
like ethernet, except superficially.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 27, 2016, 3:03 p.m. UTC | #21
On Mon, 25 Jul 2016, Jason Gunthorpe wrote:

> I can't help that IBTA created a different scheme for device
> aggregation, addressing, and basically everything else. 'standard
> convention' in the kernel really just means ethernet, and this isn't
> like ethernet, except superficially.

Well you can create your own non ethernet protocol. Why is there not an IB
ulp that allows basic messaging, listen() and all the other stuff using
standard system calls? There should be a netdev. I already see devices
in /sys/class/infiniband/*! But they are not listed in /proc/net/dev!!!

Why can we not use these with regular syscalls? Could provide this basic
support for native infiniband? And maybe offload some of the need for
esoteric IB ioctl functions that could just be provided by the standard
calls? F.e. statistics and device configuration would be simplified and possible using
the standard tools. IB devices have MACs as well. We could reuse quite a
bit of software there.

I think Christoph H made the point already that the wire format needs to
stay the same and also the user verbs API (to some degree) but below that
we could do pretty much what we want.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit July 27, 2016, 5:28 p.m. UTC | #22
On Wed, Jul 27, 2016 at 8:33 PM, Christoph Lameter <cl@linux.com> wrote:
>
> On Mon, 25 Jul 2016, Jason Gunthorpe wrote:
>
> > I can't help that IBTA created a different scheme for device
> > aggregation, addressing, and basically everything else. 'standard
> > convention' in the kernel really just means ethernet, and this isn't
> > like ethernet, except superficially.
>
> Well you can create your own non ethernet protocol. Why is there not an IB
> ulp that allows basic messaging, listen() and all the other stuff using
> standard system calls? There should be a netdev. I already see devices
> in /sys/class/infiniband/*! But they are not listed in /proc/net/dev!!!
>
> Why can we not use these with regular syscalls? Could provide this basic
> support for native infiniband? And maybe offload some of the need for
> esoteric IB ioctl functions that could just be provided by the standard
> calls? F.e. statistics and device configuration would be simplified and possible using
> the standard tools. IB devices have MACs as well. We could reuse quite a
> bit of software there.
>

I recall this functionality is being called as SDP (Socket Direct
Protocol) in Annex-4 of IB spec. 1.3 which used be present in older
2.6.x kernel as well as an ULP.
Vaguely recall it as non zcopy implementation.

Few older papers on this:
1. Asynchronous Zero-copy Communication for Synchronous Sockets in the
Sockets Direct Protocol (SDP) over InfiniBand
2. Zero Copy Sockets Direct Protocol over InfiniBand - Preliminary
Implementation and Performance Analysis
3. Transparently Achieving Superior Socket Performance Using Zero Copy
Socket Direct Protocol over IB
4. One more paper from Microsoft, I don't have ready reference too
which did zcopy implementation

Parav

> I think Christoph H made the point already that the wire format needs to
> stay the same and also the user verbs API (to some degree) but below that
> we could do pretty much what we want.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 27, 2016, 6:03 p.m. UTC | #23
On Wed, 27 Jul 2016, Parav Pandit wrote:

> I recall this functionality is being called as SDP (Socket Direct
> Protocol) in Annex-4 of IB spec. 1.3 which used be present in older
> 2.6.x kernel as well as an ULP.
> Vaguely recall it as non zcopy implementation.

It does not need to be zcopy if the functionality is only there for
administrative purposes and for other things where we are just too lazy to
get the whole QP stuff fully operational from user space and just fall
back to the old sockets paradigm where the QP stuff is done by th kernel
for us to give us a standard socket API. Diagnostics and configuration
would be using the standard tools and the established
infrastructure. F.e. ethtool would work without IPoIB using the native IB
addressing.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Parav Pandit July 27, 2016, 6:56 p.m. UTC | #24
On Wed, Jul 27, 2016 at 11:33 PM, Christoph Lameter <cl@linux.com> wrote:
> On Wed, 27 Jul 2016, Parav Pandit wrote:
>
>> I recall this functionality is being called as SDP (Socket Direct
>> Protocol) in Annex-4 of IB spec. 1.3 which used be present in older
>> 2.6.x kernel as well as an ULP.
>> Vaguely recall it as non zcopy implementation.
>
> It does not need to be zcopy if the functionality is only there for
> administrative purposes and for other things where we are just too lazy to
> get the whole QP stuff fully operational from user space and just fall
> back to the old sockets paradigm where the QP stuff is done by th kernel
> for us to give us a standard socket API.
Agree. Would rsockets socket emulation be sufficient/good enough or
socket has to come from kernel?
It may be likely that some handful of applications might find socket
interface easy to integrate those application with pretty minor
changes which can still benefit from transport offload acceleration,
with cost of zcopy and syscall.

> Diagnostics and configuration
> would be using the standard tools and the established
> infrastructure. F.e. ethtool would work without IPoIB using the native IB
> addressing.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) July 27, 2016, 6:58 p.m. UTC | #25
On Thu, 28 Jul 2016, Parav Pandit wrote:

> > It does not need to be zcopy if the functionality is only there for
> > administrative purposes and for other things where we are just too lazy to
> > get the whole QP stuff fully operational from user space and just fall
> > back to the old sockets paradigm where the QP stuff is done by th kernel
> > for us to give us a standard socket API.
> Agree. Would rsockets socket emulation be sufficient/good enough or
> socket has to come from kernel?

The implementation has to fully integrate with the kernel to take
advantage of all the diagnostics etc etc. User space wont help.

> changes which can still benefit from transport offload acceleration,
> with cost of zcopy and syscall.

Sure we could add ioctls etc to do QP setup etc etc. That then also has
the character of a general QP facility that could also be used by other
protocols in the future. Would make RDMA universally available if the
ioctls are also supported by other protocols. Which would be great.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 1819623..769a299 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -29,4 +29,4 @@  ib_umad-y :=			user_mad.o
 ib_ucm-y :=			ucm.o
 
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
-				rdma_core.o
+				rdma_core.o uverbs_ioctl.o uverbs_ioctl_cmd.o
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 5f09c40..b2e2ee2 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -245,6 +245,9 @@  struct ib_device *ib_alloc_device(size_t size)
 	INIT_LIST_HEAD(&device->port_list);
 	INIT_LIST_HEAD(&device->type_list);
 
+	/* TODO: don't forget to initialize device->driver_id, so verbs handshake between
+	 * user space<->kernel space will work for other values than driver_id == 0.
+	 */
 	return device;
 }
 EXPORT_SYMBOL(ib_alloc_device);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 60d7c3d..86be861 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -41,6 +41,7 @@ 
 #include <linux/mutex.h>
 #include <linux/completion.h>
 #include <linux/cdev.h>
+#include <linux/rwsem.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_umem.h>
@@ -83,6 +84,8 @@ 
  * released when the CQ is destroyed.
  */
 
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
 struct ib_uverbs_device {
 	atomic_t				refcount;
 	int					num_comp_vectors;
@@ -121,6 +124,7 @@  struct ib_uverbs_file {
 	struct ib_uverbs_event_file	       *async_file;
 	struct list_head			list;
 	int					is_closed;
+	struct rw_semaphore			close_sem;
 };
 
 struct ib_uverbs_event {
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 6af2c29..9648b80 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -361,6 +361,7 @@  ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	}
 
 	file->ucontext = ucontext;
+	ucontext->ufile = file;
 
 	fd_install(resp.async_fd, filp);
 
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
new file mode 100644
index 0000000..57ff7cb
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -0,0 +1,279 @@ 
+/*
+ * Copyright (c) 2016, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/rdma_ioctl.h>
+#include <rdma/uverbs_ioctl.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_validate_attr(struct ib_device *ibdev,
+				struct ib_ucontext *ucontext,
+				const struct ib_uverbs_attr *uattr,
+				u16 attr_id,
+				const struct uverbs_attr_chain_spec *chain_spec,
+				struct uverbs_attr *elements)
+{
+	const struct uverbs_attr_spec *spec;
+	struct uverbs_attr *e;
+
+	if (uattr->reserved)
+		return -EINVAL;
+
+	if (attr_id >= chain_spec->num_attrs)
+		return -EINVAL;
+
+	spec = &chain_spec->attrs[attr_id];
+	e = &elements[attr_id];
+
+	if (e->valid)
+		return -EINVAL;
+
+	switch (spec->type) {
+	case UVERBS_ATTR_TYPE_PTR_IN:
+	case UVERBS_ATTR_TYPE_PTR_OUT:
+		/* If spec->len is zero, don't validate (flexible) */
+		if (spec->len && uattr->len != spec->len)
+			return -EINVAL;
+		e->cmd_attr.ptr = (void * __user)uattr->ptr_idr;
+		e->cmd_attr.len = uattr->len;
+		break;
+
+	case UVERBS_ATTR_TYPE_IDR:
+		if (uattr->len != 0 || (uattr->ptr_idr >> 32) || (!ucontext))
+			return -EINVAL;
+
+		e->obj_attr.idr = (uint32_t)uattr->ptr_idr;
+		e->obj_attr.val = spec;
+		e->obj_attr.type = uverbs_get_type(ibdev,
+						   spec->idr.idr_type);
+		if (!e->obj_attr.type)
+			return -EINVAL;
+
+		e->obj_attr.uobject = uverbs_get_type_from_idr(e->obj_attr.type,
+							       ucontext,
+							       spec->idr.access,
+							       e->obj_attr.idr);
+		if (!e->obj_attr.uobject)
+			return -EINVAL;
+
+		break;
+	};
+
+	e->valid = 1;
+	return 0;
+}
+
+static int uverbs_validate(struct ib_device *ibdev,
+			   struct ib_ucontext *ucontext,
+			   const struct ib_uverbs_attr *uattrs,
+			   size_t num_attrs,
+			   const struct action_spec *action_spec,
+			   struct uverbs_attr_array *attr_array)
+{
+	size_t i;
+	int ret;
+	int n_val = -1;
+
+	for (i = 0; i < num_attrs; i++) {
+		const struct ib_uverbs_attr *uattr = &uattrs[i];
+		__u16 attr_id = uattr->attr_id;
+		const struct uverbs_attr_chain_spec *chain_spec;
+
+		ret = action_spec->dist(&attr_id, action_spec->priv);
+		if (ret < 0)
+			return ret;
+
+		if (ret > n_val)
+			n_val = ret;
+
+		chain_spec = action_spec->validator_chains[ret];
+		ret = uverbs_validate_attr(ibdev, ucontext, uattr, attr_id,
+					   chain_spec, attr_array[ret].attrs);
+		if (ret)
+			return ret;
+
+	}
+
+	return n_val >= 0 ? n_val + 1 : n_val;
+}
+
+static int uverbs_handle_action(const struct ib_uverbs_attr *uattrs,
+				size_t num_attrs,
+				struct ib_device *ibdev,
+				struct ib_uverbs_file *ufile,
+				const struct uverbs_action *handler,
+				struct uverbs_attr_array *attr_array)
+{
+	int ret;
+	int n_val;
+
+	n_val = uverbs_validate(ibdev, ufile->ucontext, uattrs, num_attrs,
+				&handler->chain, attr_array);
+	if (n_val <= 0)
+		return n_val;
+
+	ret = handler->handler(ibdev, ufile, attr_array, n_val,
+			       handler->priv);
+	uverbs_unlock_objects(attr_array, n_val, &handler->chain, !ret);
+
+	return ret;
+}
+
+static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev,
+				struct ib_uverbs_file *file,
+				struct ib_uverbs_ioctl_hdr *hdr,
+				void __user *buf)
+{
+	const struct uverbs_type_actions *type;
+	const struct uverbs_action *action;
+	const struct action_spec *action_spec;
+	long err = 0;
+	unsigned int num_specs = 0;
+	unsigned int i;
+	struct {
+		struct ib_uverbs_attr		*uattrs;
+		struct uverbs_attr_array	*uverbs_attr_array;
+	} *ctx = NULL;
+	struct uverbs_attr *curr_attr;
+	size_t ctx_size;
+
+	if (!ib_dev)
+		return -EIO;
+
+	if (ib_dev->types->num_types < hdr->object_type ||
+	    ib_dev->driver_id != hdr->driver_id)
+		return -EINVAL;
+
+	type = ib_dev->types->types[hdr->object_type];
+	if (!type || type->num_actions < hdr->action)
+		return -EOPNOTSUPP;
+
+	action = &type->actions[hdr->action];
+	if (!action)
+		return -EOPNOTSUPP;
+
+	action_spec = &action->chain;
+	for (i = 0; i < action_spec->num_chains;
+	     num_specs += action_spec->validator_chains[i]->num_attrs, i++)
+		;
+
+	ctx_size = sizeof(*ctx->uattrs) * hdr->num_attrs +
+		   sizeof(*ctx->uverbs_attr_array->attrs) * num_specs +
+		   sizeof(struct uverbs_attr_array) * action_spec->num_chains +
+		   sizeof(*ctx);
+
+	ctx = kzalloc(ctx_size, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uverbs_attr_array = (void *)ctx + sizeof(*ctx);
+	ctx->uattrs = (void *)(ctx->uverbs_attr_array +
+			       action_spec->num_chains);
+	curr_attr = (void *)(ctx->uattrs + hdr->num_attrs);
+	for (i = 0; i < action_spec->num_chains; i++) {
+		ctx->uverbs_attr_array[i].attrs = curr_attr;
+		ctx->uverbs_attr_array[i].num_attrs =
+			action_spec->validator_chains[i]->num_attrs;
+		curr_attr += action_spec->validator_chains[i]->num_attrs;
+	}
+
+	err = copy_from_user(ctx->uattrs, buf,
+			     sizeof(*ctx->uattrs) * hdr->num_attrs);
+	if (err) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = uverbs_handle_action(ctx->uattrs, hdr->num_attrs, ib_dev,
+				   file, action, ctx->uverbs_attr_array);
+out:
+	kfree(ctx);
+	return err;
+}
+
+#define IB_UVERBS_MAX_CMD_SZ 4096
+
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_ioctl_hdr __user *user_hdr =
+		(struct ib_uverbs_ioctl_hdr __user *)arg;
+	struct ib_uverbs_ioctl_hdr hdr;
+	struct ib_device *ib_dev;
+	int srcu_key;
+	long err;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		err = -EIO;
+		goto out;
+	}
+
+	if (cmd == RDMA_DIRECT_IOCTL) {
+		/* TODO? */
+		err = -ENOSYS;
+		goto out;
+	} else {
+		if (cmd != RDMA_VERBS_IOCTL) {
+			err = -ENOIOCTLCMD;
+			goto out;
+		}
+
+		err = copy_from_user(&hdr, user_hdr, sizeof(hdr));
+
+		if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ ||
+		    hdr.length <= sizeof(hdr) ||
+		    hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* currently there are no flags supported */
+		if (hdr.flags) {
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		/* We're closing, fail all commands */
+		if (!down_read_trylock(&file->close_sem))
+			return -EIO;
+		err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr,
+					  (__user void *)arg + sizeof(hdr));
+		up_read(&file->close_sem);
+	}
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+
+	return err;
+}
diff --git a/drivers/infiniband/core/uverbs_ioctl_cmd.c b/drivers/infiniband/core/uverbs_ioctl_cmd.c
new file mode 100644
index 0000000..4c2d30e4
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl_cmd.c
@@ -0,0 +1,78 @@ 
+/*
+ * Copyright (c) 2016, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_ioctl_cmd.h>
+#include <rdma/ib_verbs.h>
+#include <linux/bug.h>
+#include "uverbs.h"
+
+#define IB_UVERBS_VENDOR_FLAG	0x8000
+
+int ib_uverbs_std_dist(__u16 *attr_id, void *priv)
+{
+	if (*attr_id & IB_UVERBS_VENDOR_FLAG) {
+		*attr_id &= ~IB_UVERBS_VENDOR_FLAG;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_uverbs_std_dist);
+
+int uverbs_action_std_handle(struct ib_device *ib_dev,
+			     struct ib_uverbs_file *ufile,
+			     struct uverbs_attr_array *ctx, size_t num,
+			     void *_priv)
+{
+	struct uverbs_action_std_handler *priv = _priv;
+
+	if (!ufile->ucontext)
+		return -EINVAL;
+
+	WARN_ON((num != 1) && (num != 2));
+	return priv->handler(ib_dev, ufile->ucontext, &ctx[0],
+			     (num == 2 ? &ctx[1] : NULL),
+			     priv->priv);
+}
+EXPORT_SYMBOL(uverbs_action_std_handle);
+
+int uverbs_action_std_ctx_handle(struct ib_device *ib_dev,
+				 struct ib_uverbs_file *ufile,
+				 struct uverbs_attr_array *ctx, size_t num,
+				 void *_priv)
+{
+	struct uverbs_action_std_ctx_handler *priv = _priv;
+
+	WARN_ON((num != 1) && (num != 2));
+	return priv->handler(ib_dev, ufile, &ctx[0], (num == 2 ? &ctx[1] : NULL), priv->priv);
+}
+EXPORT_SYMBOL(uverbs_action_std_ctx_handle);
+
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 333ed70..fa51568 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -49,6 +49,7 @@ 
 #include <asm/uaccess.h>
 
 #include <rdma/ib.h>
+#include <rdma/rdma_ioctl.h>
 
 #include "uverbs.h"
 
@@ -211,6 +212,7 @@  static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 {
 	struct ib_uobject *uobj, *tmp;
 
+	down_write(&file->close_sem);
 	context->closing = 1;
 
 	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -306,6 +308,7 @@  static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 	}
 
 	put_pid(context->tgid);
+	up_write(&file->close_sem);
 
 	return context->device->dealloc_ucontext(context);
 }
@@ -915,6 +918,7 @@  static int ib_uverbs_open(struct inode *inode, struct file *filp)
 		goto err;
 	}
 
+	init_rwsem(&file->close_sem);
 	file->device	 = dev;
 	file->ucontext	 = NULL;
 	file->async_file = NULL;
@@ -973,6 +977,7 @@  static const struct file_operations uverbs_fops = {
 	.open	 = ib_uverbs_open,
 	.release = ib_uverbs_close,
 	.llseek	 = no_llseek,
+	.unlocked_ioctl = ib_uverbs_ioctl,
 };
 
 static const struct file_operations uverbs_mmap_fops = {
@@ -982,6 +987,7 @@  static const struct file_operations uverbs_mmap_fops = {
 	.open	 = ib_uverbs_open,
 	.release = ib_uverbs_close,
 	.llseek	 = no_llseek,
+	.unlocked_ioctl = ib_uverbs_ioctl,
 };
 
 static struct ib_client uverbs_client = {
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 85d4c41..4688463 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1314,6 +1314,7 @@  struct ib_umem;
 
 struct ib_ucontext {
 	struct ib_device       *device;
+	struct ib_uverbs_file  *ufile;
 	struct list_head	pd_list;
 	struct list_head	mr_list;
 	struct list_head	mw_list;
@@ -1976,6 +1977,7 @@  struct ib_device {
 	int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
 	struct list_head type_list;
 
+	u16				driver_id;
 	const struct uverbs_types	*types;
 };
 
diff --git a/include/rdma/uverbs_ioctl_cmd.h b/include/rdma/uverbs_ioctl_cmd.h
new file mode 100644
index 0000000..19806df
--- /dev/null
+++ b/include/rdma/uverbs_ioctl_cmd.h
@@ -0,0 +1,68 @@ 
+/*
+ * Copyright (c) 2016, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UVERBS_IOCTL_CMD_
+#define _UVERBS_IOCTL_CMD_
+
+#include <rdma/uverbs_ioctl.h>
+
+int ib_uverbs_std_dist(__u16 *attr_id, void *priv);
+
+/* common validators */
+
+int uverbs_action_std_handle(struct ib_device *ib_dev,
+			     struct ib_uverbs_file *ufile,
+			     struct uverbs_attr_array *ctx, size_t num,
+			     void *_priv);
+int uverbs_action_std_ctx_handle(struct ib_device *ib_dev,
+				 struct ib_uverbs_file *ufile,
+				 struct uverbs_attr_array *ctx, size_t num,
+				 void *_priv);
+
+struct uverbs_action_std_handler {
+	int (*handler)(struct ib_device *ib_dev, struct ib_ucontext *ucontext,
+		       struct uverbs_attr_array *common,
+		       struct uverbs_attr_array *vendor,
+		       void *priv);
+	void *priv;
+};
+
+struct uverbs_action_std_ctx_handler {
+	int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile,
+		       struct uverbs_attr_array *common,
+		       struct uverbs_attr_array *vendor,
+		       void *priv);
+	void *priv;
+};
+
+#endif
+
diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
index 5c1117a..f6927fc 100644
--- a/include/uapi/rdma/rdma_user_ioctl.h
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -42,6 +42,29 @@ 
  */
 #define IB_IOCTL_MAGIC			RDMA_IOCTL_MAGIC
 
+#define RDMA_VERBS_IOCTL \
+	_IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr)
+
+#define RDMA_DIRECT_IOCTL \
+	_IOWR(RDMA_IOCTL_MAGIC, 2, struct ib_uverbs_ioctl_hdr)
+
+struct ib_uverbs_attr {
+	__u16 attr_id;		/* command specific type attribute */
+	__u16 len;		/* NA for idr */
+	__u32 reserved;
+	 __u64 ptr_idr;		/* ptr typeo command/idr handle */
+};
+
+struct ib_uverbs_ioctl_hdr {
+	__u16 length;
+	__u16 flags;
+	__u16 object_type;
+	__u16 driver_id;
+	__u16 action;
+	__u16 num_attrs;
+	struct ib_uverbs_attr  attrs[0];
+};
+
 /* Legacy part
  * !!!! NOTE: It uses the same command index as VERBS
  */