diff mbox

[resend,4/4] IB/srp: Add RDMA/CM support

Message ID 20180104222842.26756-5-bart.vanassche@wdc.com (mailing list archive)
State Superseded
Delegated to: Doug Ledford
Headers show

Commit Message

Bart Van Assche Jan. 4, 2018, 10:28 p.m. UTC
Since the SRP_LOGIN_REQ defined in the SRP standard is larger than
what fits in the RDMA/CM login request private data, introduce a new
login request format for the RDMA/CM. Add a kernel module parameter
to the ib_srp kernel driver that allows to specify the port on which
to listen for RDMA/CM connections. The default value for this kernel
module parameter is 0, which means not to listen for RDMA/CM
connections.

Note: since srp_daemon and ibsrpdm rely on the subnet manager and
since there is no equivalent of the IB subnet manager in non-IB
networks, login has to be performed manually for non-IB networks.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
---
 drivers/infiniband/ulp/srp/ib_srp.c | 683 ++++++++++++++++++++++++++++--------
 drivers/infiniband/ulp/srp/ib_srp.h |  41 ++-
 include/scsi/srp.h                  |  17 +
 3 files changed, 590 insertions(+), 151 deletions(-)

Comments

Doug Ledford Jan. 5, 2018, 5:21 p.m. UTC | #1
On Thu, 2018-01-04 at 14:28 -0800, Bart Van Assche wrote:
> Since the SRP_LOGIN_REQ defined in the SRP standard is larger than
> what fits in the RDMA/CM login request private data, introduce a new
> login request format for the RDMA/CM.

Ok up to here...

>  Add a kernel module parameter
> to the ib_srp kernel driver

Unless I missed it, there is no new ib_srp kernel module parameter here.
 Furthermore, if there were, I would probably think that was a bad idea.

>  that allows to specify the port on which
> to listen for RDMA/CM connections.

Obviously, a kernel module parameter for the listen port would be an
ib_srpt option, not an ib_srp option, so that makes sense why it's not
in the patch.  The commit log message needs updated.  But, a kernel
module parameter instead of a config file option for a listen port that
we set with the target infrastructure sounds wrong anyway.

>  The default value for this kernel
> module parameter is 0, which means not to listen for RDMA/CM
> connections.

Again, doesn't belong in this commit log.

> 
> +static int srp_parse_in(struct sockaddr_storage *sa, const char *addr_port_str)
> +{
> +	char *addr = kstrdup(addr_port_str, GFP_KERNEL);
> +	char *port_str = addr;
> +	int ret;
> +
> +	if (!addr)
> +		return -ENOMEM;
> +	strsep(&port_str, ":");
> +	ret = inet_pton_with_scope(&init_net, AF_UNSPEC, addr, port_str, sa);
> +	kfree(addr);
> +	return ret;
> +}
> +
> 

This particular function is problematic in that it adds new namespace
unaware code.  The namespace code in the RDMA stack is in a limbo state
of partially implemented, partially not.  I'm loathe to add any more
code that is not fully namespace aware as that just worsens the
hysteresis in the stack.  So we need to figure out how to do this in a
namespace aware manner.  I haven't previously been thinking about this
specific namespace issue, so I'm not prepared to even make suggestions
for a fix for this yet...
Jason Gunthorpe Jan. 5, 2018, 5:34 p.m. UTC | #2
On Fri, Jan 05, 2018 at 12:21:10PM -0500, Doug Ledford wrote:

> > +static int srp_parse_in(struct sockaddr_storage *sa, const char *addr_port_str)
> > +{
> > +	char *addr = kstrdup(addr_port_str, GFP_KERNEL);
> > +	char *port_str = addr;
> > +	int ret;
> > +
> > +	if (!addr)
> > +		return -ENOMEM;
> > +	strsep(&port_str, ":");
> > +	ret = inet_pton_with_scope(&init_net, AF_UNSPEC, addr, port_str, sa);
> > +	kfree(addr);
> > +	return ret;
> > +}
> > +
> 
> This particular function is problematic in that it adds new namespace
> unaware code.  The namespace code in the RDMA stack is in a limbo state
> of partially implemented, partially not.  I'm loathe to add any more
> code that is not fully namespace aware as that just worsens the
> hysteresis in the stack.  So we need to figure out how to do this in a
> namespace aware manner.  I haven't previously been thinking about this
> specific namespace issue, so I'm not prepared to even make suggestions
> for a fix for this yet...

Do the userspace daemon's still manage the connection to SRP?

If yes, then the networking information should be relative to the
namespace of the thing that wrote to the sysfs file..

Also, are there srp_daemon patches for this too? I've been asking to
see the userspace side for new uAPI features before accepting the
kernel part so that everything can be well understood. If yes, please
send, even if it is RFCish..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Jan. 5, 2018, 5:45 p.m. UTC | #3
T24gRnJpLCAyMDE4LTAxLTA1IGF0IDEyOjIxIC0wNTAwLCBEb3VnIExlZGZvcmQgd3JvdGU6DQo+
IE9uIFRodSwgMjAxOC0wMS0wNCBhdCAxNDoyOCAtMDgwMCwgQmFydCBWYW4gQXNzY2hlIHdyb3Rl
Og0KPiA+IEFkZCBhIGtlcm5lbCBtb2R1bGUgcGFyYW1ldGVyIHRvIHRoZSBpYl9zcnAga2VybmVs
IGRyaXZlcg0KPiANCj4gVW5sZXNzIEkgbWlzc2VkIGl0LCB0aGVyZSBpcyBubyBuZXcgaWJfc3Jw
IGtlcm5lbCBtb2R1bGUgcGFyYW1ldGVyIGhlcmUuDQoNClRoYXQncyByaWdodC4gSSB3aWxsIHJl
bW92ZSB0aGF0IHBhcnQgZnJvbSB0aGUgcGF0Y2ggZGVzY3JpcHRpb24uDQoNCkJhcnQu
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Jan. 5, 2018, 5:51 p.m. UTC | #4
On Fri, 2018-01-05 at 10:34 -0700, Jason Gunthorpe wrote:
> On Fri, Jan 05, 2018 at 12:21:10PM -0500, Doug Ledford wrote:

> 

> > > +static int srp_parse_in(struct sockaddr_storage *sa, const char *addr_port_str)

> > > +{

> > > +	char *addr = kstrdup(addr_port_str, GFP_KERNEL);

> > > +	char *port_str = addr;

> > > +	int ret;

> > > +

> > > +	if (!addr)

> > > +		return -ENOMEM;

> > > +	strsep(&port_str, ":");

> > > +	ret = inet_pton_with_scope(&init_net, AF_UNSPEC, addr, port_str, sa);

> > > +	kfree(addr);

> > > +	return ret;

> > > +}

> > > +

> > 

> > This particular function is problematic in that it adds new namespace

> > unaware code.  The namespace code in the RDMA stack is in a limbo state

> > of partially implemented, partially not.  I'm loathe to add any more

> > code that is not fully namespace aware as that just worsens the

> > hysteresis in the stack.  So we need to figure out how to do this in a

> > namespace aware manner.  I haven't previously been thinking about this

> > specific namespace issue, so I'm not prepared to even make suggestions

> > for a fix for this yet...

> 

> Do the userspace daemon's still manage the connection to SRP?


srp_daemon uses management datagrams to discover SRP targets and these MADs
discover the GID of the target port instead of the IP address. So I have not
even tried to add RDMA/CM support to srp_daemon. If automatic login would be
implemented for SRP and RDMA/CM I think we should define a new discovery
mechanism based on the Internet Protocol.

> If yes, then the networking information should be relative to the

> namespace of the thing that wrote to the sysfs file..


Login occurs by writing into a sysfs file created by the SRP initiator. How
about using the networking namespace of the process that writes into that
sysfs file?

> Also, are there srp_daemon patches for this too? I've been asking to

> see the userspace side for new uAPI features before accepting the

> kernel part so that everything can be well understood. If yes, please

> send, even if it is RFCish..


As explained above there are no corresponding srp_daemon patches. But it may
be interesting to have a look at the srp-test source code since I used that
software to test RDMA/CM support. See also https://github.com/bvanassche/srp-test.

Bart.
Jason Gunthorpe Jan. 5, 2018, 5:55 p.m. UTC | #5
On Fri, Jan 05, 2018 at 05:51:18PM +0000, Bart Van Assche wrote:

> > If yes, then the networking information should be relative to the
> > namespace of the thing that wrote to the sysfs file..
> 
> Login occurs by writing into a sysfs file created by the SRP initiator. How
> about using the networking namespace of the process that writes into that
> sysfs file?

Sounds right to me.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Jan. 5, 2018, 6:06 p.m. UTC | #6
On Fri, 2018-01-05 at 10:34 -0700, Jason Gunthorpe wrote:
> On Fri, Jan 05, 2018 at 12:21:10PM -0500, Doug Ledford wrote:
> 
> > > +static int srp_parse_in(struct sockaddr_storage *sa, const char *addr_port_str)
> > > +{
> > > +	char *addr = kstrdup(addr_port_str, GFP_KERNEL);
> > > +	char *port_str = addr;
> > > +	int ret;
> > > +
> > > +	if (!addr)
> > > +		return -ENOMEM;
> > > +	strsep(&port_str, ":");
> > > +	ret = inet_pton_with_scope(&init_net, AF_UNSPEC, addr, port_str, sa);
> > > +	kfree(addr);
> > > +	return ret;
> > > +}
> > > +
> > 
> > This particular function is problematic in that it adds new namespace
> > unaware code.  The namespace code in the RDMA stack is in a limbo state
> > of partially implemented, partially not.  I'm loathe to add any more
> > code that is not fully namespace aware as that just worsens the
> > hysteresis in the stack.  So we need to figure out how to do this in a
> > namespace aware manner.  I haven't previously been thinking about this
> > specific namespace issue, so I'm not prepared to even make suggestions
> > for a fix for this yet...
> 
> Do the userspace daemon's still manage the connection to SRP?
> 
> If yes, then the networking information should be relative to the
> namespace of the thing that wrote to the sysfs file..

Maybe, maybe not.  It depends on the implementation.  IIRC you get one
daemon per port, not one daemon per mount.  In that situation, it's
conceivable that you might want to allow the init_net namespace to host
a master daemon that mounts all mounts on a specific port under a
variety of namespaces and then you don't need to run a separate daemon
in the namespace itself.  You can reduce the privilege level of
applications in the namespace this way.  But, that's an implementation
detail.

> Also, are there srp_daemon patches for this too? I've been asking to
> see the userspace side for new uAPI features before accepting the
> kernel part so that everything can be well understood. If yes, please
> send, even if it is RFCish..

No, the target code is likewise clueless about namespaces.  I suspect
(although I haven't looked) that the target code's Add RDMACM support
patch will have the same problem.

And, FWIW, the iser/isert code also has this same namespace issue.

And this is all complicated by the fact that the block devices created
by the SRP code might be mounted filesystems, or might be presented as
raw block devices to apps.  If it's a filesystem, then we get a pass as
the filesystem layer will determine if any given read/write should go
through based upon the bind mounts of the filesystem and the namespace
of the application accessing the filesystem.  But if the block device is
directly accessed by a user space application, like maybe a database
writing directly to the block device or even just mkfs/fsck, then we
need to enforce namespaces ourselves.  For that we need to know what
namespace(s) the device is created in, and what namespace the app
opening the device is in.
Bart Van Assche Jan. 5, 2018, 6:12 p.m. UTC | #7
On Fri, 2018-01-05 at 13:06 -0500, Doug Ledford wrote:
> No, the target code is likewise clueless about namespaces.  I suspect

> (although I haven't looked) that the target code's Add RDMACM support

> patch will have the same problem.


Hello Doug,

The target code already allows to configure ACL (access control lists) that
control which initiator systems are allowed to log in and which ones not. Are
you sure that it will be valuable to add network namespace support to the
RDMA/CM code in the SRP target driver?

Thanks,

Bart.
Doug Ledford Jan. 5, 2018, 6:15 p.m. UTC | #8
On Fri, 2018-01-05 at 18:12 +0000, Bart Van Assche wrote:
> On Fri, 2018-01-05 at 13:06 -0500, Doug Ledford wrote:
> > No, the target code is likewise clueless about namespaces.  I suspect
> > (although I haven't looked) that the target code's Add RDMACM support
> > patch will have the same problem.
> 
> Hello Doug,
> 
> The target code already allows to configure ACL (access control lists) that
> control which initiator systems are allowed to log in and which ones not. Are
> you sure that it will be valuable to add network namespace support to the
> RDMA/CM code in the SRP target driver?

I haven't really thought about it much.  My comment wasn't that I
thought it should be implemented, just that my quick grep of init_net in
the drivers/infiniband directory showed me that the srpt driver doesn't
distinguish any namespaces (while the isert driver does), so I suspect
adding rdmacm support will add a new init_net usage.
Jason Gunthorpe Jan. 5, 2018, 7:25 p.m. UTC | #9
On Fri, Jan 05, 2018 at 01:06:58PM -0500, Doug Ledford wrote:
> > Do the userspace daemon's still manage the connection to SRP?
> >
> > If yes, then the networking information should be relative to the
> > namespace of the thing that wrote to the sysfs file..
>
> Maybe, maybe not.  It depends on the implementation.  IIRC you get one
> daemon per port, not one daemon per mount.

I don't think it depends - if we expose this sysfs file to a container
then anything less than using the contain'd net namespace sounds like
it is a path to allow the container to escape its net namespace.

The complication here is that sysfs creates a device, and that device
is currently created in the host namespace.

So from a security perspective containers shouldn't even have access
to this thing at all without more work to ensure that the created
block device is also restriced inside the container.

Since it is a sysfs file, and most container systems mount syfs ro, we
can probably get away with ignoring namespaces for now?

But using the current process namespace is also a good choice.

In princinple there can be multiple srp_daemons if they can coordinate
which ones do which. For instance a container could run its own
srp_daemon restricted to the pkeys the container has access to. If the
device stuff above was fixed then this would even make some sense...

Otherwise srp_daemon has to run in the host namespace, where the
created devices end up and it rightly should not see the netdevices
that are assigned to other namespaces.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Jan. 5, 2018, 8:23 p.m. UTC | #10
On Fri, 2018-01-05 at 12:25 -0700, Jason Gunthorpe wrote:
> On Fri, Jan 05, 2018 at 01:06:58PM -0500, Doug Ledford wrote:
> > > Do the userspace daemon's still manage the connection to SRP?
> > > 
> > > If yes, then the networking information should be relative to the
> > > namespace of the thing that wrote to the sysfs file..
> > 
> > Maybe, maybe not.  It depends on the implementation.  IIRC you get one
> > daemon per port, not one daemon per mount.
> 
> I don't think it depends - if we expose this sysfs file to a container

Who says we have to do that?  We could make the sysfs file only visible
in the init namespace and let the init namespace daemon control what
namespaces have what views.  That was my point, the implementation can
be flexible.  And actually, if most containers mount sysfs ro as you say
below, then the init namespace daemon would need to create the namespace
views anyway.  We could just make that mandatory by refusing to create
devices from anything other than init_net namespace.  Then even if
someone does mount sysfs rw in a container, we're still good.

> then anything less than using the contain'd net namespace sounds like
> it is a path to allow the container to escape its net namespace.

I'm a little concerned that this is a problem now regardless.

> The complication here is that sysfs creates a device, and that device
> is currently created in the host namespace.

Let's assume, for the sake of what I'm writing below, that we modify the
srp daemon so that every line in the srp_daemon.conf file can optionally
specify a namespace, and when present, the daemon will pass that to the
kernel, and when present the kernel code creates the *device* file for
that device in that specific namespace (which is really the only thing
we care about...for a filesystem based access as opposed to direct
device access, you want to create the device file in the init_net
namespace and mount the device in the init_net namespace and then follow
the typical filesystem namespace rules for determining what the client
namespaces can see, and in that situation the client need know nothing
about SRP, it is only using a filesystem in a namespace).

> So from a security perspective containers shouldn't even have access
> to this thing at all without more work to ensure that the created
> block device is also restriced inside the container.

This isn't sufficient.  The block device created must be constrained
within the container, but if we allow direct device access to the
underlying LUN on the target, then that target LUN must be exclusively
owned by the container.  No other container, nor the host, can be
allowed to have any access of any sort or it becomes a message passing
bypass around containerization.  It becomes easier then to allow the
init_net daemon to create all of the devices, and once it creates a
single mapping to any LUN, that LUN can not be reused for any other
mapping.  So, a LUN can be either A) a mounted filesystem in the
init_net namespace with other namespaces carved out of the filesystem as
appropriate or B) a direct access device that is accessible in exactly
one namespace only.  We can't actually rely on the srp_daemon to enforce
this, we have to do it at the kernel level, but I think that's what we
need to do (if we don't simply bar direct device access from a
container, period).  The only difficulty I see here is multipath.  You
still want to support it, especially for the host OS, but at the same
time, you can't allow a container to get one path and a different
container to get another path to the same device.

> Since it is a sysfs file, and most container systems mount syfs ro, we
> can probably get away with ignoring namespaces for now?
> 
> But using the current process namespace is also a good choice.
> 
> In princinple there can be multiple srp_daemons if they can coordinate
> which ones do which. For instance a container could run its own
> srp_daemon restricted to the pkeys the container has access to. If the
> device stuff above was fixed then this would even make some sense...
> 
> Otherwise srp_daemon has to run in the host namespace, where the
> created devices end up and it rightly should not see the netdevices
> that are assigned to other namespaces.

This problem is made more difficult by the fact that there is persistent
storage at the other end of the connection.  It doesn't really matter
what netdevice we access a target through.  If the accesses go to the
same physical media at the other end, then they can't be shared across
namespaces without creating a containerization leak.  With netdevices we
have a unique MAC/vlan/IP tuple of data, and remote systems only know us
by that and our containerized code can't reach beyond those boundaries. 
But with disks, the issue is different.  If we allow direct device
access in the container, then (as best we can, there may be problems we
simply can't solve) we need the container bubble to extend all the way
around the physical media we are allowing access to on the remote target
system.

We might just have to turn off all direct device file access in
containers for iser and srp and nvmeof...
Jason Gunthorpe Jan. 5, 2018, 8:35 p.m. UTC | #11
On Fri, Jan 05, 2018 at 03:23:55PM -0500, Doug Ledford wrote:
> On Fri, 2018-01-05 at 12:25 -0700, Jason Gunthorpe wrote:
> > On Fri, Jan 05, 2018 at 01:06:58PM -0500, Doug Ledford wrote:
> > > > Do the userspace daemon's still manage the connection to SRP?
> > > > 
> > > > If yes, then the networking information should be relative to the
> > > > namespace of the thing that wrote to the sysfs file..
> > > 
> > > Maybe, maybe not.  It depends on the implementation.  IIRC you get one
> > > daemon per port, not one daemon per mount.
> > 
> > I don't think it depends - if we expose this sysfs file to a container
> 
> Who says we have to do that?  We could make the sysfs file only visible
> in the init namespace and let the init namespace daemon control what
> namespaces have what views.

What 'views'? It is a sysfs file controlled by the kernel - srp_daemon
has no control ove rit.

> views anyway.  We could just make that mandatory by refusing to create
> devices from anything other than init_net namespace.  Then even if
> someone does mount sysfs rw in a container, we're still good.

Usually we don't put that kind if policy in the kernel.

Someone could run a priviledged container with full device access and
expect this stuff to work right. In that case it is certainly correct
for the srp_daemon and kernel to be in the namespace of the calling
process.

> > So from a security perspective containers shouldn't even have access
> > to this thing at all without more work to ensure that the created
> > block device is also restriced inside the container.
> 
> This isn't sufficient.  The block device created must be constrained
> within the container, but if we allow direct device access to the
> underlying LUN on the target, then that target LUN must be exclusively
> owned by the container.

Yes. That is done on the storage controller via ACLs of that LUN. The
container's net namespace would be restricted in some way that the ACL
can uniquely identify it - and the srp_daemon could run inside the
container.

Otherwise the LUN has to be ACL'd to the host and the srp_daemon
running on the host creates the device and orchestartion sends it into
the contiainer.

> No other container, nor the host, can be allowed to have any access
> of any sort or it becomes a message passing bypass around
> containerization.

But maybe that is the user-intent. shared storage between VMs is a
real thing. We can't put that kind of policy choice in the kernel.

> This problem is made more difficult by the fact that there is persistent
> storage at the other end of the connection.  It doesn't really matter
> what netdevice we access a target through.

Well, it does. The netdevice is part of the ACL system the target
uses. It is actually super important that the right netdevice be
used..

Anyhow. Bart should look at what iscsi does for namespaces and copy
that.. No sense in inventing something unique for RDMA.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Jan. 5, 2018, 8:53 p.m. UTC | #12
On Fri, 2018-01-05 at 13:35 -0700, Jason Gunthorpe wrote:
> Anyhow. Bart should look at what iscsi does for namespaces and copy

> that.. No sense in inventing something unique for RDMA.


Unless if I missed something, network namespace support seems to be missing
from the iSCSI and FCoE initiator drivers:

$ git grep -nH 'struct net[^[:alnum:]_]' */scsi
$ git grep -nH 'init_net' */scsi
drivers/scsi/cxgbi/libcxgbi.c:593:	rt = ip_route_output_ports(&init_net, fl4, NULL, daddr, saddr,
drivers/scsi/cxgbi/libcxgbi.c:642:		ndev = ip_dev_find(&init_net, daddr->sin_addr.s_addr);
drivers/scsi/cxgbi/libcxgbi.c:708:	return (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
drivers/scsi/cxgbi/libcxgbi.c:790:		err = ipv6_dev_get_saddr(&init_net, idev ? idev->dev : NULL,
drivers/scsi/fcoe/fcoe.c:1815:	netdev = dev_get_by_index(&init_net, entry->ifindex);
drivers/scsi/fcoe/fcoe_transport.c:727:		return dev_get_by_name(&init_net, ifname);
drivers/scsi/scsi_netlink.c:133:	scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT,
drivers/scsi/scsi_transport_iscsi.c:4533:	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, &cfg);

Bart.
Doug Ledford Jan. 5, 2018, 11:13 p.m. UTC | #13
On Fri, 2018-01-05 at 13:35 -0700, Jason Gunthorpe wrote:
> On Fri, Jan 05, 2018 at 03:23:55PM -0500, Doug Ledford wrote:
> > On Fri, 2018-01-05 at 12:25 -0700, Jason Gunthorpe wrote:
> > > On Fri, Jan 05, 2018 at 01:06:58PM -0500, Doug Ledford wrote:
> > > > > Do the userspace daemon's still manage the connection to SRP?
> > > > > 
> > > > > If yes, then the networking information should be relative to the
> > > > > namespace of the thing that wrote to the sysfs file..
> > > > 
> > > > Maybe, maybe not.  It depends on the implementation.  IIRC you get one
> > > > daemon per port, not one daemon per mount.
> > > 
> > > I don't think it depends - if we expose this sysfs file to a container
> > 
> > Who says we have to do that?  We could make the sysfs file only visible
> > in the init namespace and let the init namespace daemon control what
> > namespaces have what views.
> 
> What 'views'? It is a sysfs file controlled by the kernel - srp_daemon
> has no control ove rit.

Ok, allow me to clarify: restrict the sysfs file to create mappings to
only the init_net namespace, and by views I meant allow the host
srp_daemon to create a mapping with a specific namespace and that would
then create a device file in that namespace, not a sysfs file.

> > views anyway.  We could just make that mandatory by refusing to create
> > devices from anything other than init_net namespace.  Then even if
> > someone does mount sysfs rw in a container, we're still good.
> 
> Usually we don't put that kind if policy in the kernel.

No, we normally don't.  However....

> Someone could run a priviledged container with full device access and
> expect this stuff to work right. In that case it is certainly correct
> for the srp_daemon and kernel to be in the namespace of the calling
> process.
> 
> > > So from a security perspective containers shouldn't even have access
> > > to this thing at all without more work to ensure that the created
> > > block device is also restriced inside the container.
> > 
> > This isn't sufficient.  The block device created must be constrained
> > within the container, but if we allow direct device access to the
> > underlying LUN on the target, then that target LUN must be exclusively
> > owned by the container.
> 
> Yes. That is done on the storage controller via ACLs of that LUN.

But we broke that already...

>  The
> container's net namespace would be restricted in some way that the ACL
> can uniquely identify it - and the srp_daemon could run inside the
> container.

When we arguing over namespaces, especially as they related to IPoIB
devices, we decided to allow the tuple to be p_key/qp/gid so that you
can have to separate containers on the same p_key and gid with the
differentiating factor being only the qp.  If we then use that to target
our SRP RDMACM connection, we've gone into an area where the target
can't differentiate our container.

So, yes, I think the storage target should control the ACLs too, but I'm
concerned that we've gone down a path where that can't currently be done
 and changes will be required on the target for things to work.
Jason Gunthorpe Jan. 5, 2018, 11:27 p.m. UTC | #14
On Fri, Jan 05, 2018 at 06:13:08PM -0500, Doug Ledford wrote:
> Ok, allow me to clarify: restrict the sysfs file to create mappings to
> only the init_net namespace, and by views I meant allow the host
> srp_daemon to create a mapping with a specific namespace and that would
> then create a device file in that namespace, not a sysfs file.

I'm not familiar enough with the status of the 'device namespace'
stuff, but..

AFAIK a today this works with the orchestation software just putting
the device nodes it wants the container to have in /dev/ tmpfs and
then the kernel prevents the container from creating new device nodes.

So, in that configuration plugging new block devices into the
container is a userspace problem, not the kernel, and you'd never run
something like srp_daemon inside a container..

> When we arguing over namespaces, especially as they related to IPoIB
> devices, we decided to allow the tuple to be p_key/qp/gid so that you
> can have to separate containers on the same p_key and gid with the

Well, the PKey and GID is supposed to be the differentiator for ACL
like purposes.

And in roce we can have a full MAC address assigned to the container
(for iser and what not)

So it isn't broken, it is just limited. (ie by the gid table size)

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 9472f5989d27..6f59bf715a0d 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -41,6 +41,7 @@ 
 #include <linux/random.h>
 #include <linux/jiffies.h>
 #include <linux/lockdep.h>
+#include <linux/inet.h>
 #include <rdma/ib_cache.h>
 
 #include <linux/atomic.h>
@@ -144,7 +145,9 @@  static void srp_remove_one(struct ib_device *device, void *client_data);
 static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
 		const char *opname);
-static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
+static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
+static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id,
+			       struct rdma_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
 static struct workqueue_struct *srp_remove_wq;
@@ -265,8 +268,8 @@  static void srp_qp_event(struct ib_event *event, void *context)
 		 ib_event_msg(event->event), event->event);
 }
 
-static int srp_init_qp(struct srp_target_port *target,
-		       struct ib_qp *qp)
+static int srp_init_ib_qp(struct srp_target_port *target,
+			  struct ib_qp *qp)
 {
 	struct ib_qp_attr *attr;
 	int ret;
@@ -277,7 +280,7 @@  static int srp_init_qp(struct srp_target_port *target,
 
 	ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev,
 				  target->srp_host->port,
-				  be16_to_cpu(target->pkey),
+				  be16_to_cpu(target->ib_cm.pkey),
 				  &attr->pkey_index);
 	if (ret)
 		goto out;
@@ -298,32 +301,110 @@  static int srp_init_qp(struct srp_target_port *target,
 	return ret;
 }
 
-static int srp_new_cm_id(struct srp_rdma_ch *ch)
+static int srp_new_ib_cm_id(struct srp_rdma_ch *ch)
 {
 	struct srp_target_port *target = ch->target;
 	struct ib_cm_id *new_cm_id;
 
 	new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev,
-				    srp_cm_handler, ch);
+				    srp_ib_cm_handler, ch);
 	if (IS_ERR(new_cm_id))
 		return PTR_ERR(new_cm_id);
 
-	if (ch->cm_id)
-		ib_destroy_cm_id(ch->cm_id);
-	ch->cm_id = new_cm_id;
+	if (ch->ib_cm.cm_id)
+		ib_destroy_cm_id(ch->ib_cm.cm_id);
+	ch->ib_cm.cm_id = new_cm_id;
 	if (rdma_cap_opa_ah(target->srp_host->srp_dev->dev,
 			    target->srp_host->port))
-		ch->path.rec_type = SA_PATH_REC_TYPE_OPA;
+		ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_OPA;
 	else
-		ch->path.rec_type = SA_PATH_REC_TYPE_IB;
-	ch->path.sgid = target->sgid;
-	ch->path.dgid = target->orig_dgid;
-	ch->path.pkey = target->pkey;
-	ch->path.service_id = target->service_id;
+		ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_IB;
+	ch->ib_cm.path.sgid = target->sgid;
+	ch->ib_cm.path.dgid = target->ib_cm.orig_dgid;
+	ch->ib_cm.path.pkey = target->ib_cm.pkey;
+	ch->ib_cm.path.service_id = target->ib_cm.service_id;
 
 	return 0;
 }
 
+static const char *inet_ntop(const void *sa, char *dst, unsigned int size)
+{
+	switch (((struct sockaddr *)sa)->sa_family) {
+	case AF_INET:
+		snprintf(dst, size, "%pI4",
+			 &((struct sockaddr_in *)sa)->sin_addr);
+		break;
+	case AF_INET6:
+		snprintf(dst, size, "%pI6",
+			 &((struct sockaddr_in6 *)sa)->sin6_addr);
+		break;
+	default:
+		snprintf(dst, size, "???");
+		break;
+	}
+	return dst;
+}
+
+static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct rdma_cm_id *new_cm_id;
+	char src_addr[64], dst_addr[64];
+	int ret;
+
+	new_cm_id = rdma_create_id(&init_net, srp_rdma_cm_handler, ch,
+				   RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(new_cm_id)) {
+		ret = PTR_ERR(new_cm_id);
+		new_cm_id = NULL;
+		goto out;
+	}
+
+	init_completion(&ch->done);
+	ret = rdma_resolve_addr(new_cm_id, target->rdma_cm.src_specified ?
+				(struct sockaddr *)&target->rdma_cm.src : NULL,
+				(struct sockaddr *)&target->rdma_cm.dst,
+				SRP_PATH_REC_TIMEOUT_MS);
+	if (ret) {
+		pr_err("No route available from %s to %s (%d)\n",
+		       target->rdma_cm.src_specified ?
+		       inet_ntop(&target->rdma_cm.src, src_addr,
+				 sizeof(src_addr)) : "(any)",
+		       inet_ntop(&target->rdma_cm.dst, dst_addr,
+				 sizeof(dst_addr)),
+		       ret);
+		goto out;
+	}
+	ret = wait_for_completion_interruptible(&ch->done);
+	if (ret < 0)
+		goto out;
+
+	ret = ch->status;
+	if (ret) {
+		pr_err("Resolving address %s failed (%d)\n",
+		       inet_ntop(&target->rdma_cm.dst, dst_addr,
+				 sizeof(dst_addr)),
+		       ret);
+		goto out;
+	}
+
+	swap(ch->rdma_cm.cm_id, new_cm_id);
+
+out:
+	if (new_cm_id)
+		rdma_destroy_id(new_cm_id);
+
+	return ret;
+}
+
+static int srp_new_cm_id(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+
+	return target->using_rdma_cm ? srp_new_rdma_cm_id(ch) :
+		srp_new_ib_cm_id(ch);
+}
+
 static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
 {
 	struct srp_device *dev = target->srp_host->srp_dev;
@@ -521,16 +602,25 @@  static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	init_attr->send_cq             = send_cq;
 	init_attr->recv_cq             = recv_cq;
 
-	qp = ib_create_qp(dev->pd, init_attr);
-	if (IS_ERR(qp)) {
-		ret = PTR_ERR(qp);
+	if (target->using_rdma_cm) {
+		ret = rdma_create_qp(ch->rdma_cm.cm_id, dev->pd, init_attr);
+		qp = ch->rdma_cm.cm_id->qp;
+	} else {
+		qp = ib_create_qp(dev->pd, init_attr);
+		if (!IS_ERR(qp)) {
+			ret = srp_init_ib_qp(target, qp);
+			if (ret)
+				ib_destroy_qp(qp);
+		} else {
+			ret = PTR_ERR(qp);
+		}
+	}
+	if (ret) {
+		pr_err("QP creation failed for dev %s: %d\n",
+		       dev_name(&dev->dev->dev), ret);
 		goto err_send_cq;
 	}
 
-	ret = srp_init_qp(target, qp);
-	if (ret)
-		goto err_qp;
-
 	if (dev->use_fast_reg) {
 		fr_pool = srp_alloc_fr_pool(target);
 		if (IS_ERR(fr_pool)) {
@@ -574,7 +664,10 @@  static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	return 0;
 
 err_qp:
-	ib_destroy_qp(qp);
+	if (target->using_rdma_cm)
+		rdma_destroy_qp(ch->rdma_cm.cm_id);
+	else
+		ib_destroy_qp(qp);
 
 err_send_cq:
 	ib_free_cq(send_cq);
@@ -600,9 +693,16 @@  static void srp_free_ch_ib(struct srp_target_port *target,
 	if (!ch->target)
 		return;
 
-	if (ch->cm_id) {
-		ib_destroy_cm_id(ch->cm_id);
-		ch->cm_id = NULL;
+	if (target->using_rdma_cm) {
+		if (ch->rdma_cm.cm_id) {
+			rdma_destroy_id(ch->rdma_cm.cm_id);
+			ch->rdma_cm.cm_id = NULL;
+		}
+	} else {
+		if (ch->ib_cm.cm_id) {
+			ib_destroy_cm_id(ch->ib_cm.cm_id);
+			ch->ib_cm.cm_id = NULL;
+		}
 	}
 
 	/* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */
@@ -658,16 +758,16 @@  static void srp_path_rec_completion(int status,
 		shost_printk(KERN_ERR, target->scsi_host,
 			     PFX "Got failed path rec status %d\n", status);
 	else
-		ch->path = *pathrec;
+		ch->ib_cm.path = *pathrec;
 	complete(&ch->done);
 }
 
-static int srp_lookup_path(struct srp_rdma_ch *ch)
+static int srp_ib_lookup_path(struct srp_rdma_ch *ch)
 {
 	struct srp_target_port *target = ch->target;
 	int ret = -ENODEV;
 
-	ch->path.numb_path = 1;
+	ch->ib_cm.path.numb_path = 1;
 
 	init_completion(&ch->done);
 
@@ -678,10 +778,10 @@  static int srp_lookup_path(struct srp_rdma_ch *ch)
 	if (!scsi_host_get(target->scsi_host))
 		goto out;
 
-	ch->path_query_id = ib_sa_path_rec_get(&srp_sa_client,
+	ch->ib_cm.path_query_id = ib_sa_path_rec_get(&srp_sa_client,
 					       target->srp_host->srp_dev->dev,
 					       target->srp_host->port,
-					       &ch->path,
+					       &ch->ib_cm.path,
 					       IB_SA_PATH_REC_SERVICE_ID |
 					       IB_SA_PATH_REC_DGID	 |
 					       IB_SA_PATH_REC_SGID	 |
@@ -690,8 +790,8 @@  static int srp_lookup_path(struct srp_rdma_ch *ch)
 					       SRP_PATH_REC_TIMEOUT_MS,
 					       GFP_KERNEL,
 					       srp_path_rec_completion,
-					       ch, &ch->path_query);
-	ret = ch->path_query_id;
+					       ch, &ch->ib_cm.path_query);
+	ret = ch->ib_cm.path_query_id;
 	if (ret < 0)
 		goto put;
 
@@ -703,9 +803,9 @@  static int srp_lookup_path(struct srp_rdma_ch *ch)
 	if (ret < 0)
 		shost_printk(KERN_WARNING, target->scsi_host,
 			     PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n",
-			     ch->path.sgid.raw, ch->path.dgid.raw,
-			     be16_to_cpu(target->pkey),
-			     be64_to_cpu(target->service_id));
+			     ch->ib_cm.path.sgid.raw, ch->ib_cm.path.dgid.raw,
+			     be16_to_cpu(target->ib_cm.pkey),
+			     be64_to_cpu(target->ib_cm.service_id));
 
 put:
 	scsi_host_put(target->scsi_host);
@@ -714,6 +814,34 @@  static int srp_lookup_path(struct srp_rdma_ch *ch)
 	return ret;
 }
 
+static int srp_rdma_lookup_path(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	int ret;
+
+	init_completion(&ch->done);
+
+	ret = rdma_resolve_route(ch->rdma_cm.cm_id, SRP_PATH_REC_TIMEOUT_MS);
+	if (ret)
+		return ret;
+
+	wait_for_completion_interruptible(&ch->done);
+
+	if (ch->status != 0)
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Path resolution failed\n");
+
+	return ch->status;
+}
+
+static int srp_lookup_path(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+
+	return target->using_rdma_cm ? srp_rdma_lookup_path(ch) :
+		srp_ib_lookup_path(ch);
+}
+
 static u8 srp_get_subnet_timeout(struct srp_host *host)
 {
 	struct ib_port_attr attr;
@@ -735,8 +863,10 @@  static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
 {
 	struct srp_target_port *target = ch->target;
 	struct {
-		struct ib_cm_req_param param;
-		struct srp_login_req   priv;
+		struct rdma_conn_param	  rdma_param;
+		struct srp_login_req_rdma rdma_req;
+		struct ib_cm_req_param	  ib_param;
+		struct srp_login_req	  ib_req;
 	} *req = NULL;
 	char *ipi, *tpi;
 	int status;
@@ -745,44 +875,62 @@  static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
 	if (!req)
 		return -ENOMEM;
 
-	req->param.flow_control 	      = 1;
-	req->param.retry_count                = target->tl_retry_count;
+	req->ib_param.flow_control = 1;
+	req->ib_param.retry_count = target->tl_retry_count;
 
 	/*
 	 * Pick some arbitrary defaults here; we could make these
 	 * module parameters if anyone cared about setting them.
 	 */
-	req->param.responder_resources	      = 4;
-	req->param.rnr_retry_count 	      = 7;
-	req->param.max_cm_retries 	      = 15;
-
-	req->priv.opcode     	= SRP_LOGIN_REQ;
-	req->priv.tag        	= 0;
-	req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len);
-	req->priv.req_buf_fmt 	= cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
+	req->ib_param.responder_resources = 4;
+	req->ib_param.rnr_retry_count = 7;
+	req->ib_param.max_cm_retries = 15;
+
+	req->ib_req.opcode = SRP_LOGIN_REQ;
+	req->ib_req.tag = 0;
+	req->ib_req.req_it_iu_len = cpu_to_be32(target->max_iu_len);
+	req->ib_req.req_buf_fmt	= cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
 					      SRP_BUF_FORMAT_INDIRECT);
-	req->priv.req_flags	= (multich ? SRP_MULTICHAN_MULTI :
-				   SRP_MULTICHAN_SINGLE);
-
-	{
+	req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI :
+				 SRP_MULTICHAN_SINGLE);
+
+	if (target->using_rdma_cm) {
+		req->rdma_param.flow_control = req->ib_param.flow_control;
+		req->rdma_param.responder_resources =
+			req->ib_param.responder_resources;
+		req->rdma_param.initiator_depth = req->ib_param.initiator_depth;
+		req->rdma_param.retry_count = req->ib_param.retry_count;
+		req->rdma_param.rnr_retry_count = req->ib_param.rnr_retry_count;
+		req->rdma_param.private_data = &req->rdma_req;
+		req->rdma_param.private_data_len = sizeof(req->rdma_req);
+
+		req->rdma_req.opcode = req->ib_req.opcode;
+		req->rdma_req.tag = req->ib_req.tag;
+		req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len;
+		req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt;
+		req->rdma_req.req_flags	= req->ib_req.req_flags;
+
+		ipi = req->rdma_req.initiator_port_id;
+		tpi = req->rdma_req.target_port_id;
+	} else {
 		u8 subnet_timeout;
 
 		subnet_timeout = srp_get_subnet_timeout(target->srp_host);
 
-		req->param.primary_path = &ch->path;
-		req->param.alternate_path = NULL;
-		req->param.service_id = target->service_id;
-		get_random_bytes(&req->param.starting_psn, 4);
-		req->param.starting_psn &= 0xffffff;
-		req->param.qp_num = ch->qp->qp_num;
-		req->param.qp_type = ch->qp->qp_type;
-		req->param.local_cm_response_timeout = subnet_timeout + 2;
-		req->param.remote_cm_response_timeout = subnet_timeout + 2;
-		req->param.private_data = &req->priv;
-		req->param.private_data_len = sizeof(req->priv);
-
-		ipi = req->priv.initiator_port_id;
-		tpi = req->priv.target_port_id;
+		req->ib_param.primary_path = &ch->ib_cm.path;
+		req->ib_param.alternate_path = NULL;
+		req->ib_param.service_id = target->ib_cm.service_id;
+		get_random_bytes(&req->ib_param.starting_psn, 4);
+		req->ib_param.starting_psn &= 0xffffff;
+		req->ib_param.qp_num = ch->qp->qp_num;
+		req->ib_param.qp_type = ch->qp->qp_type;
+		req->ib_param.local_cm_response_timeout = subnet_timeout + 2;
+		req->ib_param.remote_cm_response_timeout = subnet_timeout + 2;
+		req->ib_param.private_data = &req->ib_req;
+		req->ib_param.private_data_len = sizeof(req->ib_req);
+
+		ipi = req->ib_req.initiator_port_id;
+		tpi = req->ib_req.target_port_id;
 	}
 
 	/*
@@ -820,7 +968,10 @@  static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
 		memcpy(ipi + 8, &target->srp_host->srp_dev->dev->node_guid, 8);
 	}
 
-	status = ib_send_cm_req(ch->cm_id, &req->param);
+	if (target->using_rdma_cm)
+		status = rdma_connect(ch->rdma_cm.cm_id, &req->rdma_param);
+	else
+		status = ib_send_cm_req(ch->ib_cm.cm_id, &req->ib_param);
 
 	kfree(req);
 
@@ -847,14 +998,23 @@  static bool srp_queue_remove_work(struct srp_target_port *target)
 static void srp_disconnect_target(struct srp_target_port *target)
 {
 	struct srp_rdma_ch *ch;
-	int i;
+	int i, ret;
 
 	/* XXX should send SRP_I_LOGOUT request */
 
 	for (i = 0; i < target->ch_count; i++) {
 		ch = &target->ch[i];
 		ch->connected = false;
-		if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) {
+		ret = 0;
+		if (target->using_rdma_cm) {
+			if (ch->rdma_cm.cm_id)
+				rdma_disconnect(ch->rdma_cm.cm_id);
+		} else {
+			if (ch->ib_cm.cm_id)
+				ret = ib_send_cm_dreq(ch->ib_cm.cm_id,
+						      NULL, 0);
+		}
+		if (ret < 0) {
 			shost_printk(KERN_DEBUG, target->scsi_host,
 				     PFX "Sending CM DREQ failed\n");
 		}
@@ -2355,7 +2515,7 @@  static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
 	struct srp_target_port *target = ch->target;
 	struct ib_qp_attr *qp_attr = NULL;
 	int attr_mask = 0;
-	int ret;
+	int ret = 0;
 	int i;
 
 	if (lrsp->opcode == SRP_LOGIN_RSP) {
@@ -2385,40 +2545,42 @@  static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
 			goto error;
 	}
 
-	ret = -ENOMEM;
-	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
-	if (!qp_attr)
-		goto error;
-
-	qp_attr->qp_state = IB_QPS_RTR;
-	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
-	if (ret)
-		goto error_free;
-
-	ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
-	if (ret)
-		goto error_free;
-
 	for (i = 0; i < target->queue_size; i++) {
 		struct srp_iu *iu = ch->rx_ring[i];
 
 		ret = srp_post_recv(ch, iu);
 		if (ret)
-			goto error_free;
+			goto error;
 	}
 
-	qp_attr->qp_state = IB_QPS_RTS;
-	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
-	if (ret)
-		goto error_free;
+	if (!target->using_rdma_cm) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL);
+		if (!qp_attr)
+			goto error;
 
-	target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask);
+		qp_attr->qp_state = IB_QPS_RTR;
+		ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+		if (ret)
+			goto error_free;
 
-	ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
-	if (ret)
-		goto error_free;
+		ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
+		if (ret)
+			goto error_free;
+
+		qp_attr->qp_state = IB_QPS_RTS;
+		ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+		if (ret)
+			goto error_free;
 
-	ret = ib_send_cm_rtu(cm_id, NULL, 0);
+		target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask);
+
+		ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
+		if (ret)
+			goto error_free;
+
+		ret = ib_send_cm_rtu(cm_id, NULL, 0);
+	}
 
 error_free:
 	kfree(qp_attr);
@@ -2427,41 +2589,43 @@  static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
 	ch->status = ret;
 }
 
-static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
-			       struct ib_cm_event *event,
-			       struct srp_rdma_ch *ch)
+static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id,
+				  struct ib_cm_event *event,
+				  struct srp_rdma_ch *ch)
 {
 	struct srp_target_port *target = ch->target;
 	struct Scsi_Host *shost = target->scsi_host;
 	struct ib_class_port_info *cpi;
 	int opcode;
+	u16 dlid;
 
 	switch (event->param.rej_rcvd.reason) {
 	case IB_CM_REJ_PORT_CM_REDIRECT:
 		cpi = event->param.rej_rcvd.ari;
-		sa_path_set_dlid(&ch->path, ntohs(cpi->redirect_lid));
-		ch->path.pkey = cpi->redirect_pkey;
+		dlid = be16_to_cpu(cpi->redirect_lid);
+		sa_path_set_dlid(&ch->ib_cm.path, dlid);
+		ch->ib_cm.path.pkey = cpi->redirect_pkey;
 		cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff;
-		memcpy(ch->path.dgid.raw, cpi->redirect_gid, 16);
+		memcpy(ch->ib_cm.path.dgid.raw, cpi->redirect_gid, 16);
 
-		ch->status = sa_path_get_dlid(&ch->path) ?
-			SRP_DLID_REDIRECT : SRP_PORT_REDIRECT;
+		ch->status = dlid ? SRP_DLID_REDIRECT : SRP_PORT_REDIRECT;
 		break;
 
 	case IB_CM_REJ_PORT_REDIRECT:
 		if (srp_target_is_topspin(target)) {
+			union ib_gid *dgid = &ch->ib_cm.path.dgid;
+
 			/*
 			 * Topspin/Cisco SRP gateways incorrectly send
 			 * reject reason code 25 when they mean 24
 			 * (port redirect).
 			 */
-			memcpy(ch->path.dgid.raw,
-			       event->param.rej_rcvd.ari, 16);
+			memcpy(dgid->raw, event->param.rej_rcvd.ari, 16);
 
 			shost_printk(KERN_DEBUG, shost,
 				     PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
-				     be64_to_cpu(ch->path.dgid.global.subnet_prefix),
-				     be64_to_cpu(ch->path.dgid.global.interface_id));
+				     be64_to_cpu(dgid->global.subnet_prefix),
+				     be64_to_cpu(dgid->global.interface_id));
 
 			ch->status = SRP_PORT_REDIRECT;
 		} else {
@@ -2490,7 +2654,8 @@  static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 				shost_printk(KERN_WARNING, shost, PFX
 					     "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n",
 					     target->sgid.raw,
-					     target->orig_dgid.raw, reason);
+					     target->ib_cm.orig_dgid.raw,
+					     reason);
 		} else
 			shost_printk(KERN_WARNING, shost,
 				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
@@ -2510,7 +2675,7 @@  static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 	}
 }
 
-static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 {
 	struct srp_rdma_ch *ch = cm_id->context;
 	struct srp_target_port *target = ch->target;
@@ -2533,7 +2698,7 @@  static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
 		comp = 1;
 
-		srp_cm_rej_handler(cm_id, event, ch);
+		srp_ib_cm_rej_handler(cm_id, event, ch);
 		break;
 
 	case IB_CM_DREQ_RECEIVED:
@@ -2571,6 +2736,135 @@  static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 	return 0;
 }
 
+static void srp_rdma_cm_rej_handler(struct srp_rdma_ch *ch,
+				    struct rdma_cm_event *event)
+{
+	struct srp_target_port *target = ch->target;
+	struct Scsi_Host *shost = target->scsi_host;
+	int opcode;
+
+	switch (event->status) {
+	case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID:
+		shost_printk(KERN_WARNING, shost,
+			    "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		opcode = *(u8 *) event->param.conn.private_data;
+		if (opcode == SRP_LOGIN_REJ) {
+			struct srp_login_rej *rej =
+				(struct srp_login_rej *)
+				event->param.conn.private_data;
+			u32 reason = be32_to_cpu(rej->reason);
+
+			if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE)
+				shost_printk(KERN_WARNING, shost,
+					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
+			else
+				shost_printk(KERN_WARNING, shost,
+					    PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason);
+		} else {
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED, opcode 0x%02x\n",
+				     opcode);
+		}
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_STALE_CONN:
+		shost_printk(KERN_WARNING, shost,
+			     "  REJ reason: stale connection\n");
+		ch->status = SRP_STALE_CONN;
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
+			     event->status);
+		ch->status = -ECONNRESET;
+		break;
+	}
+}
+
+static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id,
+			       struct rdma_cm_event *event)
+{
+	struct srp_rdma_ch *ch = cm_id->context;
+	struct srp_target_port *target = ch->target;
+	int comp = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		ch->status = 0;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ch->status = -ENXIO;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ch->status = 0;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		ch->status = -EHOSTUNREACH;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM REQ failed\n");
+		comp = 1;
+		ch->status = -ECONNRESET;
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		comp = 1;
+		srp_cm_rep_handler(NULL, event->param.conn.private_data, ch);
+		break;
+
+	case RDMA_CM_EVENT_REJECTED:
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
+		comp = 1;
+
+		srp_rdma_cm_rej_handler(ch, event);
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		if (ch->connected) {
+			shost_printk(KERN_WARNING, target->scsi_host,
+				     PFX "received DREQ\n");
+			rdma_disconnect(ch->rdma_cm.cm_id);
+			comp = 1;
+			ch->status = 0;
+			queue_work(system_long_wq, &target->tl_err_work);
+		}
+		break;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "connection closed\n");
+
+		comp = 1;
+		ch->status = 0;
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled CM event %d\n", event->event);
+		break;
+	}
+
+	if (comp)
+		complete(&ch->done);
+
+	return 0;
+}
+
 /**
  * srp_change_queue_depth - setting device queue depth
  * @sdev: scsi device struct
@@ -2772,7 +3066,10 @@  static ssize_t show_service_id(struct device *dev,
 {
 	struct srp_target_port *target = host_to_target(class_to_shost(dev));
 
-	return sprintf(buf, "0x%016llx\n", be64_to_cpu(target->service_id));
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "0x%016llx\n",
+		       be64_to_cpu(target->ib_cm.service_id));
 }
 
 static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,
@@ -2780,7 +3077,9 @@  static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,
 {
 	struct srp_target_port *target = host_to_target(class_to_shost(dev));
 
-	return sprintf(buf, "0x%04x\n", be16_to_cpu(target->pkey));
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey));
 }
 
 static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,
@@ -2797,7 +3096,9 @@  static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
 	struct srp_target_port *target = host_to_target(class_to_shost(dev));
 	struct srp_rdma_ch *ch = &target->ch[0];
 
-	return sprintf(buf, "%pI6\n", ch->path.dgid.raw);
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "%pI6\n", ch->ib_cm.path.dgid.raw);
 }
 
 static ssize_t show_orig_dgid(struct device *dev,
@@ -2805,7 +3106,9 @@  static ssize_t show_orig_dgid(struct device *dev,
 {
 	struct srp_target_port *target = host_to_target(class_to_shost(dev));
 
-	return sprintf(buf, "%pI6\n", target->orig_dgid.raw);
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "%pI6\n", target->ib_cm.orig_dgid.raw);
 }
 
 static ssize_t show_req_lim(struct device *dev,
@@ -3050,6 +3353,9 @@  static bool srp_conn_unique(struct srp_host *host,
 		if (t != target &&
 		    target->id_ext == t->id_ext &&
 		    target->ioc_guid == t->ioc_guid &&
+		    (!target->using_rdma_cm ||
+		     memcmp(&target->rdma_cm.dst, &t->rdma_cm.dst,
+			    sizeof(target->rdma_cm.dst)) == 0) &&
 		    target->initiator_ext == t->initiator_ext) {
 			ret = false;
 			break;
@@ -3066,6 +3372,9 @@  static bool srp_conn_unique(struct srp_host *host,
  *
  *     id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,dgid=<dest GID>,
  *     pkey=<P_Key>,service_id=<service ID>
+ * or
+ *     id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,
+ *     [src=<IPv4 address>,]dest=<IPv4 address>:<port number>
  *
  * to the add_target sysfs attribute.
  */
@@ -3086,11 +3395,19 @@  enum {
 	SRP_OPT_COMP_VECTOR	= 1 << 12,
 	SRP_OPT_TL_RETRY_COUNT	= 1 << 13,
 	SRP_OPT_QUEUE_SIZE	= 1 << 14,
-	SRP_OPT_ALL		= (SRP_OPT_ID_EXT	|
-				   SRP_OPT_IOC_GUID	|
-				   SRP_OPT_DGID		|
-				   SRP_OPT_PKEY		|
-				   SRP_OPT_SERVICE_ID),
+	SRP_OPT_IP_SRC		= 1 << 15,
+	SRP_OPT_IP_DEST		= 1 << 16,
+};
+
+static unsigned int srp_opt_mandatory[] = {
+	SRP_OPT_ID_EXT		|
+	SRP_OPT_IOC_GUID	|
+	SRP_OPT_DGID		|
+	SRP_OPT_PKEY		|
+	SRP_OPT_SERVICE_ID,
+	SRP_OPT_ID_EXT		|
+	SRP_OPT_IOC_GUID	|
+	SRP_OPT_IP_DEST,
 };
 
 static const match_table_t srp_opt_tokens = {
@@ -3109,9 +3426,25 @@  static const match_table_t srp_opt_tokens = {
 	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	},
 	{ SRP_OPT_TL_RETRY_COUNT,	"tl_retry_count=%u"	},
 	{ SRP_OPT_QUEUE_SIZE,		"queue_size=%d"		},
+	{ SRP_OPT_IP_SRC,		"src=%s"		},
+	{ SRP_OPT_IP_DEST,		"dest=%s"		},
 	{ SRP_OPT_ERR,			NULL 			}
 };
 
+static int srp_parse_in(struct sockaddr_storage *sa, const char *addr_port_str)
+{
+	char *addr = kstrdup(addr_port_str, GFP_KERNEL);
+	char *port_str = addr;
+	int ret;
+
+	if (!addr)
+		return -ENOMEM;
+	strsep(&port_str, ":");
+	ret = inet_pton_with_scope(&init_net, AF_UNSPEC, addr, port_str, sa);
+	kfree(addr);
+	return ret;
+}
+
 static int srp_parse_options(const char *buf, struct srp_target_port *target)
 {
 	char *options, *sep_opt;
@@ -3180,7 +3513,7 @@  static int srp_parse_options(const char *buf, struct srp_target_port *target)
 				goto out;
 			}
 
-			ret = hex2bin(target->orig_dgid.raw, p, 16);
+			ret = hex2bin(target->ib_cm.orig_dgid.raw, p, 16);
 			kfree(p);
 			if (ret < 0)
 				goto out;
@@ -3191,7 +3524,7 @@  static int srp_parse_options(const char *buf, struct srp_target_port *target)
 				pr_warn("bad P_Key parameter '%s'\n", p);
 				goto out;
 			}
-			target->pkey = cpu_to_be16(token);
+			target->ib_cm.pkey = cpu_to_be16(token);
 			break;
 
 		case SRP_OPT_SERVICE_ID:
@@ -3206,7 +3539,39 @@  static int srp_parse_options(const char *buf, struct srp_target_port *target)
 				kfree(p);
 				goto out;
 			}
-			target->service_id = cpu_to_be64(ull);
+			target->ib_cm.service_id = cpu_to_be64(ull);
+			kfree(p);
+			break;
+
+		case SRP_OPT_IP_SRC:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = srp_parse_in(&target->rdma_cm.src.ss, p);
+			if (ret < 0) {
+				pr_warn("bad source parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->rdma_cm.src_specified = true;
+			kfree(p);
+			break;
+
+		case SRP_OPT_IP_DEST:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = srp_parse_in(&target->rdma_cm.dst.ss, p);
+			if (ret < 0) {
+				pr_warn("bad dest parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->using_rdma_cm = true;
 			kfree(p);
 			break;
 
@@ -3321,14 +3686,14 @@  static int srp_parse_options(const char *buf, struct srp_target_port *target)
 		}
 	}
 
-	if ((opt_mask & SRP_OPT_ALL) == SRP_OPT_ALL)
-		ret = 0;
-	else
-		for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i)
-			if ((srp_opt_tokens[i].token & SRP_OPT_ALL) &&
-			    !(srp_opt_tokens[i].token & opt_mask))
-				pr_warn("target creation request is missing parameter '%s'\n",
-					srp_opt_tokens[i].pattern);
+	for (i = 0; i < ARRAY_SIZE(srp_opt_mandatory); i++) {
+		if ((opt_mask & srp_opt_mandatory[i]) == srp_opt_mandatory[i]) {
+			ret = 0;
+			break;
+		}
+	}
+	if (ret)
+		pr_warn("target creation request is missing one or more parameters\n");
 
 	if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue
 	    && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
@@ -3397,11 +3762,22 @@  static ssize_t srp_create_target(struct device *dev,
 	target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE;
 
 	if (!srp_conn_unique(target->srp_host, target)) {
-		shost_printk(KERN_INFO, target->scsi_host,
-			     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n",
-			     be64_to_cpu(target->id_ext),
-			     be64_to_cpu(target->ioc_guid),
-			     be64_to_cpu(target->initiator_ext));
+		if (target->using_rdma_cm) {
+			char dst_addr[64];
+
+			shost_printk(KERN_INFO, target->scsi_host,
+				     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%s\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     inet_ntop(&target->rdma_cm.dst, dst_addr,
+					       sizeof(dst_addr)));
+		} else {
+			shost_printk(KERN_INFO, target->scsi_host,
+				     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     be64_to_cpu(target->initiator_ext));
+		}
 		ret = -EEXIST;
 		goto out;
 	}
@@ -3502,11 +3878,18 @@  static ssize_t srp_create_target(struct device *dev,
 
 			ret = srp_connect_ch(ch, multich);
 			if (ret) {
+				char dst[64];
+
+				if (target->using_rdma_cm)
+					inet_ntop(&target->rdma_cm.dst, dst,
+						  sizeof(dst));
+				else
+					snprintf(dst, sizeof(dst), "%pI6",
+						 target->ib_cm.orig_dgid.raw);
 				shost_printk(KERN_ERR, target->scsi_host,
-					     PFX "Connection %d/%d to %pI6 failed\n",
+					     PFX "Connection %d/%d to %s failed\n",
 					     ch_start + cpu_idx,
-					     target->ch_count,
-					     ch->target->orig_dgid.raw);
+					     target->ch_count, dst);
 				if (node_idx == 0 && cpu_idx == 0) {
 					goto free_ch;
 				} else {
@@ -3531,13 +3914,25 @@  static ssize_t srp_create_target(struct device *dev,
 		goto err_disconnect;
 
 	if (target->state != SRP_TARGET_REMOVED) {
-		shost_printk(KERN_DEBUG, target->scsi_host, PFX
-			     "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n",
-			     be64_to_cpu(target->id_ext),
-			     be64_to_cpu(target->ioc_guid),
-			     be16_to_cpu(target->pkey),
-			     be64_to_cpu(target->service_id),
-			     target->sgid.raw, target->orig_dgid.raw);
+		if (target->using_rdma_cm) {
+			char dst[64];
+
+			inet_ntop(&target->rdma_cm.dst, dst, sizeof(dst));
+			shost_printk(KERN_DEBUG, target->scsi_host, PFX
+				     "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %s\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     target->sgid.raw, dst);
+		} else {
+			shost_printk(KERN_DEBUG, target->scsi_host, PFX
+				     "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     be16_to_cpu(target->ib_cm.pkey),
+				     be64_to_cpu(target->ib_cm.service_id),
+				     target->sgid.raw,
+				     target->ib_cm.orig_dgid.raw);
+		}
 	}
 
 	ret = count;
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index a814f5ef16f9..ce9d8e0f84dc 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -45,6 +45,7 @@ 
 #include <rdma/ib_sa.h>
 #include <rdma/ib_cm.h>
 #include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
 
 enum {
 	SRP_PATH_REC_TIMEOUT_MS	= 1000,
@@ -153,11 +154,18 @@  struct srp_rdma_ch {
 	struct completion	done;
 	int			status;
 
-	struct sa_path_rec	path;
-	struct ib_sa_query     *path_query;
-	int			path_query_id;
+	union {
+		struct ib_cm {
+			struct sa_path_rec	path;
+			struct ib_sa_query	*path_query;
+			int			path_query_id;
+			struct ib_cm_id		*cm_id;
+		} ib_cm;
+		struct rdma_cm {
+			struct rdma_cm_id	*cm_id;
+		} rdma_cm;
+	};
 
-	struct ib_cm_id	       *cm_id;
 	struct srp_iu	      **tx_ring;
 	struct srp_iu	      **rx_ring;
 	struct srp_request     *req_ring;
@@ -194,7 +202,6 @@  struct srp_target_port {
 	union ib_gid		sgid;
 	__be64			id_ext;
 	__be64			ioc_guid;
-	__be64			service_id;
 	__be64			initiator_ext;
 	u16			io_class;
 	struct srp_host	       *srp_host;
@@ -210,8 +217,28 @@  struct srp_target_port {
 	int			comp_vector;
 	int			tl_retry_count;
 
-	union ib_gid		orig_dgid;
-	__be16			pkey;
+	bool			using_rdma_cm;
+
+	union {
+		struct {
+			__be64			service_id;
+			union ib_gid		orig_dgid;
+			__be16			pkey;
+		} ib_cm;
+		struct {
+			union {
+				struct sockaddr_in	ip4;
+				struct sockaddr_in6	ip6;
+				struct sockaddr_storage ss;
+			} src;
+			union {
+				struct sockaddr_in	ip4;
+				struct sockaddr_in6	ip6;
+				struct sockaddr_storage ss;
+			} dst;
+			bool src_specified;
+		} rdma_cm;
+	};
 
 	u32			rq_tmo_jiffies;
 
diff --git a/include/scsi/srp.h b/include/scsi/srp.h
index 5be834de491a..c16a3c9a4d9b 100644
--- a/include/scsi/srp.h
+++ b/include/scsi/srp.h
@@ -129,6 +129,23 @@  struct srp_login_req {
 	u8	target_port_id[16];
 };
 
+/**
+ * struct srp_login_req_rdma - RDMA/CM login parameters.
+ *
+ * RDMA/CM over InfiniBand can only carry 92 - 36 = 56 bytes of private
+ * data. The %srp_login_req_rdma structure contains the same information as
+ * %srp_login_req but with the reserved data removed.
+ */
+struct srp_login_req_rdma {
+	u64	tag;
+	__be16	req_buf_fmt;
+	u8	req_flags;
+	u8	opcode;
+	__be32	req_it_iu_len;
+	u8	initiator_port_id[16];
+	u8	target_port_id[16];
+};
+
 /*
  * The SRP spec defines the size of the LOGIN_RSP structure to be 52
  * bytes, so it needs to be packed to avoid having it padded to 56