diff mbox

[RFC,1/5] IB/core: Add Core Capability flags to ib_device

Message ID 1430720099-32512-2-git-send-email-ira.weiny@intel.com (mailing list archive)
State Rejected
Headers show

Commit Message

Ira Weiny May 4, 2015, 6:14 a.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

Add Core capability flags to each port attribute and read those into ib_device
upon registration for each port.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/infiniband/core/device.c |   41 ++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h          |   22 ++++++++++++++++++++
 2 files changed, 63 insertions(+), 0 deletions(-)

Comments

Doug Ledford May 4, 2015, 2:41 p.m. UTC | #1
On Mon, 2015-05-04 at 02:14 -0400, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
> 
> Add Core capability flags to each port attribute and read those into ib_device
> upon registration for each port.
> 
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> ---
>  drivers/infiniband/core/device.c |   41 ++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_verbs.h          |   22 ++++++++++++++++++++
>  2 files changed, 63 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index b360350..6a37255 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -262,6 +262,37 @@ out:
>  	return ret;
>  }
>  
> +static int read_core_cap_flags(struct ib_device *device)
> +{
> +	struct ib_port_attr tprops;
> +	int num_ports, ret = -ENOMEM;
> +	u8 port_index;
> +
> +	num_ports = device->phys_port_cnt;
> +
> +	device->core_cap_flags = kzalloc(sizeof(*device->core_cap_flags)
> +						* (num_ports+1),
> +					 GFP_KERNEL);
> +	if (!device->core_cap_flags)
> +		return -ENOMEM;
> +
> +	for (port_index = 0; port_index <= num_ports; ++port_index) {
> +		if ((port_index == 0 && device->node_type != RDMA_NODE_IB_SWITCH))
> +			continue;
> +
> +		ret = ib_query_port(device, port_index, &tprops);
> +		if (ret)
> +			goto err;
> +
> +		device->core_cap_flags[port_index] = tprops.core_cap_flags;
> +	}
> +
> +	return 0;
> +err:
> +	kfree(device->core_cap_flags);
> +	return ret;
> +}
> +
>  /**
>   * ib_register_device - Register an IB device with IB core
>   * @device:Device to register
> @@ -302,12 +333,21 @@ int ib_register_device(struct ib_device *device,
>  		goto out;
>  	}
>  
> +	ret = read_core_cap_flags(device);
> +	if (ret) {
> +		dev_err(&device->dev, "Couldn't create Core Capability flags\n");
> +		kfree(device->gid_tbl_len);
> +		kfree(device->pkey_tbl_len);
> +		goto out;
> +	}
> +
>  	ret = ib_device_register_sysfs(device, port_callback);
>  	if (ret) {
>  		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
>  		       device->name);
>  		kfree(device->gid_tbl_len);
>  		kfree(device->pkey_tbl_len);
> +		kfree(device->core_cap_flags);
>  		goto out;
>  	}
>  
> @@ -351,6 +391,7 @@ void ib_unregister_device(struct ib_device *device)
>  
>  	kfree(device->gid_tbl_len);
>  	kfree(device->pkey_tbl_len);
> +	kfree(device->core_cap_flags);
>  
>  	mutex_unlock(&device_mutex);
>  
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index c724114..4de2758 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -353,11 +353,32 @@ union rdma_protocol_stats {
>  	struct iw_protocol_stats	iw;
>  };
>  
> +/* Define bits for the various functionality this port needs to be supported by
> + * the core.
> + */
> +/* Management                           0x00000000FFFFFFFF */
> +#define RDMA_CORE_CAP_IB_MAD		0x0000000000000001ULL
> +#define RDMA_CORE_CAP_IB_SMI		0x0000000000000002ULL
> +#define RDMA_CORE_CAP_IB_CM		0x0000000000000004ULL
> +#define RDMA_CORE_CAP_IW_CM		0x0000000000000008ULL
> +#define RDMA_CORE_CAP_IB_SA		0x0000000000000010ULL
> +
> +/* Address format                       0x0000FFFF00000000 */
> +#define RDMA_CORE_CAP_AF_IB		0x0000000100000000ULL
> +#define RDMA_CORE_CAP_ETH_AH		0x0000000200000000ULL
> +
> +/* Protocol                             0xFFFF000000000000 */
> +#define RDMA_CORE_CAP_PROT_IB		0x0001000000000000ULL
> +#define RDMA_CORE_CAP_PROT_IBOE		0x0002000000000000ULL
> +#define RDMA_CORE_CAP_PROT_IWARP	0x0004000000000000ULL
> +#define RDMA_CORE_CAP_PROT_USNIC_UDP	0x0008000000000000ULL

In accordance with what we've been talking about, drop IBOE for ROCE.

Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP.
USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1
will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP.

> +
>  struct ib_port_attr {
>  	enum ib_port_state	state;
>  	enum ib_mtu		max_mtu;
>  	enum ib_mtu		active_mtu;
>  	int			gid_tbl_len;
> +	u64                     core_cap_flags;

I think u32 should be enough here, and will help keep our footprint
smaller.

>  	u32			port_cap_flags;
>  	u32			max_msg_sz;
>  	u32			bad_pkey_cntr;
> @@ -1684,6 +1705,7 @@ struct ib_device {
>  	u32			     local_dma_lkey;
>  	u8                           node_type;
>  	u8                           phys_port_cnt;
> +	u64                         *core_cap_flags; /* Per port core capability flags */

Ditto.

>  };
>  
>  struct ib_client {
Hefty, Sean May 4, 2015, 4:40 p.m. UTC | #2
> > +/* Protocol                             0xFFFF000000000000 */

> > +#define RDMA_CORE_CAP_PROT_IB		0x0001000000000000ULL

> > +#define RDMA_CORE_CAP_PROT_IBOE		0x0002000000000000ULL

> > +#define RDMA_CORE_CAP_PROT_IWARP	0x0004000000000000ULL

> > +#define RDMA_CORE_CAP_PROT_USNIC_UDP	0x0008000000000000ULL

> 

> In accordance with what we've been talking about, drop IBOE for ROCE.

> 

> Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP.

> USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1

> will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP.


USNIC_UDP is just UDP.  I don't understand why we would want 'USNIC | UDP_ENCAP', or what UDP_ENCAP is intended to convey.  Nothing is being encapsulated.

RoCEv2 is IB transport over UDP. 

I'm not sure what the protocol field is intended to imply.  If we want to expose the link, network, transport, and RDMA protocols in use, shouldn't these be separate fields or bits?  And even then, I'm not sure what use this has for the ULPs.  iWarp does not require Ethernet or TCP.  RoCEv2 would work fine over any link.  And the core layer should not assume that a device is limited to supporting only one protocol, especially at the network and transport levels.  I vote for deprecating the protocol goofiness.

- Sean
Hefty, Sean May 4, 2015, 4:42 p.m. UTC | #3
> @@ -302,12 +333,21 @@ int ib_register_device(struct ib_device *device,
>  		goto out;
>  	}
> 
> +	ret = read_core_cap_flags(device);
> +	if (ret) {
> +		dev_err(&device->dev, "Couldn't create Core Capability
> flags\n");
> +		kfree(device->gid_tbl_len);
> +		kfree(device->pkey_tbl_len);
> +		goto out;
> +	}
> +
>  	ret = ib_device_register_sysfs(device, port_callback);
>  	if (ret) {
>  		printk(KERN_WARNING "Couldn't register device %s with driver
> model\n",
>  		       device->name);
>  		kfree(device->gid_tbl_len);
>  		kfree(device->pkey_tbl_len);
> +		kfree(device->core_cap_flags);

Use a common exit location to avoid duplicating the kfree's on errors.

>  		goto out;
>  	}
> 
> @@ -351,6 +391,7 @@ void ib_unregister_device(struct ib_device *device)
> 
>  	kfree(device->gid_tbl_len);
>  	kfree(device->pkey_tbl_len);
> +	kfree(device->core_cap_flags);
> 
>  	mutex_unlock(&device_mutex);
> 
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index c724114..4de2758 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -353,11 +353,32 @@ union rdma_protocol_stats {
>  	struct iw_protocol_stats	iw;
>  };
> 
> +/* Define bits for the various functionality this port needs to be
> supported by
> + * the core.
> + */
> +/* Management                           0x00000000FFFFFFFF */
> +#define RDMA_CORE_CAP_IB_MAD		0x0000000000000001ULL
> +#define RDMA_CORE_CAP_IB_SMI		0x0000000000000002ULL
> +#define RDMA_CORE_CAP_IB_CM		0x0000000000000004ULL
> +#define RDMA_CORE_CAP_IW_CM		0x0000000000000008ULL
> +#define RDMA_CORE_CAP_IB_SA		0x0000000000000010ULL
> +
> +/* Address format                       0x0000FFFF00000000 */
> +#define RDMA_CORE_CAP_AF_IB		0x0000000100000000ULL
> +#define RDMA_CORE_CAP_ETH_AH		0x0000000200000000ULL
> +
> +/* Protocol                             0xFFFF000000000000 */
> +#define RDMA_CORE_CAP_PROT_IB		0x0001000000000000ULL
> +#define RDMA_CORE_CAP_PROT_IBOE		0x0002000000000000ULL
> +#define RDMA_CORE_CAP_PROT_IWARP	0x0004000000000000ULL
> +#define RDMA_CORE_CAP_PROT_USNIC_UDP	0x0008000000000000ULL
> +
>  struct ib_port_attr {
>  	enum ib_port_state	state;
>  	enum ib_mtu		max_mtu;
>  	enum ib_mtu		active_mtu;
>  	int			gid_tbl_len;
> +	u64                     core_cap_flags;
>  	u32			port_cap_flags;
>  	u32			max_msg_sz;
>  	u32			bad_pkey_cntr;
> @@ -1684,6 +1705,7 @@ struct ib_device {
>  	u32			     local_dma_lkey;
>  	u8                           node_type;
>  	u8                           phys_port_cnt;
> +	u64                         *core_cap_flags; /* Per port core
> capability flags */

Why are the core_cap_flags duplicated in the struct and in port attributes?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 4, 2015, 4:48 p.m. UTC | #4
On Mon, 2015-05-04 at 16:42 +0000, Hefty, Sean wrote:

> >  struct ib_port_attr {
> >  	enum ib_port_state	state;
> >  	enum ib_mtu		max_mtu;
> >  	enum ib_mtu		active_mtu;
> >  	int			gid_tbl_len;
> > +	u64                     core_cap_flags;
> >  	u32			port_cap_flags;
> >  	u32			max_msg_sz;
> >  	u32			bad_pkey_cntr;
> > @@ -1684,6 +1705,7 @@ struct ib_device {
> >  	u32			     local_dma_lkey;
> >  	u8                           node_type;
> >  	u8                           phys_port_cnt;
> > +	u64                         *core_cap_flags; /* Per port core
> > capability flags */
> 
> Why are the core_cap_flags duplicated in the struct and in port attributes?

Because the per port ib_port_attr struct is not visible to the core
code, it must call ib_query_port each time it wants to see it.  In order
to make the helpers avoid a call, you have to stash the cap_flags into
the ib_device struct.  In the past, the only thing there was the
node_type, and it was per device and not per node.  In order to make all
of this work and not require calls into the driver all the time, that
has to change to a per-port array.  Eventually we could get rid of the
node_type element once everything is in place.
Hefty, Sean May 4, 2015, 4:53 p.m. UTC | #5
> > >  struct ib_port_attr {

> > >  	enum ib_port_state	state;

> > >  	enum ib_mtu		max_mtu;

> > >  	enum ib_mtu		active_mtu;

> > >  	int			gid_tbl_len;

> > > +	u64                     core_cap_flags;

> > >  	u32			port_cap_flags;

> > >  	u32			max_msg_sz;

> > >  	u32			bad_pkey_cntr;

> > > @@ -1684,6 +1705,7 @@ struct ib_device {

> > >  	u32			     local_dma_lkey;

> > >  	u8                           node_type;

> > >  	u8                           phys_port_cnt;

> > > +	u64                         *core_cap_flags; /* Per port core

> > > capability flags */

> >

> > Why are the core_cap_flags duplicated in the struct and in port

> attributes?

> 

> Because the per port ib_port_attr struct is not visible to the core

> code, it must call ib_query_port each time it wants to see it.  In order

> to make the helpers avoid a call, you have to stash the cap_flags into

> the ib_device struct.  In the past, the only thing there was the

> node_type, and it was per device and not per node.  In order to make all

> of this work and not require calls into the driver all the time, that

> has to change to a per-port array.  Eventually we could get rid of the

> node_type element once everything is in place.


I understand why they are in struct ib_device, but why are they duplicated in struct ib_port_attr?
Doug Ledford May 4, 2015, 4:56 p.m. UTC | #6
On Mon, 2015-05-04 at 16:53 +0000, Hefty, Sean wrote:
> > > >  struct ib_port_attr {
> > > >  	enum ib_port_state	state;
> > > >  	enum ib_mtu		max_mtu;
> > > >  	enum ib_mtu		active_mtu;
> > > >  	int			gid_tbl_len;
> > > > +	u64                     core_cap_flags;
> > > >  	u32			port_cap_flags;
> > > >  	u32			max_msg_sz;
> > > >  	u32			bad_pkey_cntr;
> > > > @@ -1684,6 +1705,7 @@ struct ib_device {
> > > >  	u32			     local_dma_lkey;
> > > >  	u8                           node_type;
> > > >  	u8                           phys_port_cnt;
> > > > +	u64                         *core_cap_flags; /* Per port core
> > > > capability flags */
> > >
> > > Why are the core_cap_flags duplicated in the struct and in port
> > attributes?
> > 
> > Because the per port ib_port_attr struct is not visible to the core
> > code, it must call ib_query_port each time it wants to see it.  In order
> > to make the helpers avoid a call, you have to stash the cap_flags into
> > the ib_device struct.  In the past, the only thing there was the
> > node_type, and it was per device and not per node.  In order to make all
> > of this work and not require calls into the driver all the time, that
> > has to change to a per-port array.  Eventually we could get rid of the
> > node_type element once everything is in place.
> 
> I understand why they are in struct ib_device, but why are they duplicated in struct ib_port_attr?

Because ib_query_port returns a struct, so this must be an element of
that struct or it can't be returned (without adding a new, single use
driver upcall).
Hefty, Sean May 4, 2015, 5:25 p.m. UTC | #7
PiBCZWNhdXNlIGliX3F1ZXJ5X3BvcnQgcmV0dXJucyBhIHN0cnVjdCwgc28gdGhpcyBtdXN0IGJl
IGFuIGVsZW1lbnQgb2YNCj4gdGhhdCBzdHJ1Y3Qgb3IgaXQgY2FuJ3QgYmUgcmV0dXJuZWQgKHdp
dGhvdXQgYWRkaW5nIGEgbmV3LCBzaW5nbGUgdXNlDQo+IGRyaXZlciB1cGNhbGwpLg0KDQpObSAt
IFRoaXMgY291bGQgYmUgY2xlYXJlciBpZiB0aGUgZnVuY3Rpb25hbGl0eSB3ZXJlIGZvbGRlZCBp
bnRvIHRoZSAncmVhZF9wb3J0X3RhYmxlX2xlbmd0aHMnIGZ1bmN0aW9uLg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 4, 2015, 5:31 p.m. UTC | #8
> 

> > Because ib_query_port returns a struct, so this must be an element of

> > that struct or it can't be returned (without adding a new, single use

> > driver upcall).

> 

> Nm - This could be clearer if the functionality were folded into the

> 'read_port_table_lengths' function.


I thought of doing that but then the function name no longer matches the functionality.

I could change it to:

read_per_port_info(...)

And add the bits there.

Ira
Hefty, Sean May 4, 2015, 5:34 p.m. UTC | #9
> > Nm - This could be clearer if the functionality were folded into the

> > 'read_port_table_lengths' function.

> 

> I thought of doing that but then the function name no longer matches the

> functionality.

> 

> I could change it to:

> 

> read_per_port_info(...)


That makes sense.
Doug Ledford May 4, 2015, 5:38 p.m. UTC | #10
On Mon, 2015-05-04 at 16:40 +0000, Hefty, Sean wrote:
> > > +/* Protocol                             0xFFFF000000000000 */
> > > +#define RDMA_CORE_CAP_PROT_IB		0x0001000000000000ULL
> > > +#define RDMA_CORE_CAP_PROT_IBOE		0x0002000000000000ULL
> > > +#define RDMA_CORE_CAP_PROT_IWARP	0x0004000000000000ULL
> > > +#define RDMA_CORE_CAP_PROT_USNIC_UDP	0x0008000000000000ULL
> > 
> > In accordance with what we've been talking about, drop IBOE for ROCE.
> > 
> > Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP.
> > USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1
> > will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP.
> 
> USNIC_UDP is just UDP.  I don't understand why we would want 'USNIC | UDP_ENCAP', or what UDP_ENCAP is intended to convey.  Nothing is being encapsulated.

I thought USNIC_UDP had an embedded USNIC protocol header inside the UDP
header.  That would make it a UDP_ENCAP protocol.

> RoCEv2 is IB transport over UDP.

Right, ROCE (or IB, whichever you prefer) encapsulated in UDP.

> I'm not sure what the protocol field is intended to imply.

There is still information in those bits that we can't get elsewhere.
For instance, even though this patch replaces the CAP_* stuff with bits,
if you took away the CAP_PROT_* entries, then there would be no entry to
identify USNIC at all.

Right now, you could infer iWARP from CAP_IW_CM.
You could infer InfiniBand from any of the CAP_IB_* (but later will need
a way to differentiate between IB and OPA)
You could infer ROCE from CAP_ETH_AH (but later will need a way to
differentiate between ROCE and ROCEv2)
The only way to differentiate USNIC at the moment, is that the CAPS
would be all 0.  That's not the sort of positive identification I would
prefer.

So you *could* reduce this to just one bit for USNIC.

And if you then add a UDP_ENCAP bit, then that single bit can do double
duty in telling apart USNIC and USNIC_UDP and ROCE and ROCEv2.

>   If we want
> to expose the link, network, transport, and RDMA protocols in use,
> shouldn't these be separate fields or bits?  And even then, I'm not
> sure what use this has for the ULPs.  iWarp does not require Ethernet
> or TCP.  RoCEv2 would work fine over any link.


>   And the core layer
> should not assume that a device is limited to supporting only one
> protocol, especially at the network and transport levels.

Given that this is a per port thing, there is no assumption about a
device only supporting a single protocol.

>   I vote for
> deprecating the protocol goofiness.
Hefty, Sean May 4, 2015, 6:26 p.m. UTC | #11
PiBJIHRob3VnaHQgVVNOSUNfVURQIGhhZCBhbiBlbWJlZGRlZCBVU05JQyBwcm90b2NvbCBoZWFk
ZXIgaW5zaWRlIHRoZSBVRFANCj4gaGVhZGVyLiAgVGhhdCB3b3VsZCBtYWtlIGl0IGEgVURQX0VO
Q0FQIHByb3RvY29sLg0KDQpTb21lb25lIGZyb20gQ2lzY28gY2FuIGNvcnJlY3QgbWUsIGJ1dCBV
U05JQyBzdXBwb3J0cyAyIHByb3RvY29scy4gIEp1c3QgcGxhaW4gVURQLCBhbmQgYSBwcm9wcmll
dGFyeSBwcm90b2NvbCB0aGF0IHJ1bnMgb3ZlciBFdGhlcm5ldCwgYnV0IHVzZXMgdGhlIHNhbWUg
RXRoZXJUeXBlIGFzIFJvQ0UuICBJIHRob3VnaHQgdGhlc2UgY291bGQgYm90aCBiZSBhY3RpdmUg
b24gdGhlIHNhbWUgcG9ydCBhdCB0aGUgc2FtZSB0aW1lLg0KDQo+ID4gUm9DRXYyIGlzIElCIHRy
YW5zcG9ydCBvdmVyIFVEUC4NCj4gDQo+IFJpZ2h0LCBST0NFIChvciBJQiwgd2hpY2hldmVyIHlv
dSBwcmVmZXIpIGVuY2Fwc3VsYXRlZCBpbiBVRFAuDQo+IA0KPiA+IEknbSBub3Qgc3VyZSB3aGF0
IHRoZSBwcm90b2NvbCBmaWVsZCBpcyBpbnRlbmRlZCB0byBpbXBseS4NCj4gDQo+IFRoZXJlIGlz
IHN0aWxsIGluZm9ybWF0aW9uIGluIHRob3NlIGJpdHMgdGhhdCB3ZSBjYW4ndCBnZXQgZWxzZXdo
ZXJlLg0KPiBGb3IgaW5zdGFuY2UsIGV2ZW4gdGhvdWdoIHRoaXMgcGF0Y2ggcmVwbGFjZXMgdGhl
IENBUF8qIHN0dWZmIHdpdGggYml0cywNCj4gaWYgeW91IHRvb2sgYXdheSB0aGUgQ0FQX1BST1Rf
KiBlbnRyaWVzLCB0aGVuIHRoZXJlIHdvdWxkIGJlIG5vIGVudHJ5IHRvDQo+IGlkZW50aWZ5IFVT
TklDIGF0IGFsbC4NCj4gDQo+IFJpZ2h0IG5vdywgeW91IGNvdWxkIGluZmVyIGlXQVJQIGZyb20g
Q0FQX0lXX0NNLg0KPiBZb3UgY291bGQgaW5mZXIgSW5maW5pQmFuZCBmcm9tIGFueSBvZiB0aGUg
Q0FQX0lCXyogKGJ1dCBsYXRlciB3aWxsIG5lZWQNCj4gYSB3YXkgdG8gZGlmZmVyZW50aWF0ZSBi
ZXR3ZWVuIElCIGFuZCBPUEEpDQo+IFlvdSBjb3VsZCBpbmZlciBST0NFIGZyb20gQ0FQX0VUSF9B
SCAoYnV0IGxhdGVyIHdpbGwgbmVlZCBhIHdheSB0bw0KPiBkaWZmZXJlbnRpYXRlIGJldHdlZW4g
Uk9DRSBhbmQgUk9DRXYyKQ0KPiBUaGUgb25seSB3YXkgdG8gZGlmZmVyZW50aWF0ZSBVU05JQyBh
dCB0aGUgbW9tZW50LCBpcyB0aGF0IHRoZSBDQVBTDQo+IHdvdWxkIGJlIGFsbCAwLiAgVGhhdCdz
IG5vdCB0aGUgc29ydCBvZiBwb3NpdGl2ZSBpZGVudGlmaWNhdGlvbiBJIHdvdWxkDQo+IHByZWZl
ci4NCj4gDQo+IFNvIHlvdSAqY291bGQqIHJlZHVjZSB0aGlzIHRvIGp1c3Qgb25lIGJpdCBmb3Ig
VVNOSUMuDQo+IA0KPiBBbmQgaWYgeW91IHRoZW4gYWRkIGEgVURQX0VOQ0FQIGJpdCwgdGhlbiB0
aGF0IHNpbmdsZSBiaXQgY2FuIGRvIGRvdWJsZQ0KPiBkdXR5IGluIHRlbGxpbmcgYXBhcnQgVVNO
SUMgYW5kIFVTTklDX1VEUCBhbmQgUk9DRSBhbmQgUk9DRXYyLg0KDQpNeSBxdWVzdGlvbiBpcyB3
aG8gbmVlZHMgdGhlc2UgYml0cyBhbmQgd2h5PyAgVGhlIHByaW1hcnkgcmVhc29uIGl0IHdhcyBl
eHBvc2VkIHdhcyB0byBkbyB0aGUgam9iIHRoYXQgdGhlIG5ldyBjYXAgZmxhZ3MgYXJlIGFjY29t
cGxpc2hpbmcuDQoNCkkgc3RpbGwgYmVsaWV2ZSB0aGF0IFJvQ0V2MiBpcyBjb25jZXB0dWFsbHkg
dGhlIHNhbWUgYXMgaVdhcnAuICBBbiBSRE1BIHByb3RvY29sIGhhcyBiZWVuIGxheWVyZWQgb3Zl
ciBzb21lIG90aGVyIHRyYW5zcG9ydC4gIEluIHRoZSBjYXNlIG9mIGlXYXJwLCBpdCdzIFRDUC4g
IEluIHRoZSBjYXNlIG9mIFJvQ0V2MiwgaXQncyBVRFAuICBEbyB3ZSBkZWZpbmUgYSBUQ1BfRU5D
QVAgYml0IHRoYXQgY29ycmVzcG9uZHMgd2l0aCBVRFBfRU5DQVA/ICBBbmQgd2h5IHNob3VsZCBh
biBhcHAgY2FyZT8gIFdlIGRvbid0IHNwZWNpZnkgd2hldGhlciB0aGUgcG9ydCBpcyBydW5uaW5n
IElQdjQgb3IgSVB2Ni4gIFdoeSBpcyB0aGUgdHJhbnNwb3J0IGxldmVsIGNhbGxlZCBvdXQsIGJ1
dCBub3QgdGhlIG5ldHdvcmsgbGF5ZXI/ICBBcmNoaXRlY3R1cmFsbHksIG5laXRoZXIgaVdhcnAg
b3IgUm9DRXYyIChkZXNwaXRlIGl0cyBuYW1lKSBjYXJlcyB3aGF0IHRoZSBsaW5rIGxheWVyIGlz
Lg0KDQo+ID4gICBBbmQgdGhlIGNvcmUgbGF5ZXINCj4gPiBzaG91bGQgbm90IGFzc3VtZSB0aGF0
IGEgZGV2aWNlIGlzIGxpbWl0ZWQgdG8gc3VwcG9ydGluZyBvbmx5IG9uZQ0KPiA+IHByb3RvY29s
LCBlc3BlY2lhbGx5IGF0IHRoZSBuZXR3b3JrIGFuZCB0cmFuc3BvcnQgbGV2ZWxzLg0KPiANCj4g
R2l2ZW4gdGhhdCB0aGlzIGlzIGEgcGVyIHBvcnQgdGhpbmcsIHRoZXJlIGlzIG5vIGFzc3VtcHRp
b24gYWJvdXQgYQ0KPiBkZXZpY2Ugb25seSBzdXBwb3J0aW5nIGEgc2luZ2xlIHByb3RvY29sLg0K
DQpEZXZpY2UsIG5vLCBidXQgd2UgYXJlIGFzc3VtaW5nIHRoaXMgcGVyIHBvcnQuICBJIGRvbid0
IHRoaW5rIHRoaXMgaXMgdHJ1ZSBmb3IgVVNOSUMuICBGb3IgdGhhdCBtYXR0ZXIsIGl0J3MgZW50
aXJlbHkgcG9zc2libGUgZm9yIGEgUm9DRXYyIGRldmljZSB0byBleHBvc2UgVURQIGRpcmVjdGx5
IHRvIHVzZXIgc3BhY2UsIHNhbWUgYXMgVVNOSUMuICAoSSdkIGFjdHVhbGx5IGJlIHN1cnByaXNl
ZCBpZiBubyBkZXZpY2VzIGhhdmUgdGhpcyBjYXBhYmlsaXR5LCBpZiBmb3IgZGVidWdnaW5nIGNh
cGFiaWxpdGllcywgZXZlbiBpZiBmb3Igbm90aGluZyBlbHNlLikgIFdoYXQgYXJlIHdlIGdvaW5n
IHRvIGRvIGlmIHRoZXJlJ3MgYSBkZXZpY2UgdGhhdCBzdXBwb3J0cyBib3RoIGlXYXJwIGFuZCBS
b0NFdjI/ICBUaGF0J3MgZWFzaWx5IGRvYWJsZSB0b2RheSB0aHJvdWdoIHNvZnR3YXJlLg0KDQpC
YXNpY2FsbHksIEkgcXVlc3Rpb24gaG93IHByb3RvY29sIGlzIGRlZmluZWQgYW5kIHdoYXQgaXQg
bWVhbnMgdG8gZXhwb3NlIGl0IGFzIGEgZGV2aWNlIGF0dHJpYnV0ZS4gIFNob3VsZCBpdCBpbnN0
ZWFkIGJlIG5lZ290aWF0ZWQgKGkuZS4gbW9yZSBsaWtlIHNvY2tldHMpPyAgSWYgYW4gYXBwIG5l
ZWRzIGEgc3BlY2lmaWMgIlJETUEgcHJvdG9jb2wiIChJQiBvciBpV2FycCkgb3IgImFwcGxpY2F0
aW9uIHByb3RvY29sIiAoSUIgb3IgaVdhcnAgb3IgVURQIG9yIE1BRHM/KSwgdGhleSBjYW4gcmVx
dWVzdCBpdC4gIE90aGVyd2lzZSwgdGhlIGFwcCBnZXRzIGFzc2lnbmVkIHNvbWUgcHJvdG9jb2wu
ICBBbmQgdGhlIHByb3RvY29sIHNob3VsZCByZWFsbHkgYmUgYXNzb2NpYXRlZCB3aXRoIGEgUVAs
IHJhdGhlciB0aGFuIGEgcG9ydC4NCg0KLSBTZWFuIA0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 4, 2015, 6:36 p.m. UTC | #12
On Mon, May 04, 2015 at 02:14:55AM -0400, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
> 
> Add Core capability flags to each port attribute and read those into ib_device
> upon registration for each port.

+1 on adding it to read_port_table_lengths

But this whole thing is starting to get goofy, 3rd add gets the work
to fix it up I guess.

Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct
and allocate an array of them.

Actually, why not just allocate an array of ib_port_attrs and fill
that? Then you can use it for the mad size too.

Not in your patch, but why does read_port_table_lengths use a kmalloc
for what should be a stack allocation? Yuk.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 4, 2015, 7:40 p.m. UTC | #13
On Mon, 2015-05-04 at 18:26 +0000, Hefty, Sean wrote:

> > Given that this is a per port thing, there is no assumption about a
> > device only supporting a single protocol.
> 
> Device, no, but we are assuming this per port.

At a high level, there are certain things that will be per port because
they are tied to the link layer in question (you won't have a port with
IB_SA that doesn't have IB_SA on all its queue pairs on that port).

>   I don't think this is
> true for USNIC.

I can't speak for USNIC, I haven't read the driver for it closely
enough.

>   For that matter, it's entirely possible for a RoCEv2
> device to expose UDP directly to user space, same as USNIC.

I'm not sure I'm following what you are saying here.  USNIC is unique in
that it presents a NIC to the application that it has exclusive control
over.

>   (I'd
> actually be surprised if no devices have this capability, if for
> debugging capabilities, even if for nothing else.)  What are we going
> to do if there's a device that supports both iWarp and RoCEv2?  That's
> easily doable today through software.

It's doable, but nobody is actually doing it.

> Basically, I question how protocol is defined and what it means to
> expose it as a device attribute.  Should it instead be negotiated (i.e.
> more like sockets)?  If an app needs a specific "RDMA protocol" (IB or
> iWarp) or "application protocol" (IB or iWarp or UDP or MADs?), they
> can request it.  Otherwise, the app gets assigned some protocol.  And
> the protocol should really be associated with a QP, rather than a
> port.

If you're going down to that level (which we will when the RoCEv2
patches are integrated), then it doesn't quite map to QPs.  Since a UD
QP may need to talk to both RoCEv1 and RoCEv2 hosts, it's at the AH
level for RoCE.

To a certain extent you are right.  RoCE, RoCEv2, iWARP, and USNIC could
all reside on a single port by virtue of them all sharing an Ethernet
link layer.  IB link layer (and associated attributes) and OPA link
layer are the outliers in this method of thinking in that their ports
will only support their own transport.  So if we were to actually be
forward thinking about this, then the link layer is the important bit
you need to capture on a per port basis, not the transport technology.
We would have to modify lots of the stack to support per QP or per AH
transport settings though.
Dave Goodell (dgoodell) May 4, 2015, 8:21 p.m. UTC | #14
On May 4, 2015, at 1:26 PM, Hefty, Sean <sean.hefty@intel.com> wrote:

>> I thought USNIC_UDP had an embedded USNIC protocol header inside the UDP
>> header.  That would make it a UDP_ENCAP protocol.
> 
> Someone from Cisco can correct me, but USNIC supports 2 protocols.  Just plain UDP, and a proprietary protocol that runs over Ethernet, but uses the same EtherType as RoCE.  I thought these could both be active on the same port at the same time.

Sean's statements above are correct.  "USNIC_UDP" in today's code really is plain-old-UDP (over IP over Ethernet).

>>>  And the core layer
>>> should not assume that a device is limited to supporting only one
>>> protocol, especially at the network and transport levels.
>> 
>> Given that this is a per port thing, there is no assumption about a
>> device only supporting a single protocol.
> 
> Device, no, but we are assuming this per port.  I don't think this is true for USNIC.  

Correct, usNIC devices and ports can speak either format concurrently, IIRC even for different QPs in the same context/PD/etc.

Incidentally, usNIC devices only ever have a single port right now, but there's no reason that has to remain true.

> For that matter, it's entirely possible for a RoCEv2 device to expose UDP directly to user space, same as USNIC.  (I'd actually be surprised if no devices have this capability, if for debugging capabilities, even if for nothing else.)  What are we going to do if there's a device that supports both iWarp and RoCEv2?  That's easily doable today through software.


+1

-Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 4, 2015, 9:07 p.m. UTC | #15
On Mon, May 04, 2015 at 03:40:25PM -0400, Doug Ledford wrote:

> So if we were to actually be forward thinking about this, then the
> link layer is the important bit you need to capture on a per port
> basis, not the transport technology.  We would have to modify lots
> of the stack to support per QP or per AH transport settings though.

We are going to have to do this, at least in the kernel side, the idea
that the port can imply the one and only address family of the QP is
certainly antiquated now. I think the roceev2 patches will require
this work.

As far as this patch series goes, I actually don't think the bits
matter one bit - the intent is to optimize the cap tests, so a one
cap test-one-bit method is fine. Don't add extra we don't need right
now.

As an optimization, be aware of some of the restrictions, it is
actually pretty expensive on some arch's to load large 32 or 64 bit
numbers, smaller numbers are better, single bit test is better.

If this is combined with my thought to use a #define for all the
common driver cases then we can trivially rework the bit-field to
future needs. No reason to over architect something here.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 4, 2015, 10:32 p.m. UTC | #16
On Mon, May 04, 2015 at 12:36:57PM -0600, Jason Gunthorpe wrote:
> On Mon, May 04, 2015 at 02:14:55AM -0400, ira.weiny@intel.com wrote:
> > From: Ira Weiny <ira.weiny@intel.com>
> > 
> > Add Core capability flags to each port attribute and read those into ib_device
> > upon registration for each port.
> 
> +1 on adding it to read_port_table_lengths
> 
> But this whole thing is starting to get goofy,
>

<quote>
> 3rd add gets the work
> to fix it up I guess.
</quote>

I don't understand this comment?

> 
> Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct
> and allocate an array of them.
> 
> Actually, why not just allocate an array of ib_port_attrs and fill
> that? Then you can use it for the mad size too.

That was debated before and I was hoping to leave that for another day.

It does make some sense to roll it in here.

> 
> Not in your patch, but why does read_port_table_lengths use a kmalloc
> for what should be a stack allocation? Yuk.

I don't know.  I almost submitted a separate patch for that.  I will clean it
up when I combine the functions.

I also don't like the way the arrays in read_port_table_lengths are handled.
This requires a start_port call when comparing bits.  It is easier to just make
the array 1 based with index 0 valid only for switches.

This is part of the reason there is a separate function and array.

Ira

> 
> Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 4, 2015, 11:16 p.m. UTC | #17
> > Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct
> > and allocate an array of them.
> > 
> > Actually, why not just allocate an array of ib_port_attrs and fill
> > that? Then you can use it for the mad size too.
> 
> That was debated before and I was hoping to leave that for another day.
> 
> It does make some sense to roll it in here.

I remember more details now...

This came up in my original OPA patch series when Sean objected to me calling
the port attributes "cached_port_attr".

There are a number of values in the ib_port_attr structure which are not fixed
(state, active_mtu, lid, sm_lid, etc...)

Putting another copy of this data in the device will be confusing to know which
of these one can count on for the correct data.

I did not write the table length code so I am assuming that data is immutable.
As is the max MAD size, and these new capability bits.  But storing the entire
ib_port_attr structure is probably going to mislead someone.

Do we still think this is a good idea?  I think putting in a single struct with
the proper immutable data is the proper way to go.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 4, 2015, 11:52 p.m. UTC | #18
On Mon, May 04, 2015 at 07:16:23PM -0400, ira.weiny wrote:
> > > Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct
> > > and allocate an array of them.
> > > 
> > > Actually, why not just allocate an array of ib_port_attrs and fill
> > > that? Then you can use it for the mad size too.
> > 
> > That was debated before and I was hoping to leave that for another day.
> > 
> > It does make some sense to roll it in here.
> 
> I remember more details now...
> 
> This came up in my original OPA patch series when Sean objected to me calling
> the port attributes "cached_port_attr".

Yeah, not a great name for storing immutable values. port_properties
or something.

> There are a number of values in the ib_port_attr structure which are not fixed
> (state, active_mtu, lid, sm_lid, etc...)

Yes, there are alot of those.. 

So, a new struct is better, and this has just gotten so messy.

- Update ib_alloc_device to accept a num_ports argument and
  create the port-port array at that point
- Have the drivers fill in their per port values before calling
  ib_register. Delete read_port_table_lengths
- Get ib_dealloc_device to free the list instead of unregister,
  feels like keeping that memory around for the duration of the kref
  is smarter..
- Drop gid_tbl_len and pkey_tbl_len for the new scheme
- Mark all the old immutable attrs in ib_port_attrs as deprecated and
  if any are easy to remove then do so..
- Don't add the caps or the max mad size immutables to port_attrs

> <quote>
> > 3rd add gets the work
> > to fix it up I guess.
> </quote>
> I don't understand this comment?

gid_tbl_len, pkey_tbl_len were the first two to use a 'shortcut' here,
your caps and/or max mad size are the third to take the 'shortcut'.

Rule of threes: The third person to extend the same badly designed
widget gets to fix it properly.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liran Liss May 5, 2015, 7:27 p.m. UTC | #19
PiBGcm9tOiBsaW51eC1yZG1hLW93bmVyQHZnZXIua2VybmVsLm9yZyBbbWFpbHRvOmxpbnV4LXJk
bWEtDQoNCj4gb3duZXJAdmdlci5rZXJuZWwub3JnXSBPbiBCZWhhbGYgT2YgSGVmdHksIFNlYW4N
Cj4gPg0KPiA+IEluIGFjY29yZGFuY2Ugd2l0aCB3aGF0IHdlJ3ZlIGJlZW4gdGFsa2luZyBhYm91
dCwgZHJvcCBJQk9FIGZvciBST0NFLg0KPiA+DQo+ID4gRHJvcCB0aGUgVURQIG9mZiBvZiBVU05J
QywgdGhlbiBkZWZpbmUgYSBiaXQgZm9yIENBUF9QUk9UX1VEUF9FTkNBUC4NCj4gPiBVU05JQyB3
aWxsIGJlIGp1c3QgVVNOSUMsIFVTTklDX1VEUCB3aWxsIGJlIFVTTklDIHwgVURQX0VOQ0FQLCBS
T0NFIHYxDQo+ID4gd2lsbCBiZSBST0NFLCBhbmQgUk9DRXYyIHdpbGwgYmUgUk9DRSB8IFVEUF9F
TkNBUC4NCj4gDQo+IFVTTklDX1VEUCBpcyBqdXN0IFVEUC4gIEkgZG9uJ3QgdW5kZXJzdGFuZCB3
aHkgd2Ugd291bGQgd2FudCAnVVNOSUMgfA0KPiBVRFBfRU5DQVAnLCBvciB3aGF0IFVEUF9FTkNB
UCBpcyBpbnRlbmRlZCB0byBjb252ZXkuICBOb3RoaW5nIGlzIGJlaW5nDQo+IGVuY2Fwc3VsYXRl
ZC4NCj4gDQoNCkkgYWdyZWUgdGhhdCB0aGUgVURQX0VOQ0FQIG5vdGlvbiBpcyBjb25mdXNpbmcu
DQpXZSBzaG91bGQgc3RpY2sgdG8gUk9DRV9WMSBhbmQgUk9DRV9WMi4NCg0KPiBSb0NFdjIgaXMg
SUIgdHJhbnNwb3J0IG92ZXIgVURQLg0KPiANCj4gSSdtIG5vdCBzdXJlIHdoYXQgdGhlIHByb3Rv
Y29sIGZpZWxkIGlzIGludGVuZGVkIHRvIGltcGx5LiAgSWYgd2Ugd2FudCB0bw0KPiBleHBvc2Ug
dGhlIGxpbmssIG5ldHdvcmssIHRyYW5zcG9ydCwgYW5kIFJETUEgcHJvdG9jb2xzIGluIHVzZSwg
c2hvdWxkbid0DQo+IHRoZXNlIGJlIHNlcGFyYXRlIGZpZWxkcyBvciBiaXRzPyAgQW5kIGV2ZW4g
dGhlbiwgSSdtIG5vdCBzdXJlIHdoYXQgdXNlIHRoaXMNCj4gaGFzIGZvciB0aGUgVUxQcy4gIGlX
YXJwIGRvZXMgbm90IHJlcXVpcmUgRXRoZXJuZXQgb3IgVENQLiAgUm9DRXYyIHdvdWxkDQo+IHdv
cmsgZmluZSBvdmVyIGFueSBsaW5rLiAgQW5kIHRoZSBjb3JlIGxheWVyIHNob3VsZCBub3QgYXNz
dW1lIHRoYXQgYSBkZXZpY2UgaXMNCj4gbGltaXRlZCB0byBzdXBwb3J0aW5nIG9ubHkgb25lIHBy
b3RvY29sLCBlc3BlY2lhbGx5IGF0IHRoZSBuZXR3b3JrIGFuZA0KPiB0cmFuc3BvcnQgbGV2ZWxz
LiAgSSB2b3RlIGZvciBkZXByZWNhdGluZyB0aGUgcHJvdG9jb2wgZ29vZmluZXNzLg0KPiANCj4g
LSBTZWFuDQoNClRoZSBwcm90b2NvbCBub3Rpb24gbWlnaHQgbm90IGhhdmUgYW55IHZhbHVlIGZv
ciBVTFBzLCBidXQgaXQgaXMgdXNlZnVsIGZvciBjb3JlDQptYW5hZ2VtZW50IGNvZGUuIEFsc28s
IHdoZW4gd2UgZXh0ZW5kIHRoZXNlIG5vdGlvbnMgdG8gdXNlci1zcGFjZSwgYWRtaW5zIHdpbGwN
CmFjdHVhbGx5IHdhbnQgdG8ga25vdyB3aGF0IHdpcmUgcHJvdG9jb2xzIGEgY2VydGFpbiBkZXZp
Y2UgY2FuIHN1cHBvcnQsIGV2ZW4NCmp1c3QgZm9yIHRoZSBzYWtlIG9mIGludGVyb3BlcmFiaWxp
dHkuDQoNCi0tTGlyYW4NCg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liran Liss May 5, 2015, 7:51 p.m. UTC | #20
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-

> owner@vger.kernel.org] On Behalf Of Hefty, Sean


> > I thought USNIC_UDP had an embedded USNIC protocol header inside the

> > UDP header.  That would make it a UDP_ENCAP protocol.

> 

> Someone from Cisco can correct me, but USNIC supports 2 protocols.  Just

> plain UDP, and a proprietary protocol that runs over Ethernet, but uses the

> same EtherType as RoCE.  I thought these could both be active on the same

> port at the same time.

> 


'protocol' refers to what your device generates when you do a post_send() on 
some QP.
In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation
over IP. In the USNIC case, you might want the protocol to refer to addition
information the distinguishes this wire protocol rather than just "I am sending
UDP packets"...
In other words, I think that 'protocol' should uniquely distinguish interoperable
peers at the Verbs API level. We are *not* trying to describe a certain header,
but rather a stack of protocols.

> > > RoCEv2 is IB transport over UDP.

> >

> > Right, ROCE (or IB, whichever you prefer) encapsulated in UDP.

> >

> > > I'm not sure what the protocol field is intended to imply.

> >

> > There is still information in those bits that we can't get elsewhere.

> > For instance, even though this patch replaces the CAP_* stuff with

> > bits, if you took away the CAP_PROT_* entries, then there would be no

> > entry to identify USNIC at all.

> >

> > Right now, you could infer iWARP from CAP_IW_CM.

> > You could infer InfiniBand from any of the CAP_IB_* (but later will

> > need a way to differentiate between IB and OPA) You could infer ROCE

> > from CAP_ETH_AH (but later will need a way to differentiate between

> > ROCE and ROCEv2) The only way to differentiate USNIC at the moment, is

> > that the CAPS would be all 0.  That's not the sort of positive

> > identification I would prefer.

> >

> > So you *could* reduce this to just one bit for USNIC.

> >

> > And if you then add a UDP_ENCAP bit, then that single bit can do

> > double duty in telling apart USNIC and USNIC_UDP and ROCE and ROCEv2.

> 

> My question is who needs these bits and why?  The primary reason it was

> exposed was to do the job that the new cap flags are accomplishing.

> 

> I still believe that RoCEv2 is conceptually the same as iWarp.  An RDMA

> protocol has been layered over some other transport.  In the case of iWarp,

> it's TCP.  In the case of RoCEv2, it's UDP.  Do we define a TCP_ENCAP bit that

> corresponds with UDP_ENCAP?  And why should an app care?  We don't

> specify whether the port is running IPv4 or IPv6.  Why is the transport level

> called out, but not the network layer?  Architecturally, neither iWarp or

> RoCEv2 (despite its name) cares what the link layer is.


We don't call out the transport layer.
That's why we call it 'protocol' and not 'transport' !

> 

> > >   And the core layer

> > > should not assume that a device is limited to supporting only one

> > > protocol, especially at the network and transport levels.

> >

> > Given that this is a per port thing, there is no assumption about a

> > device only supporting a single protocol.

> 

> Device, no, but we are assuming this per port.  I don't think this is true for

> USNIC.  For that matter, it's entirely possible for a RoCEv2 device to expose

> UDP directly to user space, same as USNIC.  (I'd actually be surprised if no

> devices have this capability, if for debugging capabilities, even if for nothing

> else.)  What are we going to do if there's a device that supports both iWarp

> and RoCEv2?  That's easily doable today through software.


If and when there are such devices, they can advertise multiple protocols.

> 

> Basically, I question how protocol is defined and what it means to expose it

> as a device attribute.  Should it instead be negotiated (i.e. more like sockets)?

> If an app needs a specific "RDMA protocol" (IB or iWarp) or "application

> protocol" (IB or iWarp or UDP or MADs?), they can request it.  Otherwise, the

> app gets assigned some protocol.  And the protocol should really be

> associated with a QP, rather than a port.

> 

> - Sean


The port capabilities do just that: advertise capabilities.
The actual protocol selection for each transport entity happens later.

For ROCE RC QPs, for example, this occurs while modifying the QP to refer
to the desired GID index.

--Liran
Liran Liss May 5, 2015, 7:59 p.m. UTC | #21
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> owner@vger.kernel.org] On Behalf Of Jason Gunthorpe

> > So if we were to actually be forward thinking about this, then the
> > link layer is the important bit you need to capture on a per port
> > basis, not the transport technology.  We would have to modify lots of
> > the stack to support per QP or per AH transport settings though.
> 
> We are going to have to do this, at least in the kernel side, the idea that the
> port can imply the one and only address family of the QP is certainly
> antiquated now. I think the roceev2 patches will require this work.
> 

I second that the link layer should not be used for this purpose.
RDMA devices can and will advertise multiple rdma 'protocols', such as
ROCE devices that support both ROCE_V1 and ROCE_V2.
 
--Liran

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Goodell (dgoodell) May 5, 2015, 8:22 p.m. UTC | #22
On May 5, 2015, at 2:51 PM, Liran Liss <liranl@mellanox.com> wrote:

>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>> owner@vger.kernel.org] On Behalf Of Hefty, Sean
> 
>>> I thought USNIC_UDP had an embedded USNIC protocol header inside the
>>> UDP header.  That would make it a UDP_ENCAP protocol.
>> 
>> Someone from Cisco can correct me, but USNIC supports 2 protocols.  Just
>> plain UDP, and a proprietary protocol that runs over Ethernet, but uses the
>> same EtherType as RoCE.  I thought these could both be active on the same
>> port at the same time.
>> 
> 
> 'protocol' refers to what your device generates when you do a post_send() on 
> some QP.
> In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation
> over IP. In the USNIC case, you might want the protocol to refer to addition
> information the distinguishes this wire protocol rather than just "I am sending
> UDP packets"...

In the case that usNIC is operating in UDP mode (which is the overwhelming majority of the cases), there is absolutely no additional protocol that ends up on the wire or headers in the user buffers besides UDP/IP/Ethernet.  They are 100% plain UDP packets, they just happen to be sent via OS-bypass queues instead of traveling through the kernel networking stack.  

[^^^^^ there continues to be confusion about this for some reason, but I don't know why]

> In other words, I think that 'protocol' should uniquely distinguish interoperable
> peers at the Verbs API level. We are *not* trying to describe a certain header,
> but rather a stack of protocols.

Any non-UDP "protocol" that might currently be in use over usNIC is an entirely application-level protocol outside of the view of the Verbs API level.

-Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 5, 2015, 8:29 p.m. UTC | #23
> > In other words, I think that 'protocol' should uniquely distinguish
> interoperable
> > peers at the Verbs API level. We are *not* trying to describe a certain
> header,
> > but rather a stack of protocols.
> 
> Any non-UDP "protocol" that might currently be in use over usNIC is an
> entirely application-level protocol outside of the view of the Verbs API
> level.

I agree with this for usNIC.  For other technologies, the definition isn't even this simple.  UD QPs and RC QPs cannot interoperate, yet are treated as the same 'protocol'.  Just defining protocol as a 'stack of protocols' shows how broken the approach is.  And we haven't even tried to incorporate other 'protocols' needed to make this work, such as the CM or management protocols.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liran Liss May 6, 2015, 2:25 p.m. UTC | #24
> From: Hefty, Sean [mailto:sean.hefty@intel.com]
> Sent: Tuesday, May 05, 2015 11:29 PM

> > > In other words, I think that 'protocol' should uniquely distinguish
> > interoperable
> > > peers at the Verbs API level. We are *not* trying to describe a
> > > certain
> > header,
> > > but rather a stack of protocols.
> >
> > Any non-UDP "protocol" that might currently be in use over usNIC is an
> > entirely application-level protocol outside of the view of the Verbs
> > API level.
> 

Fine. Then we can add a 'udp/ip' protocol for usNIC.
Similarly, 'raw Ethernet' can designate the ability to inject of all headers starting from L2.

> I agree with this for usNIC.  For other technologies, the definition isn't even
> this simple.  UD QPs and RC QPs cannot interoperate, yet are treated as the
> same 'protocol'.  Just defining protocol as a 'stack of protocols' shows how
> broken the approach is.  And we haven't even tried to incorporate other
> 'protocols' needed to make this work, such as the CM or management
> protocols.

The interoperability between different transport objects depends on the technology.
I don't think that we should even attempt to describe these dependencies in any
abstract way. RTFM...

However, users should have a way of knowing what transports and what wire
protocol stacks a certain rdma device supports.

No one using IB or RoCE would think that UD and RC interoperate.
(If someone tries it, he would be disappointed 'gracefully'...)
But people that use RoCE would like to know which devices they can use to send
RoCE packets (or select a device that supports a certain RoCE version).

--Liran

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 8, 2015, 6:56 p.m. UTC | #25
On Tue, 2015-05-05 at 19:27 +0000, Liran Liss wrote:
> > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> 
> > owner@vger.kernel.org] On Behalf Of Hefty, Sean
> > >
> > > In accordance with what we've been talking about, drop IBOE for ROCE.
> > >
> > > Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP.
> > > USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1
> > > will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP.
> > 
> > USNIC_UDP is just UDP.  I don't understand why we would want 'USNIC |
> > UDP_ENCAP', or what UDP_ENCAP is intended to convey.  Nothing is being
> > encapsulated.
> > 
> 
> I agree that the UDP_ENCAP notion is confusing.
> We should stick to ROCE_V1 and ROCE_V2.
> 
> > RoCEv2 is IB transport over UDP.
> > 
> > I'm not sure what the protocol field is intended to imply.  If we want to
> > expose the link, network, transport, and RDMA protocols in use, shouldn't
> > these be separate fields or bits?  And even then, I'm not sure what use this
> > has for the ULPs.  iWarp does not require Ethernet or TCP.  RoCEv2 would
> > work fine over any link.  And the core layer should not assume that a device is
> > limited to supporting only one protocol, especially at the network and
> > transport levels.  I vote for deprecating the protocol goofiness.
> > 
> > - Sean
> 
> The protocol notion might not have any value for ULPs, but it is useful for core
> management code. Also, when we extend these notions to user-space, admins will
> actually want to know what wire protocols a certain device can support, even
> just for the sake of interoperability.

So I've been thinking a bit more about the overall architecture of the
underlying patch set from Michael.  The structure of that patch set is
to a certain extent dictating this follow on patch set and so any
problems in it end up being transmitted here as well.

Sean's comments about transports I think were spot on, and they got me
doing a little thought exercise in my head.

Theoretically, if we were to take the SoftiWARP and SoftRoCE drivers, we
could merge them down to one driver that supported both iWARP and RoCE
(and we could add USNIC too I suspect).  Because all of these things
work on an Ethernet link layer, there is no reason they couldn't all be
presented on the same device.  So if you had a single verbs device that
could support all of these things, where would you need to capture and
store the relevant information?

This is what I have so far:

On a per device basis: not much to be honest, node_type IB_CA/RNIC for
back compatibility and that's pretty much it

On a per port basis: link layer (Ethernet, IB, or OPA).  Notice that I
didn't include the transport here.  Two of the above link layers
strictly imply their transport.  The third link layer could have
multiple transports.  Which brings me to my next level...

On a per queue pair basis: if (link_layer == Ethernet) iWARP/RoCE/USNIC
                         if (qp_type == RoCE && qp != UD) RoCEv1/RoCEv2

On a per ah basis: if (qp_type == RoCE && qp = UD) RoCEv1/RoCEv2

Michael's patch set was intended to give us a framework within which we
can start cleaning things up.  Once we get to cleaning things up, I
think the above is where we should target storing the relevant
information.

Thoughts?
Hefty, Sean May 8, 2015, 8:06 p.m. UTC | #26
> This is what I have so far:

> 

> On a per device basis: not much to be honest, node_type IB_CA/RNIC for

> back compatibility and that's pretty much it

> 

> On a per port basis: link layer (Ethernet, IB, or OPA).  Notice that I

> didn't include the transport here.  Two of the above link layers

> strictly imply their transport.  The third link layer could have

> multiple transports.  Which brings me to my next level...

> 

> On a per queue pair basis: if (link_layer == Ethernet) iWARP/RoCE/USNIC

>                          if (qp_type == RoCE && qp != UD) RoCEv1/RoCEv2

> 

> On a per ah basis: if (qp_type == RoCE && qp = UD) RoCEv1/RoCEv2

> 

> Michael's patch set was intended to give us a framework within which we

> can start cleaning things up.  Once we get to cleaning things up, I

> think the above is where we should target storing the relevant

> information.

> 

> Thoughts?


I'm not following your intent here.

The qp_type values are currently defined as RC, UC, UD, XRC, plus some weirdness like SMI, GSI.  Are you suggesting that we store the relevant information as part of the qp_type, or that we keep the qp_type as-is?
 
Once Michael's patches are integrated, do apps need anything else beyond the qp type as currently defined?
Doug Ledford May 8, 2015, 8:30 p.m. UTC | #27
On Fri, 2015-05-08 at 20:06 +0000, Hefty, Sean wrote:
> > This is what I have so far:
> > 
> > On a per device basis: not much to be honest, node_type IB_CA/RNIC for
> > back compatibility and that's pretty much it
> > 
> > On a per port basis: link layer (Ethernet, IB, or OPA).  Notice that I
> > didn't include the transport here.  Two of the above link layers
> > strictly imply their transport.  The third link layer could have
> > multiple transports.  Which brings me to my next level...
> > 
> > On a per queue pair basis: if (link_layer == Ethernet) iWARP/RoCE/USNIC
> >                          if (qp_type == RoCE && qp != UD) RoCEv1/RoCEv2
> > 
> > On a per ah basis: if (qp_type == RoCE && qp = UD) RoCEv1/RoCEv2
> > 
> > Michael's patch set was intended to give us a framework within which we
> > can start cleaning things up.  Once we get to cleaning things up, I
> > think the above is where we should target storing the relevant
> > information.
> > 
> > Thoughts?
> 
> I'm not following your intent here.
> 
> The qp_type values are currently defined as RC, UC, UD, XRC, plus some
> weirdness like SMI, GSI.  Are you suggesting that we store the relevant
> information as part of the qp_type, or that we keep the qp_type as-is?

Well, you note I wrote qp != UD, where as that's really the qp_type, so
the above was psuedo code at best.  I was necessarily suggesting where
in the qp data struct to store it, just that even though there isn't
hardware that does this (yet), there's no reason hardware couldn't be
designed to support both iWARP and RoCE and USNIC over the same Ethernet
link layer, and merging the SoftRoCE and SoftiWARP drivers would make a
proof of concept, and so we would *have* to store that information on a
per QP basis.

>  
 Once Michael's patches are integrated, do apps need anything else
> beyond the qp type as currently defined?

If you had a device that supported iWARP and RoCE on the same physical
link layer, then yes, the app would need a means of saying which
transport to use in addition to the type of QP to establish.
Hefty, Sean May 8, 2015, 8:56 p.m. UTC | #28
> Well, you note I wrote qp != UD, where as that's really the qp_type, so

> the above was psuedo code at best.  I was necessarily suggesting where

> in the qp data struct to store it, just that even though there isn't

> hardware that does this (yet), there's no reason hardware couldn't be

> designed to support both iWARP and RoCE and USNIC over the same Ethernet

> link layer, and merging the SoftRoCE and SoftiWARP drivers would make a

> proof of concept, and so we would *have* to store that information on a

> per QP basis.

> 

> >

>  Once Michael's patches are integrated, do apps need anything else

> > beyond the qp type as currently defined?

> 

> If you had a device that supported iWARP and RoCE on the same physical

> link layer, then yes, the app would need a means of saying which

> transport to use in addition to the type of QP to establish.


Ah - got it now.  And I agree, there should be some way to specify this at the QP level.
Jason Gunthorpe May 8, 2015, 9:48 p.m. UTC | #29
On Fri, May 08, 2015 at 08:56:16PM +0000, Hefty, Sean wrote:

> > If you had a device that supported iWARP and RoCE on the same physical
> > link layer, then yes, the app would need a means of saying which
> > transport to use in addition to the type of QP to establish.
> 
> Ah - got it now.  And I agree, there should be some way to specify
> this at the QP level.

Yes, the only way out is to specify on a per QP basis the addressing
and protocol/transport/whatever thing. Socket uses the AF,SOCK,PROTO
tuple to specify this information. We can probably productively use a
similar breakdown:

AF_IB,SOCK_RC,PROTO_IBA  // InfiniBand
AF_OPA,SOCK_RC,PROTO_IBA // Future 32 bit LID OPB 'InfiniBand'
AF_ETH,SOCK_RC,PROTO_IBA // RoCEv1
AF_INET,SOCK_RC,PROTO_IBA // InfiniBand or RoCEv2, depending on the Link Layer
AF_ETH,SOCK_RC,PROTO_USNIC
AF_INET,SOCK_RC,PROTO_USNIC
AF_INET,SOCK_RC,PROTO_IWARP

The expectation would be that any SOCK,PROTO combination can
'interoperate' with any AF combination - which is true with the above
examples, ie with the right magic NAT hardware RoCEv2/v1/IB/(OPA?) can
all interwork at the BTH layer.

Which is to also say, it also unambiguously implies which standard
defines the subtle verbs varietions that the app has to cope with.

Incompatible AH's can not be used with the wrong type of QP, the CM
layers would auto create QPs with the correct type to reflect how the
CM process was run, etc.

But Michael's patches are still an improvement, ie a RoCE + iWarp port
would still have the cap_mad as true and would still have to create a
QP1 interface, which I think is captured OK. The CM side probably
explodes if a port is both iWarp and RoCE, but that is alot of work to
fix anyhow..

I think we have to tackle the addressing problem as part of the RoCEv2
stuff, just burying all this in a qp index is really horrible.

Realistically, today, if a RoCE/iWarp driver appeared then it would
have to present to the system as two RDMA devices.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 9, 2015, 3:32 a.m. UTC | #30
On Tue, 2015-05-05 at 20:22 +0000, Dave Goodell (dgoodell) wrote:
> On May 5, 2015, at 2:51 PM, Liran Liss <liranl@mellanox.com> wrote:
> 
> >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> >> owner@vger.kernel.org] On Behalf Of Hefty, Sean
> > 
> >>> I thought USNIC_UDP had an embedded USNIC protocol header inside the
> >>> UDP header.  That would make it a UDP_ENCAP protocol.
> >> 
> >> Someone from Cisco can correct me, but USNIC supports 2 protocols.  Just
> >> plain UDP, and a proprietary protocol that runs over Ethernet, but uses the
> >> same EtherType as RoCE.  I thought these could both be active on the same
> >> port at the same time.
> >> 
> > 
> > 'protocol' refers to what your device generates when you do a post_send() on 
> > some QP.
> > In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation
> > over IP. In the USNIC case, you might want the protocol to refer to addition
> > information the distinguishes this wire protocol rather than just "I am sending
> > UDP packets"...
> 
> In the case that usNIC is operating in UDP mode (which is the
> overwhelming majority of the cases), there is absolutely no additional
> protocol that ends up on the wire or headers in the user buffers
> besides UDP/IP/Ethernet.  They are 100% plain UDP packets, they just
> happen to be sent via OS-bypass queues instead of traveling through the
> kernel networking stack.  
> 
> [^^^^^ there continues to be confusion about this for some reason, but
> I don't know why]

I really need to just sit down and read your driver front to back.  The
confusion probably comes from people (such as myself) that first think
about this and go "Well, if you have multiple queue pairs, and UDP, then
you need a header to tell what QP a packet goes to" (which assumes a
RoCE like usage of UDP where all packets go to the same UDP address)
where your actual usage is probably more iWARP like in that the IP/UDP
SRC/DST combo maps to a specific QP, right?
Ira Weiny May 11, 2015, 6:42 a.m. UTC | #31
On Tue, May 05, 2015 at 08:22:21PM +0000, Dave Goodell (dgoodell) wrote:
> On May 5, 2015, at 2:51 PM, Liran Liss <liranl@mellanox.com> wrote:
> 
> >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> >> owner@vger.kernel.org] On Behalf Of Hefty, Sean
> > 
> >>> I thought USNIC_UDP had an embedded USNIC protocol header inside the
> >>> UDP header.  That would make it a UDP_ENCAP protocol.
> >> 
> >> Someone from Cisco can correct me, but USNIC supports 2 protocols.  Just
> >> plain UDP, and a proprietary protocol that runs over Ethernet, but uses the
> >> same EtherType as RoCE.  I thought these could both be active on the same
> >> port at the same time.
> >> 
> > 
> > 'protocol' refers to what your device generates when you do a post_send() on 
> > some QP.
> > In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation
> > over IP. In the USNIC case, you might want the protocol to refer to addition
> > information the distinguishes this wire protocol rather than just "I am sending
> > UDP packets"...
> 
> In the case that usNIC is operating in UDP mode (which is the overwhelming majority of the cases), there is absolutely no additional protocol that ends up on the wire or headers in the user buffers besides UDP/IP/Ethernet.  They are 100% plain UDP packets, they just happen to be sent via OS-bypass queues instead of traveling through the kernel networking stack.  
> 
> [^^^^^ there continues to be confusion about this for some reason, but I don't know why]

So what is this patch for?

commit 248567f79304b953ea492fb92ade097b62ed09b2
Author: Upinder Malhi <umalhi@cisco.com>
Date:   Thu Jan 9 14:48:19 2014 -0800

IB/core: Add RDMA_TRANSPORT_USNIC_UDP
        
Add RDMA_TRANSPORT_USNIC_UDP which will be used by usNIC.
	        
Signed-off-by: Upinder Malhi <umalhi@cisco.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>

This is probably where a lot of the confusion is coming from.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Goodell (dgoodell) May 11, 2015, 10:19 p.m. UTC | #32
On May 8, 2015, at 8:32 PM, Doug Ledford <dledford@redhat.com> wrote:

> On Tue, 2015-05-05 at 20:22 +0000, Dave Goodell (dgoodell) wrote:
>> 
>> In the case that usNIC is operating in UDP mode (which is the
>> overwhelming majority of the cases), there is absolutely no additional
>> protocol that ends up on the wire or headers in the user buffers
>> besides UDP/IP/Ethernet.  They are 100% plain UDP packets, they just
>> happen to be sent via OS-bypass queues instead of traveling through the
>> kernel networking stack.  
>> 
>> [^^^^^ there continues to be confusion about this for some reason, but
>> I don't know why]
> 
> I really need to just sit down and read your driver front to back.

Feel free to ping me off-list if you want any explanation about what's going on in the usnic_verbs module.  I'm happy to help.

> The
> confusion probably comes from people (such as myself) that first think
> about this and go "Well, if you have multiple queue pairs, and UDP, then
> you need a header to tell what QP a packet goes to" (which assumes a
> RoCE like usage of UDP where all packets go to the same UDP address)
> where your actual usage is probably more iWARP like in that the IP/UDP
> SRC/DST combo maps to a specific QP, right?

I don't know much about the details of iWARP, but yes, a UDP/IP src/dst port combo maps to a specific QP when using USNIC_UDP QPs.  We do _not_ use a single well-known port.

-Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Goodell (dgoodell) May 11, 2015, 10:26 p.m. UTC | #33
On May 10, 2015, at 11:42 PM, ira.weiny <ira.weiny@intel.com> wrote:

> On Tue, May 05, 2015 at 08:22:21PM +0000, Dave Goodell (dgoodell) wrote:
>> 
>> In the case that usNIC is operating in UDP mode (which is the overwhelming majority of the cases), there is absolutely no additional protocol that ends up on the wire or headers in the user buffers besides UDP/IP/Ethernet.  They are 100% plain UDP packets, they just happen to be sent via OS-bypass queues instead of traveling through the kernel networking stack.  
>> 
>> [^^^^^ there continues to be confusion about this for some reason, but I don't know why]
> 
> So what is this patch for?

Does my earlier email clarify the situation any?  http://marc.info/?l=linux-rdma&m=142972178630720&w=2

> commit 248567f79304b953ea492fb92ade097b62ed09b2
> Author: Upinder Malhi <umalhi@cisco.com>
> Date:   Thu Jan 9 14:48:19 2014 -0800
> 
> IB/core: Add RDMA_TRANSPORT_USNIC_UDP
> 
> Add RDMA_TRANSPORT_USNIC_UDP which will be used by usNIC.
> 	        
> Signed-off-by: Upinder Malhi <umalhi@cisco.com>
> Signed-off-by: Roland Dreier <roland@purestorage.com>
> 
> This is probably where a lot of the confusion is coming from.

Arguably RDMA_TRANSPORT_USNIC_UDP could/should have simply been named RDMA_TRANSPORT_UDP.

-Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 11, 2015, 10:39 p.m. UTC | #34
On Mon, May 11, 2015 at 10:19:29PM +0000, Dave Goodell (dgoodell) wrote:

> > I really need to just sit down and read your driver front to back.
> 
> Feel free to ping me off-list if you want any explanation about
> what's going on in the usnic_verbs module.  I'm happy to help.

So, I look at this:

int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                                struct ib_send_wr **bad_wr)
{
        usnic_dbg("\n");
        return -EINVAL;
}

And conclude that usnic doesn't actually provide any RDMA services to
in-kernel users, correct?

I'd then guess the entire point of this is to hack some non-RDMA
device to export it's interface via uverbs? Presumably because someone
nack'd using your own char device or whatever?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 12, 2015, 2:13 a.m. UTC | #35
> 
> On May 10, 2015, at 11:42 PM, ira.weiny <ira.weiny@intel.com> wrote:
> 
> > On Tue, May 05, 2015 at 08:22:21PM +0000, Dave Goodell (dgoodell) wrote:
> >>
> >> In the case that usNIC is operating in UDP mode (which is the overwhelming
> majority of the cases), there is absolutely no additional protocol that ends up on
> the wire or headers in the user buffers besides UDP/IP/Ethernet.  They are
> 100% plain UDP packets, they just happen to be sent via OS-bypass queues
> instead of traveling through the kernel networking stack.
> >>
> >> [^^^^^ there continues to be confusion about this for some reason, but I
> don't know why]
> >
> > So what is this patch for?
> 
> Does my earlier email clarify the situation any?  http://marc.info/?l=linux-
> rdma&m=142972178630720&w=2

Somewhat, is there any reason applications need to distinguish between the " The legacy RDMA_TRANSPORT_USNIC type" and " The current RDMA_TRANSPORT_USNIC_UDP type"?

Or does the former no longer exist?

> 
> > commit 248567f79304b953ea492fb92ade097b62ed09b2
> > Author: Upinder Malhi <umalhi@cisco.com>
> > Date:   Thu Jan 9 14:48:19 2014 -0800
> >
> > IB/core: Add RDMA_TRANSPORT_USNIC_UDP
> >
> > Add RDMA_TRANSPORT_USNIC_UDP which will be used by usNIC.
> >
> > Signed-off-by: Upinder Malhi <umalhi@cisco.com>
> > Signed-off-by: Roland Dreier <roland@purestorage.com>
> >
> > This is probably where a lot of the confusion is coming from.
> 
> Arguably RDMA_TRANSPORT_USNIC_UDP could/should have simply been
> named RDMA_TRANSPORT_UDP.

I guess I'm wondering if there needs to be an RDMA_TRANSPORT_USNIC to represent the " The legacy RDMA_TRANSPORT_USNIC type" you mention in the link above.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Goodell (dgoodell) May 12, 2015, 5:41 p.m. UTC | #36
On May 11, 2015, at 3:39 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:

> So, I look at this:
> 
> int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
>                                struct ib_send_wr **bad_wr)
> {
>        usnic_dbg("\n");
>        return -EINVAL;
> }
> 
> And conclude that usnic doesn't actually provide any RDMA services to
> in-kernel users, correct?

Correct.

> I'd then guess the entire point of this is to hack some non-RDMA
> device to export it's interface via uverbs?

If by "non-RDMA", you mean "does not currently support hardware offload of direct data placement", then yes.  There are a couple of things to keep in mind:

1) usNIC is UD-only at this time.  Are there any kernel clients that make use of UD without also requiring RC? (this is a genuine question) 

2) When using UDP, usNIC deposits 42-byte UDP/IP/Ethernet headers into the start of the recv buffer, not a 40-byte GRH.  Unfortunately, the current verbs stack has no way to indicate the device's/port's/QP's header size, let alone format, to any client.  So clients must be aware they are using a usNIC device under the hood.

> Presumably because someone
> nack'd using your own char device or whatever?

I wasn't participating on the kernel side of the usNIC project at the time that some of the early decisions were made, but to my knowledge the uverbs stack was selected as the most appropriate way to expose this technology to users after evaluation of several approaches.  I don't think that any other approach was previously submitted and NACK-ed.

-Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Goodell (dgoodell) May 12, 2015, 5:56 p.m. UTC | #37
On May 11, 2015, at 7:13 PM, Weiny, Ira <ira.weiny@intel.com> wrote:

>> Does my earlier email clarify the situation any?  http://marc.info/?l=linux-
>> rdma&m=142972178630720&w=2
> 
> Somewhat, is there any reason applications need to distinguish between the " The legacy RDMA_TRANSPORT_USNIC type" and " The current RDMA_TRANSPORT_USNIC_UDP type"?

Addressing is different between the two (Ethernet_MAC+QP# vs. IP+UDP_port, respectively).

Also, the received UD header format and size are different between them (40-byte synthetic GRH vs. 42-byte UDP/IP/Ethernet header).

> Or does the former no longer exist?

The former is no longer being deployed and should be rare to find in the wild at this point, though I do not know precisely how many customers may be using it still.  It is definitely legacy at this point.

>> Arguably RDMA_TRANSPORT_USNIC_UDP could/should have simply been
>> named RDMA_TRANSPORT_UDP.
> 
> I guess I'm wondering if there needs to be an RDMA_TRANSPORT_USNIC to represent the " The legacy RDMA_TRANSPORT_USNIC type" you mention in the link above.

It probably needs to exist for a little while still.

-Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 12, 2015, 6:08 p.m. UTC | #38
On Tue, May 12, 2015 at 05:41:14PM +0000, Dave Goodell (dgoodell) wrote:

> > I'd then guess the entire point of this is to hack some non-RDMA
> > device to export it's interface via uverbs?
> 
> If by "non-RDMA", you mean "does not currently support hardware
> offload of direct data placement", then yes.  There are a couple of
> things to keep in mind:

I actually mean 'does not provide RDMA services to the kernel'.

What actually happens if you, for instance, load the iser module and
try and use it with usnic?

What parts of the RDMA core code does usnic actually use beyond
command marshaling through uverbs?

Would things be easier for you if usnic lived within DPDK?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liran Liss May 13, 2015, 8:41 p.m. UTC | #39
> From: Jason Gunthorpe [mailto:jgunthorpe@obsidianresearch.com]

> > > If you had a device that supported iWARP and RoCE on the same
> > > physical link layer, then yes, the app would need a means of saying
> > > which transport to use in addition to the type of QP to establish.
> >
> > Ah - got it now.  And I agree, there should be some way to specify
> > this at the QP level.

Supporting iWARP and RoCE over the same RDMA device makes no sense.
First of all, it is confusing to the user. It's like supporting SRP and iSCSI
using the same SCSI device instance...

Second, different technologies affect much more than the QP. 
A technology determines the format and semantics of the whole interface.
For example, AHs, memory windows, atomics, CQE formats are different as well.
Just thinking of how to abstract all these in  single device is mind-boggling.

Finally, there is no sane way to describe device capabilities. There is no
single set of capabilities that applies to all of these technologies consistently.

> 
> Yes, the only way out is to specify on a per QP basis the addressing and
> protocol/transport/whatever thing. Socket uses the AF,SOCK,PROTO tuple to
> specify this information. We can probably productively use a similar
> breakdown:

We might consider such abstractions at the CMA layer, not at the device level.
Essentially, this is what the CMA was intended for.

> 
> AF_IB,SOCK_RC,PROTO_IBA  // InfiniBand
> AF_OPA,SOCK_RC,PROTO_IBA // Future 32 bit LID OPB 'InfiniBand'

This is not PROTO_IBA.

> AF_ETH,SOCK_RC,PROTO_IBA // RoCEv1
> AF_INET,SOCK_RC,PROTO_IBA // InfiniBand or RoCEv2, depending on the Link
> Layer AF_ETH,SOCK_RC,PROTO_USNIC AF_INET,SOCK_RC,PROTO_USNIC
> AF_INET,SOCK_RC,PROTO_IWARP
> 

[snip]
> 
> 
> Realistically, today, if a RoCE/iWarp driver appeared then it would have to
> present to the system as two RDMA devices.
> 

Exactly!
--Liran
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b360350..6a37255 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -262,6 +262,37 @@  out:
 	return ret;
 }
 
+static int read_core_cap_flags(struct ib_device *device)
+{
+	struct ib_port_attr tprops;
+	int num_ports, ret = -ENOMEM;
+	u8 port_index;
+
+	num_ports = device->phys_port_cnt;
+
+	device->core_cap_flags = kzalloc(sizeof(*device->core_cap_flags)
+						* (num_ports+1),
+					 GFP_KERNEL);
+	if (!device->core_cap_flags)
+		return -ENOMEM;
+
+	for (port_index = 0; port_index <= num_ports; ++port_index) {
+		if ((port_index == 0 && device->node_type != RDMA_NODE_IB_SWITCH))
+			continue;
+
+		ret = ib_query_port(device, port_index, &tprops);
+		if (ret)
+			goto err;
+
+		device->core_cap_flags[port_index] = tprops.core_cap_flags;
+	}
+
+	return 0;
+err:
+	kfree(device->core_cap_flags);
+	return ret;
+}
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
@@ -302,12 +333,21 @@  int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
+	ret = read_core_cap_flags(device);
+	if (ret) {
+		dev_err(&device->dev, "Couldn't create Core Capability flags\n");
+		kfree(device->gid_tbl_len);
+		kfree(device->pkey_tbl_len);
+		goto out;
+	}
+
 	ret = ib_device_register_sysfs(device, port_callback);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
 		       device->name);
 		kfree(device->gid_tbl_len);
 		kfree(device->pkey_tbl_len);
+		kfree(device->core_cap_flags);
 		goto out;
 	}
 
@@ -351,6 +391,7 @@  void ib_unregister_device(struct ib_device *device)
 
 	kfree(device->gid_tbl_len);
 	kfree(device->pkey_tbl_len);
+	kfree(device->core_cap_flags);
 
 	mutex_unlock(&device_mutex);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c724114..4de2758 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -353,11 +353,32 @@  union rdma_protocol_stats {
 	struct iw_protocol_stats	iw;
 };
 
+/* Define bits for the various functionality this port needs to be supported by
+ * the core.
+ */
+/* Management                           0x00000000FFFFFFFF */
+#define RDMA_CORE_CAP_IB_MAD		0x0000000000000001ULL
+#define RDMA_CORE_CAP_IB_SMI		0x0000000000000002ULL
+#define RDMA_CORE_CAP_IB_CM		0x0000000000000004ULL
+#define RDMA_CORE_CAP_IW_CM		0x0000000000000008ULL
+#define RDMA_CORE_CAP_IB_SA		0x0000000000000010ULL
+
+/* Address format                       0x0000FFFF00000000 */
+#define RDMA_CORE_CAP_AF_IB		0x0000000100000000ULL
+#define RDMA_CORE_CAP_ETH_AH		0x0000000200000000ULL
+
+/* Protocol                             0xFFFF000000000000 */
+#define RDMA_CORE_CAP_PROT_IB		0x0001000000000000ULL
+#define RDMA_CORE_CAP_PROT_IBOE		0x0002000000000000ULL
+#define RDMA_CORE_CAP_PROT_IWARP	0x0004000000000000ULL
+#define RDMA_CORE_CAP_PROT_USNIC_UDP	0x0008000000000000ULL
+
 struct ib_port_attr {
 	enum ib_port_state	state;
 	enum ib_mtu		max_mtu;
 	enum ib_mtu		active_mtu;
 	int			gid_tbl_len;
+	u64                     core_cap_flags;
 	u32			port_cap_flags;
 	u32			max_msg_sz;
 	u32			bad_pkey_cntr;
@@ -1684,6 +1705,7 @@  struct ib_device {
 	u32			     local_dma_lkey;
 	u8                           node_type;
 	u8                           phys_port_cnt;
+	u64                         *core_cap_flags; /* Per port core capability flags */
 };
 
 struct ib_client {