Message ID | 1430720099-32512-2-git-send-email-ira.weiny@intel.com (mailing list archive) |
---|---|
State | Rejected |
Headers | show |
On Mon, 2015-05-04 at 02:14 -0400, ira.weiny@intel.com wrote: > From: Ira Weiny <ira.weiny@intel.com> > > Add Core capability flags to each port attribute and read those into ib_device > upon registration for each port. > > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > --- > drivers/infiniband/core/device.c | 41 ++++++++++++++++++++++++++++++++++++++ > include/rdma/ib_verbs.h | 22 ++++++++++++++++++++ > 2 files changed, 63 insertions(+), 0 deletions(-) > > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c > index b360350..6a37255 100644 > --- a/drivers/infiniband/core/device.c > +++ b/drivers/infiniband/core/device.c > @@ -262,6 +262,37 @@ out: > return ret; > } > > +static int read_core_cap_flags(struct ib_device *device) > +{ > + struct ib_port_attr tprops; > + int num_ports, ret = -ENOMEM; > + u8 port_index; > + > + num_ports = device->phys_port_cnt; > + > + device->core_cap_flags = kzalloc(sizeof(*device->core_cap_flags) > + * (num_ports+1), > + GFP_KERNEL); > + if (!device->core_cap_flags) > + return -ENOMEM; > + > + for (port_index = 0; port_index <= num_ports; ++port_index) { > + if ((port_index == 0 && device->node_type != RDMA_NODE_IB_SWITCH)) > + continue; > + > + ret = ib_query_port(device, port_index, &tprops); > + if (ret) > + goto err; > + > + device->core_cap_flags[port_index] = tprops.core_cap_flags; > + } > + > + return 0; > +err: > + kfree(device->core_cap_flags); > + return ret; > +} > + > /** > * ib_register_device - Register an IB device with IB core > * @device:Device to register > @@ -302,12 +333,21 @@ int ib_register_device(struct ib_device *device, > goto out; > } > > + ret = read_core_cap_flags(device); > + if (ret) { > + dev_err(&device->dev, "Couldn't create Core Capability flags\n"); > + kfree(device->gid_tbl_len); > + kfree(device->pkey_tbl_len); > + goto out; > + } > + > ret = ib_device_register_sysfs(device, port_callback); > if (ret) { > printk(KERN_WARNING "Couldn't register device %s with driver model\n", > device->name); > kfree(device->gid_tbl_len); > kfree(device->pkey_tbl_len); > + kfree(device->core_cap_flags); > goto out; > } > > @@ -351,6 +391,7 @@ void ib_unregister_device(struct ib_device *device) > > kfree(device->gid_tbl_len); > kfree(device->pkey_tbl_len); > + kfree(device->core_cap_flags); > > mutex_unlock(&device_mutex); > > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index c724114..4de2758 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -353,11 +353,32 @@ union rdma_protocol_stats { > struct iw_protocol_stats iw; > }; > > +/* Define bits for the various functionality this port needs to be supported by > + * the core. > + */ > +/* Management 0x00000000FFFFFFFF */ > +#define RDMA_CORE_CAP_IB_MAD 0x0000000000000001ULL > +#define RDMA_CORE_CAP_IB_SMI 0x0000000000000002ULL > +#define RDMA_CORE_CAP_IB_CM 0x0000000000000004ULL > +#define RDMA_CORE_CAP_IW_CM 0x0000000000000008ULL > +#define RDMA_CORE_CAP_IB_SA 0x0000000000000010ULL > + > +/* Address format 0x0000FFFF00000000 */ > +#define RDMA_CORE_CAP_AF_IB 0x0000000100000000ULL > +#define RDMA_CORE_CAP_ETH_AH 0x0000000200000000ULL > + > +/* Protocol 0xFFFF000000000000 */ > +#define RDMA_CORE_CAP_PROT_IB 0x0001000000000000ULL > +#define RDMA_CORE_CAP_PROT_IBOE 0x0002000000000000ULL > +#define RDMA_CORE_CAP_PROT_IWARP 0x0004000000000000ULL > +#define RDMA_CORE_CAP_PROT_USNIC_UDP 0x0008000000000000ULL In accordance with what we've been talking about, drop IBOE for ROCE. Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP. USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1 will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP. > + > struct ib_port_attr { > enum ib_port_state state; > enum ib_mtu max_mtu; > enum ib_mtu active_mtu; > int gid_tbl_len; > + u64 core_cap_flags; I think u32 should be enough here, and will help keep our footprint smaller. > u32 port_cap_flags; > u32 max_msg_sz; > u32 bad_pkey_cntr; > @@ -1684,6 +1705,7 @@ struct ib_device { > u32 local_dma_lkey; > u8 node_type; > u8 phys_port_cnt; > + u64 *core_cap_flags; /* Per port core capability flags */ Ditto. > }; > > struct ib_client {
> > +/* Protocol 0xFFFF000000000000 */ > > +#define RDMA_CORE_CAP_PROT_IB 0x0001000000000000ULL > > +#define RDMA_CORE_CAP_PROT_IBOE 0x0002000000000000ULL > > +#define RDMA_CORE_CAP_PROT_IWARP 0x0004000000000000ULL > > +#define RDMA_CORE_CAP_PROT_USNIC_UDP 0x0008000000000000ULL > > In accordance with what we've been talking about, drop IBOE for ROCE. > > Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP. > USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1 > will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP. USNIC_UDP is just UDP. I don't understand why we would want 'USNIC | UDP_ENCAP', or what UDP_ENCAP is intended to convey. Nothing is being encapsulated. RoCEv2 is IB transport over UDP. I'm not sure what the protocol field is intended to imply. If we want to expose the link, network, transport, and RDMA protocols in use, shouldn't these be separate fields or bits? And even then, I'm not sure what use this has for the ULPs. iWarp does not require Ethernet or TCP. RoCEv2 would work fine over any link. And the core layer should not assume that a device is limited to supporting only one protocol, especially at the network and transport levels. I vote for deprecating the protocol goofiness. - Sean
> @@ -302,12 +333,21 @@ int ib_register_device(struct ib_device *device, > goto out; > } > > + ret = read_core_cap_flags(device); > + if (ret) { > + dev_err(&device->dev, "Couldn't create Core Capability > flags\n"); > + kfree(device->gid_tbl_len); > + kfree(device->pkey_tbl_len); > + goto out; > + } > + > ret = ib_device_register_sysfs(device, port_callback); > if (ret) { > printk(KERN_WARNING "Couldn't register device %s with driver > model\n", > device->name); > kfree(device->gid_tbl_len); > kfree(device->pkey_tbl_len); > + kfree(device->core_cap_flags); Use a common exit location to avoid duplicating the kfree's on errors. > goto out; > } > > @@ -351,6 +391,7 @@ void ib_unregister_device(struct ib_device *device) > > kfree(device->gid_tbl_len); > kfree(device->pkey_tbl_len); > + kfree(device->core_cap_flags); > > mutex_unlock(&device_mutex); > > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index c724114..4de2758 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -353,11 +353,32 @@ union rdma_protocol_stats { > struct iw_protocol_stats iw; > }; > > +/* Define bits for the various functionality this port needs to be > supported by > + * the core. > + */ > +/* Management 0x00000000FFFFFFFF */ > +#define RDMA_CORE_CAP_IB_MAD 0x0000000000000001ULL > +#define RDMA_CORE_CAP_IB_SMI 0x0000000000000002ULL > +#define RDMA_CORE_CAP_IB_CM 0x0000000000000004ULL > +#define RDMA_CORE_CAP_IW_CM 0x0000000000000008ULL > +#define RDMA_CORE_CAP_IB_SA 0x0000000000000010ULL > + > +/* Address format 0x0000FFFF00000000 */ > +#define RDMA_CORE_CAP_AF_IB 0x0000000100000000ULL > +#define RDMA_CORE_CAP_ETH_AH 0x0000000200000000ULL > + > +/* Protocol 0xFFFF000000000000 */ > +#define RDMA_CORE_CAP_PROT_IB 0x0001000000000000ULL > +#define RDMA_CORE_CAP_PROT_IBOE 0x0002000000000000ULL > +#define RDMA_CORE_CAP_PROT_IWARP 0x0004000000000000ULL > +#define RDMA_CORE_CAP_PROT_USNIC_UDP 0x0008000000000000ULL > + > struct ib_port_attr { > enum ib_port_state state; > enum ib_mtu max_mtu; > enum ib_mtu active_mtu; > int gid_tbl_len; > + u64 core_cap_flags; > u32 port_cap_flags; > u32 max_msg_sz; > u32 bad_pkey_cntr; > @@ -1684,6 +1705,7 @@ struct ib_device { > u32 local_dma_lkey; > u8 node_type; > u8 phys_port_cnt; > + u64 *core_cap_flags; /* Per port core > capability flags */ Why are the core_cap_flags duplicated in the struct and in port attributes? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2015-05-04 at 16:42 +0000, Hefty, Sean wrote: > > struct ib_port_attr { > > enum ib_port_state state; > > enum ib_mtu max_mtu; > > enum ib_mtu active_mtu; > > int gid_tbl_len; > > + u64 core_cap_flags; > > u32 port_cap_flags; > > u32 max_msg_sz; > > u32 bad_pkey_cntr; > > @@ -1684,6 +1705,7 @@ struct ib_device { > > u32 local_dma_lkey; > > u8 node_type; > > u8 phys_port_cnt; > > + u64 *core_cap_flags; /* Per port core > > capability flags */ > > Why are the core_cap_flags duplicated in the struct and in port attributes? Because the per port ib_port_attr struct is not visible to the core code, it must call ib_query_port each time it wants to see it. In order to make the helpers avoid a call, you have to stash the cap_flags into the ib_device struct. In the past, the only thing there was the node_type, and it was per device and not per node. In order to make all of this work and not require calls into the driver all the time, that has to change to a per-port array. Eventually we could get rid of the node_type element once everything is in place.
> > > struct ib_port_attr { > > > enum ib_port_state state; > > > enum ib_mtu max_mtu; > > > enum ib_mtu active_mtu; > > > int gid_tbl_len; > > > + u64 core_cap_flags; > > > u32 port_cap_flags; > > > u32 max_msg_sz; > > > u32 bad_pkey_cntr; > > > @@ -1684,6 +1705,7 @@ struct ib_device { > > > u32 local_dma_lkey; > > > u8 node_type; > > > u8 phys_port_cnt; > > > + u64 *core_cap_flags; /* Per port core > > > capability flags */ > > > > Why are the core_cap_flags duplicated in the struct and in port > attributes? > > Because the per port ib_port_attr struct is not visible to the core > code, it must call ib_query_port each time it wants to see it. In order > to make the helpers avoid a call, you have to stash the cap_flags into > the ib_device struct. In the past, the only thing there was the > node_type, and it was per device and not per node. In order to make all > of this work and not require calls into the driver all the time, that > has to change to a per-port array. Eventually we could get rid of the > node_type element once everything is in place. I understand why they are in struct ib_device, but why are they duplicated in struct ib_port_attr?
On Mon, 2015-05-04 at 16:53 +0000, Hefty, Sean wrote: > > > > struct ib_port_attr { > > > > enum ib_port_state state; > > > > enum ib_mtu max_mtu; > > > > enum ib_mtu active_mtu; > > > > int gid_tbl_len; > > > > + u64 core_cap_flags; > > > > u32 port_cap_flags; > > > > u32 max_msg_sz; > > > > u32 bad_pkey_cntr; > > > > @@ -1684,6 +1705,7 @@ struct ib_device { > > > > u32 local_dma_lkey; > > > > u8 node_type; > > > > u8 phys_port_cnt; > > > > + u64 *core_cap_flags; /* Per port core > > > > capability flags */ > > > > > > Why are the core_cap_flags duplicated in the struct and in port > > attributes? > > > > Because the per port ib_port_attr struct is not visible to the core > > code, it must call ib_query_port each time it wants to see it. In order > > to make the helpers avoid a call, you have to stash the cap_flags into > > the ib_device struct. In the past, the only thing there was the > > node_type, and it was per device and not per node. In order to make all > > of this work and not require calls into the driver all the time, that > > has to change to a per-port array. Eventually we could get rid of the > > node_type element once everything is in place. > > I understand why they are in struct ib_device, but why are they duplicated in struct ib_port_attr? Because ib_query_port returns a struct, so this must be an element of that struct or it can't be returned (without adding a new, single use driver upcall).
PiBCZWNhdXNlIGliX3F1ZXJ5X3BvcnQgcmV0dXJucyBhIHN0cnVjdCwgc28gdGhpcyBtdXN0IGJl IGFuIGVsZW1lbnQgb2YNCj4gdGhhdCBzdHJ1Y3Qgb3IgaXQgY2FuJ3QgYmUgcmV0dXJuZWQgKHdp dGhvdXQgYWRkaW5nIGEgbmV3LCBzaW5nbGUgdXNlDQo+IGRyaXZlciB1cGNhbGwpLg0KDQpObSAt IFRoaXMgY291bGQgYmUgY2xlYXJlciBpZiB0aGUgZnVuY3Rpb25hbGl0eSB3ZXJlIGZvbGRlZCBp bnRvIHRoZSAncmVhZF9wb3J0X3RhYmxlX2xlbmd0aHMnIGZ1bmN0aW9uLg0K -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > > Because ib_query_port returns a struct, so this must be an element of > > that struct or it can't be returned (without adding a new, single use > > driver upcall). > > Nm - This could be clearer if the functionality were folded into the > 'read_port_table_lengths' function. I thought of doing that but then the function name no longer matches the functionality. I could change it to: read_per_port_info(...) And add the bits there. Ira
> > Nm - This could be clearer if the functionality were folded into the > > 'read_port_table_lengths' function. > > I thought of doing that but then the function name no longer matches the > functionality. > > I could change it to: > > read_per_port_info(...) That makes sense.
On Mon, 2015-05-04 at 16:40 +0000, Hefty, Sean wrote: > > > +/* Protocol 0xFFFF000000000000 */ > > > +#define RDMA_CORE_CAP_PROT_IB 0x0001000000000000ULL > > > +#define RDMA_CORE_CAP_PROT_IBOE 0x0002000000000000ULL > > > +#define RDMA_CORE_CAP_PROT_IWARP 0x0004000000000000ULL > > > +#define RDMA_CORE_CAP_PROT_USNIC_UDP 0x0008000000000000ULL > > > > In accordance with what we've been talking about, drop IBOE for ROCE. > > > > Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP. > > USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1 > > will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP. > > USNIC_UDP is just UDP. I don't understand why we would want 'USNIC | UDP_ENCAP', or what UDP_ENCAP is intended to convey. Nothing is being encapsulated. I thought USNIC_UDP had an embedded USNIC protocol header inside the UDP header. That would make it a UDP_ENCAP protocol. > RoCEv2 is IB transport over UDP. Right, ROCE (or IB, whichever you prefer) encapsulated in UDP. > I'm not sure what the protocol field is intended to imply. There is still information in those bits that we can't get elsewhere. For instance, even though this patch replaces the CAP_* stuff with bits, if you took away the CAP_PROT_* entries, then there would be no entry to identify USNIC at all. Right now, you could infer iWARP from CAP_IW_CM. You could infer InfiniBand from any of the CAP_IB_* (but later will need a way to differentiate between IB and OPA) You could infer ROCE from CAP_ETH_AH (but later will need a way to differentiate between ROCE and ROCEv2) The only way to differentiate USNIC at the moment, is that the CAPS would be all 0. That's not the sort of positive identification I would prefer. So you *could* reduce this to just one bit for USNIC. And if you then add a UDP_ENCAP bit, then that single bit can do double duty in telling apart USNIC and USNIC_UDP and ROCE and ROCEv2. > If we want > to expose the link, network, transport, and RDMA protocols in use, > shouldn't these be separate fields or bits? And even then, I'm not > sure what use this has for the ULPs. iWarp does not require Ethernet > or TCP. RoCEv2 would work fine over any link. > And the core layer > should not assume that a device is limited to supporting only one > protocol, especially at the network and transport levels. Given that this is a per port thing, there is no assumption about a device only supporting a single protocol. > I vote for > deprecating the protocol goofiness.
PiBJIHRob3VnaHQgVVNOSUNfVURQIGhhZCBhbiBlbWJlZGRlZCBVU05JQyBwcm90b2NvbCBoZWFk ZXIgaW5zaWRlIHRoZSBVRFANCj4gaGVhZGVyLiAgVGhhdCB3b3VsZCBtYWtlIGl0IGEgVURQX0VO Q0FQIHByb3RvY29sLg0KDQpTb21lb25lIGZyb20gQ2lzY28gY2FuIGNvcnJlY3QgbWUsIGJ1dCBV U05JQyBzdXBwb3J0cyAyIHByb3RvY29scy4gIEp1c3QgcGxhaW4gVURQLCBhbmQgYSBwcm9wcmll dGFyeSBwcm90b2NvbCB0aGF0IHJ1bnMgb3ZlciBFdGhlcm5ldCwgYnV0IHVzZXMgdGhlIHNhbWUg RXRoZXJUeXBlIGFzIFJvQ0UuICBJIHRob3VnaHQgdGhlc2UgY291bGQgYm90aCBiZSBhY3RpdmUg b24gdGhlIHNhbWUgcG9ydCBhdCB0aGUgc2FtZSB0aW1lLg0KDQo+ID4gUm9DRXYyIGlzIElCIHRy YW5zcG9ydCBvdmVyIFVEUC4NCj4gDQo+IFJpZ2h0LCBST0NFIChvciBJQiwgd2hpY2hldmVyIHlv dSBwcmVmZXIpIGVuY2Fwc3VsYXRlZCBpbiBVRFAuDQo+IA0KPiA+IEknbSBub3Qgc3VyZSB3aGF0 IHRoZSBwcm90b2NvbCBmaWVsZCBpcyBpbnRlbmRlZCB0byBpbXBseS4NCj4gDQo+IFRoZXJlIGlz IHN0aWxsIGluZm9ybWF0aW9uIGluIHRob3NlIGJpdHMgdGhhdCB3ZSBjYW4ndCBnZXQgZWxzZXdo ZXJlLg0KPiBGb3IgaW5zdGFuY2UsIGV2ZW4gdGhvdWdoIHRoaXMgcGF0Y2ggcmVwbGFjZXMgdGhl IENBUF8qIHN0dWZmIHdpdGggYml0cywNCj4gaWYgeW91IHRvb2sgYXdheSB0aGUgQ0FQX1BST1Rf KiBlbnRyaWVzLCB0aGVuIHRoZXJlIHdvdWxkIGJlIG5vIGVudHJ5IHRvDQo+IGlkZW50aWZ5IFVT TklDIGF0IGFsbC4NCj4gDQo+IFJpZ2h0IG5vdywgeW91IGNvdWxkIGluZmVyIGlXQVJQIGZyb20g Q0FQX0lXX0NNLg0KPiBZb3UgY291bGQgaW5mZXIgSW5maW5pQmFuZCBmcm9tIGFueSBvZiB0aGUg Q0FQX0lCXyogKGJ1dCBsYXRlciB3aWxsIG5lZWQNCj4gYSB3YXkgdG8gZGlmZmVyZW50aWF0ZSBi ZXR3ZWVuIElCIGFuZCBPUEEpDQo+IFlvdSBjb3VsZCBpbmZlciBST0NFIGZyb20gQ0FQX0VUSF9B SCAoYnV0IGxhdGVyIHdpbGwgbmVlZCBhIHdheSB0bw0KPiBkaWZmZXJlbnRpYXRlIGJldHdlZW4g Uk9DRSBhbmQgUk9DRXYyKQ0KPiBUaGUgb25seSB3YXkgdG8gZGlmZmVyZW50aWF0ZSBVU05JQyBh dCB0aGUgbW9tZW50LCBpcyB0aGF0IHRoZSBDQVBTDQo+IHdvdWxkIGJlIGFsbCAwLiAgVGhhdCdz IG5vdCB0aGUgc29ydCBvZiBwb3NpdGl2ZSBpZGVudGlmaWNhdGlvbiBJIHdvdWxkDQo+IHByZWZl ci4NCj4gDQo+IFNvIHlvdSAqY291bGQqIHJlZHVjZSB0aGlzIHRvIGp1c3Qgb25lIGJpdCBmb3Ig VVNOSUMuDQo+IA0KPiBBbmQgaWYgeW91IHRoZW4gYWRkIGEgVURQX0VOQ0FQIGJpdCwgdGhlbiB0 aGF0IHNpbmdsZSBiaXQgY2FuIGRvIGRvdWJsZQ0KPiBkdXR5IGluIHRlbGxpbmcgYXBhcnQgVVNO SUMgYW5kIFVTTklDX1VEUCBhbmQgUk9DRSBhbmQgUk9DRXYyLg0KDQpNeSBxdWVzdGlvbiBpcyB3 aG8gbmVlZHMgdGhlc2UgYml0cyBhbmQgd2h5PyAgVGhlIHByaW1hcnkgcmVhc29uIGl0IHdhcyBl eHBvc2VkIHdhcyB0byBkbyB0aGUgam9iIHRoYXQgdGhlIG5ldyBjYXAgZmxhZ3MgYXJlIGFjY29t cGxpc2hpbmcuDQoNCkkgc3RpbGwgYmVsaWV2ZSB0aGF0IFJvQ0V2MiBpcyBjb25jZXB0dWFsbHkg dGhlIHNhbWUgYXMgaVdhcnAuICBBbiBSRE1BIHByb3RvY29sIGhhcyBiZWVuIGxheWVyZWQgb3Zl ciBzb21lIG90aGVyIHRyYW5zcG9ydC4gIEluIHRoZSBjYXNlIG9mIGlXYXJwLCBpdCdzIFRDUC4g IEluIHRoZSBjYXNlIG9mIFJvQ0V2MiwgaXQncyBVRFAuICBEbyB3ZSBkZWZpbmUgYSBUQ1BfRU5D QVAgYml0IHRoYXQgY29ycmVzcG9uZHMgd2l0aCBVRFBfRU5DQVA/ICBBbmQgd2h5IHNob3VsZCBh biBhcHAgY2FyZT8gIFdlIGRvbid0IHNwZWNpZnkgd2hldGhlciB0aGUgcG9ydCBpcyBydW5uaW5n IElQdjQgb3IgSVB2Ni4gIFdoeSBpcyB0aGUgdHJhbnNwb3J0IGxldmVsIGNhbGxlZCBvdXQsIGJ1 dCBub3QgdGhlIG5ldHdvcmsgbGF5ZXI/ICBBcmNoaXRlY3R1cmFsbHksIG5laXRoZXIgaVdhcnAg b3IgUm9DRXYyIChkZXNwaXRlIGl0cyBuYW1lKSBjYXJlcyB3aGF0IHRoZSBsaW5rIGxheWVyIGlz Lg0KDQo+ID4gICBBbmQgdGhlIGNvcmUgbGF5ZXINCj4gPiBzaG91bGQgbm90IGFzc3VtZSB0aGF0 IGEgZGV2aWNlIGlzIGxpbWl0ZWQgdG8gc3VwcG9ydGluZyBvbmx5IG9uZQ0KPiA+IHByb3RvY29s LCBlc3BlY2lhbGx5IGF0IHRoZSBuZXR3b3JrIGFuZCB0cmFuc3BvcnQgbGV2ZWxzLg0KPiANCj4g R2l2ZW4gdGhhdCB0aGlzIGlzIGEgcGVyIHBvcnQgdGhpbmcsIHRoZXJlIGlzIG5vIGFzc3VtcHRp b24gYWJvdXQgYQ0KPiBkZXZpY2Ugb25seSBzdXBwb3J0aW5nIGEgc2luZ2xlIHByb3RvY29sLg0K DQpEZXZpY2UsIG5vLCBidXQgd2UgYXJlIGFzc3VtaW5nIHRoaXMgcGVyIHBvcnQuICBJIGRvbid0 IHRoaW5rIHRoaXMgaXMgdHJ1ZSBmb3IgVVNOSUMuICBGb3IgdGhhdCBtYXR0ZXIsIGl0J3MgZW50 aXJlbHkgcG9zc2libGUgZm9yIGEgUm9DRXYyIGRldmljZSB0byBleHBvc2UgVURQIGRpcmVjdGx5 IHRvIHVzZXIgc3BhY2UsIHNhbWUgYXMgVVNOSUMuICAoSSdkIGFjdHVhbGx5IGJlIHN1cnByaXNl ZCBpZiBubyBkZXZpY2VzIGhhdmUgdGhpcyBjYXBhYmlsaXR5LCBpZiBmb3IgZGVidWdnaW5nIGNh cGFiaWxpdGllcywgZXZlbiBpZiBmb3Igbm90aGluZyBlbHNlLikgIFdoYXQgYXJlIHdlIGdvaW5n IHRvIGRvIGlmIHRoZXJlJ3MgYSBkZXZpY2UgdGhhdCBzdXBwb3J0cyBib3RoIGlXYXJwIGFuZCBS b0NFdjI/ICBUaGF0J3MgZWFzaWx5IGRvYWJsZSB0b2RheSB0aHJvdWdoIHNvZnR3YXJlLg0KDQpC YXNpY2FsbHksIEkgcXVlc3Rpb24gaG93IHByb3RvY29sIGlzIGRlZmluZWQgYW5kIHdoYXQgaXQg bWVhbnMgdG8gZXhwb3NlIGl0IGFzIGEgZGV2aWNlIGF0dHJpYnV0ZS4gIFNob3VsZCBpdCBpbnN0 ZWFkIGJlIG5lZ290aWF0ZWQgKGkuZS4gbW9yZSBsaWtlIHNvY2tldHMpPyAgSWYgYW4gYXBwIG5l ZWRzIGEgc3BlY2lmaWMgIlJETUEgcHJvdG9jb2wiIChJQiBvciBpV2FycCkgb3IgImFwcGxpY2F0 aW9uIHByb3RvY29sIiAoSUIgb3IgaVdhcnAgb3IgVURQIG9yIE1BRHM/KSwgdGhleSBjYW4gcmVx dWVzdCBpdC4gIE90aGVyd2lzZSwgdGhlIGFwcCBnZXRzIGFzc2lnbmVkIHNvbWUgcHJvdG9jb2wu ICBBbmQgdGhlIHByb3RvY29sIHNob3VsZCByZWFsbHkgYmUgYXNzb2NpYXRlZCB3aXRoIGEgUVAs IHJhdGhlciB0aGFuIGEgcG9ydC4NCg0KLSBTZWFuIA0K -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 04, 2015 at 02:14:55AM -0400, ira.weiny@intel.com wrote: > From: Ira Weiny <ira.weiny@intel.com> > > Add Core capability flags to each port attribute and read those into ib_device > upon registration for each port. +1 on adding it to read_port_table_lengths But this whole thing is starting to get goofy, 3rd add gets the work to fix it up I guess. Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct and allocate an array of them. Actually, why not just allocate an array of ib_port_attrs and fill that? Then you can use it for the mad size too. Not in your patch, but why does read_port_table_lengths use a kmalloc for what should be a stack allocation? Yuk. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2015-05-04 at 18:26 +0000, Hefty, Sean wrote: > > Given that this is a per port thing, there is no assumption about a > > device only supporting a single protocol. > > Device, no, but we are assuming this per port. At a high level, there are certain things that will be per port because they are tied to the link layer in question (you won't have a port with IB_SA that doesn't have IB_SA on all its queue pairs on that port). > I don't think this is > true for USNIC. I can't speak for USNIC, I haven't read the driver for it closely enough. > For that matter, it's entirely possible for a RoCEv2 > device to expose UDP directly to user space, same as USNIC. I'm not sure I'm following what you are saying here. USNIC is unique in that it presents a NIC to the application that it has exclusive control over. > (I'd > actually be surprised if no devices have this capability, if for > debugging capabilities, even if for nothing else.) What are we going > to do if there's a device that supports both iWarp and RoCEv2? That's > easily doable today through software. It's doable, but nobody is actually doing it. > Basically, I question how protocol is defined and what it means to > expose it as a device attribute. Should it instead be negotiated (i.e. > more like sockets)? If an app needs a specific "RDMA protocol" (IB or > iWarp) or "application protocol" (IB or iWarp or UDP or MADs?), they > can request it. Otherwise, the app gets assigned some protocol. And > the protocol should really be associated with a QP, rather than a > port. If you're going down to that level (which we will when the RoCEv2 patches are integrated), then it doesn't quite map to QPs. Since a UD QP may need to talk to both RoCEv1 and RoCEv2 hosts, it's at the AH level for RoCE. To a certain extent you are right. RoCE, RoCEv2, iWARP, and USNIC could all reside on a single port by virtue of them all sharing an Ethernet link layer. IB link layer (and associated attributes) and OPA link layer are the outliers in this method of thinking in that their ports will only support their own transport. So if we were to actually be forward thinking about this, then the link layer is the important bit you need to capture on a per port basis, not the transport technology. We would have to modify lots of the stack to support per QP or per AH transport settings though.
On May 4, 2015, at 1:26 PM, Hefty, Sean <sean.hefty@intel.com> wrote: >> I thought USNIC_UDP had an embedded USNIC protocol header inside the UDP >> header. That would make it a UDP_ENCAP protocol. > > Someone from Cisco can correct me, but USNIC supports 2 protocols. Just plain UDP, and a proprietary protocol that runs over Ethernet, but uses the same EtherType as RoCE. I thought these could both be active on the same port at the same time. Sean's statements above are correct. "USNIC_UDP" in today's code really is plain-old-UDP (over IP over Ethernet). >>> And the core layer >>> should not assume that a device is limited to supporting only one >>> protocol, especially at the network and transport levels. >> >> Given that this is a per port thing, there is no assumption about a >> device only supporting a single protocol. > > Device, no, but we are assuming this per port. I don't think this is true for USNIC. Correct, usNIC devices and ports can speak either format concurrently, IIRC even for different QPs in the same context/PD/etc. Incidentally, usNIC devices only ever have a single port right now, but there's no reason that has to remain true. > For that matter, it's entirely possible for a RoCEv2 device to expose UDP directly to user space, same as USNIC. (I'd actually be surprised if no devices have this capability, if for debugging capabilities, even if for nothing else.) What are we going to do if there's a device that supports both iWarp and RoCEv2? That's easily doable today through software. +1 -Dave -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 04, 2015 at 03:40:25PM -0400, Doug Ledford wrote: > So if we were to actually be forward thinking about this, then the > link layer is the important bit you need to capture on a per port > basis, not the transport technology. We would have to modify lots > of the stack to support per QP or per AH transport settings though. We are going to have to do this, at least in the kernel side, the idea that the port can imply the one and only address family of the QP is certainly antiquated now. I think the roceev2 patches will require this work. As far as this patch series goes, I actually don't think the bits matter one bit - the intent is to optimize the cap tests, so a one cap test-one-bit method is fine. Don't add extra we don't need right now. As an optimization, be aware of some of the restrictions, it is actually pretty expensive on some arch's to load large 32 or 64 bit numbers, smaller numbers are better, single bit test is better. If this is combined with my thought to use a #define for all the common driver cases then we can trivially rework the bit-field to future needs. No reason to over architect something here. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 04, 2015 at 12:36:57PM -0600, Jason Gunthorpe wrote: > On Mon, May 04, 2015 at 02:14:55AM -0400, ira.weiny@intel.com wrote: > > From: Ira Weiny <ira.weiny@intel.com> > > > > Add Core capability flags to each port attribute and read those into ib_device > > upon registration for each port. > > +1 on adding it to read_port_table_lengths > > But this whole thing is starting to get goofy, > <quote> > 3rd add gets the work > to fix it up I guess. </quote> I don't understand this comment? > > Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct > and allocate an array of them. > > Actually, why not just allocate an array of ib_port_attrs and fill > that? Then you can use it for the mad size too. That was debated before and I was hoping to leave that for another day. It does make some sense to roll it in here. > > Not in your patch, but why does read_port_table_lengths use a kmalloc > for what should be a stack allocation? Yuk. I don't know. I almost submitted a separate patch for that. I will clean it up when I combine the functions. I also don't like the way the arrays in read_port_table_lengths are handled. This requires a start_port call when comparing bits. It is easier to just make the array 1 based with index 0 valid only for switches. This is part of the reason there is a separate function and array. Ira > > Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct > > and allocate an array of them. > > > > Actually, why not just allocate an array of ib_port_attrs and fill > > that? Then you can use it for the mad size too. > > That was debated before and I was hoping to leave that for another day. > > It does make some sense to roll it in here. I remember more details now... This came up in my original OPA patch series when Sean objected to me calling the port attributes "cached_port_attr". There are a number of values in the ib_port_attr structure which are not fixed (state, active_mtu, lid, sm_lid, etc...) Putting another copy of this data in the device will be confusing to know which of these one can count on for the correct data. I did not write the table length code so I am assuming that data is immutable. As is the max MAD size, and these new capability bits. But storing the entire ib_port_attr structure is probably going to mislead someone. Do we still think this is a good idea? I think putting in a single struct with the proper immutable data is the proper way to go. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 04, 2015 at 07:16:23PM -0400, ira.weiny wrote: > > > Pull pkey_tbl_len, gid_tbl_len and your new thing into a single struct > > > and allocate an array of them. > > > > > > Actually, why not just allocate an array of ib_port_attrs and fill > > > that? Then you can use it for the mad size too. > > > > That was debated before and I was hoping to leave that for another day. > > > > It does make some sense to roll it in here. > > I remember more details now... > > This came up in my original OPA patch series when Sean objected to me calling > the port attributes "cached_port_attr". Yeah, not a great name for storing immutable values. port_properties or something. > There are a number of values in the ib_port_attr structure which are not fixed > (state, active_mtu, lid, sm_lid, etc...) Yes, there are alot of those.. So, a new struct is better, and this has just gotten so messy. - Update ib_alloc_device to accept a num_ports argument and create the port-port array at that point - Have the drivers fill in their per port values before calling ib_register. Delete read_port_table_lengths - Get ib_dealloc_device to free the list instead of unregister, feels like keeping that memory around for the duration of the kref is smarter.. - Drop gid_tbl_len and pkey_tbl_len for the new scheme - Mark all the old immutable attrs in ib_port_attrs as deprecated and if any are easy to remove then do so.. - Don't add the caps or the max mad size immutables to port_attrs > <quote> > > 3rd add gets the work > > to fix it up I guess. > </quote> > I don't understand this comment? gid_tbl_len, pkey_tbl_len were the first two to use a 'shortcut' here, your caps and/or max mad size are the third to take the 'shortcut'. Rule of threes: The third person to extend the same badly designed widget gets to fix it properly. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
PiBGcm9tOiBsaW51eC1yZG1hLW93bmVyQHZnZXIua2VybmVsLm9yZyBbbWFpbHRvOmxpbnV4LXJk bWEtDQoNCj4gb3duZXJAdmdlci5rZXJuZWwub3JnXSBPbiBCZWhhbGYgT2YgSGVmdHksIFNlYW4N Cj4gPg0KPiA+IEluIGFjY29yZGFuY2Ugd2l0aCB3aGF0IHdlJ3ZlIGJlZW4gdGFsa2luZyBhYm91 dCwgZHJvcCBJQk9FIGZvciBST0NFLg0KPiA+DQo+ID4gRHJvcCB0aGUgVURQIG9mZiBvZiBVU05J QywgdGhlbiBkZWZpbmUgYSBiaXQgZm9yIENBUF9QUk9UX1VEUF9FTkNBUC4NCj4gPiBVU05JQyB3 aWxsIGJlIGp1c3QgVVNOSUMsIFVTTklDX1VEUCB3aWxsIGJlIFVTTklDIHwgVURQX0VOQ0FQLCBS T0NFIHYxDQo+ID4gd2lsbCBiZSBST0NFLCBhbmQgUk9DRXYyIHdpbGwgYmUgUk9DRSB8IFVEUF9F TkNBUC4NCj4gDQo+IFVTTklDX1VEUCBpcyBqdXN0IFVEUC4gIEkgZG9uJ3QgdW5kZXJzdGFuZCB3 aHkgd2Ugd291bGQgd2FudCAnVVNOSUMgfA0KPiBVRFBfRU5DQVAnLCBvciB3aGF0IFVEUF9FTkNB UCBpcyBpbnRlbmRlZCB0byBjb252ZXkuICBOb3RoaW5nIGlzIGJlaW5nDQo+IGVuY2Fwc3VsYXRl ZC4NCj4gDQoNCkkgYWdyZWUgdGhhdCB0aGUgVURQX0VOQ0FQIG5vdGlvbiBpcyBjb25mdXNpbmcu DQpXZSBzaG91bGQgc3RpY2sgdG8gUk9DRV9WMSBhbmQgUk9DRV9WMi4NCg0KPiBSb0NFdjIgaXMg SUIgdHJhbnNwb3J0IG92ZXIgVURQLg0KPiANCj4gSSdtIG5vdCBzdXJlIHdoYXQgdGhlIHByb3Rv Y29sIGZpZWxkIGlzIGludGVuZGVkIHRvIGltcGx5LiAgSWYgd2Ugd2FudCB0bw0KPiBleHBvc2Ug dGhlIGxpbmssIG5ldHdvcmssIHRyYW5zcG9ydCwgYW5kIFJETUEgcHJvdG9jb2xzIGluIHVzZSwg c2hvdWxkbid0DQo+IHRoZXNlIGJlIHNlcGFyYXRlIGZpZWxkcyBvciBiaXRzPyAgQW5kIGV2ZW4g dGhlbiwgSSdtIG5vdCBzdXJlIHdoYXQgdXNlIHRoaXMNCj4gaGFzIGZvciB0aGUgVUxQcy4gIGlX YXJwIGRvZXMgbm90IHJlcXVpcmUgRXRoZXJuZXQgb3IgVENQLiAgUm9DRXYyIHdvdWxkDQo+IHdv cmsgZmluZSBvdmVyIGFueSBsaW5rLiAgQW5kIHRoZSBjb3JlIGxheWVyIHNob3VsZCBub3QgYXNz dW1lIHRoYXQgYSBkZXZpY2UgaXMNCj4gbGltaXRlZCB0byBzdXBwb3J0aW5nIG9ubHkgb25lIHBy b3RvY29sLCBlc3BlY2lhbGx5IGF0IHRoZSBuZXR3b3JrIGFuZA0KPiB0cmFuc3BvcnQgbGV2ZWxz LiAgSSB2b3RlIGZvciBkZXByZWNhdGluZyB0aGUgcHJvdG9jb2wgZ29vZmluZXNzLg0KPiANCj4g LSBTZWFuDQoNClRoZSBwcm90b2NvbCBub3Rpb24gbWlnaHQgbm90IGhhdmUgYW55IHZhbHVlIGZv ciBVTFBzLCBidXQgaXQgaXMgdXNlZnVsIGZvciBjb3JlDQptYW5hZ2VtZW50IGNvZGUuIEFsc28s IHdoZW4gd2UgZXh0ZW5kIHRoZXNlIG5vdGlvbnMgdG8gdXNlci1zcGFjZSwgYWRtaW5zIHdpbGwN CmFjdHVhbGx5IHdhbnQgdG8ga25vdyB3aGF0IHdpcmUgcHJvdG9jb2xzIGEgY2VydGFpbiBkZXZp Y2UgY2FuIHN1cHBvcnQsIGV2ZW4NCmp1c3QgZm9yIHRoZSBzYWtlIG9mIGludGVyb3BlcmFiaWxp dHkuDQoNCi0tTGlyYW4NCg0K -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Hefty, Sean > > I thought USNIC_UDP had an embedded USNIC protocol header inside the > > UDP header. That would make it a UDP_ENCAP protocol. > > Someone from Cisco can correct me, but USNIC supports 2 protocols. Just > plain UDP, and a proprietary protocol that runs over Ethernet, but uses the > same EtherType as RoCE. I thought these could both be active on the same > port at the same time. > 'protocol' refers to what your device generates when you do a post_send() on some QP. In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation over IP. In the USNIC case, you might want the protocol to refer to addition information the distinguishes this wire protocol rather than just "I am sending UDP packets"... In other words, I think that 'protocol' should uniquely distinguish interoperable peers at the Verbs API level. We are *not* trying to describe a certain header, but rather a stack of protocols. > > > RoCEv2 is IB transport over UDP. > > > > Right, ROCE (or IB, whichever you prefer) encapsulated in UDP. > > > > > I'm not sure what the protocol field is intended to imply. > > > > There is still information in those bits that we can't get elsewhere. > > For instance, even though this patch replaces the CAP_* stuff with > > bits, if you took away the CAP_PROT_* entries, then there would be no > > entry to identify USNIC at all. > > > > Right now, you could infer iWARP from CAP_IW_CM. > > You could infer InfiniBand from any of the CAP_IB_* (but later will > > need a way to differentiate between IB and OPA) You could infer ROCE > > from CAP_ETH_AH (but later will need a way to differentiate between > > ROCE and ROCEv2) The only way to differentiate USNIC at the moment, is > > that the CAPS would be all 0. That's not the sort of positive > > identification I would prefer. > > > > So you *could* reduce this to just one bit for USNIC. > > > > And if you then add a UDP_ENCAP bit, then that single bit can do > > double duty in telling apart USNIC and USNIC_UDP and ROCE and ROCEv2. > > My question is who needs these bits and why? The primary reason it was > exposed was to do the job that the new cap flags are accomplishing. > > I still believe that RoCEv2 is conceptually the same as iWarp. An RDMA > protocol has been layered over some other transport. In the case of iWarp, > it's TCP. In the case of RoCEv2, it's UDP. Do we define a TCP_ENCAP bit that > corresponds with UDP_ENCAP? And why should an app care? We don't > specify whether the port is running IPv4 or IPv6. Why is the transport level > called out, but not the network layer? Architecturally, neither iWarp or > RoCEv2 (despite its name) cares what the link layer is. We don't call out the transport layer. That's why we call it 'protocol' and not 'transport' ! > > > > And the core layer > > > should not assume that a device is limited to supporting only one > > > protocol, especially at the network and transport levels. > > > > Given that this is a per port thing, there is no assumption about a > > device only supporting a single protocol. > > Device, no, but we are assuming this per port. I don't think this is true for > USNIC. For that matter, it's entirely possible for a RoCEv2 device to expose > UDP directly to user space, same as USNIC. (I'd actually be surprised if no > devices have this capability, if for debugging capabilities, even if for nothing > else.) What are we going to do if there's a device that supports both iWarp > and RoCEv2? That's easily doable today through software. If and when there are such devices, they can advertise multiple protocols. > > Basically, I question how protocol is defined and what it means to expose it > as a device attribute. Should it instead be negotiated (i.e. more like sockets)? > If an app needs a specific "RDMA protocol" (IB or iWarp) or "application > protocol" (IB or iWarp or UDP or MADs?), they can request it. Otherwise, the > app gets assigned some protocol. And the protocol should really be > associated with a QP, rather than a port. > > - Sean The port capabilities do just that: advertise capabilities. The actual protocol selection for each transport entity happens later. For ROCE RC QPs, for example, this occurs while modifying the QP to refer to the desired GID index. --Liran
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Jason Gunthorpe > > So if we were to actually be forward thinking about this, then the > > link layer is the important bit you need to capture on a per port > > basis, not the transport technology. We would have to modify lots of > > the stack to support per QP or per AH transport settings though. > > We are going to have to do this, at least in the kernel side, the idea that the > port can imply the one and only address family of the QP is certainly > antiquated now. I think the roceev2 patches will require this work. > I second that the link layer should not be used for this purpose. RDMA devices can and will advertise multiple rdma 'protocols', such as ROCE devices that support both ROCE_V1 and ROCE_V2. --Liran -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On May 5, 2015, at 2:51 PM, Liran Liss <liranl@mellanox.com> wrote: >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- >> owner@vger.kernel.org] On Behalf Of Hefty, Sean > >>> I thought USNIC_UDP had an embedded USNIC protocol header inside the >>> UDP header. That would make it a UDP_ENCAP protocol. >> >> Someone from Cisco can correct me, but USNIC supports 2 protocols. Just >> plain UDP, and a proprietary protocol that runs over Ethernet, but uses the >> same EtherType as RoCE. I thought these could both be active on the same >> port at the same time. >> > > 'protocol' refers to what your device generates when you do a post_send() on > some QP. > In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation > over IP. In the USNIC case, you might want the protocol to refer to addition > information the distinguishes this wire protocol rather than just "I am sending > UDP packets"... In the case that usNIC is operating in UDP mode (which is the overwhelming majority of the cases), there is absolutely no additional protocol that ends up on the wire or headers in the user buffers besides UDP/IP/Ethernet. They are 100% plain UDP packets, they just happen to be sent via OS-bypass queues instead of traveling through the kernel networking stack. [^^^^^ there continues to be confusion about this for some reason, but I don't know why] > In other words, I think that 'protocol' should uniquely distinguish interoperable > peers at the Verbs API level. We are *not* trying to describe a certain header, > but rather a stack of protocols. Any non-UDP "protocol" that might currently be in use over usNIC is an entirely application-level protocol outside of the view of the Verbs API level. -Dave -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > In other words, I think that 'protocol' should uniquely distinguish > interoperable > > peers at the Verbs API level. We are *not* trying to describe a certain > header, > > but rather a stack of protocols. > > Any non-UDP "protocol" that might currently be in use over usNIC is an > entirely application-level protocol outside of the view of the Verbs API > level. I agree with this for usNIC. For other technologies, the definition isn't even this simple. UD QPs and RC QPs cannot interoperate, yet are treated as the same 'protocol'. Just defining protocol as a 'stack of protocols' shows how broken the approach is. And we haven't even tried to incorporate other 'protocols' needed to make this work, such as the CM or management protocols. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: Hefty, Sean [mailto:sean.hefty@intel.com] > Sent: Tuesday, May 05, 2015 11:29 PM > > > In other words, I think that 'protocol' should uniquely distinguish > > interoperable > > > peers at the Verbs API level. We are *not* trying to describe a > > > certain > > header, > > > but rather a stack of protocols. > > > > Any non-UDP "protocol" that might currently be in use over usNIC is an > > entirely application-level protocol outside of the view of the Verbs > > API level. > Fine. Then we can add a 'udp/ip' protocol for usNIC. Similarly, 'raw Ethernet' can designate the ability to inject of all headers starting from L2. > I agree with this for usNIC. For other technologies, the definition isn't even > this simple. UD QPs and RC QPs cannot interoperate, yet are treated as the > same 'protocol'. Just defining protocol as a 'stack of protocols' shows how > broken the approach is. And we haven't even tried to incorporate other > 'protocols' needed to make this work, such as the CM or management > protocols. The interoperability between different transport objects depends on the technology. I don't think that we should even attempt to describe these dependencies in any abstract way. RTFM... However, users should have a way of knowing what transports and what wire protocol stacks a certain rdma device supports. No one using IB or RoCE would think that UD and RC interoperate. (If someone tries it, he would be disappointed 'gracefully'...) But people that use RoCE would like to know which devices they can use to send RoCE packets (or select a device that supports a certain RoCE version). --Liran -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2015-05-05 at 19:27 +0000, Liran Liss wrote: > > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > > > owner@vger.kernel.org] On Behalf Of Hefty, Sean > > > > > > In accordance with what we've been talking about, drop IBOE for ROCE. > > > > > > Drop the UDP off of USNIC, then define a bit for CAP_PROT_UDP_ENCAP. > > > USNIC will be just USNIC, USNIC_UDP will be USNIC | UDP_ENCAP, ROCE v1 > > > will be ROCE, and ROCEv2 will be ROCE | UDP_ENCAP. > > > > USNIC_UDP is just UDP. I don't understand why we would want 'USNIC | > > UDP_ENCAP', or what UDP_ENCAP is intended to convey. Nothing is being > > encapsulated. > > > > I agree that the UDP_ENCAP notion is confusing. > We should stick to ROCE_V1 and ROCE_V2. > > > RoCEv2 is IB transport over UDP. > > > > I'm not sure what the protocol field is intended to imply. If we want to > > expose the link, network, transport, and RDMA protocols in use, shouldn't > > these be separate fields or bits? And even then, I'm not sure what use this > > has for the ULPs. iWarp does not require Ethernet or TCP. RoCEv2 would > > work fine over any link. And the core layer should not assume that a device is > > limited to supporting only one protocol, especially at the network and > > transport levels. I vote for deprecating the protocol goofiness. > > > > - Sean > > The protocol notion might not have any value for ULPs, but it is useful for core > management code. Also, when we extend these notions to user-space, admins will > actually want to know what wire protocols a certain device can support, even > just for the sake of interoperability. So I've been thinking a bit more about the overall architecture of the underlying patch set from Michael. The structure of that patch set is to a certain extent dictating this follow on patch set and so any problems in it end up being transmitted here as well. Sean's comments about transports I think were spot on, and they got me doing a little thought exercise in my head. Theoretically, if we were to take the SoftiWARP and SoftRoCE drivers, we could merge them down to one driver that supported both iWARP and RoCE (and we could add USNIC too I suspect). Because all of these things work on an Ethernet link layer, there is no reason they couldn't all be presented on the same device. So if you had a single verbs device that could support all of these things, where would you need to capture and store the relevant information? This is what I have so far: On a per device basis: not much to be honest, node_type IB_CA/RNIC for back compatibility and that's pretty much it On a per port basis: link layer (Ethernet, IB, or OPA). Notice that I didn't include the transport here. Two of the above link layers strictly imply their transport. The third link layer could have multiple transports. Which brings me to my next level... On a per queue pair basis: if (link_layer == Ethernet) iWARP/RoCE/USNIC if (qp_type == RoCE && qp != UD) RoCEv1/RoCEv2 On a per ah basis: if (qp_type == RoCE && qp = UD) RoCEv1/RoCEv2 Michael's patch set was intended to give us a framework within which we can start cleaning things up. Once we get to cleaning things up, I think the above is where we should target storing the relevant information. Thoughts?
> This is what I have so far: > > On a per device basis: not much to be honest, node_type IB_CA/RNIC for > back compatibility and that's pretty much it > > On a per port basis: link layer (Ethernet, IB, or OPA). Notice that I > didn't include the transport here. Two of the above link layers > strictly imply their transport. The third link layer could have > multiple transports. Which brings me to my next level... > > On a per queue pair basis: if (link_layer == Ethernet) iWARP/RoCE/USNIC > if (qp_type == RoCE && qp != UD) RoCEv1/RoCEv2 > > On a per ah basis: if (qp_type == RoCE && qp = UD) RoCEv1/RoCEv2 > > Michael's patch set was intended to give us a framework within which we > can start cleaning things up. Once we get to cleaning things up, I > think the above is where we should target storing the relevant > information. > > Thoughts? I'm not following your intent here. The qp_type values are currently defined as RC, UC, UD, XRC, plus some weirdness like SMI, GSI. Are you suggesting that we store the relevant information as part of the qp_type, or that we keep the qp_type as-is? Once Michael's patches are integrated, do apps need anything else beyond the qp type as currently defined?
On Fri, 2015-05-08 at 20:06 +0000, Hefty, Sean wrote: > > This is what I have so far: > > > > On a per device basis: not much to be honest, node_type IB_CA/RNIC for > > back compatibility and that's pretty much it > > > > On a per port basis: link layer (Ethernet, IB, or OPA). Notice that I > > didn't include the transport here. Two of the above link layers > > strictly imply their transport. The third link layer could have > > multiple transports. Which brings me to my next level... > > > > On a per queue pair basis: if (link_layer == Ethernet) iWARP/RoCE/USNIC > > if (qp_type == RoCE && qp != UD) RoCEv1/RoCEv2 > > > > On a per ah basis: if (qp_type == RoCE && qp = UD) RoCEv1/RoCEv2 > > > > Michael's patch set was intended to give us a framework within which we > > can start cleaning things up. Once we get to cleaning things up, I > > think the above is where we should target storing the relevant > > information. > > > > Thoughts? > > I'm not following your intent here. > > The qp_type values are currently defined as RC, UC, UD, XRC, plus some > weirdness like SMI, GSI. Are you suggesting that we store the relevant > information as part of the qp_type, or that we keep the qp_type as-is? Well, you note I wrote qp != UD, where as that's really the qp_type, so the above was psuedo code at best. I was necessarily suggesting where in the qp data struct to store it, just that even though there isn't hardware that does this (yet), there's no reason hardware couldn't be designed to support both iWARP and RoCE and USNIC over the same Ethernet link layer, and merging the SoftRoCE and SoftiWARP drivers would make a proof of concept, and so we would *have* to store that information on a per QP basis. > Once Michael's patches are integrated, do apps need anything else > beyond the qp type as currently defined? If you had a device that supported iWARP and RoCE on the same physical link layer, then yes, the app would need a means of saying which transport to use in addition to the type of QP to establish.
> Well, you note I wrote qp != UD, where as that's really the qp_type, so > the above was psuedo code at best. I was necessarily suggesting where > in the qp data struct to store it, just that even though there isn't > hardware that does this (yet), there's no reason hardware couldn't be > designed to support both iWARP and RoCE and USNIC over the same Ethernet > link layer, and merging the SoftRoCE and SoftiWARP drivers would make a > proof of concept, and so we would *have* to store that information on a > per QP basis. > > > > Once Michael's patches are integrated, do apps need anything else > > beyond the qp type as currently defined? > > If you had a device that supported iWARP and RoCE on the same physical > link layer, then yes, the app would need a means of saying which > transport to use in addition to the type of QP to establish. Ah - got it now. And I agree, there should be some way to specify this at the QP level.
On Fri, May 08, 2015 at 08:56:16PM +0000, Hefty, Sean wrote: > > If you had a device that supported iWARP and RoCE on the same physical > > link layer, then yes, the app would need a means of saying which > > transport to use in addition to the type of QP to establish. > > Ah - got it now. And I agree, there should be some way to specify > this at the QP level. Yes, the only way out is to specify on a per QP basis the addressing and protocol/transport/whatever thing. Socket uses the AF,SOCK,PROTO tuple to specify this information. We can probably productively use a similar breakdown: AF_IB,SOCK_RC,PROTO_IBA // InfiniBand AF_OPA,SOCK_RC,PROTO_IBA // Future 32 bit LID OPB 'InfiniBand' AF_ETH,SOCK_RC,PROTO_IBA // RoCEv1 AF_INET,SOCK_RC,PROTO_IBA // InfiniBand or RoCEv2, depending on the Link Layer AF_ETH,SOCK_RC,PROTO_USNIC AF_INET,SOCK_RC,PROTO_USNIC AF_INET,SOCK_RC,PROTO_IWARP The expectation would be that any SOCK,PROTO combination can 'interoperate' with any AF combination - which is true with the above examples, ie with the right magic NAT hardware RoCEv2/v1/IB/(OPA?) can all interwork at the BTH layer. Which is to also say, it also unambiguously implies which standard defines the subtle verbs varietions that the app has to cope with. Incompatible AH's can not be used with the wrong type of QP, the CM layers would auto create QPs with the correct type to reflect how the CM process was run, etc. But Michael's patches are still an improvement, ie a RoCE + iWarp port would still have the cap_mad as true and would still have to create a QP1 interface, which I think is captured OK. The CM side probably explodes if a port is both iWarp and RoCE, but that is alot of work to fix anyhow.. I think we have to tackle the addressing problem as part of the RoCEv2 stuff, just burying all this in a qp index is really horrible. Realistically, today, if a RoCE/iWarp driver appeared then it would have to present to the system as two RDMA devices. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2015-05-05 at 20:22 +0000, Dave Goodell (dgoodell) wrote: > On May 5, 2015, at 2:51 PM, Liran Liss <liranl@mellanox.com> wrote: > > >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > >> owner@vger.kernel.org] On Behalf Of Hefty, Sean > > > >>> I thought USNIC_UDP had an embedded USNIC protocol header inside the > >>> UDP header. That would make it a UDP_ENCAP protocol. > >> > >> Someone from Cisco can correct me, but USNIC supports 2 protocols. Just > >> plain UDP, and a proprietary protocol that runs over Ethernet, but uses the > >> same EtherType as RoCE. I thought these could both be active on the same > >> port at the same time. > >> > > > > 'protocol' refers to what your device generates when you do a post_send() on > > some QP. > > In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation > > over IP. In the USNIC case, you might want the protocol to refer to addition > > information the distinguishes this wire protocol rather than just "I am sending > > UDP packets"... > > In the case that usNIC is operating in UDP mode (which is the > overwhelming majority of the cases), there is absolutely no additional > protocol that ends up on the wire or headers in the user buffers > besides UDP/IP/Ethernet. They are 100% plain UDP packets, they just > happen to be sent via OS-bypass queues instead of traveling through the > kernel networking stack. > > [^^^^^ there continues to be confusion about this for some reason, but > I don't know why] I really need to just sit down and read your driver front to back. The confusion probably comes from people (such as myself) that first think about this and go "Well, if you have multiple queue pairs, and UDP, then you need a header to tell what QP a packet goes to" (which assumes a RoCE like usage of UDP where all packets go to the same UDP address) where your actual usage is probably more iWARP like in that the IP/UDP SRC/DST combo maps to a specific QP, right?
On Tue, May 05, 2015 at 08:22:21PM +0000, Dave Goodell (dgoodell) wrote: > On May 5, 2015, at 2:51 PM, Liran Liss <liranl@mellanox.com> wrote: > > >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > >> owner@vger.kernel.org] On Behalf Of Hefty, Sean > > > >>> I thought USNIC_UDP had an embedded USNIC protocol header inside the > >>> UDP header. That would make it a UDP_ENCAP protocol. > >> > >> Someone from Cisco can correct me, but USNIC supports 2 protocols. Just > >> plain UDP, and a proprietary protocol that runs over Ethernet, but uses the > >> same EtherType as RoCE. I thought these could both be active on the same > >> port at the same time. > >> > > > > 'protocol' refers to what your device generates when you do a post_send() on > > some QP. > > In the RoCE case, it is IBTA transport headers + payload over UDP encapsulation > > over IP. In the USNIC case, you might want the protocol to refer to addition > > information the distinguishes this wire protocol rather than just "I am sending > > UDP packets"... > > In the case that usNIC is operating in UDP mode (which is the overwhelming majority of the cases), there is absolutely no additional protocol that ends up on the wire or headers in the user buffers besides UDP/IP/Ethernet. They are 100% plain UDP packets, they just happen to be sent via OS-bypass queues instead of traveling through the kernel networking stack. > > [^^^^^ there continues to be confusion about this for some reason, but I don't know why] So what is this patch for? commit 248567f79304b953ea492fb92ade097b62ed09b2 Author: Upinder Malhi <umalhi@cisco.com> Date: Thu Jan 9 14:48:19 2014 -0800 IB/core: Add RDMA_TRANSPORT_USNIC_UDP Add RDMA_TRANSPORT_USNIC_UDP which will be used by usNIC. Signed-off-by: Upinder Malhi <umalhi@cisco.com> Signed-off-by: Roland Dreier <roland@purestorage.com> This is probably where a lot of the confusion is coming from. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On May 8, 2015, at 8:32 PM, Doug Ledford <dledford@redhat.com> wrote: > On Tue, 2015-05-05 at 20:22 +0000, Dave Goodell (dgoodell) wrote: >> >> In the case that usNIC is operating in UDP mode (which is the >> overwhelming majority of the cases), there is absolutely no additional >> protocol that ends up on the wire or headers in the user buffers >> besides UDP/IP/Ethernet. They are 100% plain UDP packets, they just >> happen to be sent via OS-bypass queues instead of traveling through the >> kernel networking stack. >> >> [^^^^^ there continues to be confusion about this for some reason, but >> I don't know why] > > I really need to just sit down and read your driver front to back. Feel free to ping me off-list if you want any explanation about what's going on in the usnic_verbs module. I'm happy to help. > The > confusion probably comes from people (such as myself) that first think > about this and go "Well, if you have multiple queue pairs, and UDP, then > you need a header to tell what QP a packet goes to" (which assumes a > RoCE like usage of UDP where all packets go to the same UDP address) > where your actual usage is probably more iWARP like in that the IP/UDP > SRC/DST combo maps to a specific QP, right? I don't know much about the details of iWARP, but yes, a UDP/IP src/dst port combo maps to a specific QP when using USNIC_UDP QPs. We do _not_ use a single well-known port. -Dave -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On May 10, 2015, at 11:42 PM, ira.weiny <ira.weiny@intel.com> wrote: > On Tue, May 05, 2015 at 08:22:21PM +0000, Dave Goodell (dgoodell) wrote: >> >> In the case that usNIC is operating in UDP mode (which is the overwhelming majority of the cases), there is absolutely no additional protocol that ends up on the wire or headers in the user buffers besides UDP/IP/Ethernet. They are 100% plain UDP packets, they just happen to be sent via OS-bypass queues instead of traveling through the kernel networking stack. >> >> [^^^^^ there continues to be confusion about this for some reason, but I don't know why] > > So what is this patch for? Does my earlier email clarify the situation any? http://marc.info/?l=linux-rdma&m=142972178630720&w=2 > commit 248567f79304b953ea492fb92ade097b62ed09b2 > Author: Upinder Malhi <umalhi@cisco.com> > Date: Thu Jan 9 14:48:19 2014 -0800 > > IB/core: Add RDMA_TRANSPORT_USNIC_UDP > > Add RDMA_TRANSPORT_USNIC_UDP which will be used by usNIC. > > Signed-off-by: Upinder Malhi <umalhi@cisco.com> > Signed-off-by: Roland Dreier <roland@purestorage.com> > > This is probably where a lot of the confusion is coming from. Arguably RDMA_TRANSPORT_USNIC_UDP could/should have simply been named RDMA_TRANSPORT_UDP. -Dave -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 11, 2015 at 10:19:29PM +0000, Dave Goodell (dgoodell) wrote: > > I really need to just sit down and read your driver front to back. > > Feel free to ping me off-list if you want any explanation about > what's going on in the usnic_verbs module. I'm happy to help. So, I look at this: int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { usnic_dbg("\n"); return -EINVAL; } And conclude that usnic doesn't actually provide any RDMA services to in-kernel users, correct? I'd then guess the entire point of this is to hack some non-RDMA device to export it's interface via uverbs? Presumably because someone nack'd using your own char device or whatever? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On May 10, 2015, at 11:42 PM, ira.weiny <ira.weiny@intel.com> wrote: > > > On Tue, May 05, 2015 at 08:22:21PM +0000, Dave Goodell (dgoodell) wrote: > >> > >> In the case that usNIC is operating in UDP mode (which is the overwhelming > majority of the cases), there is absolutely no additional protocol that ends up on > the wire or headers in the user buffers besides UDP/IP/Ethernet. They are > 100% plain UDP packets, they just happen to be sent via OS-bypass queues > instead of traveling through the kernel networking stack. > >> > >> [^^^^^ there continues to be confusion about this for some reason, but I > don't know why] > > > > So what is this patch for? > > Does my earlier email clarify the situation any? http://marc.info/?l=linux- > rdma&m=142972178630720&w=2 Somewhat, is there any reason applications need to distinguish between the " The legacy RDMA_TRANSPORT_USNIC type" and " The current RDMA_TRANSPORT_USNIC_UDP type"? Or does the former no longer exist? > > > commit 248567f79304b953ea492fb92ade097b62ed09b2 > > Author: Upinder Malhi <umalhi@cisco.com> > > Date: Thu Jan 9 14:48:19 2014 -0800 > > > > IB/core: Add RDMA_TRANSPORT_USNIC_UDP > > > > Add RDMA_TRANSPORT_USNIC_UDP which will be used by usNIC. > > > > Signed-off-by: Upinder Malhi <umalhi@cisco.com> > > Signed-off-by: Roland Dreier <roland@purestorage.com> > > > > This is probably where a lot of the confusion is coming from. > > Arguably RDMA_TRANSPORT_USNIC_UDP could/should have simply been > named RDMA_TRANSPORT_UDP. I guess I'm wondering if there needs to be an RDMA_TRANSPORT_USNIC to represent the " The legacy RDMA_TRANSPORT_USNIC type" you mention in the link above. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On May 11, 2015, at 3:39 PM, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote: > So, I look at this: > > int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > struct ib_send_wr **bad_wr) > { > usnic_dbg("\n"); > return -EINVAL; > } > > And conclude that usnic doesn't actually provide any RDMA services to > in-kernel users, correct? Correct. > I'd then guess the entire point of this is to hack some non-RDMA > device to export it's interface via uverbs? If by "non-RDMA", you mean "does not currently support hardware offload of direct data placement", then yes. There are a couple of things to keep in mind: 1) usNIC is UD-only at this time. Are there any kernel clients that make use of UD without also requiring RC? (this is a genuine question) 2) When using UDP, usNIC deposits 42-byte UDP/IP/Ethernet headers into the start of the recv buffer, not a 40-byte GRH. Unfortunately, the current verbs stack has no way to indicate the device's/port's/QP's header size, let alone format, to any client. So clients must be aware they are using a usNIC device under the hood. > Presumably because someone > nack'd using your own char device or whatever? I wasn't participating on the kernel side of the usNIC project at the time that some of the early decisions were made, but to my knowledge the uverbs stack was selected as the most appropriate way to expose this technology to users after evaluation of several approaches. I don't think that any other approach was previously submitted and NACK-ed. -Dave -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On May 11, 2015, at 7:13 PM, Weiny, Ira <ira.weiny@intel.com> wrote: >> Does my earlier email clarify the situation any? http://marc.info/?l=linux- >> rdma&m=142972178630720&w=2 > > Somewhat, is there any reason applications need to distinguish between the " The legacy RDMA_TRANSPORT_USNIC type" and " The current RDMA_TRANSPORT_USNIC_UDP type"? Addressing is different between the two (Ethernet_MAC+QP# vs. IP+UDP_port, respectively). Also, the received UD header format and size are different between them (40-byte synthetic GRH vs. 42-byte UDP/IP/Ethernet header). > Or does the former no longer exist? The former is no longer being deployed and should be rare to find in the wild at this point, though I do not know precisely how many customers may be using it still. It is definitely legacy at this point. >> Arguably RDMA_TRANSPORT_USNIC_UDP could/should have simply been >> named RDMA_TRANSPORT_UDP. > > I guess I'm wondering if there needs to be an RDMA_TRANSPORT_USNIC to represent the " The legacy RDMA_TRANSPORT_USNIC type" you mention in the link above. It probably needs to exist for a little while still. -Dave -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 12, 2015 at 05:41:14PM +0000, Dave Goodell (dgoodell) wrote: > > I'd then guess the entire point of this is to hack some non-RDMA > > device to export it's interface via uverbs? > > If by "non-RDMA", you mean "does not currently support hardware > offload of direct data placement", then yes. There are a couple of > things to keep in mind: I actually mean 'does not provide RDMA services to the kernel'. What actually happens if you, for instance, load the iser module and try and use it with usnic? What parts of the RDMA core code does usnic actually use beyond command marshaling through uverbs? Would things be easier for you if usnic lived within DPDK? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: Jason Gunthorpe [mailto:jgunthorpe@obsidianresearch.com] > > > If you had a device that supported iWARP and RoCE on the same > > > physical link layer, then yes, the app would need a means of saying > > > which transport to use in addition to the type of QP to establish. > > > > Ah - got it now. And I agree, there should be some way to specify > > this at the QP level. Supporting iWARP and RoCE over the same RDMA device makes no sense. First of all, it is confusing to the user. It's like supporting SRP and iSCSI using the same SCSI device instance... Second, different technologies affect much more than the QP. A technology determines the format and semantics of the whole interface. For example, AHs, memory windows, atomics, CQE formats are different as well. Just thinking of how to abstract all these in single device is mind-boggling. Finally, there is no sane way to describe device capabilities. There is no single set of capabilities that applies to all of these technologies consistently. > > Yes, the only way out is to specify on a per QP basis the addressing and > protocol/transport/whatever thing. Socket uses the AF,SOCK,PROTO tuple to > specify this information. We can probably productively use a similar > breakdown: We might consider such abstractions at the CMA layer, not at the device level. Essentially, this is what the CMA was intended for. > > AF_IB,SOCK_RC,PROTO_IBA // InfiniBand > AF_OPA,SOCK_RC,PROTO_IBA // Future 32 bit LID OPB 'InfiniBand' This is not PROTO_IBA. > AF_ETH,SOCK_RC,PROTO_IBA // RoCEv1 > AF_INET,SOCK_RC,PROTO_IBA // InfiniBand or RoCEv2, depending on the Link > Layer AF_ETH,SOCK_RC,PROTO_USNIC AF_INET,SOCK_RC,PROTO_USNIC > AF_INET,SOCK_RC,PROTO_IWARP > [snip] > > > Realistically, today, if a RoCE/iWarp driver appeared then it would have to > present to the system as two RDMA devices. > Exactly! --Liran -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b360350..6a37255 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -262,6 +262,37 @@ out: return ret; } +static int read_core_cap_flags(struct ib_device *device) +{ + struct ib_port_attr tprops; + int num_ports, ret = -ENOMEM; + u8 port_index; + + num_ports = device->phys_port_cnt; + + device->core_cap_flags = kzalloc(sizeof(*device->core_cap_flags) + * (num_ports+1), + GFP_KERNEL); + if (!device->core_cap_flags) + return -ENOMEM; + + for (port_index = 0; port_index <= num_ports; ++port_index) { + if ((port_index == 0 && device->node_type != RDMA_NODE_IB_SWITCH)) + continue; + + ret = ib_query_port(device, port_index, &tprops); + if (ret) + goto err; + + device->core_cap_flags[port_index] = tprops.core_cap_flags; + } + + return 0; +err: + kfree(device->core_cap_flags); + return ret; +} + /** * ib_register_device - Register an IB device with IB core * @device:Device to register @@ -302,12 +333,21 @@ int ib_register_device(struct ib_device *device, goto out; } + ret = read_core_cap_flags(device); + if (ret) { + dev_err(&device->dev, "Couldn't create Core Capability flags\n"); + kfree(device->gid_tbl_len); + kfree(device->pkey_tbl_len); + goto out; + } + ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); + kfree(device->core_cap_flags); goto out; } @@ -351,6 +391,7 @@ void ib_unregister_device(struct ib_device *device) kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); + kfree(device->core_cap_flags); mutex_unlock(&device_mutex); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c724114..4de2758 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -353,11 +353,32 @@ union rdma_protocol_stats { struct iw_protocol_stats iw; }; +/* Define bits for the various functionality this port needs to be supported by + * the core. + */ +/* Management 0x00000000FFFFFFFF */ +#define RDMA_CORE_CAP_IB_MAD 0x0000000000000001ULL +#define RDMA_CORE_CAP_IB_SMI 0x0000000000000002ULL +#define RDMA_CORE_CAP_IB_CM 0x0000000000000004ULL +#define RDMA_CORE_CAP_IW_CM 0x0000000000000008ULL +#define RDMA_CORE_CAP_IB_SA 0x0000000000000010ULL + +/* Address format 0x0000FFFF00000000 */ +#define RDMA_CORE_CAP_AF_IB 0x0000000100000000ULL +#define RDMA_CORE_CAP_ETH_AH 0x0000000200000000ULL + +/* Protocol 0xFFFF000000000000 */ +#define RDMA_CORE_CAP_PROT_IB 0x0001000000000000ULL +#define RDMA_CORE_CAP_PROT_IBOE 0x0002000000000000ULL +#define RDMA_CORE_CAP_PROT_IWARP 0x0004000000000000ULL +#define RDMA_CORE_CAP_PROT_USNIC_UDP 0x0008000000000000ULL + struct ib_port_attr { enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; int gid_tbl_len; + u64 core_cap_flags; u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; @@ -1684,6 +1705,7 @@ struct ib_device { u32 local_dma_lkey; u8 node_type; u8 phys_port_cnt; + u64 *core_cap_flags; /* Per port core capability flags */ }; struct ib_client {