diff mbox

[5/7] libmlx4: add IBoE support

Message ID alpine.LRH.2.00.1107191231540.5580@ogerlitz.voltaire.com (mailing list archive)
State New, archived
Headers show

Commit Message

Or Gerlitz July 19, 2011, 9:32 a.m. UTC
Modify libmlx4 to support IBoE, where the only user space piece
to handle is the creation of UD address handles - the L2 Ethernet
attributes have to be resolved from the DGID.
Derived from work by Eli Cohen <eli@mellanox.co.il>

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 src/mlx4.h  |    1 +
 src/qp.c    |    1 +
 src/verbs.c |   49 +++++++++++++++++++++++++++++++++++++++++++++++--
 src/wqe.h   |    3 ++-
 4 files changed, 51 insertions(+), 3 deletions(-)

Comments

Jason Gunthorpe July 19, 2011, 4:49 p.m. UTC | #1
On Tue, Jul 19, 2011 at 12:32:52PM +0300, Or Gerlitz wrote:
> Modify libmlx4 to support IBoE, where the only user space piece
> to handle is the creation of UD address handles - the L2 Ethernet
> attributes have to be resolved from the DGID.

It seems a shame to add a kernel syscall to every ah creation just to
learn the link type.

Other than rejecting some GIDs it doesn't seem like there is any
operational need to learn the link type..

Also, did anyone document the GID format for IBoE? IIRC we invented
this for Linux since IBA refused to standardize anything.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz July 19, 2011, 7:34 p.m. UTC | #2
Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
>> Modify libmlx4 to support IBoE, where the only user space piece
>> to handle is the creation of UD address handles - the L2 Ethernet
>> attributes have to be resolved from the DGID.

> Other than rejecting some GIDs it doesn't seem like there is any
> operational need to learn the link type..

Jason, as the change log explains, under Ethernet link type the code
(see mlx4_resolve_grh_to_l2) goes and extracts the Ethernet L2 info
(mac and vlan, look on the last patch) from the GID.

> It seems a shame to add a kernel syscall to every ah creation just to learn the link type

AH creation is app's slow path, e.g the equivalent of session
creation, so I wouldn't bother too much on perf, on the being elegant
side, I'll check if/how I can plant/cache the link type for the ah
creation code to use.

> Also, did anyone document the GID format for IBoE? IIRC we invented
> this for Linux since IBA refused to standardize anything.

Not really, maybe we can/should add a document to the kernel IB
documentation folder with some details - as this data is more for deep
divers -  the thing is that applications that use the rdma-cm don't
deal directly with the GIDs, they just get it from the kernel and hand
it over to libibverbs (e.g UD apps use address / route resolution and
then call ah create, RC apps even don't have to call libibiverbs, as
the qp modifications go beyond the cover of librdmacm calling
libibverbs).

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz July 19, 2011, 8:18 p.m. UTC | #3
Jason Gunthorpe <jgunthorpe@obsidianresearch.com> wrote:
> Or Gerlitz wrote:

>> Jason, as the change log explains, under Ethernet link type the code
>> (see mlx4_resolve_grh_to_l2) goes and extracts the Ethernet L2 info
>> (mac and vlan, look on the last patch) from the GID.
>
> Looking at the control flow of the actual code, there is no reason it
> needs to know the link type, except to generate that error code. Why
> can't it set the slid, dlid and MAC fields in mlx4_ah simultaneously?

If you look on patch 6/7 @ mlx4_resolve_grh_to_l2, you would see this section

+               if (vid != 0xffff) {
+                       ah->av.port_pd |= htonl(1 << 29);
+                       ah->vlan = vid | ((attr->sl & 7) << 13);
+               }

this bit in the PD tells the HW  to add VLAN header to the packet and
sets the user priority bits for that vlan from the SL, we want to do
that only for Ethernet link type.

>> AH creation is app's slow path, e.g the equivalent of session
>> creation, so I wouldn't bother too much on perf, on the being elegant
>> side, I'll check if/how I can plant/cache the link type for the ah
>> creation code to use.

> Still, the MPI people regularly seem to have job startup problems, no
> idea how many AH's they end up creating at startup though..


Under IB job startup problems relates to SA scalability, generally under any
transport @ large scale there could be more issues, I don't think AH
creation time
is among them.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/mlx4.h b/src/mlx4.h
index 4445998..b277b06 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -241,6 +241,7 @@  struct mlx4_av {
 struct mlx4_ah {
 	struct ibv_ah			ibv_ah;
 	struct mlx4_av			av;
+	uint8_t				mac[6];
 };

 static inline unsigned long align(unsigned long val, unsigned long align)
diff --git a/src/qp.c b/src/qp.c
index ec138cd..4d79e38 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -144,6 +144,7 @@  static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
 	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
 	dseg->dqpn = htonl(wr->wr.ud.remote_qpn);
 	dseg->qkey = htonl(wr->wr.ud.remote_qkey);
+	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
 }

 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
diff --git a/src/verbs.c b/src/verbs.c
index 1ac1362..6620ac2 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -614,9 +614,45 @@  int mlx4_destroy_qp(struct ibv_qp *ibqp)
 	return 0;
 }

+static int link_local_gid(const union ibv_gid *gid)
+{
+	uint32_t hi = *(uint32_t *)(gid->raw);
+	uint32_t lo = *(uint32_t *)(gid->raw + 4);
+	if (hi == htonl(0xfe800000) && lo == 0)
+		return 1;
+
+	return 0;
+}
+
+static uint16_t get_vlan_id(union ibv_gid *gid)
+{
+	uint16_t vid;
+	vid = gid->raw[11] << 8 | gid->raw[12];
+	return vid < 0x1000 ? vid : 0xffff;
+}
+
+
+static int mlx4_resolve_grh_to_l2(struct mlx4_ah *ah, struct ibv_ah_attr *attr)
+{
+	if (get_vlan_id(&attr->grh.dgid) != 0xffff)
+		return 1;
+
+	if (link_local_gid(&attr->grh.dgid)) {
+		memcpy(ah->mac, &attr->grh.dgid.raw[8], 3);
+		memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3);
+		ah->mac[0] ^= 2;
+		return 0;
+	} else
+		return 1;
+}
+
 struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
 {
 	struct mlx4_ah *ah;
+	struct ibv_port_attr port_attr;
+
+	if (ibv_query_port(pd->context, attr->port_num, &port_attr))
+		return NULL;

 	ah = malloc(sizeof *ah);
 	if (!ah)
@@ -625,8 +661,11 @@  struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
 	memset(&ah->av, 0, sizeof ah->av);

 	ah->av.port_pd   = htonl(to_mpd(pd)->pdn | (attr->port_num << 24));
-	ah->av.g_slid    = attr->src_path_bits;
-	ah->av.dlid      = htons(attr->dlid);
+
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		ah->av.g_slid = attr->src_path_bits;
+		ah->av.dlid   = htons(attr->dlid);
+	}
 	if (attr->static_rate) {
 		ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
 		/* XXX check rate cap? */
@@ -642,6 +681,12 @@  struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
 		memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
 	}

+	if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET)
+		if (mlx4_resolve_grh_to_l2(ah, attr)) {
+			free(ah);
+			return NULL;
+		}
+
 	return &ah->ibv_ah;
 }

diff --git a/src/wqe.h b/src/wqe.h
index 6f7f309..043f0da 100644
--- a/src/wqe.h
+++ b/src/wqe.h
@@ -78,7 +78,8 @@  struct mlx4_wqe_datagram_seg {
 	uint32_t		av[8];
 	uint32_t		dqpn;
 	uint32_t		qkey;
-	uint32_t		reserved[2];
+	uint16_t		reserved;
+	uint8_t			mac[6];
 };

 struct mlx4_wqe_data_seg {