@@ -300,6 +300,7 @@ struct torus {
struct coord_dirs *origin;
struct t_switch ****sw;
+ struct t_switch *master_stree_root;
unsigned flags;
int debug;
@@ -8515,6 +8516,241 @@ bool torus_lft(struct torus *t, struct t_switch *sw)
}
static
+osm_mtree_node_t *mcast_stree_branch(struct t_switch *sw, osm_switch_t *osm_sw,
+ osm_mgrp_box_t *mgb, unsigned depth,
+ unsigned *port_cnt, unsigned *max_depth)
+{
+ osm_mtree_node_t *mtn = NULL;
+ osm_mcast_tbl_t *mcast_tbl, *ds_mcast_tbl;
+ osm_node_t *ds_node;
+ struct t_switch *ds_sw;
+ struct port_grp *ptgrp;
+ struct link *link;
+ struct endpoint *port;
+ unsigned g, p;
+ unsigned mcast_fwd_ports = 0, mcast_end_ports = 0;
+
+ depth++;
+
+ if (osm_sw->priv != sw) {
+ OSM_LOG(&sw->torus->osm->log, OSM_LOG_INFO,
+ "Error: osm_sw (GUID 0x%04llx) "
+ "not in our fabric description\n",
+ ntohllu(osm_node_get_node_guid(osm_sw->p_node)));
+ goto out;
+ }
+ if (!osm_switch_supports_mcast(osm_sw)) {
+ OSM_LOG(&sw->torus->osm->log, OSM_LOG_ERROR,
+ "Error: osm_sw (GUID 0x%04llx) "
+ "does not support multicast\n",
+ ntohllu(osm_node_get_node_guid(osm_sw->p_node)));
+ goto out;
+ }
+ mtn = osm_mtree_node_new(osm_sw);
+ if (!mtn) {
+ OSM_LOG(&sw->torus->osm->log, OSM_LOG_ERROR,
+ "Insufficient memory to build multicast tree\n");
+ goto out;
+ }
+ mcast_tbl = osm_switch_get_mcast_tbl_ptr(osm_sw);
+ /*
+ * Recurse to downstream switches, i.e. those closer to master
+ * spanning tree branch tips.
+ *
+ * Note that if there are multiple ports in this port group, i.e.,
+ * multiple parallel links, we can pick any one of them to use for
+ * any individual MLID without causing loops. Pick one based on MLID
+ * for now, until someone turns up evidence we need to be smarter.
+ *
+ * Also, it might be we got called in a window between a switch getting
+ * removed from the fabric, and torus-2QoS getting to rebuild its
+ * fabric representation. If that were to happen, our next hop
+ * osm_switch pointer might be stale. Look it up via opensm's fabric
+ * description to be sure it's not.
+ */
+ for (g = 0; g < 2 * TORUS_MAX_DIM; g++) {
+ ptgrp = &sw->ptgrp[g];
+ if (!ptgrp->to_stree_tip)
+ continue;
+
+ p = mgb->mlid % ptgrp->port_cnt;/* port # in port group */
+ p = ptgrp->port[p]->port; /* now port # in switch */
+
+ ds_node = osm_node_get_remote_node(osm_sw->p_node, p, NULL);
+ ds_sw = ptgrp->to_stree_tip->sw;
+
+ if (!(ds_node && ds_node->sw &&
+ ds_sw->osm_switch == ds_node->sw)) {
+ OSM_LOG(&sw->torus->osm->log, OSM_LOG_ERROR,
+ "Error: stale pointer to osm_sw "
+ "(GUID 0x%04llx)\n", ntohllu(ds_sw->n_id));
+ continue;
+ }
+ mtn->child_array[p] =
+ mcast_stree_branch(ds_sw, ds_node->sw, mgb,
+ depth, port_cnt, max_depth);
+ if (!mtn->child_array[p])
+ continue;
+
+ osm_mcast_tbl_set(mcast_tbl, mgb->mlid, p);
+ mcast_fwd_ports++;
+ /*
+ * Since we forward traffic for this multicast group on this
+ * port, cause the switch on the other end of the link
+ * to forward traffic back to us. Do it now since have at
+ * hand the link used; otherwise it'll be hard to figure out
+ * later, and if we get it wrong we get a MC routing loop.
+ */
+ link = sw->port[p]->link;
+ ds_mcast_tbl = osm_switch_get_mcast_tbl_ptr(ds_node->sw);
+
+ if (&link->end[0] == sw->port[p])
+ osm_mcast_tbl_set(ds_mcast_tbl, mgb->mlid,
+ link->end[1].port);
+ else
+ osm_mcast_tbl_set(ds_mcast_tbl, mgb->mlid,
+ link->end[0].port);
+ }
+ /*
+ * Add any host ports marked as in mcast group into spanning tree.
+ */
+ ptgrp = &sw->ptgrp[2 * TORUS_MAX_DIM];
+ for (p = 0; p < ptgrp->port_cnt; p++) {
+ port = ptgrp->port[p];
+ if (port->tmp) {
+ port->tmp = NULL;
+ mtn->child_array[port->port] = OSM_MTREE_LEAF;
+ osm_mcast_tbl_set(mcast_tbl, mgb->mlid, port->port);
+ mcast_end_ports++;
+ }
+ }
+ if (!(mcast_end_ports || mcast_fwd_ports)) {
+ free(mtn);
+ mtn = NULL;
+ } else if (depth > *max_depth)
+ *max_depth = depth;
+
+ *port_cnt += mcast_end_ports;
+out:
+ return mtn;
+}
+
+static
+osm_port_t *next_mgrp_box_port(osm_mgrp_box_t *mgb,
+ cl_list_item_t **list_iterator,
+ cl_map_item_t **map_iterator)
+{
+ osm_mgrp_t *mgrp;
+ osm_mcm_port_t *mcm_port;
+ osm_port_t *osm_port = NULL;
+ cl_map_item_t *m_item = *map_iterator;
+ cl_list_item_t *l_item = *list_iterator;
+
+next_mgrp:
+ if (!l_item)
+ l_item = cl_qlist_head(&mgb->mgrp_list);
+ if (l_item == cl_qlist_end(&mgb->mgrp_list)) {
+ l_item = NULL;
+ goto out;
+ }
+ mgrp = cl_item_obj(l_item, mgrp, list_item);
+
+ if (!m_item)
+ m_item = cl_qmap_head(&mgrp->mcm_port_tbl);
+ if (m_item == cl_qmap_end(&mgrp->mcm_port_tbl)) {
+ m_item = NULL;
+ l_item = cl_qlist_next(l_item);
+ goto next_mgrp;
+ }
+ mcm_port = cl_item_obj(m_item, mcm_port, map_item);
+ m_item = cl_qmap_next(m_item);
+ osm_port = mcm_port->port;
+out:
+ *list_iterator = l_item;
+ *map_iterator = m_item;
+ return osm_port;
+}
+
+static
+ib_api_status_t torus_mcast_stree(void *context, osm_mgrp_box_t *mgb)
+{
+ struct torus_context *ctx = context;
+ struct torus *t = ctx->torus;
+ cl_map_item_t *m_item = NULL;
+ cl_list_item_t *l_item = NULL;
+ osm_port_t *osm_port;
+ osm_switch_t *osm_sw;
+ struct endpoint *port;
+ unsigned port_cnt = 0, max_depth = 0;
+
+ osm_purge_mtree(&ctx->osm->sm, mgb);
+
+ /*
+ * Build a spanning tree for a multicast group by first marking
+ * the torus endpoints that are participating in the group.
+ * Then do a depth-first search of the torus master spanning
+ * tree to build up the spanning tree specific to this group.
+ *
+ * Since the torus master spanning tree is constructed specifically
+ * to guarantee that multicast will not deadlock against unicast
+ * when they share VLs, we can be sure that any multicast group
+ * spanning tree constructed this way has the same property.
+ */
+ while ((osm_port = next_mgrp_box_port(mgb, &l_item, &m_item))) {
+ port = osm_port->priv;
+ if (!(port && port->osm_port == osm_port)) {
+ port = osm_port_relink_endpoint(osm_port);
+ if (!port) {
+ guid_t id;
+ id = osm_node_get_node_guid(osm_port->p_node);
+ OSM_LOG(&ctx->osm->log, OSM_LOG_ERROR,
+ "Error: osm_port (GUID 0x%04llx) "
+ "not in our fabric description\n",
+ ntohllu(id));
+ continue;
+ }
+ }
+ /*
+ * If this is a CA port, mark the switch port at the
+ * other end of this port's link.
+ *
+ * By definition, a CA port is connected to end[1] of a link,
+ * and the switch port is end[0]. See build_ca_link() and
+ * link_srcsink().
+ */
+ if (port->link)
+ port = &port->link->end[0];
+ port->tmp = osm_port;
+ }
+ /*
+ * It might be we got called in a window between a switch getting
+ * removed from the fabric, and torus-2QoS getting to rebuild its
+ * fabric representation. If that were to happen, our
+ * master_stree_root->osm_switch pointer might be stale. Look up
+ * the osm_switch by GUID to be sure it's not.
+ *
+ * Also, call into mcast_stree_branch with depth = -1, because
+ * depth at root switch needs to be 0.
+ */
+ osm_sw = (osm_switch_t *)cl_qmap_get(&ctx->osm->subn.sw_guid_tbl,
+ t->master_stree_root->n_id);
+ if (!(osm_sw && t->master_stree_root->osm_switch == osm_sw)) {
+ OSM_LOG(&ctx->osm->log, OSM_LOG_ERROR,
+ "Error: stale pointer to osm_sw (GUID 0x%04llx)\n",
+ ntohllu(t->master_stree_root->n_id));
+ return IB_ERROR;
+ }
+ mgb->root = mcast_stree_branch(t->master_stree_root, osm_sw,
+ mgb, -1, &port_cnt, &max_depth);
+
+ OSM_LOG(&ctx->osm->log, OSM_LOG_VERBOSE,
+ "Configured MLID 0x%X for %u ports, max tree depth = %u\n",
+ mgb->mlid, port_cnt, max_depth);
+
+ return IB_SUCCESS;
+}
+
+static
bool good_xy_ring(struct torus *t, int x, int y, int z)
{
struct t_switch ****sw = t->sw;
@@ -8740,6 +8976,7 @@ bool torus_master_stree(struct torus *t)
if (t->sw[i][j][k])
build_master_stree_branch(t->sw[i][j][k], 2);
}
+ t->master_stree_root = stree_root;
/*
* At this point we should have a master spanning tree that contains
* every present switch, for all fabrics that torus-2QoS can route
@@ -8855,17 +9092,7 @@ uint8_t torus_path_sl(void *context, uint8_t path_sl_hint,
/*
* If QoS was not requested by user, force path SLs into 8-15 range.
- * This leaves SL 0 available for multicast, and SL2VL mappings
- * will keep multicast traffic from deadlocking with unicast traffic.
- *
- * However, multicast might still deadlock against itself if multiple
- * multicast groups each use their own spanning tree.
- *
- * FIXME: it is possible to construct a spanning tree that can
- * overlay the DOR routing used for unicast in a way that multicast
- * and unicast can share VLs but cannot deadlock against each other.
- * Need to implement that and cause it to be used whenever the
- * torus-2QoS routing engine is used.
+ * This leaves SL 0 available for multicast.
*/
if (t->flags & QOS_ENABLED)
sl |= sl_set_qos(sl_get_qos(path_sl_hint));
@@ -8963,6 +9190,7 @@ int osm_ucast_torus2QoS_setup(struct osm_routing_engine *r,
r->ucast_build_fwd_tables = torus_build_lfts;
r->update_sl2vl = torus_update_osm_sl2vl;
r->path_sl = torus_path_sl;
+ r->mcast_build_stree = torus_mcast_stree;
r->delete = torus_context_delete;
return 0;
}