From patchwork Fri Dec 18 20:51:00 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jim Schutt X-Patchwork-Id: 68805 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.2) with ESMTP id nBIKpGpe024927 for ; Fri, 18 Dec 2009 20:51:21 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932419AbZLRUvU (ORCPT ); Fri, 18 Dec 2009 15:51:20 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754650AbZLRUvT (ORCPT ); Fri, 18 Dec 2009 15:51:19 -0500 Received: from sentry-three.sandia.gov ([132.175.109.17]:47416 "EHLO sentry-three.sandia.gov" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932418AbZLRUvR (ORCPT ); Fri, 18 Dec 2009 15:51:17 -0500 X-WSS-ID: 0KUV8LL-08-7PW-02 X-M-MSG: Received: from sentry.sandia.gov (sentry.sandia.gov [132.175.109.20]) by sentry-three.sandia.gov (Postfix) with ESMTP id 2ECA78E9564; Fri, 18 Dec 2009 13:51:20 -0700 (MST) Received: from [132.175.109.1] by sentry.sandia.gov with ESMTP (SMTP Relay 01 (Email Firewall v6.3.2)); Fri, 18 Dec 2009 13:51:10 -0700 X-Server-Uuid: 6BFC7783-7E22-49B4-B610-66D6BE496C0E Received: from localhost.localdomain (sale659.sandia.gov [134.253.4.20]) by mailgate.sandia.gov (8.14.1/8.14.1) with ESMTP id nBIKp1i4008814; Fri, 18 Dec 2009 13:51:09 -0700 From: "Jim Schutt" To: linux-rdma@vger.kernel.org cc: sashak@voltaire.com, eitan@mellanox.co.il, jaschut@sandia.gov Subject: [PATCH 11/12] opensm: Implement multicast support for torus-2QoS. Date: Fri, 18 Dec 2009 13:51:00 -0700 Message-ID: <1261169461-2516-11-git-send-email-jaschut@sandia.gov> X-Mailer: git-send-email 1.5.6.GIT In-Reply-To: <1258744509-11148-1-git-send-email-jaschut@sandia.gov> References: <1258744509-11148-1-git-send-email-jaschut@sandia.gov> X-PMX-Version: 5.5.7.378829, Antispam-Engine: 2.7.2.376379, Antispam-Data: 2009.12.18.204217 X-PerlMx-Spam: Gauge=IIIIIIII, Probability=8%, Report=' BODY_SIZE_10000_PLUS 0, TO_NO_NAME 0, __HAS_MSGID 0, __HAS_X_MAILER 0, __MIME_TEXT_ONLY 0, __SANE_MSGID 0, __STOCK_PHRASE_7 0, __TO_MALFORMED_2 0, __URI_NS ' X-TMWD-Spam-Summary: TS=20091218205111; ID=1; SEV=2.3.1; DFV=B2009121816; IFV=NA; AIF=B2009121816; RPD=5.03.0010; ENG=NA; RPDID=7374723D303030312E30413031303230332E34423242454233462E303043383A534346535441543838363133332C73733D312C6667733D30; CAT=NONE; CON=NONE; SIG=AAAAAAAAAAAAAAAAAAAAAAAAfQ== X-MMS-Spam-Filter-ID: B2009121816_5.03.0010 MIME-Version: 1.0 X-WSS-ID: 673534B42OC300013-01-01 Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org diff --git a/opensm/opensm/osm_ucast_torus.c b/opensm/opensm/osm_ucast_torus.c index 082fcf5..e2eb324 100644 --- a/opensm/opensm/osm_ucast_torus.c +++ b/opensm/opensm/osm_ucast_torus.c @@ -300,6 +300,7 @@ struct torus { struct coord_dirs *origin; struct t_switch ****sw; + struct t_switch *master_stree_root; unsigned flags; int debug; @@ -8515,6 +8516,241 @@ bool torus_lft(struct torus *t, struct t_switch *sw) } static +osm_mtree_node_t *mcast_stree_branch(struct t_switch *sw, osm_switch_t *osm_sw, + osm_mgrp_box_t *mgb, unsigned depth, + unsigned *port_cnt, unsigned *max_depth) +{ + osm_mtree_node_t *mtn = NULL; + osm_mcast_tbl_t *mcast_tbl, *ds_mcast_tbl; + osm_node_t *ds_node; + struct t_switch *ds_sw; + struct port_grp *ptgrp; + struct link *link; + struct endpoint *port; + unsigned g, p; + unsigned mcast_fwd_ports = 0, mcast_end_ports = 0; + + depth++; + + if (osm_sw->priv != sw) { + OSM_LOG(&sw->torus->osm->log, OSM_LOG_INFO, + "Error: osm_sw (GUID 0x%04llx) " + "not in our fabric description\n", + ntohllu(osm_node_get_node_guid(osm_sw->p_node))); + goto out; + } + if (!osm_switch_supports_mcast(osm_sw)) { + OSM_LOG(&sw->torus->osm->log, OSM_LOG_ERROR, + "Error: osm_sw (GUID 0x%04llx) " + "does not support multicast\n", + ntohllu(osm_node_get_node_guid(osm_sw->p_node))); + goto out; + } + mtn = osm_mtree_node_new(osm_sw); + if (!mtn) { + OSM_LOG(&sw->torus->osm->log, OSM_LOG_ERROR, + "Insufficient memory to build multicast tree\n"); + goto out; + } + mcast_tbl = osm_switch_get_mcast_tbl_ptr(osm_sw); + /* + * Recurse to downstream switches, i.e. those closer to master + * spanning tree branch tips. + * + * Note that if there are multiple ports in this port group, i.e., + * multiple parallel links, we can pick any one of them to use for + * any individual MLID without causing loops. Pick one based on MLID + * for now, until someone turns up evidence we need to be smarter. + * + * Also, it might be we got called in a window between a switch getting + * removed from the fabric, and torus-2QoS getting to rebuild its + * fabric representation. If that were to happen, our next hop + * osm_switch pointer might be stale. Look it up via opensm's fabric + * description to be sure it's not. + */ + for (g = 0; g < 2 * TORUS_MAX_DIM; g++) { + ptgrp = &sw->ptgrp[g]; + if (!ptgrp->to_stree_tip) + continue; + + p = mgb->mlid % ptgrp->port_cnt;/* port # in port group */ + p = ptgrp->port[p]->port; /* now port # in switch */ + + ds_node = osm_node_get_remote_node(osm_sw->p_node, p, NULL); + ds_sw = ptgrp->to_stree_tip->sw; + + if (!(ds_node && ds_node->sw && + ds_sw->osm_switch == ds_node->sw)) { + OSM_LOG(&sw->torus->osm->log, OSM_LOG_ERROR, + "Error: stale pointer to osm_sw " + "(GUID 0x%04llx)\n", ntohllu(ds_sw->n_id)); + continue; + } + mtn->child_array[p] = + mcast_stree_branch(ds_sw, ds_node->sw, mgb, + depth, port_cnt, max_depth); + if (!mtn->child_array[p]) + continue; + + osm_mcast_tbl_set(mcast_tbl, mgb->mlid, p); + mcast_fwd_ports++; + /* + * Since we forward traffic for this multicast group on this + * port, cause the switch on the other end of the link + * to forward traffic back to us. Do it now since have at + * hand the link used; otherwise it'll be hard to figure out + * later, and if we get it wrong we get a MC routing loop. + */ + link = sw->port[p]->link; + ds_mcast_tbl = osm_switch_get_mcast_tbl_ptr(ds_node->sw); + + if (&link->end[0] == sw->port[p]) + osm_mcast_tbl_set(ds_mcast_tbl, mgb->mlid, + link->end[1].port); + else + osm_mcast_tbl_set(ds_mcast_tbl, mgb->mlid, + link->end[0].port); + } + /* + * Add any host ports marked as in mcast group into spanning tree. + */ + ptgrp = &sw->ptgrp[2 * TORUS_MAX_DIM]; + for (p = 0; p < ptgrp->port_cnt; p++) { + port = ptgrp->port[p]; + if (port->tmp) { + port->tmp = NULL; + mtn->child_array[port->port] = OSM_MTREE_LEAF; + osm_mcast_tbl_set(mcast_tbl, mgb->mlid, port->port); + mcast_end_ports++; + } + } + if (!(mcast_end_ports || mcast_fwd_ports)) { + free(mtn); + mtn = NULL; + } else if (depth > *max_depth) + *max_depth = depth; + + *port_cnt += mcast_end_ports; +out: + return mtn; +} + +static +osm_port_t *next_mgrp_box_port(osm_mgrp_box_t *mgb, + cl_list_item_t **list_iterator, + cl_map_item_t **map_iterator) +{ + osm_mgrp_t *mgrp; + osm_mcm_port_t *mcm_port; + osm_port_t *osm_port = NULL; + cl_map_item_t *m_item = *map_iterator; + cl_list_item_t *l_item = *list_iterator; + +next_mgrp: + if (!l_item) + l_item = cl_qlist_head(&mgb->mgrp_list); + if (l_item == cl_qlist_end(&mgb->mgrp_list)) { + l_item = NULL; + goto out; + } + mgrp = cl_item_obj(l_item, mgrp, list_item); + + if (!m_item) + m_item = cl_qmap_head(&mgrp->mcm_port_tbl); + if (m_item == cl_qmap_end(&mgrp->mcm_port_tbl)) { + m_item = NULL; + l_item = cl_qlist_next(l_item); + goto next_mgrp; + } + mcm_port = cl_item_obj(m_item, mcm_port, map_item); + m_item = cl_qmap_next(m_item); + osm_port = mcm_port->port; +out: + *list_iterator = l_item; + *map_iterator = m_item; + return osm_port; +} + +static +ib_api_status_t torus_mcast_stree(void *context, osm_mgrp_box_t *mgb) +{ + struct torus_context *ctx = context; + struct torus *t = ctx->torus; + cl_map_item_t *m_item = NULL; + cl_list_item_t *l_item = NULL; + osm_port_t *osm_port; + osm_switch_t *osm_sw; + struct endpoint *port; + unsigned port_cnt = 0, max_depth = 0; + + osm_purge_mtree(&ctx->osm->sm, mgb); + + /* + * Build a spanning tree for a multicast group by first marking + * the torus endpoints that are participating in the group. + * Then do a depth-first search of the torus master spanning + * tree to build up the spanning tree specific to this group. + * + * Since the torus master spanning tree is constructed specifically + * to guarantee that multicast will not deadlock against unicast + * when they share VLs, we can be sure that any multicast group + * spanning tree constructed this way has the same property. + */ + while ((osm_port = next_mgrp_box_port(mgb, &l_item, &m_item))) { + port = osm_port->priv; + if (!(port && port->osm_port == osm_port)) { + port = osm_port_relink_endpoint(osm_port); + if (!port) { + guid_t id; + id = osm_node_get_node_guid(osm_port->p_node); + OSM_LOG(&ctx->osm->log, OSM_LOG_ERROR, + "Error: osm_port (GUID 0x%04llx) " + "not in our fabric description\n", + ntohllu(id)); + continue; + } + } + /* + * If this is a CA port, mark the switch port at the + * other end of this port's link. + * + * By definition, a CA port is connected to end[1] of a link, + * and the switch port is end[0]. See build_ca_link() and + * link_srcsink(). + */ + if (port->link) + port = &port->link->end[0]; + port->tmp = osm_port; + } + /* + * It might be we got called in a window between a switch getting + * removed from the fabric, and torus-2QoS getting to rebuild its + * fabric representation. If that were to happen, our + * master_stree_root->osm_switch pointer might be stale. Look up + * the osm_switch by GUID to be sure it's not. + * + * Also, call into mcast_stree_branch with depth = -1, because + * depth at root switch needs to be 0. + */ + osm_sw = (osm_switch_t *)cl_qmap_get(&ctx->osm->subn.sw_guid_tbl, + t->master_stree_root->n_id); + if (!(osm_sw && t->master_stree_root->osm_switch == osm_sw)) { + OSM_LOG(&ctx->osm->log, OSM_LOG_ERROR, + "Error: stale pointer to osm_sw (GUID 0x%04llx)\n", + ntohllu(t->master_stree_root->n_id)); + return IB_ERROR; + } + mgb->root = mcast_stree_branch(t->master_stree_root, osm_sw, + mgb, -1, &port_cnt, &max_depth); + + OSM_LOG(&ctx->osm->log, OSM_LOG_VERBOSE, + "Configured MLID 0x%X for %u ports, max tree depth = %u\n", + mgb->mlid, port_cnt, max_depth); + + return IB_SUCCESS; +} + +static bool good_xy_ring(struct torus *t, int x, int y, int z) { struct t_switch ****sw = t->sw; @@ -8740,6 +8976,7 @@ bool torus_master_stree(struct torus *t) if (t->sw[i][j][k]) build_master_stree_branch(t->sw[i][j][k], 2); } + t->master_stree_root = stree_root; /* * At this point we should have a master spanning tree that contains * every present switch, for all fabrics that torus-2QoS can route @@ -8855,17 +9092,7 @@ uint8_t torus_path_sl(void *context, uint8_t path_sl_hint, /* * If QoS was not requested by user, force path SLs into 8-15 range. - * This leaves SL 0 available for multicast, and SL2VL mappings - * will keep multicast traffic from deadlocking with unicast traffic. - * - * However, multicast might still deadlock against itself if multiple - * multicast groups each use their own spanning tree. - * - * FIXME: it is possible to construct a spanning tree that can - * overlay the DOR routing used for unicast in a way that multicast - * and unicast can share VLs but cannot deadlock against each other. - * Need to implement that and cause it to be used whenever the - * torus-2QoS routing engine is used. + * This leaves SL 0 available for multicast. */ if (t->flags & QOS_ENABLED) sl |= sl_set_qos(sl_get_qos(path_sl_hint)); @@ -8963,6 +9190,7 @@ int osm_ucast_torus2QoS_setup(struct osm_routing_engine *r, r->ucast_build_fwd_tables = torus_build_lfts; r->update_sl2vl = torus_update_osm_sl2vl; r->path_sl = torus_path_sl; + r->mcast_build_stree = torus_mcast_stree; r->delete = torus_context_delete; return 0; }