From patchwork Wed Jan 27 10:45:03 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Sasha Khapyorsky X-Patchwork-Id: 75428 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.2) with ESMTP id o0RAfdOs013236 for ; Wed, 27 Jan 2010 10:41:39 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751564Ab0A0Kld (ORCPT ); Wed, 27 Jan 2010 05:41:33 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751228Ab0A0Kld (ORCPT ); Wed, 27 Jan 2010 05:41:33 -0500 Received: from mail-ew0-f219.google.com ([209.85.219.219]:33068 "EHLO mail-ew0-f219.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751564Ab0A0Klb (ORCPT ); Wed, 27 Jan 2010 05:41:31 -0500 Received: by ewy19 with SMTP id 19so275146ewy.21 for ; Wed, 27 Jan 2010 02:41:30 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:received:received:sender:received:date:from:to :cc:subject:message-id:references:mime-version:content-type :content-disposition:in-reply-to:user-agent; bh=oMQgkoa6O24j/2L3PQror8BOJk2UCPRZ/NQ1k8EpgFM=; b=LTnx17xLwZ2Kq8odiZ1AQN4y1+j/l59ZaZXgfZ7TNIG57vMGkIleM43IP2fihaBomN M1FmLpCG2PwF2E/CHWmdm+dE2HMCfjj4SLcI6aI7Yq4D4HzBMUqq5NY+rty2uQDegfb8 6YGp/UN/RxRm18hDfo+z+9qpZhdDNYFVWoCyg= DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=sender:date:from:to:cc:subject:message-id:references:mime-version :content-type:content-disposition:in-reply-to:user-agent; b=UtDIEUMRlq4cZQJr5LB0zofgaUf6HyPazEX1H/hDxdrDGvJzyx/U2PWj1LJCYAK703 1qDzcwZUvJR6YTqK6INW1pU8xOm7QKRLqO4FSginohRtKYKll9bg7ifXIsQPgVGxLZk+ EytaNqNw7sEqMhBv84t7FKlekKevp5P3KYTH8= Received: by 10.213.96.212 with SMTP id i20mr5028488ebn.35.1264588889940; Wed, 27 Jan 2010 02:41:29 -0800 (PST) Received: from me.localdomain (85.64.35.106.dynamic.barak-online.net [85.64.35.106]) by mx.google.com with ESMTPS id 24sm11796615eyx.30.2010.01.27.02.41.26 (version=TLSv1/SSLv3 cipher=RC4-MD5); Wed, 27 Jan 2010 02:41:26 -0800 (PST) Received: by me.localdomain (Postfix, from userid 1000) id A03DC11DF8; Wed, 27 Jan 2010 12:45:03 +0200 (IST) Date: Wed, 27 Jan 2010 12:45:03 +0200 From: Sasha Khapyorsky To: Slava Strebkov Cc: linux-rdma@vger.kernel.org, Eli Dorfman , Or Gerlitz , Yevgeny Kliteynik Subject: Re: [PATCH v2] opensm: Multicast root switch calculation Message-ID: <20100127104503.GM26338@me> References: <4B17C712.9010109@Voltaire.COM> <20100120102703.GB25576@me> <39C75744D164D948A170E9792AF8E7CA01F6FA8A@exil.voltaire.com> <20100120115936.GC25576@me> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20100120115936.GC25576@me> User-Agent: Mutt/1.5.20 (2009-06-14) Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h index 205896d..cb6e5ac 100644 --- a/opensm/include/opensm/osm_switch.h +++ b/opensm/include/opensm/osm_switch.h @@ -109,6 +109,9 @@ typedef struct osm_switch { unsigned endport_links; unsigned need_update; void *priv; + cl_map_item_t mgrp_item; + uint32_t num_of_mcm; + uint8_t is_mc_member; } osm_switch_t; /* * FIELDS @@ -151,6 +154,15 @@ typedef struct osm_switch { * When set indicates that switch was probably reset, so * fwd tables and rest cached data should be flushed * +* mgrp_item +* map item for switch in building mcast tree +* +* num_of_mcm +* number of mcast members(ports) connected to switch +* +* is_mc_member +* whether switch is a mcast member itself +* * SEE ALSO * Switch object *********/ diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c index dce9f2b..5c9d0bc 100644 --- a/opensm/opensm/osm_mcast_mgr.c +++ b/opensm/opensm/osm_mcast_mgr.c @@ -157,50 +157,119 @@ static void mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox) OSM_LOG_EXIT(sm->p_log); } -static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qlist_t * l, - const osm_switch_t * p_sw) +static void mcast_mgr_build_switch_map(osm_sm_t * sm, + const cl_qlist_t * port_list, + cl_qmap_t * p_mcast_member_sw_tbl) { - float avg_hops = 0; - uint32_t hops = 0; - uint32_t num_ports = 0; - cl_list_item_t *i; + osm_switch_t *remote_sw; + cl_list_item_t *list_item; + osm_port_t *p_port; + ib_net64_t port_guid; + osm_physp_t *p_physp_remote; + osm_node_t *remote_node; osm_mcast_work_obj_t *wobj; OSM_LOG_ENTER(sm->p_log); - /* - For each member of the multicast group, compute the - number of hops to its base LID. - */ - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { - wobj = cl_item_obj(i, wobj, list_item); - hops += osm_switch_get_port_least_hops(p_sw, wobj->p_port); - num_ports++; + cl_qmap_init(p_mcast_member_sw_tbl); + for (list_item = cl_qlist_head(port_list); + list_item != cl_qlist_end(port_list); + list_item = cl_qlist_next(list_item)) { + wobj = cl_item_obj(list_item, wobj, list_item); + p_port = wobj->p_port; + if (!p_port) + continue; + if (p_port->p_node->sw) { + /* for switches - remote switch would be the switch itself */ + remote_node = osm_physp_get_node_ptr(p_port->p_physp); + } else { + p_physp_remote = osm_physp_get_remote(p_port->p_physp); + remote_node = osm_physp_get_node_ptr(p_physp_remote); + } + /* get the remote switch of the mcmember */ + remote_sw = remote_node->sw; + port_guid = osm_node_get_node_guid(remote_node); + if (cl_qmap_get(p_mcast_member_sw_tbl, port_guid) == + cl_qmap_end(p_mcast_member_sw_tbl)) { + /* insert switch to table */ + cl_qmap_insert(p_mcast_member_sw_tbl, port_guid, &remote_sw->mgrp_item); + /* New element in the table */ + if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA) + /* for HCA update the MC count on the remote switch */ + remote_sw->num_of_mcm++; + else + /* the switch is MC memeber */ + remote_sw->is_mc_member = 1; + } } + OSM_LOG_EXIT(sm->p_log); +} - /* - We should be here if there aren't any ports in the group. - */ - CL_ASSERT(num_ports); +static void mcast_mgr_destroy_switch_map(osm_sm_t * sm, + cl_qmap_t *p_mcast_member_sw_tbl) +{ + cl_map_item_t *p_item; + osm_switch_t *p_sw; - if (num_ports != 0) - avg_hops = (float)(hops / num_ports); + OSM_LOG_ENTER(sm->p_log); + p_item = cl_qmap_head(p_mcast_member_sw_tbl); + while (p_item != cl_qmap_end(p_mcast_member_sw_tbl)) { + p_sw = PARENT_STRUCT(p_item, osm_switch_t, mgrp_item); + p_sw->num_of_mcm = 0; + p_sw->is_mc_member = 0; + p_item = cl_qmap_next(p_item); + } + cl_qmap_remove_all(p_mcast_member_sw_tbl); OSM_LOG_EXIT(sm->p_log); - return avg_hops; } /********************************************************************** Calculate the maximal "min hops" from the given switch to any of the group HCAs **********************************************************************/ -static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, - const osm_switch_t * p_sw) +#ifdef OSM_VENDOR_INTF_ANAFA +static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m, + const osm_switch_t * this_sw) { - uint32_t max_hops = 0; + float avg_hops = 0; uint32_t hops = 0; - cl_list_item_t *i; - osm_mcast_work_obj_t *wobj; + uint32_t num_ports = 0; + uint16_t lid; + uint32_t least_hops; + cl_map_item_t *i; + osm_switch_t *sw; + + OSM_LOG_ENTER(sm->p_log); + + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { + sw = cl_item_obj(i, sw, mcast_item); + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); + least_hops = osm_switch_get_least_hops(this_sw, lid); + /* for all host that are MC members and attached to the switch, + we should add the (least_hops + 1) * number_of_such_hosts. + If switch itself is in the MC, we should add the least_hops only */ + hops += (least_hops + 1) * sw->num_of_mcm + + least_hops * sw->is_mc_member; + num_ports += sw->num_of_mcm + sw->is_mc_member; + } + + /* We should be here if there aren't any ports in the group. */ + CL_ASSERT(num_ports); + + avg_hops = (float)(hops / num_ports); + + OSM_LOG_EXIT(sm->p_log); + return avg_hops; +} +#else +static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m, + const osm_switch_t * this_sw) +{ + uint32_t max_hops = 0, hops; + uint16_t lid; + cl_map_item_t *i; + osm_switch_t *sw; OSM_LOG_ENTER(sm->p_log); @@ -208,9 +277,11 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, For each member of the multicast group, compute the number of hops to its base LID. */ - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { - wobj = cl_item_obj(i, wobj, list_item); - hops = osm_switch_get_port_least_hops(p_sw, wobj->p_port); + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { + sw = cl_item_obj(i, sw, mgrp_item); + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); + hops = osm_switch_get_least_hops(this_sw, lid); + hops = (hops + 1) * sw->num_of_mcm + hops * sw->is_mc_member; if (hops > max_hops) max_hops = hops; } @@ -222,6 +293,7 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, OSM_LOG_EXIT(sm->p_log); return (float)max_hops; } +#endif /********************************************************************** This function attempts to locate the optimal switch for the @@ -230,32 +302,30 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, of the multicast group. **********************************************************************/ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, - cl_qlist_t *list) + cl_qlist_t * list) { + cl_qmap_t mgrp_sw_map; cl_qmap_t *p_sw_tbl; osm_switch_t *p_sw, *p_best_sw = NULL; float hops = 0; float best_hops = 10000; /* any big # will do */ -#ifdef OSM_VENDOR_INTF_ANAFA - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ -#else - boolean_t use_avg_hops = FALSE; /* use max hops for root */ -#endif OSM_LOG_ENTER(sm->p_log); p_sw_tbl = &sm->p_subn->sw_guid_tbl; + mcast_mgr_build_switch_map(sm, list, &mgrp_sw_map); for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { if (!osm_switch_supports_mcast(p_sw)) continue; - if (use_avg_hops) - hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw); - else - hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw); +#ifdef OSM_VENDOR_INTF_ANAFA + hops = osm_mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw); +#else + hops = osm_mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw); +#endif OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Switch 0x%016" PRIx64 ", hops = %f\n", @@ -276,6 +346,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, "No multicast capable switches detected\n"); + mcast_mgr_destroy_switch_map(sm, &mgrp_sw_map); OSM_LOG_EXIT(sm->p_log); return p_best_sw; }