From patchwork Fri Feb 11 01:33:34 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Al Chu X-Patchwork-Id: 548591 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id p1B1hCun020519 for ; Fri, 11 Feb 2011 01:43:12 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932213Ab1BKBnK (ORCPT ); Thu, 10 Feb 2011 20:43:10 -0500 Received: from nspiron-2.llnl.gov ([128.115.41.82]:49730 "EHLO nspiron-2.llnl.gov" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932094Ab1BKBnJ (ORCPT ); Thu, 10 Feb 2011 20:43:09 -0500 X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.6 (demeter1.kernel.org [140.211.167.41]); Fri, 11 Feb 2011 01:43:12 +0000 (UTC) X-Greylist: delayed 573 seconds by postgrey-1.27 at vger.kernel.org; Thu, 10 Feb 2011 20:43:08 EST X-Attachments: None Received: from auk59.llnl.gov (HELO [134.9.93.24]) ([134.9.93.24]) by nspiron-2.llnl.gov with ESMTP; 10 Feb 2011 17:33:34 -0800 Subject: [opensm] RFC: new routing options (repost) From: Albert Chu To: "linux-rdma@vger.kernel.org" Date: Thu, 10 Feb 2011 17:33:34 -0800 Message-Id: <1297388014.18394.302.camel@auk59.llnl.gov> Mime-Version: 1.0 X-Mailer: Evolution 2.12.3 (2.12.3-19.el5) Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org diff --git a/include/opensm/osm_subnet.h b/include/opensm/osm_subnet.h index 42ae416..59f877e 100644 --- a/include/opensm/osm_subnet.h +++ b/include/opensm/osm_subnet.h @@ -199,6 +199,7 @@ typedef struct osm_subn_opt { char *root_guid_file; char *cn_guid_file; char *io_guid_file; + boolean_t port_shifting; uint16_t max_reverse_hops; char *ids_guid_file; char *guid_routing_order_file; @@ -418,6 +419,9 @@ typedef struct osm_subn_opt { * Name of the file that contains list of I/O node guids that * will be used by fat-tree routing (provided by User) * +* port_shifting +* This option will turn on port_shifting in routing. +* * ids_guid_file * Name of the file that contains list of ids which should be * used by Up/Down algorithm instead of node GUIDs diff --git a/include/opensm/osm_switch.h b/include/opensm/osm_switch.h index f407dd9..8eae119 100644 --- a/include/opensm/osm_switch.h +++ b/include/opensm/osm_switch.h @@ -919,7 +919,8 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, IN unsigned start_from, IN boolean_t ignore_existing, IN boolean_t routing_for_lmc, - IN boolean_t dor); + IN boolean_t dor, + IN boolean_t port_shifting); /* * PARAMETERS * p_sw @@ -955,6 +956,9 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, * dor * [in] If TRUE, Dimension Order Routing will be done. * +* port_shifting +* [in] If TRUE, port_shifting will be done. +* * RETURN VALUE * Returns the recommended port on which to route this LID. * diff --git a/man/opensm.8.in b/man/opensm.8.in index cd3a24f..db48d52 100644 --- a/man/opensm.8.in +++ b/man/opensm.8.in @@ -25,6 +25,7 @@ opensm \- InfiniBand subnet manager and administration (SM/SA) [\-a | \-\-root_guid_file ] [\-u | \-\-cn_guid_file ] [\-G | \-\-io_guid_file ] +[\-\-port\-shifting] [\-H | \-\-max_reverse_hops ] [\-X | \-\-guid_routing_order_file ] [\-m | \-\-ids_guid_file ] @@ -208,6 +209,13 @@ to the guids provided in the given file (one to a line). I/O nodes are non-CN nodes allowed to use up to max_reverse_hops switches the wrong way around to improve connectivity. .TP +\fB\-\-port\-shifting\fR +This option enables a feature called \fBport shifting\fR. In some +fabrics, particularly cluster environments, routes commonly align and +congest with other routes due to algorithmically unchanging traffic +patterns. This routing option will "shift" routing around in an +attempt to alleviate this problem. +.TP \fB\-H\fR, \fB\-\-max_reverse_hops\fR Set the maximum number of reverse hops an I/O node is allowed to make. A reverse hop is the use of a switch the wrong way around. diff --git a/opensm/main.c b/opensm/main.c index 756fe6f..abb32ec 100644 --- a/opensm/main.c +++ b/opensm/main.c @@ -223,6 +223,9 @@ static void show_usage(void) printf("--io_guid_file, -G \n" " Set the I/O nodes for the Fat-Tree routing algorithm\n" " to the guids provided in the given file (one to a line)\n\n"); + printf("--port-shifting\n" + " Attempt to shift port routes around to remove alignment problems\n" + " in routing tables\n\n"); printf("--max_reverse_hops, -H \n" " Set the max number of hops the wrong way around\n" " an I/O node is allowed to do (connectivity for I/O nodes on top swithces)\n\n"); @@ -601,6 +604,7 @@ int main(int argc, char *argv[]) {"root_guid_file", 1, NULL, 'a'}, {"cn_guid_file", 1, NULL, 'u'}, {"io_guid_file", 1, NULL, 'G'}, + {"port-shifting", 0, NULL, 11}, {"max_reverse_hops", 1, NULL, 'H'}, {"ids_guid_file", 1, NULL, 'm'}, {"guid_routing_order_file", 1, NULL, 'X'}, @@ -937,6 +941,10 @@ int main(int argc, char *argv[]) opt.io_guid_file = optarg; printf(" I/O Node Guid File: %s\n", opt.io_guid_file); break; + case 11: + opt.port_shifting = TRUE; + printf(" Port Shifting is on\n"); + break; case 'H': opt.max_reverse_hops = atoi(optarg); printf(" Max Reverse Hops: %d\n", opt.max_reverse_hops); diff --git a/opensm/osm_dump.c b/opensm/osm_dump.c index 535a03f..a1ff168 100644 --- a/opensm/osm_dump.c +++ b/opensm/osm_dump.c @@ -221,7 +221,7 @@ static void dump_ucast_routes(cl_map_item_t * item, FILE * file, void *cxt) /* No LMC Optimization */ best_port = osm_switch_recommend_path(p_sw, p_port, lid_ho, 1, TRUE, - FALSE, dor); + FALSE, dor, FALSE); fprintf(file, "No %u hop path possible via port %u!", best_hops, best_port); } diff --git a/opensm/osm_subnet.c b/opensm/osm_subnet.c index 228418f..c62192c 100644 --- a/opensm/osm_subnet.c +++ b/opensm/osm_subnet.c @@ -347,6 +347,7 @@ static const opt_rec_t opt_tbl[] = { { "root_guid_file", OPT_OFFSET(root_guid_file), opts_parse_charp, NULL, 0 }, { "cn_guid_file", OPT_OFFSET(cn_guid_file), opts_parse_charp, NULL, 0 }, { "io_guid_file", OPT_OFFSET(io_guid_file), opts_parse_charp, NULL, 0 }, + { "port_shifting", OPT_OFFSET(port_shifting), opts_parse_boolean, NULL, 1 }, { "max_reverse_hops", OPT_OFFSET(max_reverse_hops), opts_parse_uint16, NULL, 0 }, { "ids_guid_file", OPT_OFFSET(ids_guid_file), opts_parse_charp, NULL, 0 }, { "guid_routing_order_file", OPT_OFFSET(guid_routing_order_file), opts_parse_charp, NULL, 0 }, @@ -740,6 +741,7 @@ void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt) p_opt->root_guid_file = NULL; p_opt->cn_guid_file = NULL; p_opt->io_guid_file = NULL; + p_opt->port_shifting = FALSE; p_opt->max_reverse_hops = 0; p_opt->ids_guid_file = NULL; p_opt->guid_routing_order_file = NULL; @@ -1440,6 +1442,11 @@ int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t * p_opts) p_opts->lash_start_vl); fprintf(out, + "# Port Shifting (use FALSE if unsure)\n" + "port_shifting %s\n\n", + p_opts->port_shifting ? "TRUE" : "FALSE"); + + fprintf(out, "# SA database file name\nsa_db_file %s\n\n", p_opts->sa_db_file ? p_opts->sa_db_file : null_str); diff --git a/opensm/osm_switch.c b/opensm/osm_switch.c index 9785a9d..f24d9ea 100644 --- a/opensm/osm_switch.c +++ b/opensm/osm_switch.c @@ -51,6 +51,14 @@ #include #include +struct switch_port_path { + uint8_t port_num; + uint32_t path_count; + int found_sys_guid; + int found_node_guid; + uint32_t forwarded_to; +}; + cl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho, IN uint8_t port_num, IN uint8_t num_hops) { @@ -217,7 +225,8 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, IN unsigned start_from, IN boolean_t ignore_existing, IN boolean_t routing_for_lmc, - IN boolean_t dor) + IN boolean_t dor, + IN boolean_t port_shifting) { /* We support an enhanced LMC aware routing mode: @@ -259,6 +268,11 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, osm_node_t *p_rem_node_first = NULL; struct osm_remote_node *p_remote_guid = NULL; struct osm_remote_node null_remote_node = {NULL, 0, 0}; + struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX]; + unsigned int port_paths_total_paths = 0; + unsigned int port_paths_count = 0; + int found_sys_guid; + int found_node_guid; CL_ASSERT(lid_ho > 0); @@ -369,6 +383,7 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, check_count = osm_port_prof_path_count_get(&p_sw->p_prof[port_num]); + if (dor) { /* Get the Remote Node */ p_rem_physp = osm_physp_get_remote(p_physp); @@ -412,7 +427,10 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, best_port_other_sys = port_num; least_forwarded_to = 0; } + found_sys_guid = 0; } else { /* same sys found - try node */ + + /* Else is the node guid already used ? */ p_remote_guid = switch_find_node_guid_count(p_sw, p_port->priv, @@ -427,9 +445,27 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, } /* else prior sys and node guid already used */ + if (!p_remote_guid) + found_node_guid = 0; + else + found_node_guid = 1; + found_sys_guid = 1; } /* same sys found */ } + port_paths[port_paths_count].port_num = port_num; + port_paths[port_paths_count].path_count = check_count; + if (routing_for_lmc) { + port_paths[port_paths_count].found_sys_guid = found_sys_guid; + port_paths[port_paths_count].found_node_guid = found_node_guid; + } + if (routing_for_lmc && p_remote_guid) + port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to; + else + port_paths[port_paths_count].forwarded_to = 0; + port_paths_total_paths += check_count; + port_paths_count++; + /* routing for LMC mode */ /* the count is min but also lower then the max subscribed @@ -454,6 +490,66 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, if (port_found == FALSE) return OSM_NO_PATH; + if (port_shifting && port_paths_count) { + /* In the port_paths[] array, we now have all the ports that we + * can route out of. Using some shifting math below, possibly + * select a different one so that lids won't align in LFTs + * + * If lmc > 0, we need to loop through these ports to find the + * least_forwarded_to port, best_port_other_sys, and + * best_port_other_node just like before but through the different + * ordering. + */ + + least_paths = 0xFFFFFFFF; + least_paths_other_sys = 0xFFFFFFFF; + least_paths_other_nodes = 0xFFFFFFFF; + least_forwarded_to = 0xFFFFFFFF; + best_port = 0; + best_port_other_sys = 0; + best_port_other_node = 0; + + for (i = 0; i < port_paths_count; i++) { + unsigned int idx; + + idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count; + + if (routing_for_lmc) { + if (!port_paths[idx].found_sys_guid + && port_paths[idx].path_count < least_paths_other_sys) { + least_paths_other_sys = port_paths[idx].path_count; + best_port_other_sys = port_paths[idx].port_num; + least_forwarded_to = 0; + } + else if (!port_paths[idx].found_node_guid + && port_paths[idx].path_count < least_paths_other_nodes) { + least_paths_other_nodes = port_paths[idx].path_count; + best_port_other_node = port_paths[idx].port_num; + least_forwarded_to = 0; + } + } + + if (port_paths[idx].path_count < least_paths) { + best_port = port_paths[idx].port_num; + least_paths = port_paths[idx].path_count; + if (routing_for_lmc + && (port_paths[idx].found_sys_guid + || port_paths[idx].found_node_guid) + && port_paths[idx].forwarded_to < least_forwarded_to) + least_forwarded_to = port_paths[idx].forwarded_to; + } + else if (routing_for_lmc + && (port_paths[idx].found_sys_guid + || port_paths[idx].found_node_guid) + && port_paths[idx].path_count == least_paths + && port_paths[idx].forwarded_to < least_forwarded_to) { + least_forwarded_to = port_paths[idx].forwarded_to; + best_port = port_paths[idx].port_num; + } + + } + } + /* if we are in enhanced routing mode and the best port is not the local port 0 diff --git a/opensm/osm_ucast_mgr.c b/opensm/osm_ucast_mgr.c index 4019589..d32eb60 100644 --- a/opensm/osm_ucast_mgr.c +++ b/opensm/osm_ucast_mgr.c @@ -255,7 +255,8 @@ static void ucast_mgr_process_port(IN osm_ucast_mgr_t * p_mgr, port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from, p_mgr->p_subn->ignore_existing_lfts, p_mgr->p_subn->opt.lmc, - p_mgr->is_dor); + p_mgr->is_dor, + p_mgr->p_subn->opt.port_shifting); if (port == OSM_NO_PATH) { /* do not try to overwrite the ppro of non existing port ... */