diff mbox

[opensm] RFC: new routing options (repost)

Message ID 1297388014.18394.302.camel@auk59.llnl.gov (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Al Chu Feb. 11, 2011, 1:33 a.m. UTC
None
diff mbox

Patch

diff --git a/include/opensm/osm_subnet.h b/include/opensm/osm_subnet.h
index 42ae416..59f877e 100644
--- a/include/opensm/osm_subnet.h
+++ b/include/opensm/osm_subnet.h
@@ -199,6 +199,7 @@  typedef struct osm_subn_opt {
 	char *root_guid_file;
 	char *cn_guid_file;
 	char *io_guid_file;
+	boolean_t port_shifting;
 	uint16_t max_reverse_hops;
 	char *ids_guid_file;
 	char *guid_routing_order_file;
@@ -418,6 +419,9 @@  typedef struct osm_subn_opt {
 *		Name of the file that contains list of I/O node guids that
 *		will be used by fat-tree routing (provided by User)
 *
+*	port_shifting
+*		This option will turn on port_shifting in routing.
+*
 *	ids_guid_file
 *		Name of the file that contains list of ids which should be
 *		used by Up/Down algorithm instead of node GUIDs
diff --git a/include/opensm/osm_switch.h b/include/opensm/osm_switch.h
index f407dd9..8eae119 100644
--- a/include/opensm/osm_switch.h
+++ b/include/opensm/osm_switch.h
@@ -919,7 +919,8 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 				  IN unsigned start_from,
 				  IN boolean_t ignore_existing,
 				  IN boolean_t routing_for_lmc,
-				  IN boolean_t dor);
+				  IN boolean_t dor,
+				  IN boolean_t port_shifting);
 /*
 * PARAMETERS
 *	p_sw
@@ -955,6 +956,9 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 *	dor
 *		[in] If TRUE, Dimension Order Routing will be done.
 *
+*	port_shifting
+*		[in] If TRUE, port_shifting will be done.
+*
 * RETURN VALUE
 *	Returns the recommended port on which to route this LID.
 *
diff --git a/man/opensm.8.in b/man/opensm.8.in
index cd3a24f..db48d52 100644
--- a/man/opensm.8.in
+++ b/man/opensm.8.in
@@ -25,6 +25,7 @@  opensm \- InfiniBand subnet manager and administration (SM/SA)
 [\-a | \-\-root_guid_file <path to file>]
 [\-u | \-\-cn_guid_file <path to file>]
 [\-G | \-\-io_guid_file <path to file>]
+[\-\-port\-shifting]
 [\-H | \-\-max_reverse_hops <max reverse hops allowed>]
 [\-X | \-\-guid_routing_order_file <path to file>]
 [\-m | \-\-ids_guid_file <path to file>]
@@ -208,6 +209,13 @@  to the guids provided in the given file (one to a line).
 I/O nodes are non-CN nodes allowed to use up to max_reverse_hops switches
 the wrong way around to improve connectivity.
 .TP
+\fB\-\-port\-shifting\fR
+This option enables a feature called \fBport shifting\fR.  In some
+fabrics, particularly cluster environments, routes commonly align and
+congest with other routes due to algorithmically unchanging traffic
+patterns.  This routing option will "shift" routing around in an
+attempt to alleviate this problem.
+.TP
 \fB\-H\fR, \fB\-\-max_reverse_hops\fR <file name>
 Set the maximum number of reverse hops an I/O node is allowed
 to make. A reverse hop is the use of a switch the wrong way around.
diff --git a/opensm/main.c b/opensm/main.c
index 756fe6f..abb32ec 100644
--- a/opensm/main.c
+++ b/opensm/main.c
@@ -223,6 +223,9 @@  static void show_usage(void)
 	printf("--io_guid_file, -G <path to file>\n"
 	       "          Set the I/O nodes for the Fat-Tree routing algorithm\n"
 	       "          to the guids provided in the given file (one to a line)\n\n");
+	printf("--port-shifting\n"
+	       "          Attempt to shift port routes around to remove alignment problems\n"
+	       "          in routing tables\n\n");
 	printf("--max_reverse_hops, -H <hop_count>\n"
 	       "          Set the max number of hops the wrong way around\n"
 	       "          an I/O node is allowed to do (connectivity for I/O nodes on top swithces)\n\n");
@@ -601,6 +604,7 @@  int main(int argc, char *argv[])
 		{"root_guid_file", 1, NULL, 'a'},
 		{"cn_guid_file", 1, NULL, 'u'},
 		{"io_guid_file", 1, NULL, 'G'},
+		{"port-shifting", 0, NULL, 11},
 		{"max_reverse_hops", 1, NULL, 'H'},
 		{"ids_guid_file", 1, NULL, 'm'},
 		{"guid_routing_order_file", 1, NULL, 'X'},
@@ -937,6 +941,10 @@  int main(int argc, char *argv[])
 			opt.io_guid_file = optarg;
 			printf(" I/O Node Guid File: %s\n", opt.io_guid_file);
 			break;
+		case 11:
+			opt.port_shifting = TRUE;
+			printf(" Port Shifting is on\n");
+			break;
 		case 'H':
 			opt.max_reverse_hops = atoi(optarg);
 			printf(" Max Reverse Hops: %d\n", opt.max_reverse_hops);
diff --git a/opensm/osm_dump.c b/opensm/osm_dump.c
index 535a03f..a1ff168 100644
--- a/opensm/osm_dump.c
+++ b/opensm/osm_dump.c
@@ -221,7 +221,7 @@  static void dump_ucast_routes(cl_map_item_t * item, FILE * file, void *cxt)
 			/* No LMC Optimization */
 			best_port = osm_switch_recommend_path(p_sw, p_port,
 							      lid_ho, 1, TRUE,
-							      FALSE, dor);
+							      FALSE, dor, FALSE);
 			fprintf(file, "No %u hop path possible via port %u!",
 				best_hops, best_port);
 		}
diff --git a/opensm/osm_subnet.c b/opensm/osm_subnet.c
index 228418f..c62192c 100644
--- a/opensm/osm_subnet.c
+++ b/opensm/osm_subnet.c
@@ -347,6 +347,7 @@  static const opt_rec_t opt_tbl[] = {
 	{ "root_guid_file", OPT_OFFSET(root_guid_file), opts_parse_charp, NULL, 0 },
 	{ "cn_guid_file", OPT_OFFSET(cn_guid_file), opts_parse_charp, NULL, 0 },
 	{ "io_guid_file", OPT_OFFSET(io_guid_file), opts_parse_charp, NULL, 0 },
+	{ "port_shifting", OPT_OFFSET(port_shifting), opts_parse_boolean, NULL, 1 },
 	{ "max_reverse_hops", OPT_OFFSET(max_reverse_hops), opts_parse_uint16, NULL, 0 },
 	{ "ids_guid_file", OPT_OFFSET(ids_guid_file), opts_parse_charp, NULL, 0 },
 	{ "guid_routing_order_file", OPT_OFFSET(guid_routing_order_file), opts_parse_charp, NULL, 0 },
@@ -740,6 +741,7 @@  void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt)
 	p_opt->root_guid_file = NULL;
 	p_opt->cn_guid_file = NULL;
 	p_opt->io_guid_file = NULL;
+	p_opt->port_shifting = FALSE;
 	p_opt->max_reverse_hops = 0;
 	p_opt->ids_guid_file = NULL;
 	p_opt->guid_routing_order_file = NULL;
@@ -1440,6 +1442,11 @@  int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t * p_opts)
 		p_opts->lash_start_vl);
 
 	fprintf(out,
+		"# Port Shifting (use FALSE if unsure)\n"
+		"port_shifting %s\n\n",
+		p_opts->port_shifting ? "TRUE" : "FALSE");
+
+	fprintf(out,
 		"# SA database file name\nsa_db_file %s\n\n",
 		p_opts->sa_db_file ? p_opts->sa_db_file : null_str);
 
diff --git a/opensm/osm_switch.c b/opensm/osm_switch.c
index 9785a9d..f24d9ea 100644
--- a/opensm/osm_switch.c
+++ b/opensm/osm_switch.c
@@ -51,6 +51,14 @@ 
 #include <iba/ib_types.h>
 #include <opensm/osm_switch.h>
 
+struct switch_port_path {
+	uint8_t port_num;
+	uint32_t path_count;
+	int found_sys_guid;
+	int found_node_guid;
+	uint32_t forwarded_to;
+};
+
 cl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho,
 				IN uint8_t port_num, IN uint8_t num_hops)
 {
@@ -217,7 +225,8 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 				  IN unsigned start_from,
 				  IN boolean_t ignore_existing,
 				  IN boolean_t routing_for_lmc,
-				  IN boolean_t dor)
+				  IN boolean_t dor,
+				  IN boolean_t port_shifting)
 {
 	/*
 	   We support an enhanced LMC aware routing mode:
@@ -259,6 +268,11 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 	osm_node_t *p_rem_node_first = NULL;
 	struct osm_remote_node *p_remote_guid = NULL;
 	struct osm_remote_node null_remote_node = {NULL, 0, 0};
+	struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX];
+	unsigned int port_paths_total_paths = 0;
+	unsigned int port_paths_count = 0;
+	int found_sys_guid;
+	int found_node_guid;
 
 	CL_ASSERT(lid_ho > 0);
 
@@ -369,6 +383,7 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 		check_count =
 		    osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
 
+
 		if (dor) {
 			/* Get the Remote Node */
 			p_rem_physp = osm_physp_get_remote(p_physp);
@@ -412,7 +427,10 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 					best_port_other_sys = port_num;
 					least_forwarded_to = 0;
 				}
+				found_sys_guid = 0;
 			} else {	/* same sys found - try node */
+
+
 				/* Else is the node guid already used ? */
 				p_remote_guid = switch_find_node_guid_count(p_sw,
 									    p_port->priv,
@@ -427,9 +445,27 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 				}
 				/* else prior sys and node guid already used */
 
+				if (!p_remote_guid)
+					found_node_guid = 0;
+				else
+					found_node_guid = 1;
+				found_sys_guid = 1;
 			}	/* same sys found */
 		}
 
+		port_paths[port_paths_count].port_num = port_num;
+		port_paths[port_paths_count].path_count = check_count;
+		if (routing_for_lmc) {
+			port_paths[port_paths_count].found_sys_guid = found_sys_guid;
+			port_paths[port_paths_count].found_node_guid = found_node_guid;
+		}
+		if (routing_for_lmc && p_remote_guid)
+			port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to;
+		else
+			port_paths[port_paths_count].forwarded_to = 0;
+		port_paths_total_paths += check_count;
+		port_paths_count++;
+
 		/* routing for LMC mode */
 		/*
 		   the count is min but also lower then the max subscribed
@@ -454,6 +490,66 @@  uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
 	if (port_found == FALSE)
 		return OSM_NO_PATH;
 
+	if (port_shifting && port_paths_count) {
+		/* In the port_paths[] array, we now have all the ports that we
+		 * can route out of.  Using some shifting math below, possibly
+		 * select a different one so that lids won't align in LFTs
+		 *
+		 * If lmc > 0, we need to loop through these ports to find the
+		 * least_forwarded_to port, best_port_other_sys, and
+		 * best_port_other_node just like before but through the different
+		 * ordering.
+		 */
+
+		least_paths = 0xFFFFFFFF;
+        	least_paths_other_sys = 0xFFFFFFFF;
+        	least_paths_other_nodes = 0xFFFFFFFF;
+	        least_forwarded_to = 0xFFFFFFFF;
+		best_port = 0;
+        	best_port_other_sys = 0;
+        	best_port_other_node = 0;
+
+		for (i = 0; i < port_paths_count; i++) {
+			unsigned int idx;
+
+			idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count;
+
+			if (routing_for_lmc) {
+				if (!port_paths[idx].found_sys_guid
+				    && port_paths[idx].path_count < least_paths_other_sys) {
+					least_paths_other_sys = port_paths[idx].path_count;
+					best_port_other_sys = port_paths[idx].port_num;
+					least_forwarded_to = 0;
+				}
+				else if (!port_paths[idx].found_node_guid
+					 && port_paths[idx].path_count < least_paths_other_nodes) {
+					least_paths_other_nodes = port_paths[idx].path_count;
+					best_port_other_node = port_paths[idx].port_num;
+					least_forwarded_to = 0;
+				}
+			}
+
+			if (port_paths[idx].path_count < least_paths) {
+				best_port = port_paths[idx].port_num;
+				least_paths = port_paths[idx].path_count;
+				if (routing_for_lmc
+				    && (port_paths[idx].found_sys_guid
+					|| port_paths[idx].found_node_guid)
+				    && port_paths[idx].forwarded_to < least_forwarded_to)
+					least_forwarded_to = port_paths[idx].forwarded_to;
+			}
+			else if (routing_for_lmc
+				 && (port_paths[idx].found_sys_guid
+				     || port_paths[idx].found_node_guid)
+				 && port_paths[idx].path_count == least_paths
+				 && port_paths[idx].forwarded_to < least_forwarded_to) {
+				least_forwarded_to = port_paths[idx].forwarded_to;
+				best_port = port_paths[idx].port_num;
+			}
+				
+		}
+	}
+	
 	/*
 	   if we are in enhanced routing mode and the best port is not
 	   the local port 0
diff --git a/opensm/osm_ucast_mgr.c b/opensm/osm_ucast_mgr.c
index 4019589..d32eb60 100644
--- a/opensm/osm_ucast_mgr.c
+++ b/opensm/osm_ucast_mgr.c
@@ -255,7 +255,8 @@  static void ucast_mgr_process_port(IN osm_ucast_mgr_t * p_mgr,
 	port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from,
 					 p_mgr->p_subn->ignore_existing_lfts,
 					 p_mgr->p_subn->opt.lmc,
-					 p_mgr->is_dor);
+					 p_mgr->is_dor,
+					 p_mgr->p_subn->opt.port_shifting);
 
 	if (port == OSM_NO_PATH) {
 		/* do not try to overwrite the ppro of non existing port ... */