diff mbox

[PATCHv3] opensm: Add initial congestion control configuration support

Message ID 1343686007.18615.364.camel@auk59.llnl.gov (mailing list archive)
State Accepted
Delegated to: Alex Netes
Headers show

Commit Message

Al Chu July 30, 2012, 10:06 p.m. UTC
This patch adds initial support for congestion control configuration
on a fabric.  Users may configure settings via the Switch Congestion
Setting, CA Congestion Setting, or Congest Control Table MADs.

Signed-off-by: Albert Chu <chu11@llnl.gov>
Signed-off-by: Alex Netes <alexne@mellanox.com>
---
 include/iba/ib_types.h                  |   14 +-
 include/opensm/osm_congestion_control.h |  132 ++++++
 include/opensm/osm_madw.h               |   40 ++
 include/opensm/osm_msgdef.h             |    1 +
 include/opensm/osm_opensm.h             |    2 +
 include/opensm/osm_port.h               |   18 +
 include/opensm/osm_subnet.h             |  158 +++++++
 man/opensm.8.in                         |   11 +
 opensm/Makefile.am                      |    4 +-
 opensm/main.c                           |   16 +
 opensm/osm_congestion_control.c         |  741 +++++++++++++++++++++++++++++++
 opensm/osm_opensm.c                     |   13 +
 opensm/osm_state_mgr.c                  |   13 +
 opensm/osm_subnet.c                     |  470 ++++++++++++++++++++
 14 files changed, 1627 insertions(+), 6 deletions(-)
 create mode 100644 include/opensm/osm_congestion_control.h
 create mode 100644 opensm/osm_congestion_control.c

Comments

Alex Netes July 31, 2012, 6:41 a.m. UTC | #1
Hi Albert,

On 15:06 Mon 30 Jul     , Albert Chu wrote:
> This patch adds initial support for congestion control configuration
> on a fabric.  Users may configure settings via the Switch Congestion
> Setting, CA Congestion Setting, or Congest Control Table MADs.
> 
> Signed-off-by: Albert Chu <chu11@llnl.gov>
> Signed-off-by: Alex Netes <alexne@mellanox.com>
> ---

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/iba/ib_types.h b/include/iba/ib_types.h
index 7b4d1ee..c8d51ca 100644
--- a/include/iba/ib_types.h
+++ b/include/iba/ib_types.h
@@ -11471,11 +11471,12 @@  typedef struct _ib_cong_log {
 *
 * SYNOPSIS
 */
+#define IB_CC_PORT_MASK_DATA_SIZE 32
 #include <complib/cl_packon.h>
 typedef struct _ib_sw_cong_setting {
 	ib_net32_t control_map;
-	uint8_t victim_mask[32];
-	uint8_t credit_mask[32];
+	uint8_t victim_mask[IB_CC_PORT_MASK_DATA_SIZE];
+	uint8_t credit_mask[IB_CC_PORT_MASK_DATA_SIZE];
 	uint8_t threshold_resv;
 	uint8_t packet_size;
 	ib_net16_t cs_threshold_resv;
@@ -11585,7 +11586,8 @@  typedef struct _ib_sw_port_cong_setting_element {
 *
 * SOURCE
 */
-typedef ib_sw_port_cong_setting_element_t ib_sw_port_cong_setting_block_t[32];
+#define IB_CC_SW_PORT_SETTING_ELEMENTS 32
+typedef ib_sw_port_cong_setting_element_t ib_sw_port_cong_setting_block_t[IB_CC_SW_PORT_SETTING_ELEMENTS];
 /**********/
 
 /****s* IBA Base: Types/ib_sw_port_cong_setting_t
@@ -11663,11 +11665,12 @@  typedef struct _ib_ca_cong_entry {
 *
 * SYNOPSIS
 */
+#define IB_CA_CONG_ENTRY_DATA_SIZE 16
 #include <complib/cl_packon.h>
 typedef struct _ib_ca_cong_setting {
 	ib_net16_t port_control;
 	ib_net16_t control_map;
-	ib_ca_cong_entry_t entry_list[16];
+	ib_ca_cong_entry_t entry_list[IB_CA_CONG_ENTRY_DATA_SIZE];
 } PACK_SUFFIX ib_ca_cong_setting_t;
 #include <complib/cl_packoff.h>
 /*
@@ -11726,11 +11729,12 @@  typedef struct _ib_cc_tbl_entry {
 *
 * SYNOPSIS
 */
+#define IB_CC_TBL_ENTRY_LIST_MAX 64
 #include <complib/cl_packon.h>
 typedef struct _ib_cc_tbl {
 	ib_net16_t ccti_limit;
 	ib_net16_t resv;
-	ib_cc_tbl_entry_t entry_list[64];
+	ib_cc_tbl_entry_t entry_list[IB_CC_TBL_ENTRY_LIST_MAX];
 } PACK_SUFFIX ib_cc_tbl_t;
 #include <complib/cl_packoff.h>
 /*
diff --git a/include/opensm/osm_congestion_control.h b/include/opensm/osm_congestion_control.h
new file mode 100644
index 0000000..94e4ffb
--- /dev/null
+++ b/include/opensm/osm_congestion_control.h
@@ -0,0 +1,132 @@ 
+/*
+ * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
+ * Copyright (c) 2012 Lawrence Livermore National Lab.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/*
+ * Abstract:
+ *    OSM Congestion Control types and prototypes
+ *
+ * Author:
+ *    Albert Chu, LLNL
+ */
+
+#ifndef OSM_CONGESTION_CONTROL_H
+#define OSM_CONGESTION_CONTROL_H
+
+#include <iba/ib_types.h>
+#include <complib/cl_types_osd.h>
+#include <complib/cl_dispatcher.h>
+#include <opensm/osm_subnet.h>
+#include <opensm/osm_log.h>
+#include <opensm/osm_sm.h>
+#include <opensm/osm_opensm.h>
+#include <opensm/osm_base.h>
+
+/****s* OpenSM: Base/OSM_DEFAULT_CC_KEY
+ * NAME
+ *       OSM_DEFAULT_CC_KEY
+ *
+ * DESCRIPTION
+ *       Congestion Control Key used by OpenSM.
+ *
+ * SYNOPSIS
+ */
+#define OSM_DEFAULT_CC_KEY 0
+
+#define OSM_CC_DEFAULT_MAX_OUTSTANDING_QUERIES 500
+
+/****s* OpenSM: CongestionControl/osm_congestion_control_t
+*  This object should be treated as opaque and should
+*  be manipulated only through the provided functions.
+*/
+typedef struct osm_congestion_control {
+	struct osm_opensm *osm;
+	osm_subn_t *subn;
+	osm_sm_t *sm;
+	osm_log_t *log;
+	osm_mad_pool_t *mad_pool;
+	atomic32_t trans_id;
+	osm_vendor_t *vendor;
+	osm_bind_handle_t bind_handle;
+	cl_disp_reg_handle_t cc_disp_h;
+	ib_net64_t port_guid;
+	atomic32_t outstanding_mads;
+	atomic32_t outstanding_mads_on_wire;
+	cl_qlist_t mad_queue;
+	cl_spinlock_t mad_queue_lock;
+	cl_event_t cc_poller_wakeup;
+	cl_event_t outstanding_mads_done_event;
+	cl_event_t sig_mads_on_wire_continue;
+	cl_thread_t cc_poller;
+	osm_thread_state_t thread_state;
+	ib_sw_cong_setting_t sw_cong_setting;
+	ib_ca_cong_setting_t ca_cong_setting;
+	ib_cc_tbl_t cc_tbl[OSM_CCT_ENTRY_MAD_BLOCKS];
+	unsigned int cc_tbl_mads;
+} osm_congestion_control_t;
+/*
+* FIELDS
+*       subn
+*             Subnet object for this subnet.
+*
+*       log
+*             Pointer to the log object.
+*
+*       mad_pool
+*             Pointer to the MAD pool.
+*
+*       mad_ctrl
+*             Mad Controller
+*********/
+
+struct osm_opensm;
+
+int osm_congestion_control_setup(struct osm_opensm *osm);
+
+int osm_congestion_control_wait_pending_transactions(struct osm_opensm *osm);
+
+ib_api_status_t osm_congestion_control_init(osm_congestion_control_t * p_cc,
+					    struct osm_opensm *osm,
+					    const osm_subn_opt_t * p_opt);
+
+ib_api_status_t osm_congestion_control_bind(osm_congestion_control_t * p_cc,
+					    ib_net64_t port_guid);
+
+void osm_congestion_control_shutdown(osm_congestion_control_t * p_cc);
+
+void osm_congestion_control_destroy(osm_congestion_control_t * p_cc);
+
+
+#endif				/* ifndef OSM_CONGESTION_CONTROL_H */
diff --git a/include/opensm/osm_madw.h b/include/opensm/osm_madw.h
index 195f468..afc3680 100644
--- a/include/opensm/osm_madw.h
+++ b/include/opensm/osm_madw.h
@@ -340,6 +340,19 @@  typedef struct osm_perfmgr_context {
 } osm_perfmgr_context_t;
 /*********/
 
+/****s* OpenSM: MAD Wrapper/osm_cc_context_t
+* DESCRIPTION
+*	Context for Congestion Control MADs
+*/
+typedef struct osm_cc_context {
+	ib_net64_t node_guid;
+	ib_net64_t port_guid;
+	uint8_t port;
+	uint8_t mad_method;	/* was this a get or a set */
+	ib_net32_t attr_mod;
+} osm_cc_context_t;
+/*********/
+
 #ifndef OSM_VENDOR_INTF_OPENIB
 /****s* OpenSM: MAD Wrapper/osm_arbitrary_context_t
 * NAME
@@ -379,6 +392,7 @@  typedef union _osm_madw_context {
 	osm_pkey_context_t pkey_context;
 	osm_vla_context_t vla_context;
 	osm_perfmgr_context_t perfmgr_context;
+	osm_cc_context_t cc_context;
 #ifndef OSM_VENDOR_INTF_OPENIB
 	osm_arbitrary_context_t arb_context;
 #endif
@@ -612,6 +626,32 @@  static inline ib_perfmgt_mad_t *osm_madw_get_perfmgt_mad_ptr(IN const osm_madw_t
 *	MAD Wrapper object
 *********/
 
+/****f* OpenSM: MAD Wrapper/osm_madw_get_cc_mad_ptr
+* DESCRIPTION
+*	Gets a pointer to the Congestion Control MAD in this MAD wrapper.
+*
+* SYNOPSIS
+*/
+static inline ib_cc_mad_t *osm_madw_get_cc_mad_ptr(IN const osm_madw_t
+						   * p_madw)
+{
+	return ((ib_cc_mad_t *) p_madw->p_mad);
+}
+
+/*
+* PARAMETERS
+*	p_madw
+*		[in] Pointer to an osm_madw_t object.
+*
+* RETURN VALUES
+*	Pointer to the start of the Congestion Control MAD.
+*
+* NOTES
+*
+* SEE ALSO
+*	MAD Wrapper object
+*********/
+
 /****f* OpenSM: MAD Wrapper/osm_madw_get_ni_context_ptr
 * NAME
 *	osm_madw_get_ni_context_ptr
diff --git a/include/opensm/osm_msgdef.h b/include/opensm/osm_msgdef.h
index 0c8af9b..b0d92e0 100644
--- a/include/opensm/osm_msgdef.h
+++ b/include/opensm/osm_msgdef.h
@@ -162,6 +162,7 @@  enum {
 #endif
 	OSM_MSG_MAD_PORT_COUNTERS,
 	OSM_MSG_MAD_MLNX_EXT_PORT_INFO,
+	OSM_MSG_MAD_CC,
 	OSM_MSG_MAX
 };
 
diff --git a/include/opensm/osm_opensm.h b/include/opensm/osm_opensm.h
index 9f2c2fa..dbff4f6 100644
--- a/include/opensm/osm_opensm.h
+++ b/include/opensm/osm_opensm.h
@@ -61,6 +61,7 @@ 
 #include <opensm/osm_subnet.h>
 #include <opensm/osm_mad_pool.h>
 #include <opensm/osm_vl15intf.h>
+#include <opensm/osm_congestion_control.h>
 
 #ifdef __cplusplus
 #  define BEGIN_C_DECLS extern "C" {
@@ -203,6 +204,7 @@  typedef struct osm_opensm {
 #ifdef ENABLE_OSM_PERF_MGR
 	osm_perfmgr_t perfmgr;
 #endif				/* ENABLE_OSM_PERF_MGR */
+	osm_congestion_control_t cc;
 	cl_qlist_t plugin_list;
 	osm_db_t db;
 	osm_mad_pool_t mad_pool;
diff --git a/include/opensm/osm_port.h b/include/opensm/osm_port.h
index 56e9c37..e06483a 100644
--- a/include/opensm/osm_port.h
+++ b/include/opensm/osm_port.h
@@ -119,6 +119,15 @@  typedef struct osm_physp {
 	ib_vl_arb_table_t vl_arb[4];
 	cl_ptr_vector_t slvl_by_port;
 	uint8_t hop_wf;
+	union {
+		struct {
+			ib_sw_cong_setting_t sw_cong_setting;
+		} sw;
+		struct {
+			ib_ca_cong_setting_t ca_cong_setting;
+			ib_cc_tbl_t cc_tbl[OSM_CCT_ENTRY_MAD_BLOCKS];
+		} ca;
+	} cc;
 } osm_physp_t;
 /*
 * FIELDS
@@ -186,6 +195,15 @@  typedef struct osm_physp {
 *	hop_wf
 *		Hop weighting factor to be used in the routing.
 *
+*	sw_cong_setting
+*		Physical port switch congestion settings (switches only)
+*
+*	ca_cong_setting
+*		Physical port ca congestion settings (cas only)
+*
+*	cc_tbl
+*		Physical port ca congestion control table (cas only)
+*
 * SEE ALSO
 *	Port
 *********/
diff --git a/include/opensm/osm_subnet.h b/include/opensm/osm_subnet.h
index 683204b..426a962 100644
--- a/include/opensm/osm_subnet.h
+++ b/include/opensm/osm_subnet.h
@@ -86,6 +86,10 @@  typedef enum _osm_partition_enforce_type_enum {
 	OSM_PARTITION_ENFORCE_TYPE_OFF
 } osm_partition_enforce_type_enum;
 
+/* XXX: not actual max, max we're currently going to support */
+#define OSM_CCT_ENTRY_MAX        128
+#define OSM_CCT_ENTRY_MAD_BLOCKS (OSM_CCT_ENTRY_MAX/64)
+
 struct osm_opensm;
 struct osm_qos_policy;
 
@@ -147,6 +151,91 @@  typedef struct osm_qos_options {
 *
 *********/
 
+/****s* OpenSM: Subnet/osm_cct_entry_t
+* NAME
+*	osm_cct_entry_t
+*
+* DESCRIPTION
+*	Subnet Congestion Control Table entry.  See A10.2.2.1.1 for format details.
+*
+* SYNOPSIS
+*/
+typedef struct osm_cct_entry {
+	uint8_t shift; //Alex: shift 2 bits
+	uint16_t multiplier; //Alex multiplier 14 bits
+} osm_cct_entry_t;
+/*
+* FIELDS
+*
+*	shift
+*		shift field in CCT entry.  See A10.2.2.1.1.
+*
+*	multiplier
+*		multiplier field in CCT entry.  See A10.2.2.1.1.
+*
+*********/
+
+/****s* OpenSM: Subnet/osm_cacongestion_entry_t
+* NAME
+*	osm_cacongestion_entry_t
+*
+* DESCRIPTION
+*	Subnet CA Congestion entry.  See A10.4.3.8.4 for format details.
+*
+* SYNOPSIS
+*/
+typedef struct osm_cacongestion_entry {
+	ib_net16_t ccti_timer; //Alex: ccti_timer and ccti_increase should be replaced
+	uint8_t ccti_increase;
+	uint8_t trigger_threshold;
+	uint8_t ccti_min;
+} osm_cacongestion_entry_t;
+/*
+* FIELDS
+*
+*	ccti_timer
+*		CCTI Timer
+*
+*	ccti_increase
+*		CCTI Increase
+*
+*	trigger_threshold
+*		CCTI trigger for log message
+*
+*	ccti_min
+*		CCTI Minimum
+*
+*********/
+
+/****s* OpenSM: Subnet/osm_cct_t
+* NAME
+*	osm_cct_t
+*
+* DESCRIPTION
+*	Subnet CongestionControlTable.  See A10.4.3.9 for format details.
+*
+* SYNOPSIS
+*/
+typedef struct osm_cct {
+	osm_cct_entry_t entries[OSM_CCT_ENTRY_MAX];
+	unsigned int entries_len;
+	char *input_str;
+} osm_cct_t;
+/*
+* FIELDS
+*
+*	entries
+*		Entries in CCT
+*
+*	entries_len
+*		Length of entries
+*
+*	input_str
+*		Original str input
+*
+*********/
+
+
 /****s* OpenSM: Subnet/osm_subn_opt_t
 * NAME
 *	osm_subn_opt_t
@@ -244,6 +333,21 @@  typedef struct osm_subn_opt {
 	osm_qos_options_t qos_sw0_options;
 	osm_qos_options_t qos_swe_options;
 	osm_qos_options_t qos_rtr_options;
+	boolean_t congestion_control;
+	ib_net64_t cc_key;
+	uint32_t cc_max_outstanding_mads;
+	ib_net32_t cc_sw_cong_setting_control_map;
+	uint8_t cc_sw_cong_setting_victim_mask[IB_CC_PORT_MASK_DATA_SIZE];
+	uint8_t cc_sw_cong_setting_credit_mask[IB_CC_PORT_MASK_DATA_SIZE];
+	uint8_t cc_sw_cong_setting_threshold;
+	uint8_t cc_sw_cong_setting_packet_size;
+	uint8_t cc_sw_cong_setting_credit_starvation_threshold;
+	osm_cct_entry_t cc_sw_cong_setting_credit_starvation_return_delay;
+	ib_net16_t cc_sw_cong_setting_marking_rate;
+	ib_net16_t cc_ca_cong_setting_port_control;
+	ib_net16_t cc_ca_cong_setting_control_map;
+	osm_cacongestion_entry_t cc_ca_cong_entries[IB_CA_CONG_ENTRY_DATA_SIZE];
+	osm_cct_t cc_cct;
 	boolean_t enable_quirks;
 	boolean_t no_clients_rereg;
 #ifdef ENABLE_OSM_PERF_MGR
@@ -530,6 +634,60 @@  typedef struct osm_subn_opt {
 *	qos_rtr_options
 *		QoS options for router ports
 *
+*	congestion_control
+*		Boolean that specifies whether OpenSM congestion control configuration
+*		should be off or no.
+*
+*	cc_key
+*		CCkey to use when configuring congestion control.
+*
+*	cc_max_outstanding_mads
+*		Max number of outstanding CC mads that can be on the wire.
+*
+*	cc_sw_cong_setting_control_map
+*		Congestion Control Switch Congestion Setting Control Map
+*		configuration setting.
+*
+*	cc_sw_cong_setting_victim_mask
+*		Congestion Control Switch Congestion Setting Victim Mask
+*		configuration setting.
+*
+*	cc_sw_cong_setting_credit_mask
+*		Congestion Control Switch Congestion Setting Credit Mask
+*		configuration setting.
+*
+*	cc_sw_cong_setting_threshold
+*		Congestion Control Switch Congestion Setting Threshold
+*		configuration setting.
+*
+*	cc_sw_cong_setting_packet_size
+*		Congestion Control Switch Congestion Setting Packet Size
+*		configuration setting.
+*
+*	cc_sw_cong_setting_credit_starvation_threshold
+*		Congestion Control Switch Congestion Setting Credit Staraction Threshold
+*		configuration setting.
+*
+*	cc_sw_cong_setting_credit_starvation_return_delay
+*		Congestion Control Switch Congestion Setting Credit Starvation Return Delay
+*		configuration setting.
+*
+*	cc_sw_cong_setting_marking_rate
+*		Congestion Control Switch Congestion Setting Marking Rate
+*		configuration setting.
+*
+*	cc_ca_cong_setting_port_control
+*		Congestion Control CA Congestion Setting Port Control
+*
+*	cc_ca_cong_setting_control_map
+*		Congestion Control CA Congestion Setting Control Map
+
+*	cc_ca_cong_entries
+*		Congestion Control CA Congestion Setting Entries
+*
+*	cc_cct
+*		Congestion Control Table array of entries
+*
 *	enable_quirks
 *		Enable high risk new features and not fully qualified
 *		hardware specific work arounds
diff --git a/man/opensm.8.in b/man/opensm.8.in
index 888d6a6..dd67671 100644
--- a/man/opensm.8.in
+++ b/man/opensm.8.in
@@ -48,6 +48,8 @@  opensm \- InfiniBand subnet manager and administration (SM/SA)
 [\-Z | \-\-part_enforce [both | in | out | off]]
 [\-W | \-\-allow_both_pkeys]
 [\-Q | \-\-qos [\-Y | \-\-qos_policy_file <file name>]]
+[\-\-congestion\-control]
+[\-\-cckey <key>]
 [\-y | \-\-stay_on_fatal]
 [\-B | \-\-daemon]
 [\-I | \-\-inactive]
@@ -369,6 +371,15 @@  name is \fB\%@OPENSM_CONFIG_DIR@/@QOS_POLICY_FILE@\fP. See
 QoS_management_in_OpenSM.txt in opensm doc for more information on
 configuring QoS policy via this file.
 .TP
+\fB\-\-congestion_control\fR
+(EXPERIMENTAL) This option enables congestion control configuration.
+It is disabled by default.  See config file for congestion control
+configuration options.
+\fB\-\-cc_key\fR <key>
+(EXPERIMENTAL) This option configures the CCkey to use when configuring
+congestion control.  Note that this option does not configure a new
+CCkey into switches and CAs.  Defaults to 0.
+.TP
 \fB\-N\fR, \fB\-\-no_part_enforce\fR \fB(DEPRECATED)\fR
 This is a deprecated flag. Please use \fB\-\-part_enforce\fR instead.
 This option disables partition enforcement on switch external ports.
diff --git a/opensm/Makefile.am b/opensm/Makefile.am
index 855042c..7fd6bc6 100644
--- a/opensm/Makefile.am
+++ b/opensm/Makefile.am
@@ -57,7 +57,8 @@  opensm_SOURCES = main.c osm_console_io.c osm_console.c osm_db_files.c \
 		 osm_ucast_dfsssp.c osm_vl15intf.c \
 		 osm_vl_arb_rcv.c st.c osm_perfmgr.c osm_perfmgr_db.c \
 		 osm_event_plugin.c osm_dump.c osm_ucast_cache.c \
-		 osm_qos_parser_y.y osm_qos_parser_l.l osm_qos_policy.c
+		 osm_qos_parser_y.y osm_qos_parser_l.l osm_qos_policy.c \
+		 osm_congestion_control.c
 
 AM_YFLAGS:= -d
 
@@ -102,6 +103,7 @@  opensminclude_HEADERS = \
 	$(srcdir)/../include/opensm/osm_port_profile.h \
 	$(srcdir)/../include/opensm/osm_prefix_route.h \
 	$(srcdir)/../include/opensm/osm_qos_policy.h \
+	$(srcdir)/../include/opensm/osm_congestion_control.h \
 	$(srcdir)/../include/opensm/osm_remote_sm.h \
 	$(srcdir)/../include/opensm/osm_router.h \
 	$(srcdir)/../include/opensm/osm_sa.h \
diff --git a/opensm/main.c b/opensm/main.c
index fca209a..e9a0b4c 100644
--- a/opensm/main.c
+++ b/opensm/main.c
@@ -340,6 +340,11 @@  static void show_usage(void)
 	       "          This option defines the optional QoS policy file.\n"
 	       "          The default name is \'" OSM_DEFAULT_QOS_POLICY_FILE
 	       "\'.\n\n");
+	printf("--congestion_control\n"
+	       "          (EXPERIMENTAL) This option enables congestion control configuration.\n\n");
+	printf("--cc_key <key>\n"
+	       "          (EXPERIMENTAL) This option configures the CCkey to use when configuring\n"
+	       "          congestion control.\n\n");
 	printf("--stay_on_fatal, -y\n"
 	       "          This option will cause SM not to exit on fatal initialization\n"
 	       "          issues: if SM discovers duplicated guids or 12x link with\n"
@@ -614,6 +619,8 @@  int main(int argc, char *argv[])
 		{"allow_both_pkeys", 0, NULL, 'W'},
 		{"qos", 0, NULL, 'Q'},
 		{"qos_policy_file", 1, NULL, 'Y'},
+		{"congestion_control", 0, NULL, 128},
+		{"cc_key", 1, NULL, 129},
 		{"maxsmps", 1, NULL, 'n'},
 		{"console", 1, NULL, 'q'},
 		{"V", 0, NULL, 'V'},
@@ -920,6 +927,15 @@  int main(int argc, char *argv[])
 			printf(" QoS policy file \'%s\'\n", optarg);
 			break;
 
+		case 128:
+			opt.congestion_control = TRUE;
+			break;
+
+		case 129:
+			opt.cc_key = strtoull(optarg, NULL, 0);
+			printf(" CC Key 0x%" PRIx64 "\n", opt.cc_key);
+			break;
+
 		case 'y':
 			opt.exit_on_fatal = FALSE;
 			printf(" Staying on fatal initialization errors\n");
diff --git a/opensm/osm_congestion_control.c b/opensm/osm_congestion_control.c
new file mode 100644
index 0000000..640f3fa
--- /dev/null
+++ b/opensm/osm_congestion_control.c
@@ -0,0 +1,741 @@ 
+/*
+ * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ * Copyright (c) 2012 Lawrence Livermore National Lab.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/*
+ * Abstract:
+ *    OSM Congestion Control configuration implementation
+ *
+ * Author:
+ *    Albert Chu, LLNL
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif				/* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <iba/ib_types.h>
+#include <complib/cl_debug.h>
+#include <opensm/osm_subnet.h>
+#include <opensm/osm_opensm.h>
+#include <opensm/osm_log.h>
+#include <opensm/osm_subnet.h>
+#include <opensm/osm_congestion_control.h>
+
+#define CONGESTION_CONTROL_INITIAL_TID_VALUE 0xbabe
+
+static void cc_mad_post(osm_congestion_control_t *p_cc,
+			osm_madw_t *p_madw,
+			osm_node_t *p_node,
+			osm_physp_t *p_physp,
+			ib_net16_t attr_id,
+			ib_net32_t attr_mod)
+{
+	osm_subn_opt_t *p_opt = &p_cc->subn->opt;
+	ib_cc_mad_t *p_cc_mad;
+	uint8_t port;
+
+	OSM_LOG_ENTER(p_cc->log);
+
+	port = osm_physp_get_port_num(p_physp);
+
+	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
+
+	p_cc_mad->header.base_ver = 1;
+	p_cc_mad->header.mgmt_class = IB_MCLASS_CC;
+	p_cc_mad->header.class_ver = 2;
+	p_cc_mad->header.method = IB_MAD_METHOD_SET;
+	p_cc_mad->header.status = 0;
+	p_cc_mad->header.class_spec = 0;
+	p_cc_mad->header.trans_id =
+		cl_hton64((uint64_t) cl_atomic_inc(&p_cc->trans_id));
+	p_cc_mad->header.attr_id = attr_id;
+	p_cc_mad->header.resv = 0;
+	p_cc_mad->header.attr_mod = attr_mod;
+
+	p_cc_mad->cc_key = p_opt->cc_key;
+
+	memset(p_cc_mad->log_data, '\0', IB_CC_LOG_DATA_SIZE);
+
+	p_madw->mad_addr.dest_lid = osm_node_get_base_lid(p_node, port);
+	p_madw->mad_addr.addr_type.gsi.remote_qp = IB_QP1;
+	p_madw->mad_addr.addr_type.gsi.remote_qkey =
+		cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY);
+	p_madw->resp_expected = TRUE;
+	p_madw->fail_msg = CL_DISP_MSGID_NONE;
+
+	p_madw->context.cc_context.node_guid = osm_node_get_node_guid(p_node);
+	p_madw->context.cc_context.port_guid = osm_physp_get_port_guid(p_physp);
+	p_madw->context.cc_context.port = port;
+	p_madw->context.cc_context.mad_method = IB_MAD_METHOD_SET;
+	p_madw->context.cc_context.attr_mod = attr_mod;
+
+	cl_spinlock_acquire(&p_cc->mad_queue_lock);
+	cl_atomic_inc(&p_cc->outstanding_mads);
+	cl_qlist_insert_tail(&p_cc->mad_queue, &p_madw->list_item);
+	cl_spinlock_release(&p_cc->mad_queue_lock);
+
+	cl_event_signal(&p_cc->cc_poller_wakeup);
+
+	OSM_LOG_EXIT(p_cc->log);
+}
+
+static void cc_setup_mad_data(osm_sm_t * p_sm)
+{
+	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
+	osm_subn_opt_t *p_opt = &p_sm->p_subn->opt;
+	uint16_t ccti_limit;
+	int i;
+
+	/* Switch Congestion Setting */
+	p_cc->sw_cong_setting.control_map = p_opt->cc_sw_cong_setting_control_map;
+
+	memcpy(p_cc->sw_cong_setting.victim_mask,
+	       p_opt->cc_sw_cong_setting_victim_mask,
+	       IB_CC_PORT_MASK_DATA_SIZE);
+
+	memcpy(p_cc->sw_cong_setting.credit_mask,
+	       p_opt->cc_sw_cong_setting_credit_mask,
+	       IB_CC_PORT_MASK_DATA_SIZE);
+
+	/* threshold is 4 bits, takes up upper nibble of byte */
+	p_cc->sw_cong_setting.threshold_resv = (p_opt->cc_sw_cong_setting_threshold << 4);
+
+	p_cc->sw_cong_setting.packet_size = p_opt->cc_sw_cong_setting_packet_size;
+
+	/* cs threshold is 4 bits, takes up upper nibble of short */
+	p_cc->sw_cong_setting.cs_threshold_resv =
+		cl_hton16(p_opt->cc_sw_cong_setting_credit_starvation_threshold << 12);
+
+	p_cc->sw_cong_setting.cs_return_delay =
+		cl_hton16(p_opt->cc_sw_cong_setting_credit_starvation_return_delay.shift << 14
+			  | p_opt->cc_sw_cong_setting_credit_starvation_return_delay.multiplier);
+
+	p_cc->sw_cong_setting.marking_rate = p_opt->cc_sw_cong_setting_marking_rate;
+
+	/* CA Congestion Setting */
+	p_cc->ca_cong_setting.port_control = p_opt->cc_ca_cong_setting_port_control;
+	p_cc->ca_cong_setting.control_map = p_opt->cc_ca_cong_setting_control_map;
+
+	for (i = 0; i < IB_CA_CONG_ENTRY_DATA_SIZE; i++) {
+		ib_ca_cong_entry_t *p_entry;
+
+		p_entry = &p_cc->ca_cong_setting.entry_list[i];
+
+		p_entry->ccti_timer = p_opt->cc_ca_cong_entries[i].ccti_timer;
+		p_entry->ccti_increase = p_opt->cc_ca_cong_entries[i].ccti_increase;
+		p_entry->trigger_threshold = p_opt->cc_ca_cong_entries[i].trigger_threshold;
+		p_entry->ccti_min = p_opt->cc_ca_cong_entries[i].ccti_min;
+		p_entry->resv0 = 0;
+		p_entry->resv1 = 0;
+	}
+
+	/* Congestion Control Table */
+
+	/* if no entries, we will always send atleast 1 mad to set ccti_limit = 0 */
+	if (!p_opt->cc_cct.entries_len)
+		p_cc->cc_tbl_mads = 1;
+	else {
+		p_cc->cc_tbl_mads = p_opt->cc_cct.entries_len - 1;
+		p_cc->cc_tbl_mads /= IB_CC_TBL_ENTRY_LIST_MAX;
+		p_cc->cc_tbl_mads += 1;
+	}
+
+	CL_ASSERT(p_cc->cc_tbl_mads <= OSM_CCT_ENTRY_MAD_BLOCKS);
+
+	if (!p_opt->cc_cct.entries_len)
+		ccti_limit = 0;
+	else
+		ccti_limit = p_opt->cc_cct.entries_len - 1;
+
+	for (i = 0; i < p_cc->cc_tbl_mads; i++) {
+		int j;
+
+		p_cc->cc_tbl[i].ccti_limit = cl_hton16(ccti_limit);
+		p_cc->cc_tbl[i].resv = 0;
+
+		memset(p_cc->cc_tbl[i].entry_list,
+		       '\0',
+		       sizeof(p_cc->cc_tbl[i].entry_list));
+
+		if (!ccti_limit)
+			break;
+
+		for (j = 0; j < IB_CC_TBL_ENTRY_LIST_MAX; j++) {
+			int k;
+
+			k = (i * IB_CC_TBL_ENTRY_LIST_MAX) + j;
+			p_cc->cc_tbl[i].entry_list[j].shift_multiplier =
+				cl_hton16(p_opt->cc_cct.entries[k].shift << 14
+					  | p_opt->cc_cct.entries[k].multiplier);
+		}
+	}
+}
+
+static ib_api_status_t cc_send_sw_cong_setting(osm_sm_t * p_sm,
+					       osm_node_t *p_node)
+{
+	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
+	unsigned force_update;
+	osm_physp_t *p_physp;
+	osm_madw_t *p_madw = NULL;
+	ib_cc_mad_t *p_cc_mad = NULL;
+	ib_sw_cong_setting_t *p_sw_cong_setting = NULL;
+
+	OSM_LOG_ENTER(p_sm->p_log);
+
+	p_physp = osm_node_get_physp_ptr(p_node, 0);
+
+	force_update = p_physp->need_update || p_sm->p_subn->need_update;
+
+	if (!force_update
+	    && !memcmp(&p_cc->sw_cong_setting,
+		       &p_physp->cc.sw.sw_cong_setting,
+		       sizeof(p_cc->sw_cong_setting)))
+		return IB_SUCCESS;
+
+	p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle,
+				  MAD_BLOCK_SIZE, NULL);
+	if (p_madw == NULL) {
+		OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C101: "
+			"failed to allocate mad\n");
+		return IB_INSUFFICIENT_MEMORY;
+	}
+
+	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
+
+	p_sw_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
+
+	memcpy(p_sw_cong_setting,
+	       &p_cc->sw_cong_setting,
+	       sizeof(p_cc->sw_cong_setting));
+
+	cc_mad_post(p_cc, p_madw, p_node, p_physp,
+		    IB_MAD_ATTR_SW_CONG_SETTING, 0);
+
+	OSM_LOG_EXIT(p_sm->p_log);
+
+	return IB_SUCCESS;
+}
+
+static ib_api_status_t cc_send_ca_cong_setting(osm_sm_t * p_sm,
+					       osm_node_t *p_node,
+					       osm_physp_t *p_physp)
+{
+	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
+	unsigned force_update;
+	osm_madw_t *p_madw = NULL;
+	ib_cc_mad_t *p_cc_mad = NULL;
+	ib_ca_cong_setting_t *p_ca_cong_setting = NULL;
+
+	OSM_LOG_ENTER(p_sm->p_log);
+
+	force_update = p_physp->need_update || p_sm->p_subn->need_update;
+
+	if (!force_update
+	    && !memcmp(&p_cc->ca_cong_setting,
+		       &p_physp->cc.ca.ca_cong_setting,
+		       sizeof(p_cc->ca_cong_setting)))
+		return IB_SUCCESS;
+
+	p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle,
+				  MAD_BLOCK_SIZE, NULL);
+	if (p_madw == NULL) {
+		OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C102: "
+			"failed to allocate mad\n");
+		return IB_INSUFFICIENT_MEMORY;
+	}
+
+	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
+
+	p_ca_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
+
+	memcpy(p_ca_cong_setting,
+	       &p_cc->ca_cong_setting,
+	       sizeof(p_cc->ca_cong_setting));
+
+	cc_mad_post(p_cc, p_madw, p_node, p_physp,
+		    IB_MAD_ATTR_CA_CONG_SETTING, 0);
+
+	OSM_LOG_EXIT(p_sm->p_log);
+
+	return IB_SUCCESS;
+}
+
+static ib_api_status_t cc_send_cct(osm_sm_t * p_sm,
+				   osm_node_t *p_node,
+				   osm_physp_t *p_physp)
+{
+	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
+	unsigned force_update;
+	osm_madw_t *p_madw = NULL;
+	ib_cc_mad_t *p_cc_mad = NULL;
+	ib_cc_tbl_t *p_cc_tbl = NULL;
+	unsigned int index = 0;
+
+	OSM_LOG_ENTER(p_sm->p_log);
+
+	force_update = p_physp->need_update || p_sm->p_subn->need_update;
+
+	for (index = 0; index < p_cc->cc_tbl_mads; index++) {
+		if (!force_update
+		    && !memcmp(&p_cc->cc_tbl[index],
+			       &p_physp->cc.ca.cc_tbl[index],
+			       sizeof(p_cc->cc_tbl[index])))
+			continue;
+
+		p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle,
+					  MAD_BLOCK_SIZE, NULL);
+		if (p_madw == NULL) {
+			OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C103: "
+				"failed to allocate mad\n");
+			return IB_INSUFFICIENT_MEMORY;
+		}
+
+		p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
+
+		p_cc_tbl = (ib_cc_tbl_t *)ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
+
+		memcpy(p_cc_tbl,
+		       &p_cc->cc_tbl[index],
+		       sizeof(p_cc->cc_tbl[index]));
+
+		cc_mad_post(p_cc, p_madw, p_node, p_physp,
+			    IB_MAD_ATTR_CC_TBL, cl_hton32(index));
+	}
+
+	OSM_LOG_EXIT(p_sm->p_log);
+
+	return IB_SUCCESS;
+}
+
+int osm_congestion_control_setup(struct osm_opensm *p_osm)
+{
+	cl_qmap_t *p_tbl;
+	cl_map_item_t *p_next;
+	int ret = 0;
+
+	if (!p_osm->subn.opt.congestion_control)
+		return 0;
+
+	OSM_LOG_ENTER(&p_osm->log);
+
+	/*
+	 * Do nothing unless the most recent routing attempt was successful.
+	 */
+	if (!p_osm->sm.p_subn->p_osm->routing_engine_used)
+		return 0;
+
+	cc_setup_mad_data(&p_osm->sm);
+
+	cl_plock_acquire(&p_osm->lock);
+
+	p_tbl = &p_osm->subn.port_guid_tbl;
+	p_next = cl_qmap_head(p_tbl);
+	while (p_next != cl_qmap_end(p_tbl)) {
+		osm_port_t *p_port = (osm_port_t *) p_next;
+		osm_node_t *p_node = p_port->p_node;
+		ib_api_status_t status;
+
+		p_next = cl_qmap_next(p_next);
+
+		if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH) {
+			status = cc_send_sw_cong_setting(&p_osm->sm, p_node);
+			if (status != IB_SUCCESS)
+				ret = -1;
+		} else if (osm_node_get_type(p_node) == IB_NODE_TYPE_CA) {
+			status = cc_send_ca_cong_setting(&p_osm->sm,
+							 p_node,
+							 p_port->p_physp);
+			if (status != IB_SUCCESS)
+				ret = -1;
+
+			status = cc_send_cct(&p_osm->sm,
+					     p_node,
+					     p_port->p_physp);
+			if (status != IB_SUCCESS)
+				ret = -1;
+		}
+	}
+
+	cl_plock_release(&p_osm->lock);
+
+	OSM_LOG_EXIT(&p_osm->log);
+
+	return ret;
+}
+
+int osm_congestion_control_wait_pending_transactions(struct osm_opensm *p_osm)
+{
+	osm_congestion_control_t *cc = &p_osm->sm.p_subn->p_osm->cc;
+
+	if (!p_osm->subn.opt.congestion_control)
+		return 0;
+
+	while (1) {
+		unsigned count = cc->outstanding_mads;
+		if (!count || osm_exit_flag)
+			break;
+		cl_event_wait_on(&cc->outstanding_mads_done_event,
+				 EVENT_NO_TIMEOUT,
+				 TRUE);
+	}
+
+	return osm_exit_flag;
+}
+
+static inline void decrement_outstanding_mads(osm_congestion_control_t *p_cc)
+{
+	uint32_t outstanding;
+
+	outstanding = cl_atomic_dec(&p_cc->outstanding_mads);
+	if (!outstanding)
+		cl_event_signal(&p_cc->outstanding_mads_done_event);
+
+	cl_atomic_dec(&p_cc->outstanding_mads_on_wire);
+	cl_event_signal(&p_cc->sig_mads_on_wire_continue);
+}
+
+
+static void cc_rcv_mad(void *context, void *data)
+{
+	osm_congestion_control_t *p_cc = context;
+	osm_opensm_t *p_osm = p_cc->osm;
+	osm_madw_t *p_madw = data;
+	ib_cc_mad_t *p_cc_mad;
+	osm_madw_context_t *p_mad_context = &p_madw->context;
+	ib_mad_t *p_mad = osm_madw_get_mad_ptr(p_madw);
+	uint64_t node_guid = p_mad_context->cc_context.node_guid;
+	uint64_t port_guid = p_mad_context->cc_context.port_guid;
+	uint8_t port = p_mad_context->cc_context.port;
+	osm_port_t *p_port;
+
+	OSM_LOG_ENTER(p_cc->log);
+
+	OSM_LOG(p_cc->log, OSM_LOG_VERBOSE,
+		"Processing received MAD status 0x%x context 0x%"
+		PRIx64 "port %u\n", p_mad->status, node_guid, port);
+
+	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
+
+	cl_plock_acquire(&p_osm->lock);
+
+	p_port = osm_get_port_by_guid(p_cc->subn, port_guid);
+	if (!p_port) {
+		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C109: "
+			"Port guid not in table 0x%" PRIx64 "\n",
+			   port_guid);
+		cl_plock_release(&p_osm->lock);
+		goto Exit;
+	}
+
+	if (p_cc_mad->header.attr_id == IB_MAD_ATTR_SW_CONG_SETTING) {
+		ib_sw_cong_setting_t *p_sw_cong_setting;
+
+		p_sw_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
+		p_port->p_physp->cc.sw.sw_cong_setting = *p_sw_cong_setting;
+	}
+	else if (p_cc_mad->header.attr_id == IB_MAD_ATTR_CA_CONG_SETTING) {
+		ib_ca_cong_setting_t *p_ca_cong_setting;
+
+		p_ca_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
+		p_port->p_physp->cc.ca.ca_cong_setting = *p_ca_cong_setting;
+	}
+	else if (p_cc_mad->header.attr_id == IB_MAD_ATTR_CC_TBL) {
+		ib_net32_t attr_mod = p_mad_context->cc_context.attr_mod;
+		uint32_t index = cl_ntoh32(attr_mod);
+		ib_cc_tbl_t *p_cc_tbl;
+
+		p_cc_tbl = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
+		p_port->p_physp->cc.ca.cc_tbl[index] = *p_cc_tbl;
+	}
+	else
+		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C10A: "
+			"Unexpected MAD attribute received: %u\n",
+			   p_cc_mad->header.attr_id);
+
+	cl_plock_release(&p_osm->lock);
+
+Exit:
+	decrement_outstanding_mads(p_cc);
+	osm_mad_pool_put(p_cc->mad_pool, p_madw);
+	OSM_LOG_EXIT(p_cc->log);
+}
+
+static void cc_poller_send(osm_congestion_control_t *p_cc,
+			   osm_madw_t *p_madw)
+{
+	osm_subn_opt_t *p_opt = &p_cc->subn->opt;
+	ib_api_status_t status;
+
+	status = osm_vendor_send(p_cc->bind_handle, p_madw, TRUE);
+	if (status == IB_SUCCESS) {
+		cl_atomic_inc(&p_cc->outstanding_mads_on_wire);
+		if (p_cc->outstanding_mads_on_wire >
+		    p_opt->cc_max_outstanding_mads)
+			cl_event_wait_on(&p_cc->sig_mads_on_wire_continue,
+					 EVENT_NO_TIMEOUT,
+					 TRUE);
+	}
+	else {
+		osm_madw_context_t *mad_context = &p_madw->context;
+
+		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C104: "
+			"send failed to node 0x%" PRIx64 "port %u\n",
+			mad_context->cc_context.node_guid,
+			mad_context->cc_context.port);
+	}
+}
+
+static void cc_poller(void *p_ptr)
+{
+	osm_congestion_control_t *p_cc = p_ptr;
+	osm_madw_t *p_madw;
+
+	OSM_LOG_ENTER(p_cc->log);
+
+	if (p_cc->thread_state == OSM_THREAD_STATE_NONE)
+		p_cc->thread_state = OSM_THREAD_STATE_RUN;
+
+	while (p_cc->thread_state == OSM_THREAD_STATE_RUN) {
+		cl_spinlock_acquire(&p_cc->mad_queue_lock);
+
+		p_madw = (osm_madw_t *) cl_qlist_remove_head(&p_cc->mad_queue);
+
+		cl_spinlock_release(&p_cc->mad_queue_lock);
+
+		if (p_madw != (osm_madw_t *) cl_qlist_end(&p_cc->mad_queue))
+			cc_poller_send(p_cc, p_madw);
+		else
+			cl_event_wait_on(&p_cc->cc_poller_wakeup,
+					 EVENT_NO_TIMEOUT, TRUE);
+	}
+
+	OSM_LOG_EXIT(p_cc->log);
+}
+
+ib_api_status_t osm_congestion_control_init(osm_congestion_control_t * p_cc,
+					    struct osm_opensm *p_osm,
+					    const osm_subn_opt_t * p_opt)
+{
+	ib_api_status_t status = IB_SUCCESS;
+
+	OSM_LOG_ENTER(&p_osm->log);
+
+	memset(p_cc, 0, sizeof(*p_cc));
+
+	p_cc->osm = p_osm;
+	p_cc->subn = &p_osm->subn;
+	p_cc->sm = &p_osm->sm;
+	p_cc->log = &p_osm->log;
+	p_cc->mad_pool = &p_osm->mad_pool;
+	p_cc->trans_id = CONGESTION_CONTROL_INITIAL_TID_VALUE;
+	p_cc->vendor = p_osm->p_vendor;
+
+	p_cc->cc_disp_h = cl_disp_register(&p_osm->disp, OSM_MSG_MAD_CC,
+					   cc_rcv_mad, p_cc);
+	if (p_cc->cc_disp_h == CL_DISP_INVALID_HANDLE)
+		goto Exit;
+
+	cl_qlist_init(&p_cc->mad_queue);
+
+	status = cl_spinlock_init(&p_cc->mad_queue_lock);
+	if (status != IB_SUCCESS)
+		goto Exit;
+
+	cl_event_construct(&p_cc->cc_poller_wakeup);
+	status = cl_event_init(&p_cc->cc_poller_wakeup, FALSE);
+	if (status != IB_SUCCESS)
+		goto Exit;
+
+	cl_event_construct(&p_cc->outstanding_mads_done_event);
+	status = cl_event_init(&p_cc->outstanding_mads_done_event, FALSE);
+	if (status != IB_SUCCESS)
+		goto Exit;
+
+	cl_event_construct(&p_cc->sig_mads_on_wire_continue);
+	status = cl_event_init(&p_cc->sig_mads_on_wire_continue, FALSE);
+	if (status != IB_SUCCESS)
+		goto Exit;
+
+	p_cc->thread_state = OSM_THREAD_STATE_NONE;
+
+	status = cl_thread_init(&p_cc->cc_poller, cc_poller, p_cc,
+				"cc poller");
+	if (status != IB_SUCCESS)
+		goto Exit;
+
+	status = IB_SUCCESS;
+Exit:
+	OSM_LOG_EXIT(p_cc->log);
+	return status;
+}
+
+static void cc_mad_recv_callback(osm_madw_t * p_madw, void *bind_context,
+				 osm_madw_t * p_req_madw)
+{
+	osm_congestion_control_t *p_cc = bind_context;
+
+	OSM_LOG_ENTER(p_cc->log);
+
+	osm_madw_copy_context(p_madw, p_req_madw);
+	osm_mad_pool_put(p_cc->mad_pool, p_req_madw);
+
+	/* Do not decrement outstanding mads here, do it in the dispatcher */
+
+	if (cl_disp_post(p_cc->cc_disp_h, OSM_MSG_MAD_CC,
+			 p_madw, NULL, NULL) != CL_SUCCESS) {
+		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C105: "
+			"Congestion Control Dispatcher post failed\n");
+		osm_mad_pool_put(p_cc->mad_pool, p_madw);
+	}
+
+	OSM_LOG_EXIT(p_cc->log);
+}
+
+static void cc_mad_send_err_callback(void *bind_context,
+				     osm_madw_t * p_madw)
+{
+	osm_congestion_control_t *p_cc = bind_context;
+	osm_madw_context_t *p_madw_context = &p_madw->context;
+	uint64_t node_guid = p_madw_context->cc_context.node_guid;
+	uint8_t port = p_madw_context->cc_context.port;
+
+	OSM_LOG_ENTER(p_cc->log);
+
+	OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C106: MAD Error (%s): "
+		"attr id = %u LID %u GUID 0x%016" PRIx64 " port %u "
+		"TID 0x%" PRIx64 "\n",
+		ib_get_err_str(p_madw->status),
+		p_madw->p_mad->attr_id,
+		cl_ntoh16(p_madw->mad_addr.dest_lid),
+		node_guid,
+		port,
+		cl_ntoh64(p_madw->p_mad->trans_id));
+
+	p_cc->subn->subnet_initialization_error = TRUE;
+
+	osm_mad_pool_put(p_cc->mad_pool, p_madw);
+
+	decrement_outstanding_mads(p_cc);
+
+	OSM_LOG_EXIT(p_cc->log);
+}
+
+ib_api_status_t osm_congestion_control_bind(osm_congestion_control_t * p_cc,
+					    ib_net64_t port_guid)
+{
+	osm_bind_info_t bind_info;
+	ib_api_status_t status = IB_SUCCESS;
+
+	OSM_LOG_ENTER(p_cc->log);
+
+	bind_info.port_guid = p_cc->port_guid = port_guid;
+	bind_info.mad_class = IB_MCLASS_CC;
+	bind_info.class_version = 2;
+	bind_info.is_responder = FALSE;
+	bind_info.is_report_processor = FALSE;
+	bind_info.is_trap_processor = FALSE;
+	bind_info.recv_q_size = OSM_SM_DEFAULT_QP1_RCV_SIZE;
+	bind_info.send_q_size = OSM_SM_DEFAULT_QP1_SEND_SIZE;
+	bind_info.timeout = p_cc->subn->opt.transaction_timeout;
+	bind_info.retries = p_cc->subn->opt.transaction_retries;
+
+	OSM_LOG(p_cc->log, OSM_LOG_VERBOSE,
+		"Binding to port GUID 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
+
+	p_cc->bind_handle = osm_vendor_bind(p_cc->vendor, &bind_info,
+					    p_cc->mad_pool,
+					    cc_mad_recv_callback,
+					    cc_mad_send_err_callback, p_cc);
+
+	if (p_cc->bind_handle == OSM_BIND_INVALID_HANDLE) {
+		status = IB_ERROR;
+		OSM_LOG(p_cc->log, OSM_LOG_ERROR,
+			"ERR C107: Vendor specific bind failed (%s)\n",
+			ib_get_err_str(status));
+		goto Exit;
+	}
+
+Exit:
+	OSM_LOG_EXIT(p_cc->log);
+	return status;
+}
+
+void osm_congestion_control_shutdown(osm_congestion_control_t * p_cc)
+{
+	OSM_LOG_ENTER(p_cc->log);
+	if (p_cc->bind_handle == OSM_BIND_INVALID_HANDLE) {
+		OSM_LOG(p_cc->log, OSM_LOG_ERROR,
+			"ERR C108: No previous bind\n");
+		goto Exit;
+	}
+	cl_disp_unregister(p_cc->cc_disp_h);
+Exit:
+	OSM_LOG_EXIT(p_cc->log);
+}
+
+void osm_congestion_control_destroy(osm_congestion_control_t * p_cc)
+{
+	osm_madw_t *p_madw;
+
+	OSM_LOG_ENTER(p_cc->log);
+
+	p_cc->thread_state = OSM_THREAD_STATE_EXIT;
+
+	cl_event_signal(&p_cc->sig_mads_on_wire_continue);
+	cl_event_signal(&p_cc->cc_poller_wakeup);
+
+	cl_thread_destroy(&p_cc->cc_poller);
+
+	cl_spinlock_acquire(&p_cc->mad_queue_lock);
+
+	while (!cl_is_qlist_empty(&p_cc->mad_queue)) {
+		p_madw = (osm_madw_t *) cl_qlist_remove_head(&p_cc->mad_queue);
+		osm_mad_pool_put(p_cc->mad_pool, p_madw);
+	}
+
+	cl_spinlock_release(&p_cc->mad_queue_lock);
+
+	cl_spinlock_destroy(&p_cc->mad_queue_lock);
+
+	cl_event_destroy(&p_cc->cc_poller_wakeup);
+	cl_event_destroy(&p_cc->outstanding_mads_done_event);
+	cl_event_destroy(&p_cc->sig_mads_on_wire_continue);
+
+	OSM_LOG_EXIT(p_cc->log);
+}
diff --git a/opensm/osm_opensm.c b/opensm/osm_opensm.c
index 429108a..c7328ef 100644
--- a/opensm/osm_opensm.c
+++ b/opensm/osm_opensm.c
@@ -61,6 +61,7 @@ 
 #include <opensm/osm_sm.h>
 #include <opensm/osm_vl15intf.h>
 #include <opensm/osm_event_plugin.h>
+#include <opensm/osm_congestion_control.h>
 
 struct routing_engine_module {
 	const char *name;
@@ -291,6 +292,8 @@  void osm_opensm_destroy(IN osm_opensm_t * p_osm)
 	osm_perfmgr_shutdown(&p_osm->perfmgr);
 #endif				/* ENABLE_OSM_PERF_MGR */
 
+	osm_congestion_control_shutdown(&p_osm->cc);
+
 	/* shut down the SA
 	 * - unbind from QP1 messages
 	 */
@@ -320,6 +323,7 @@  void osm_opensm_destroy(IN osm_opensm_t * p_osm)
 #ifdef ENABLE_OSM_PERF_MGR
 	osm_perfmgr_destroy(&p_osm->perfmgr);
 #endif				/* ENABLE_OSM_PERF_MGR */
+	osm_congestion_control_destroy(&p_osm->cc);
 	osm_db_destroy(&p_osm->db);
 	osm_vl15_destroy(&p_osm->vl15, &p_osm->mad_pool);
 	osm_mad_pool_destroy(&p_osm->mad_pool);
@@ -464,6 +468,11 @@  ib_api_status_t osm_opensm_init(IN osm_opensm_t * p_osm,
 		goto Exit;
 #endif				/* ENABLE_OSM_PERF_MGR */
 
+	status = osm_congestion_control_init(&p_osm->cc,
+					     p_osm, p_opt);
+	if (status != IB_SUCCESS)
+		goto Exit;
+
 	p_osm->no_fallback_routing_engine = FALSE;
 
 	setup_routing_engines(p_osm, p_opt->routing_engine_names);
@@ -497,6 +506,10 @@  ib_api_status_t osm_opensm_bind(IN osm_opensm_t * p_osm, IN ib_net64_t guid)
 		goto Exit;
 #endif				/* ENABLE_OSM_PERF_MGR */
 
+	status = osm_congestion_control_bind(&p_osm->cc, guid);
+	if (status != IB_SUCCESS)
+		goto Exit;
+
 	/* setting IS_SM in capability mask */
 	OSM_LOG(&p_osm->log, OSM_LOG_INFO, "Setting IS_SM on port 0x%016" PRIx64 "\n",
 			cl_ntoh64(guid));
diff --git a/opensm/osm_state_mgr.c b/opensm/osm_state_mgr.c
index 143b744..4d762a3 100644
--- a/opensm/osm_state_mgr.c
+++ b/opensm/osm_state_mgr.c
@@ -66,6 +66,7 @@ 
 #include <vendor/osm_vendor_api.h>
 #include <opensm/osm_inform.h>
 #include <opensm/osm_opensm.h>
+#include <opensm/osm_congestion_control.h>
 
 extern void osm_drop_mgr_process(IN osm_sm_t * sm);
 extern int osm_qos_setup(IN osm_opensm_t * p_osm);
@@ -1156,6 +1157,11 @@  static void do_sweep(osm_sm_t * sm)
 		if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats))
 			return;
 
+		osm_congestion_control_setup(sm->p_subn->p_osm);
+
+		if (osm_congestion_control_wait_pending_transactions (sm->p_subn->p_osm))
+			return;
+
 		if (!sm->p_subn->subnet_initialization_error) {
 			OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE,
 					"REROUTE COMPLETE");
@@ -1401,6 +1407,13 @@  repeat_discovery:
 	 * The sweep completed!
 	 */
 
+	/* Now do GSI configuration */
+
+	osm_congestion_control_setup(sm->p_subn->p_osm);
+
+	if (osm_congestion_control_wait_pending_transactions (sm->p_subn->p_osm))
+		return;
+
 	/*
 	 * Send trap 64 on newly discovered endports
 	 */
diff --git a/opensm/osm_subnet.c b/opensm/osm_subnet.c
index ccaa47c..3584caa 100644
--- a/opensm/osm_subnet.c
+++ b/opensm/osm_subnet.c
@@ -72,6 +72,7 @@ 
 #include <opensm/osm_inform.h>
 #include <opensm/osm_console.h>
 #include <opensm/osm_perfmgr.h>
+#include <opensm/osm_congestion_control.h>
 #include <opensm/osm_event_plugin.h>
 #include <opensm/osm_qos_policy.h>
 #include <opensm/osm_service.h>
@@ -300,6 +301,22 @@  static void opts_parse_uint32(IN osm_subn_t *p_subn, IN char *p_key,
 	}
 }
 
+static void opts_parse_net32(IN osm_subn_t *p_subn, IN char *p_key,
+			     IN char *p_val_str, void *p_v1, void *p_v2,
+			     void (*pfn)(osm_subn_t *, void *))
+{
+	uint32_t *p_val1 = p_v1, *p_val2 = p_v2;
+	uint32_t val = strtoul(p_val_str, NULL, 0);
+
+	if (cl_hton32(val) != *p_val1) {
+		log_config_value(p_key, "%u", val);
+		if (pfn)
+			pfn(p_subn, &val);
+		*p_val1 = *p_val2 = cl_hton32(val);
+	}
+}
+
+
 static void opts_parse_int32(IN osm_subn_t *p_subn, IN char *p_key,
 			     IN char *p_val_str, void *p_v1, void *p_v2,
 			     void (*pfn)(osm_subn_t *, void *))
@@ -405,6 +422,274 @@  static void opts_parse_charp(IN osm_subn_t *p_subn, IN char *p_key,
 	}
 }
 
+static void opts_parse_256bit(IN osm_subn_t *p_subn, IN char *p_key,
+			      IN char *p_val_str, void *p_v1, void *p_v2,
+			      void (*pfn)(osm_subn_t *, void *))
+{
+	uint8_t *p_val1 = p_v1, *p_val2 = p_v2;
+	uint8_t val[IB_CC_PORT_MASK_DATA_SIZE] = { 0 };
+	char tmpbuf[3] = { 0 };
+	uint8_t tmpint;
+	int numdigits = 0;
+	int startindex;
+	char *strptr = p_val_str;
+	char *ptr;
+	int i;
+
+	/* parse like it's hypothetically a 256 bit integer code
+	 *
+	 * store "big endian"
+	 */
+
+	if (!strncmp(strptr, "0x", 2) || !strncmp(strptr, "0X", 2))
+		strptr+=2;
+
+	for (ptr = strptr; *ptr; ptr++) {
+		if (!isxdigit(*ptr)) {
+			log_report("invalid hex digit in bitmask\n");
+			return;
+		}
+		numdigits++;
+	}
+
+	if (!numdigits) {
+		log_report("invalid length bitmask\n");
+		return;
+	}
+
+	/* max of 2 hex chars per byte */
+	if (numdigits > IB_CC_PORT_MASK_DATA_SIZE * 2)
+		numdigits = IB_CC_PORT_MASK_DATA_SIZE * 2;
+
+	startindex = IB_CC_PORT_MASK_DATA_SIZE - ((numdigits - 1) / 2) - 1;
+
+	if (numdigits % 2) {
+		memcpy(tmpbuf, strptr, 1);
+		strptr += 1;
+	}
+	else {
+		memcpy(tmpbuf, strptr, 2);
+		strptr += 2;
+	}
+
+	tmpint = strtoul(tmpbuf, NULL, 16);
+	val[startindex] = tmpint;
+
+	for (i = (startindex + 1); i < IB_CC_PORT_MASK_DATA_SIZE; i++) {
+		memcpy(tmpbuf, strptr, 2);
+		strptr += 2;
+		tmpint = strtoul(tmpbuf, NULL, 16);
+		val[i] = tmpint;
+	}
+
+	if (memcmp(val, p_val1, IB_CC_PORT_MASK_DATA_SIZE)) {
+		log_config_value(p_key, "%s", p_val_str);
+		if (pfn)
+			pfn(p_subn, val);
+		memcpy(p_val1, val, IB_CC_PORT_MASK_DATA_SIZE);
+		memcpy(p_val2, val, IB_CC_PORT_MASK_DATA_SIZE);
+	}
+
+}
+
+static void opts_parse_cct_entry(IN osm_subn_t *p_subn, IN char *p_key,
+				 IN char *p_val_str, void *p_v1, void *p_v2,
+				 void (*pfn)(osm_subn_t *, void *))
+{
+	osm_cct_entry_t *p_cct1 = p_v1, *p_cct2 = p_v2;
+	osm_cct_entry_t cct;
+	char buf[512] = { 0 };
+	char *ptr;
+
+	strncpy(buf, p_val_str, 511);
+
+	if (!(ptr = strchr(buf, ':'))) {
+		log_report("invalid CCT entry\n");
+		return;
+	}
+
+	*ptr = '\0';
+	ptr++;
+
+	cct.shift = strtoul(buf, NULL, 0);
+	cct.multiplier = strtoul(ptr, NULL, 0);
+
+	if (cct.shift != p_cct1->shift
+	    || cct.multiplier != p_cct1->multiplier) {
+		log_config_value(p_key, "%s", p_val_str);
+		if (pfn)
+			pfn(p_subn, &cct);
+		p_cct1->shift = p_cct2->shift = cct.shift;
+		p_cct1->multiplier = p_cct2->multiplier = cct.multiplier;
+	}
+}
+
+static void opts_parse_cc_cct(IN osm_subn_t *p_subn, IN char *p_key,
+			      IN char *p_val_str, void *p_v1, void *p_v2,
+			      void (*pfn)(osm_subn_t *, void *))
+{
+	osm_cct_t *p_val1 = p_v1, *p_val2 = p_v2;
+	const char *current_str = p_val1->input_str ? p_val1->input_str : null_str;
+
+	if (p_val_str && strcmp(p_val_str, current_str)) {
+		osm_cct_t newcct;
+		char *new;
+		unsigned int len = 0;
+		char *lasts;
+		char *tok;
+		char *ptr;
+
+		/* special case the "(null)" string */
+		new = strcmp(null_str, p_val_str) ? strdup(p_val_str) : NULL;
+
+		if (!new) {
+			log_config_value(p_key, "%s", p_val_str);
+			if (pfn)
+				pfn(p_subn, NULL);
+			memset(p_val1->entries, '\0', sizeof(p_val1->entries));
+			memset(p_val2->entries, '\0', sizeof(p_val2->entries));
+			p_val1->entries_len = p_val2->entries_len = 0;
+			p_val1->input_str = p_val2->input_str = NULL;
+			return;
+		}
+
+		memset(&newcct, '\0', sizeof(newcct));
+
+		tok = strtok_r(new, ",", &lasts);
+		while (tok && len < OSM_CCT_ENTRY_MAX) {
+
+			if (!(ptr = strchr(tok, ':'))) {
+				log_report("invalid CCT entry\n");
+				free(new);
+				return;
+			}
+			*ptr = '\0';
+			ptr++;
+
+			newcct.entries[len].shift = strtoul(tok, NULL, 0);
+			newcct.entries[len].multiplier = strtoul(ptr, NULL, 0);
+			len++;
+			tok = strtok_r(NULL, ",", &lasts);
+		}
+
+		free(new);
+
+		newcct.entries_len = len;
+		newcct.input_str = strdup(p_val_str);
+
+		log_config_value(p_key, "%s", p_val_str);
+		if (pfn)
+			pfn(p_subn, &newcct);
+		if (p_val1->input_str && p_val1->input_str != p_val2->input_str)
+			free(p_val1->input_str);
+		if (p_val2->input_str)
+			free(p_val2->input_str);
+		memcpy(p_val1->entries, newcct.entries, sizeof(newcct.entries));
+		memcpy(p_val2->entries, newcct.entries, sizeof(newcct.entries));
+		p_val1->entries_len = p_val2->entries_len = newcct.entries_len;
+		p_val1->input_str = p_val2->input_str = newcct.input_str;
+	}
+}
+
+static int parse_ca_cong_common(char *p_val_str, uint8_t *sl, unsigned int *val_offset) {
+	char *new, *lasts, *sl_str, *val_str;
+	uint8_t sltmp;
+
+	new = strcmp(null_str, p_val_str) ? strdup(p_val_str) : NULL;
+	if (!new)
+		return -1;
+
+	sl_str = strtok_r(new, " \t", &lasts);
+	val_str = strtok_r(NULL, " \t", &lasts);
+
+	if (!val_str) {
+		log_report("value must be specified in addition to SL\n");
+		free(new);
+		return -1;
+	}
+
+	sltmp = strtoul(sl_str, NULL, 0);
+	if (sltmp >= IB_CA_CONG_ENTRY_DATA_SIZE) {
+		log_report("invalid SL specified\n");
+		free(new);
+		return -1;
+	}
+
+	*sl = sltmp;
+	*val_offset = (unsigned int)(val_str - new);
+
+	free(new);
+	return 0;
+}
+
+static void opts_parse_ccti_timer(IN osm_subn_t *p_subn, IN char *p_key,
+				  IN char *p_val_str, void *p_v1, void *p_v2,
+				  void (*pfn)(osm_subn_t *, void *))
+{
+	osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2;
+	unsigned int val_offset = 0;
+	uint8_t sl = 0;
+
+	if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0)
+		return;
+
+	opts_parse_net16(p_subn, p_key, p_val_str + val_offset,
+			 &p_val1[sl].ccti_timer,
+			 &p_val2[sl].ccti_timer,
+			 pfn);
+}
+
+static void opts_parse_ccti_increase(IN osm_subn_t *p_subn, IN char *p_key,
+				     IN char *p_val_str, void *p_v1, void *p_v2,
+				     void (*pfn)(osm_subn_t *, void *))
+{
+	osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2;
+	unsigned int val_offset = 0;
+	uint8_t sl = 0;
+
+	if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0)
+		return;
+
+	opts_parse_uint8(p_subn, p_key, p_val_str + val_offset,
+			 &p_val1[sl].ccti_increase,
+			 &p_val2[sl].ccti_increase,
+			 pfn);
+}
+
+static void opts_parse_trigger_threshold(IN osm_subn_t *p_subn, IN char *p_key,
+					 IN char *p_val_str, void *p_v1, void *p_v2,
+					 void (*pfn)(osm_subn_t *, void *))
+{
+	osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2;
+	unsigned int val_offset = 0;
+	uint8_t sl = 0;
+
+	if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0)
+		return;
+
+	opts_parse_uint8(p_subn, p_key, p_val_str + val_offset,
+			 &p_val1[sl].trigger_threshold,
+			 &p_val2[sl].trigger_threshold,
+			 pfn);
+}
+
+static void opts_parse_ccti_min(IN osm_subn_t *p_subn, IN char *p_key,
+				IN char *p_val_str, void *p_v1, void *p_v2,
+				void (*pfn)(osm_subn_t *, void *))
+{
+	osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2;
+	unsigned int val_offset = 0;
+	uint8_t sl = 0;
+
+	if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0)
+		return;
+
+	opts_parse_uint8(p_subn, p_key, p_val_str + val_offset,
+			 &p_val1[sl].ccti_min,
+			 &p_val2[sl].ccti_min,
+			 pfn);
+}
+
 static const opt_rec_t opt_tbl[] = {
 	{ "guid", OPT_OFFSET(guid), opts_parse_net64, NULL, 0 },
 	{ "m_key", OPT_OFFSET(m_key), opts_parse_net64, NULL, 1 },
@@ -524,6 +809,24 @@  static const opt_rec_t opt_tbl[] = {
 	{ "qos_rtr_vlarb_high", OPT_OFFSET(qos_rtr_options.vlarb_high), opts_parse_charp, NULL, 1 },
 	{ "qos_rtr_vlarb_low", OPT_OFFSET(qos_rtr_options.vlarb_low), opts_parse_charp, NULL, 1 },
 	{ "qos_rtr_sl2vl", OPT_OFFSET(qos_rtr_options.sl2vl), opts_parse_charp, NULL, 1 },
+	{ "congestion_control", OPT_OFFSET(congestion_control), opts_parse_boolean, NULL, 1 },
+	{ "cc_key", OPT_OFFSET(cc_key), opts_parse_net64, NULL, 0},
+	{ "cc_max_outstanding_mads", OPT_OFFSET(cc_max_outstanding_mads), opts_parse_uint32, NULL, 0 },
+	{ "cc_sw_cong_setting_control_map", OPT_OFFSET(cc_sw_cong_setting_control_map), opts_parse_net32, NULL, 1},
+	{ "cc_sw_cong_setting_victim_mask", OPT_OFFSET(cc_sw_cong_setting_victim_mask), opts_parse_256bit, NULL, 1},
+	{ "cc_sw_cong_setting_credit_mask", OPT_OFFSET(cc_sw_cong_setting_credit_mask), opts_parse_256bit, NULL, 1},
+	{ "cc_sw_cong_setting_threshold", OPT_OFFSET(cc_sw_cong_setting_threshold), opts_parse_uint8, NULL, 1},
+	{ "cc_sw_cong_setting_packet_size", OPT_OFFSET(cc_sw_cong_setting_packet_size), opts_parse_uint8, NULL, 1},
+	{ "cc_sw_cong_setting_credit_starvation_threshold", OPT_OFFSET(cc_sw_cong_setting_credit_starvation_threshold), opts_parse_uint8, NULL, 1},
+	{ "cc_sw_cong_setting_credit_starvation_return_delay", OPT_OFFSET(cc_sw_cong_setting_credit_starvation_return_delay), opts_parse_cct_entry, NULL, 1},
+	{ "cc_sw_cong_setting_marking_rate", OPT_OFFSET(cc_sw_cong_setting_marking_rate), opts_parse_net16, NULL, 1},
+	{ "cc_ca_cong_setting_port_control", OPT_OFFSET(cc_ca_cong_setting_port_control), opts_parse_net16, NULL, 1},
+	{ "cc_ca_cong_setting_control_map", OPT_OFFSET(cc_ca_cong_setting_control_map), opts_parse_net16, NULL, 1},
+	{ "cc_ca_cong_setting_ccti_timer", OPT_OFFSET(cc_ca_cong_entries), opts_parse_ccti_timer, NULL, 1},
+	{ "cc_ca_cong_setting_ccti_increase", OPT_OFFSET(cc_ca_cong_entries), opts_parse_ccti_increase, NULL, 1},
+	{ "cc_ca_cong_setting_trigger_threshold", OPT_OFFSET(cc_ca_cong_entries), opts_parse_trigger_threshold, NULL, 1},
+	{ "cc_ca_cong_setting_ccti_min", OPT_OFFSET(cc_ca_cong_entries), opts_parse_ccti_min, NULL, 1},
+	{ "cc_cct", OPT_OFFSET(cc_cct), opts_parse_cc_cct, NULL, 1},
 	{ "enable_quirks", OPT_OFFSET(enable_quirks), opts_parse_boolean, NULL, 1 },
 	{ "no_clients_rereg", OPT_OFFSET(no_clients_rereg), opts_parse_boolean, NULL, 1 },
 	{ "prefix_routes_file", OPT_OFFSET(prefix_routes_file), opts_parse_charp, NULL, 0 },
@@ -601,6 +904,7 @@  static void subn_opt_destroy(IN osm_subn_opt_t * p_opt)
 	subn_destroy_qos_options(&p_opt->qos_sw0_options);
 	subn_destroy_qos_options(&p_opt->qos_swe_options);
 	subn_destroy_qos_options(&p_opt->qos_rtr_options);
+	free(p_opt->cc_cct.input_str);
 }
 
 void osm_subn_destroy(IN osm_subn_t * p_subn)
@@ -1033,6 +1337,9 @@  void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt)
 	p_opt->torus_conf_file = strdup(OSM_DEFAULT_TORUS_CONF_FILE);
 	p_opt->do_mesh_analysis = FALSE;
 	p_opt->exit_on_fatal = TRUE;
+	p_opt->congestion_control = FALSE;
+	p_opt->cc_key = OSM_DEFAULT_CC_KEY;
+	p_opt->cc_max_outstanding_mads = OSM_PERFMGR_DEFAULT_MAX_OUTSTANDING_QUERIES;
 	p_opt->enable_quirks = FALSE;
 	p_opt->no_clients_rereg = FALSE;
 	p_opt->prefix_routes_file = strdup(OSM_DEFAULT_PREFIX_ROUTES_FILE);
@@ -1047,6 +1354,8 @@  void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt)
 	subn_init_qos_options(&p_opt->qos_sw0_options, NULL);
 	subn_init_qos_options(&p_opt->qos_swe_options, NULL);
 	subn_init_qos_options(&p_opt->qos_rtr_options, NULL);
+	p_opt->cc_cct.entries_len = 0;
+	p_opt->cc_cct.input_str = NULL;
 }
 
 static char *clean_val(char *val)
@@ -1674,6 +1983,9 @@  int osm_subn_rescan_conf_files(IN osm_subn_t * p_subn)
 
 int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t * p_opts)
 {
+	int cacongoutputcount = 0;
+	int i;
+
 	fprintf(out,
 		"#\n# DEVICE ATTRIBUTES OPTIONS\n#\n"
 		"# The port GUID on which the OpenSM is running\n"
@@ -2138,6 +2450,164 @@  int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t * p_opts)
 	fprintf(out, "\n");
 
 	fprintf(out,
+		"#\n# Congestion Control OPTIONS (EXPERIMENTAL)\n#\n\n"
+		"# Enable Congestion Control Configuration\n"
+		"congestion_control %s\n\n"
+		"# CCKey to use when configuring congestion control\n"
+		"# note that this does not configure a new CCkey, only the CCkey to use\n"
+		"cc_key 0x%016" PRIx64 "\n\n"
+		"# Congestion Control Max outstanding MAD\n"
+		"cc_max_outstanding_mads %u\n\n",
+		p_opts->congestion_control ? "TRUE" : "FALSE",
+		cl_ntoh64(p_opts->cc_key),
+		p_opts->cc_max_outstanding_mads);
+
+	fprintf(out,
+		"#\n# Congestion Control SwitchCongestionSetting options\n#\n"
+		"# Control Map - bitmask indicating which of the following attributes are to be used\n"
+		"# bit 0 - victim mask\n"
+		"# bit 1 - credit mask\n"
+		"# bit 2 - threshold + packet size\n"
+		"# bit 3 - credit starvation threshold + return delay valid\n"
+		"# bit 4 - marking rate valid\n"
+		"cc_sw_cong_setting_control_map 0x%X\n\n",
+		cl_ntoh32(p_opts->cc_sw_cong_setting_control_map));
+
+	fprintf(out,
+		"# Victim Mask - 256 bit mask representing switch ports, mark packets with FECN\n"
+		"# whether they are the source or victim of congestion\n"
+		"# bit 0 - port 0 (enhanced port)\n"
+		"# bit 1 - port 1\n"
+		"# ...\n"
+		"# bit 254 - port 254\n"
+		"# bit 255 - reserved\n"
+		"cc_sw_cong_setting_victim_mask 0x");
+
+	for (i = 0; i < IB_CC_PORT_MASK_DATA_SIZE; i++)
+		fprintf(out, "%02X", p_opts->cc_sw_cong_setting_victim_mask[i]);
+	fprintf(out, "\n\n");
+
+	fprintf(out,
+		"# Credit Mask - 256 bit mask representing switch ports to apply credit starvation\n"
+		"# bit 0 - port 0 (enhanced port)\n"
+		"# bit 1 - port 1\n"
+		"# ...\n"
+		"# bit 254 - port 254\n"
+		"# bit 255 - reserved\n"
+		"cc_sw_cong_setting_credit_mask 0x");
+
+	for (i = 0; i < IB_CC_PORT_MASK_DATA_SIZE; i++)
+		fprintf(out, "%02X", p_opts->cc_sw_cong_setting_credit_mask[i]);
+	fprintf(out, "\n\n");
+
+	fprintf(out,
+		"# Threshold - value indicating aggressiveness of congestion marking\n"
+		"# 0x0 - none, 0x1 - loose, ..., 0xF - aggressive\n"
+		"cc_sw_cong_setting_threshold 0x%02X\n\n"
+		"# Packet Size - any packet less than this size will not be marked with a FECN\n"
+		"# units are in credits\n"
+		"cc_sw_cong_setting_packet_size %u\n\n"
+		"# Credit Starvation Threshold - value indicating aggressiveness of credit starvation\n"
+		"# 0x0 - none, 0x1 - loose, ..., 0xF - aggressive\n"
+		"cc_sw_cong_setting_credit_starvation_threshold 0x%02X\n\n"
+		"# Credit Starvation Return Delay - in CCT entry shift:multiplier format, see IB spec\n"
+		"cc_sw_cong_setting_credit_starvation_return_delay %u:%u\n\n"
+		"# Marking Rate - mean number of packets between markings\n"
+		"cc_sw_cong_setting_marking_rate %u\n\n",
+		p_opts->cc_sw_cong_setting_threshold,
+		p_opts->cc_sw_cong_setting_packet_size,
+		p_opts->cc_sw_cong_setting_credit_starvation_threshold,
+		p_opts->cc_sw_cong_setting_credit_starvation_return_delay.shift,
+		p_opts->cc_sw_cong_setting_credit_starvation_return_delay.multiplier,
+		cl_ntoh16(p_opts->cc_sw_cong_setting_marking_rate));
+
+	fprintf(out,
+		"#\n# Congestion Control CA Congestion Setting options\n#\n"
+		"# Port Control\n"
+		"# bit 0 = 0, QP based congestion control\n"
+		"# bit 0 = 1, SL/port based congestion control\n"
+		"cc_ca_cong_setting_port_control 0x%04X\n\n"
+		"# Control Map - 16 bit bitmask indicating which SLs should be configured\n"
+		"cc_ca_cong_setting_control_map 0x%04X\n\n",
+		cl_ntoh16(p_opts->cc_ca_cong_setting_port_control),
+		cl_ntoh16(p_opts->cc_ca_cong_setting_control_map));
+
+	fprintf(out,
+		"#\n# CA Congestion Setting Entries\n#\n"
+		"# Each of congestion control settings below configures the CA Congestion\n"
+		"# Settings for an individual SL.  The SL must be specified before the value.\n"
+		"# These options may be specified multiple times to configure different values\n"
+		"# for different SLs.\n"
+		"#\n"
+		"# ccti timer - when expires decrements 1 from the CCTI\n"
+		"# ccti increase - number to be added to the table index on receipt of a BECN\n"
+		"# trigger threshold - when the ccti is equal to this, an event is logged\n"
+		"# ccti min - the minimum value for the ccti.  This imposes a minimum rate\n"
+		"#            on the injection rate\n\n");
+
+	for (i = 0; i < IB_CA_CONG_ENTRY_DATA_SIZE; i++) {
+		/* Don't output unless one of the settings has been set, there's no need
+		 * to output 16 chunks of this with all defaults of 0 */
+		if (p_opts->cc_ca_cong_entries[i].ccti_timer
+		    || p_opts->cc_ca_cong_entries[i].ccti_increase
+		    || p_opts->cc_ca_cong_entries[i].trigger_threshold
+		    || p_opts->cc_ca_cong_entries[i].ccti_min) {
+			fprintf(out,
+				"# SL = %u\n"
+				"cc_ca_cong_setting_ccti_timer %u %u\n"
+				"cc_ca_cong_setting_ccti_increase %u %u\n"
+				"cc_ca_cong_setting_trigger_threshold %u %u\n"
+				"cc_ca_cong_setting_ccti_min %u %u\n\n",
+				i,
+				i,
+				cl_ntoh16(p_opts->cc_ca_cong_entries[i].ccti_timer),
+				i,
+				p_opts->cc_ca_cong_entries[i].ccti_increase,
+				i,
+				p_opts->cc_ca_cong_entries[i].trigger_threshold,
+				i,
+				p_opts->cc_ca_cong_entries[i].ccti_min);
+			cacongoutputcount++;
+		}
+	}
+
+	/* If by chance all the CA Cong Settings are default, output atleast 1 chunk
+         * for illustration */
+	if (!cacongoutputcount)
+		fprintf(out,
+			"# SL = 0\n"
+			"cc_ca_cong_setting_ccti_timer 0 %u\n"
+			"cc_ca_cong_setting_ccti_increase 0 %u\n"
+			"cc_ca_cong_setting_trigger_threshold 0 %u\n"
+			"cc_ca_cong_setting_ccti_min 0 %u\n\n",
+			cl_ntoh16(p_opts->cc_ca_cong_entries[0].ccti_timer),
+			p_opts->cc_ca_cong_entries[0].ccti_increase,
+			p_opts->cc_ca_cong_entries[0].trigger_threshold,
+			p_opts->cc_ca_cong_entries[0].ccti_min);
+
+	fprintf(out,
+		"#\n# Congestion Control Table\n#\n"
+		"# Comma separated list of CCT entries representing CCT.\n"
+		"# Format is shift:multipler,shift_multiplier,shift:multiplier,...\n"
+		"cc_cct ");
+
+	if (!p_opts->cc_cct.entries_len) {
+		fprintf(out, "%s\n", null_str);
+	}
+	else {
+		fprintf(out, "%u:%u",
+			p_opts->cc_cct.entries[0].shift,
+			p_opts->cc_cct.entries[0].multiplier);
+		for (i = 0; i < p_opts->cc_cct.entries_len; i++) {
+			fprintf(out, ",%u:%u",
+				p_opts->cc_cct.entries[0].shift,
+				p_opts->cc_cct.entries[0].multiplier);
+		}
+		fprintf(out, "\n");
+	}
+	fprintf(out, "\n");
+
+	fprintf(out,
 		"# Prefix routes file name\n"
 		"prefix_routes_file %s\n\n",
 		p_opts->prefix_routes_file);