From patchwork Mon Jul 30 22:06:47 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Al Chu X-Patchwork-Id: 1256691 X-Patchwork-Delegate: alexne@voltaire.com Return-Path: X-Original-To: patchwork-linux-rdma@patchwork.kernel.org Delivered-To: patchwork-process-083081@patchwork1.kernel.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by patchwork1.kernel.org (Postfix) with ESMTP id 853873FCC5 for ; Mon, 30 Jul 2012 22:07:11 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755058Ab2G3WGu (ORCPT ); Mon, 30 Jul 2012 18:06:50 -0400 Received: from nspiron-3.llnl.gov ([128.115.41.83]:18051 "EHLO nspiron-3.llnl.gov" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755031Ab2G3WGs (ORCPT ); Mon, 30 Jul 2012 18:06:48 -0400 X-Attachments: Received: from auk59.llnl.gov (HELO [134.9.93.24]) ([134.9.93.24]) by nspiron-3.llnl.gov with ESMTP; 30 Jul 2012 15:06:47 -0700 Subject: [PATCHv3] opensm: Add initial congestion control configuration support From: Albert Chu To: linux-rdma@vger.kernel.org Date: Mon, 30 Jul 2012 15:06:47 -0700 Message-Id: <1343686007.18615.364.camel@auk59.llnl.gov> Mime-Version: 1.0 X-Mailer: Evolution 2.12.3 (2.12.3-19.el5) Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org This patch adds initial support for congestion control configuration on a fabric. Users may configure settings via the Switch Congestion Setting, CA Congestion Setting, or Congest Control Table MADs. Signed-off-by: Albert Chu Signed-off-by: Alex Netes --- include/iba/ib_types.h | 14 +- include/opensm/osm_congestion_control.h | 132 ++++++ include/opensm/osm_madw.h | 40 ++ include/opensm/osm_msgdef.h | 1 + include/opensm/osm_opensm.h | 2 + include/opensm/osm_port.h | 18 + include/opensm/osm_subnet.h | 158 +++++++ man/opensm.8.in | 11 + opensm/Makefile.am | 4 +- opensm/main.c | 16 + opensm/osm_congestion_control.c | 741 +++++++++++++++++++++++++++++++ opensm/osm_opensm.c | 13 + opensm/osm_state_mgr.c | 13 + opensm/osm_subnet.c | 470 ++++++++++++++++++++ 14 files changed, 1627 insertions(+), 6 deletions(-) create mode 100644 include/opensm/osm_congestion_control.h create mode 100644 opensm/osm_congestion_control.c diff --git a/include/iba/ib_types.h b/include/iba/ib_types.h index 7b4d1ee..c8d51ca 100644 --- a/include/iba/ib_types.h +++ b/include/iba/ib_types.h @@ -11471,11 +11471,12 @@ typedef struct _ib_cong_log { * * SYNOPSIS */ +#define IB_CC_PORT_MASK_DATA_SIZE 32 #include typedef struct _ib_sw_cong_setting { ib_net32_t control_map; - uint8_t victim_mask[32]; - uint8_t credit_mask[32]; + uint8_t victim_mask[IB_CC_PORT_MASK_DATA_SIZE]; + uint8_t credit_mask[IB_CC_PORT_MASK_DATA_SIZE]; uint8_t threshold_resv; uint8_t packet_size; ib_net16_t cs_threshold_resv; @@ -11585,7 +11586,8 @@ typedef struct _ib_sw_port_cong_setting_element { * * SOURCE */ -typedef ib_sw_port_cong_setting_element_t ib_sw_port_cong_setting_block_t[32]; +#define IB_CC_SW_PORT_SETTING_ELEMENTS 32 +typedef ib_sw_port_cong_setting_element_t ib_sw_port_cong_setting_block_t[IB_CC_SW_PORT_SETTING_ELEMENTS]; /**********/ /****s* IBA Base: Types/ib_sw_port_cong_setting_t @@ -11663,11 +11665,12 @@ typedef struct _ib_ca_cong_entry { * * SYNOPSIS */ +#define IB_CA_CONG_ENTRY_DATA_SIZE 16 #include typedef struct _ib_ca_cong_setting { ib_net16_t port_control; ib_net16_t control_map; - ib_ca_cong_entry_t entry_list[16]; + ib_ca_cong_entry_t entry_list[IB_CA_CONG_ENTRY_DATA_SIZE]; } PACK_SUFFIX ib_ca_cong_setting_t; #include /* @@ -11726,11 +11729,12 @@ typedef struct _ib_cc_tbl_entry { * * SYNOPSIS */ +#define IB_CC_TBL_ENTRY_LIST_MAX 64 #include typedef struct _ib_cc_tbl { ib_net16_t ccti_limit; ib_net16_t resv; - ib_cc_tbl_entry_t entry_list[64]; + ib_cc_tbl_entry_t entry_list[IB_CC_TBL_ENTRY_LIST_MAX]; } PACK_SUFFIX ib_cc_tbl_t; #include /* diff --git a/include/opensm/osm_congestion_control.h b/include/opensm/osm_congestion_control.h new file mode 100644 index 0000000..94e4ffb --- /dev/null +++ b/include/opensm/osm_congestion_control.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2012 Lawrence Livermore National Lab. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* + * Abstract: + * OSM Congestion Control types and prototypes + * + * Author: + * Albert Chu, LLNL + */ + +#ifndef OSM_CONGESTION_CONTROL_H +#define OSM_CONGESTION_CONTROL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +/****s* OpenSM: Base/OSM_DEFAULT_CC_KEY + * NAME + * OSM_DEFAULT_CC_KEY + * + * DESCRIPTION + * Congestion Control Key used by OpenSM. + * + * SYNOPSIS + */ +#define OSM_DEFAULT_CC_KEY 0 + +#define OSM_CC_DEFAULT_MAX_OUTSTANDING_QUERIES 500 + +/****s* OpenSM: CongestionControl/osm_congestion_control_t +* This object should be treated as opaque and should +* be manipulated only through the provided functions. +*/ +typedef struct osm_congestion_control { + struct osm_opensm *osm; + osm_subn_t *subn; + osm_sm_t *sm; + osm_log_t *log; + osm_mad_pool_t *mad_pool; + atomic32_t trans_id; + osm_vendor_t *vendor; + osm_bind_handle_t bind_handle; + cl_disp_reg_handle_t cc_disp_h; + ib_net64_t port_guid; + atomic32_t outstanding_mads; + atomic32_t outstanding_mads_on_wire; + cl_qlist_t mad_queue; + cl_spinlock_t mad_queue_lock; + cl_event_t cc_poller_wakeup; + cl_event_t outstanding_mads_done_event; + cl_event_t sig_mads_on_wire_continue; + cl_thread_t cc_poller; + osm_thread_state_t thread_state; + ib_sw_cong_setting_t sw_cong_setting; + ib_ca_cong_setting_t ca_cong_setting; + ib_cc_tbl_t cc_tbl[OSM_CCT_ENTRY_MAD_BLOCKS]; + unsigned int cc_tbl_mads; +} osm_congestion_control_t; +/* +* FIELDS +* subn +* Subnet object for this subnet. +* +* log +* Pointer to the log object. +* +* mad_pool +* Pointer to the MAD pool. +* +* mad_ctrl +* Mad Controller +*********/ + +struct osm_opensm; + +int osm_congestion_control_setup(struct osm_opensm *osm); + +int osm_congestion_control_wait_pending_transactions(struct osm_opensm *osm); + +ib_api_status_t osm_congestion_control_init(osm_congestion_control_t * p_cc, + struct osm_opensm *osm, + const osm_subn_opt_t * p_opt); + +ib_api_status_t osm_congestion_control_bind(osm_congestion_control_t * p_cc, + ib_net64_t port_guid); + +void osm_congestion_control_shutdown(osm_congestion_control_t * p_cc); + +void osm_congestion_control_destroy(osm_congestion_control_t * p_cc); + + +#endif /* ifndef OSM_CONGESTION_CONTROL_H */ diff --git a/include/opensm/osm_madw.h b/include/opensm/osm_madw.h index 195f468..afc3680 100644 --- a/include/opensm/osm_madw.h +++ b/include/opensm/osm_madw.h @@ -340,6 +340,19 @@ typedef struct osm_perfmgr_context { } osm_perfmgr_context_t; /*********/ +/****s* OpenSM: MAD Wrapper/osm_cc_context_t +* DESCRIPTION +* Context for Congestion Control MADs +*/ +typedef struct osm_cc_context { + ib_net64_t node_guid; + ib_net64_t port_guid; + uint8_t port; + uint8_t mad_method; /* was this a get or a set */ + ib_net32_t attr_mod; +} osm_cc_context_t; +/*********/ + #ifndef OSM_VENDOR_INTF_OPENIB /****s* OpenSM: MAD Wrapper/osm_arbitrary_context_t * NAME @@ -379,6 +392,7 @@ typedef union _osm_madw_context { osm_pkey_context_t pkey_context; osm_vla_context_t vla_context; osm_perfmgr_context_t perfmgr_context; + osm_cc_context_t cc_context; #ifndef OSM_VENDOR_INTF_OPENIB osm_arbitrary_context_t arb_context; #endif @@ -612,6 +626,32 @@ static inline ib_perfmgt_mad_t *osm_madw_get_perfmgt_mad_ptr(IN const osm_madw_t * MAD Wrapper object *********/ +/****f* OpenSM: MAD Wrapper/osm_madw_get_cc_mad_ptr +* DESCRIPTION +* Gets a pointer to the Congestion Control MAD in this MAD wrapper. +* +* SYNOPSIS +*/ +static inline ib_cc_mad_t *osm_madw_get_cc_mad_ptr(IN const osm_madw_t + * p_madw) +{ + return ((ib_cc_mad_t *) p_madw->p_mad); +} + +/* +* PARAMETERS +* p_madw +* [in] Pointer to an osm_madw_t object. +* +* RETURN VALUES +* Pointer to the start of the Congestion Control MAD. +* +* NOTES +* +* SEE ALSO +* MAD Wrapper object +*********/ + /****f* OpenSM: MAD Wrapper/osm_madw_get_ni_context_ptr * NAME * osm_madw_get_ni_context_ptr diff --git a/include/opensm/osm_msgdef.h b/include/opensm/osm_msgdef.h index 0c8af9b..b0d92e0 100644 --- a/include/opensm/osm_msgdef.h +++ b/include/opensm/osm_msgdef.h @@ -162,6 +162,7 @@ enum { #endif OSM_MSG_MAD_PORT_COUNTERS, OSM_MSG_MAD_MLNX_EXT_PORT_INFO, + OSM_MSG_MAD_CC, OSM_MSG_MAX }; diff --git a/include/opensm/osm_opensm.h b/include/opensm/osm_opensm.h index 9f2c2fa..dbff4f6 100644 --- a/include/opensm/osm_opensm.h +++ b/include/opensm/osm_opensm.h @@ -61,6 +61,7 @@ #include #include #include +#include #ifdef __cplusplus # define BEGIN_C_DECLS extern "C" { @@ -203,6 +204,7 @@ typedef struct osm_opensm { #ifdef ENABLE_OSM_PERF_MGR osm_perfmgr_t perfmgr; #endif /* ENABLE_OSM_PERF_MGR */ + osm_congestion_control_t cc; cl_qlist_t plugin_list; osm_db_t db; osm_mad_pool_t mad_pool; diff --git a/include/opensm/osm_port.h b/include/opensm/osm_port.h index 56e9c37..e06483a 100644 --- a/include/opensm/osm_port.h +++ b/include/opensm/osm_port.h @@ -119,6 +119,15 @@ typedef struct osm_physp { ib_vl_arb_table_t vl_arb[4]; cl_ptr_vector_t slvl_by_port; uint8_t hop_wf; + union { + struct { + ib_sw_cong_setting_t sw_cong_setting; + } sw; + struct { + ib_ca_cong_setting_t ca_cong_setting; + ib_cc_tbl_t cc_tbl[OSM_CCT_ENTRY_MAD_BLOCKS]; + } ca; + } cc; } osm_physp_t; /* * FIELDS @@ -186,6 +195,15 @@ typedef struct osm_physp { * hop_wf * Hop weighting factor to be used in the routing. * +* sw_cong_setting +* Physical port switch congestion settings (switches only) +* +* ca_cong_setting +* Physical port ca congestion settings (cas only) +* +* cc_tbl +* Physical port ca congestion control table (cas only) +* * SEE ALSO * Port *********/ diff --git a/include/opensm/osm_subnet.h b/include/opensm/osm_subnet.h index 683204b..426a962 100644 --- a/include/opensm/osm_subnet.h +++ b/include/opensm/osm_subnet.h @@ -86,6 +86,10 @@ typedef enum _osm_partition_enforce_type_enum { OSM_PARTITION_ENFORCE_TYPE_OFF } osm_partition_enforce_type_enum; +/* XXX: not actual max, max we're currently going to support */ +#define OSM_CCT_ENTRY_MAX 128 +#define OSM_CCT_ENTRY_MAD_BLOCKS (OSM_CCT_ENTRY_MAX/64) + struct osm_opensm; struct osm_qos_policy; @@ -147,6 +151,91 @@ typedef struct osm_qos_options { * *********/ +/****s* OpenSM: Subnet/osm_cct_entry_t +* NAME +* osm_cct_entry_t +* +* DESCRIPTION +* Subnet Congestion Control Table entry. See A10.2.2.1.1 for format details. +* +* SYNOPSIS +*/ +typedef struct osm_cct_entry { + uint8_t shift; //Alex: shift 2 bits + uint16_t multiplier; //Alex multiplier 14 bits +} osm_cct_entry_t; +/* +* FIELDS +* +* shift +* shift field in CCT entry. See A10.2.2.1.1. +* +* multiplier +* multiplier field in CCT entry. See A10.2.2.1.1. +* +*********/ + +/****s* OpenSM: Subnet/osm_cacongestion_entry_t +* NAME +* osm_cacongestion_entry_t +* +* DESCRIPTION +* Subnet CA Congestion entry. See A10.4.3.8.4 for format details. +* +* SYNOPSIS +*/ +typedef struct osm_cacongestion_entry { + ib_net16_t ccti_timer; //Alex: ccti_timer and ccti_increase should be replaced + uint8_t ccti_increase; + uint8_t trigger_threshold; + uint8_t ccti_min; +} osm_cacongestion_entry_t; +/* +* FIELDS +* +* ccti_timer +* CCTI Timer +* +* ccti_increase +* CCTI Increase +* +* trigger_threshold +* CCTI trigger for log message +* +* ccti_min +* CCTI Minimum +* +*********/ + +/****s* OpenSM: Subnet/osm_cct_t +* NAME +* osm_cct_t +* +* DESCRIPTION +* Subnet CongestionControlTable. See A10.4.3.9 for format details. +* +* SYNOPSIS +*/ +typedef struct osm_cct { + osm_cct_entry_t entries[OSM_CCT_ENTRY_MAX]; + unsigned int entries_len; + char *input_str; +} osm_cct_t; +/* +* FIELDS +* +* entries +* Entries in CCT +* +* entries_len +* Length of entries +* +* input_str +* Original str input +* +*********/ + + /****s* OpenSM: Subnet/osm_subn_opt_t * NAME * osm_subn_opt_t @@ -244,6 +333,21 @@ typedef struct osm_subn_opt { osm_qos_options_t qos_sw0_options; osm_qos_options_t qos_swe_options; osm_qos_options_t qos_rtr_options; + boolean_t congestion_control; + ib_net64_t cc_key; + uint32_t cc_max_outstanding_mads; + ib_net32_t cc_sw_cong_setting_control_map; + uint8_t cc_sw_cong_setting_victim_mask[IB_CC_PORT_MASK_DATA_SIZE]; + uint8_t cc_sw_cong_setting_credit_mask[IB_CC_PORT_MASK_DATA_SIZE]; + uint8_t cc_sw_cong_setting_threshold; + uint8_t cc_sw_cong_setting_packet_size; + uint8_t cc_sw_cong_setting_credit_starvation_threshold; + osm_cct_entry_t cc_sw_cong_setting_credit_starvation_return_delay; + ib_net16_t cc_sw_cong_setting_marking_rate; + ib_net16_t cc_ca_cong_setting_port_control; + ib_net16_t cc_ca_cong_setting_control_map; + osm_cacongestion_entry_t cc_ca_cong_entries[IB_CA_CONG_ENTRY_DATA_SIZE]; + osm_cct_t cc_cct; boolean_t enable_quirks; boolean_t no_clients_rereg; #ifdef ENABLE_OSM_PERF_MGR @@ -530,6 +634,60 @@ typedef struct osm_subn_opt { * qos_rtr_options * QoS options for router ports * +* congestion_control +* Boolean that specifies whether OpenSM congestion control configuration +* should be off or no. +* +* cc_key +* CCkey to use when configuring congestion control. +* +* cc_max_outstanding_mads +* Max number of outstanding CC mads that can be on the wire. +* +* cc_sw_cong_setting_control_map +* Congestion Control Switch Congestion Setting Control Map +* configuration setting. +* +* cc_sw_cong_setting_victim_mask +* Congestion Control Switch Congestion Setting Victim Mask +* configuration setting. +* +* cc_sw_cong_setting_credit_mask +* Congestion Control Switch Congestion Setting Credit Mask +* configuration setting. +* +* cc_sw_cong_setting_threshold +* Congestion Control Switch Congestion Setting Threshold +* configuration setting. +* +* cc_sw_cong_setting_packet_size +* Congestion Control Switch Congestion Setting Packet Size +* configuration setting. +* +* cc_sw_cong_setting_credit_starvation_threshold +* Congestion Control Switch Congestion Setting Credit Staraction Threshold +* configuration setting. +* +* cc_sw_cong_setting_credit_starvation_return_delay +* Congestion Control Switch Congestion Setting Credit Starvation Return Delay +* configuration setting. +* +* cc_sw_cong_setting_marking_rate +* Congestion Control Switch Congestion Setting Marking Rate +* configuration setting. +* +* cc_ca_cong_setting_port_control +* Congestion Control CA Congestion Setting Port Control +* +* cc_ca_cong_setting_control_map +* Congestion Control CA Congestion Setting Control Map + +* cc_ca_cong_entries +* Congestion Control CA Congestion Setting Entries +* +* cc_cct +* Congestion Control Table array of entries +* * enable_quirks * Enable high risk new features and not fully qualified * hardware specific work arounds diff --git a/man/opensm.8.in b/man/opensm.8.in index 888d6a6..dd67671 100644 --- a/man/opensm.8.in +++ b/man/opensm.8.in @@ -48,6 +48,8 @@ opensm \- InfiniBand subnet manager and administration (SM/SA) [\-Z | \-\-part_enforce [both | in | out | off]] [\-W | \-\-allow_both_pkeys] [\-Q | \-\-qos [\-Y | \-\-qos_policy_file ]] +[\-\-congestion\-control] +[\-\-cckey ] [\-y | \-\-stay_on_fatal] [\-B | \-\-daemon] [\-I | \-\-inactive] @@ -369,6 +371,15 @@ name is \fB\%@OPENSM_CONFIG_DIR@/@QOS_POLICY_FILE@\fP. See QoS_management_in_OpenSM.txt in opensm doc for more information on configuring QoS policy via this file. .TP +\fB\-\-congestion_control\fR +(EXPERIMENTAL) This option enables congestion control configuration. +It is disabled by default. See config file for congestion control +configuration options. +\fB\-\-cc_key\fR +(EXPERIMENTAL) This option configures the CCkey to use when configuring +congestion control. Note that this option does not configure a new +CCkey into switches and CAs. Defaults to 0. +.TP \fB\-N\fR, \fB\-\-no_part_enforce\fR \fB(DEPRECATED)\fR This is a deprecated flag. Please use \fB\-\-part_enforce\fR instead. This option disables partition enforcement on switch external ports. diff --git a/opensm/Makefile.am b/opensm/Makefile.am index 855042c..7fd6bc6 100644 --- a/opensm/Makefile.am +++ b/opensm/Makefile.am @@ -57,7 +57,8 @@ opensm_SOURCES = main.c osm_console_io.c osm_console.c osm_db_files.c \ osm_ucast_dfsssp.c osm_vl15intf.c \ osm_vl_arb_rcv.c st.c osm_perfmgr.c osm_perfmgr_db.c \ osm_event_plugin.c osm_dump.c osm_ucast_cache.c \ - osm_qos_parser_y.y osm_qos_parser_l.l osm_qos_policy.c + osm_qos_parser_y.y osm_qos_parser_l.l osm_qos_policy.c \ + osm_congestion_control.c AM_YFLAGS:= -d @@ -102,6 +103,7 @@ opensminclude_HEADERS = \ $(srcdir)/../include/opensm/osm_port_profile.h \ $(srcdir)/../include/opensm/osm_prefix_route.h \ $(srcdir)/../include/opensm/osm_qos_policy.h \ + $(srcdir)/../include/opensm/osm_congestion_control.h \ $(srcdir)/../include/opensm/osm_remote_sm.h \ $(srcdir)/../include/opensm/osm_router.h \ $(srcdir)/../include/opensm/osm_sa.h \ diff --git a/opensm/main.c b/opensm/main.c index fca209a..e9a0b4c 100644 --- a/opensm/main.c +++ b/opensm/main.c @@ -340,6 +340,11 @@ static void show_usage(void) " This option defines the optional QoS policy file.\n" " The default name is \'" OSM_DEFAULT_QOS_POLICY_FILE "\'.\n\n"); + printf("--congestion_control\n" + " (EXPERIMENTAL) This option enables congestion control configuration.\n\n"); + printf("--cc_key \n" + " (EXPERIMENTAL) This option configures the CCkey to use when configuring\n" + " congestion control.\n\n"); printf("--stay_on_fatal, -y\n" " This option will cause SM not to exit on fatal initialization\n" " issues: if SM discovers duplicated guids or 12x link with\n" @@ -614,6 +619,8 @@ int main(int argc, char *argv[]) {"allow_both_pkeys", 0, NULL, 'W'}, {"qos", 0, NULL, 'Q'}, {"qos_policy_file", 1, NULL, 'Y'}, + {"congestion_control", 0, NULL, 128}, + {"cc_key", 1, NULL, 129}, {"maxsmps", 1, NULL, 'n'}, {"console", 1, NULL, 'q'}, {"V", 0, NULL, 'V'}, @@ -920,6 +927,15 @@ int main(int argc, char *argv[]) printf(" QoS policy file \'%s\'\n", optarg); break; + case 128: + opt.congestion_control = TRUE; + break; + + case 129: + opt.cc_key = strtoull(optarg, NULL, 0); + printf(" CC Key 0x%" PRIx64 "\n", opt.cc_key); + break; + case 'y': opt.exit_on_fatal = FALSE; printf(" Staying on fatal initialization errors\n"); diff --git a/opensm/osm_congestion_control.c b/opensm/osm_congestion_control.c new file mode 100644 index 0000000..640f3fa --- /dev/null +++ b/opensm/osm_congestion_control.c @@ -0,0 +1,741 @@ +/* + * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2012 Lawrence Livermore National Lab. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* + * Abstract: + * OSM Congestion Control configuration implementation + * + * Author: + * Albert Chu, LLNL + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define CONGESTION_CONTROL_INITIAL_TID_VALUE 0xbabe + +static void cc_mad_post(osm_congestion_control_t *p_cc, + osm_madw_t *p_madw, + osm_node_t *p_node, + osm_physp_t *p_physp, + ib_net16_t attr_id, + ib_net32_t attr_mod) +{ + osm_subn_opt_t *p_opt = &p_cc->subn->opt; + ib_cc_mad_t *p_cc_mad; + uint8_t port; + + OSM_LOG_ENTER(p_cc->log); + + port = osm_physp_get_port_num(p_physp); + + p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw); + + p_cc_mad->header.base_ver = 1; + p_cc_mad->header.mgmt_class = IB_MCLASS_CC; + p_cc_mad->header.class_ver = 2; + p_cc_mad->header.method = IB_MAD_METHOD_SET; + p_cc_mad->header.status = 0; + p_cc_mad->header.class_spec = 0; + p_cc_mad->header.trans_id = + cl_hton64((uint64_t) cl_atomic_inc(&p_cc->trans_id)); + p_cc_mad->header.attr_id = attr_id; + p_cc_mad->header.resv = 0; + p_cc_mad->header.attr_mod = attr_mod; + + p_cc_mad->cc_key = p_opt->cc_key; + + memset(p_cc_mad->log_data, '\0', IB_CC_LOG_DATA_SIZE); + + p_madw->mad_addr.dest_lid = osm_node_get_base_lid(p_node, port); + p_madw->mad_addr.addr_type.gsi.remote_qp = IB_QP1; + p_madw->mad_addr.addr_type.gsi.remote_qkey = + cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY); + p_madw->resp_expected = TRUE; + p_madw->fail_msg = CL_DISP_MSGID_NONE; + + p_madw->context.cc_context.node_guid = osm_node_get_node_guid(p_node); + p_madw->context.cc_context.port_guid = osm_physp_get_port_guid(p_physp); + p_madw->context.cc_context.port = port; + p_madw->context.cc_context.mad_method = IB_MAD_METHOD_SET; + p_madw->context.cc_context.attr_mod = attr_mod; + + cl_spinlock_acquire(&p_cc->mad_queue_lock); + cl_atomic_inc(&p_cc->outstanding_mads); + cl_qlist_insert_tail(&p_cc->mad_queue, &p_madw->list_item); + cl_spinlock_release(&p_cc->mad_queue_lock); + + cl_event_signal(&p_cc->cc_poller_wakeup); + + OSM_LOG_EXIT(p_cc->log); +} + +static void cc_setup_mad_data(osm_sm_t * p_sm) +{ + osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc; + osm_subn_opt_t *p_opt = &p_sm->p_subn->opt; + uint16_t ccti_limit; + int i; + + /* Switch Congestion Setting */ + p_cc->sw_cong_setting.control_map = p_opt->cc_sw_cong_setting_control_map; + + memcpy(p_cc->sw_cong_setting.victim_mask, + p_opt->cc_sw_cong_setting_victim_mask, + IB_CC_PORT_MASK_DATA_SIZE); + + memcpy(p_cc->sw_cong_setting.credit_mask, + p_opt->cc_sw_cong_setting_credit_mask, + IB_CC_PORT_MASK_DATA_SIZE); + + /* threshold is 4 bits, takes up upper nibble of byte */ + p_cc->sw_cong_setting.threshold_resv = (p_opt->cc_sw_cong_setting_threshold << 4); + + p_cc->sw_cong_setting.packet_size = p_opt->cc_sw_cong_setting_packet_size; + + /* cs threshold is 4 bits, takes up upper nibble of short */ + p_cc->sw_cong_setting.cs_threshold_resv = + cl_hton16(p_opt->cc_sw_cong_setting_credit_starvation_threshold << 12); + + p_cc->sw_cong_setting.cs_return_delay = + cl_hton16(p_opt->cc_sw_cong_setting_credit_starvation_return_delay.shift << 14 + | p_opt->cc_sw_cong_setting_credit_starvation_return_delay.multiplier); + + p_cc->sw_cong_setting.marking_rate = p_opt->cc_sw_cong_setting_marking_rate; + + /* CA Congestion Setting */ + p_cc->ca_cong_setting.port_control = p_opt->cc_ca_cong_setting_port_control; + p_cc->ca_cong_setting.control_map = p_opt->cc_ca_cong_setting_control_map; + + for (i = 0; i < IB_CA_CONG_ENTRY_DATA_SIZE; i++) { + ib_ca_cong_entry_t *p_entry; + + p_entry = &p_cc->ca_cong_setting.entry_list[i]; + + p_entry->ccti_timer = p_opt->cc_ca_cong_entries[i].ccti_timer; + p_entry->ccti_increase = p_opt->cc_ca_cong_entries[i].ccti_increase; + p_entry->trigger_threshold = p_opt->cc_ca_cong_entries[i].trigger_threshold; + p_entry->ccti_min = p_opt->cc_ca_cong_entries[i].ccti_min; + p_entry->resv0 = 0; + p_entry->resv1 = 0; + } + + /* Congestion Control Table */ + + /* if no entries, we will always send atleast 1 mad to set ccti_limit = 0 */ + if (!p_opt->cc_cct.entries_len) + p_cc->cc_tbl_mads = 1; + else { + p_cc->cc_tbl_mads = p_opt->cc_cct.entries_len - 1; + p_cc->cc_tbl_mads /= IB_CC_TBL_ENTRY_LIST_MAX; + p_cc->cc_tbl_mads += 1; + } + + CL_ASSERT(p_cc->cc_tbl_mads <= OSM_CCT_ENTRY_MAD_BLOCKS); + + if (!p_opt->cc_cct.entries_len) + ccti_limit = 0; + else + ccti_limit = p_opt->cc_cct.entries_len - 1; + + for (i = 0; i < p_cc->cc_tbl_mads; i++) { + int j; + + p_cc->cc_tbl[i].ccti_limit = cl_hton16(ccti_limit); + p_cc->cc_tbl[i].resv = 0; + + memset(p_cc->cc_tbl[i].entry_list, + '\0', + sizeof(p_cc->cc_tbl[i].entry_list)); + + if (!ccti_limit) + break; + + for (j = 0; j < IB_CC_TBL_ENTRY_LIST_MAX; j++) { + int k; + + k = (i * IB_CC_TBL_ENTRY_LIST_MAX) + j; + p_cc->cc_tbl[i].entry_list[j].shift_multiplier = + cl_hton16(p_opt->cc_cct.entries[k].shift << 14 + | p_opt->cc_cct.entries[k].multiplier); + } + } +} + +static ib_api_status_t cc_send_sw_cong_setting(osm_sm_t * p_sm, + osm_node_t *p_node) +{ + osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc; + unsigned force_update; + osm_physp_t *p_physp; + osm_madw_t *p_madw = NULL; + ib_cc_mad_t *p_cc_mad = NULL; + ib_sw_cong_setting_t *p_sw_cong_setting = NULL; + + OSM_LOG_ENTER(p_sm->p_log); + + p_physp = osm_node_get_physp_ptr(p_node, 0); + + force_update = p_physp->need_update || p_sm->p_subn->need_update; + + if (!force_update + && !memcmp(&p_cc->sw_cong_setting, + &p_physp->cc.sw.sw_cong_setting, + sizeof(p_cc->sw_cong_setting))) + return IB_SUCCESS; + + p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle, + MAD_BLOCK_SIZE, NULL); + if (p_madw == NULL) { + OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C101: " + "failed to allocate mad\n"); + return IB_INSUFFICIENT_MEMORY; + } + + p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw); + + p_sw_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad); + + memcpy(p_sw_cong_setting, + &p_cc->sw_cong_setting, + sizeof(p_cc->sw_cong_setting)); + + cc_mad_post(p_cc, p_madw, p_node, p_physp, + IB_MAD_ATTR_SW_CONG_SETTING, 0); + + OSM_LOG_EXIT(p_sm->p_log); + + return IB_SUCCESS; +} + +static ib_api_status_t cc_send_ca_cong_setting(osm_sm_t * p_sm, + osm_node_t *p_node, + osm_physp_t *p_physp) +{ + osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc; + unsigned force_update; + osm_madw_t *p_madw = NULL; + ib_cc_mad_t *p_cc_mad = NULL; + ib_ca_cong_setting_t *p_ca_cong_setting = NULL; + + OSM_LOG_ENTER(p_sm->p_log); + + force_update = p_physp->need_update || p_sm->p_subn->need_update; + + if (!force_update + && !memcmp(&p_cc->ca_cong_setting, + &p_physp->cc.ca.ca_cong_setting, + sizeof(p_cc->ca_cong_setting))) + return IB_SUCCESS; + + p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle, + MAD_BLOCK_SIZE, NULL); + if (p_madw == NULL) { + OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C102: " + "failed to allocate mad\n"); + return IB_INSUFFICIENT_MEMORY; + } + + p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw); + + p_ca_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad); + + memcpy(p_ca_cong_setting, + &p_cc->ca_cong_setting, + sizeof(p_cc->ca_cong_setting)); + + cc_mad_post(p_cc, p_madw, p_node, p_physp, + IB_MAD_ATTR_CA_CONG_SETTING, 0); + + OSM_LOG_EXIT(p_sm->p_log); + + return IB_SUCCESS; +} + +static ib_api_status_t cc_send_cct(osm_sm_t * p_sm, + osm_node_t *p_node, + osm_physp_t *p_physp) +{ + osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc; + unsigned force_update; + osm_madw_t *p_madw = NULL; + ib_cc_mad_t *p_cc_mad = NULL; + ib_cc_tbl_t *p_cc_tbl = NULL; + unsigned int index = 0; + + OSM_LOG_ENTER(p_sm->p_log); + + force_update = p_physp->need_update || p_sm->p_subn->need_update; + + for (index = 0; index < p_cc->cc_tbl_mads; index++) { + if (!force_update + && !memcmp(&p_cc->cc_tbl[index], + &p_physp->cc.ca.cc_tbl[index], + sizeof(p_cc->cc_tbl[index]))) + continue; + + p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle, + MAD_BLOCK_SIZE, NULL); + if (p_madw == NULL) { + OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C103: " + "failed to allocate mad\n"); + return IB_INSUFFICIENT_MEMORY; + } + + p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw); + + p_cc_tbl = (ib_cc_tbl_t *)ib_cc_mad_get_mgt_data_ptr(p_cc_mad); + + memcpy(p_cc_tbl, + &p_cc->cc_tbl[index], + sizeof(p_cc->cc_tbl[index])); + + cc_mad_post(p_cc, p_madw, p_node, p_physp, + IB_MAD_ATTR_CC_TBL, cl_hton32(index)); + } + + OSM_LOG_EXIT(p_sm->p_log); + + return IB_SUCCESS; +} + +int osm_congestion_control_setup(struct osm_opensm *p_osm) +{ + cl_qmap_t *p_tbl; + cl_map_item_t *p_next; + int ret = 0; + + if (!p_osm->subn.opt.congestion_control) + return 0; + + OSM_LOG_ENTER(&p_osm->log); + + /* + * Do nothing unless the most recent routing attempt was successful. + */ + if (!p_osm->sm.p_subn->p_osm->routing_engine_used) + return 0; + + cc_setup_mad_data(&p_osm->sm); + + cl_plock_acquire(&p_osm->lock); + + p_tbl = &p_osm->subn.port_guid_tbl; + p_next = cl_qmap_head(p_tbl); + while (p_next != cl_qmap_end(p_tbl)) { + osm_port_t *p_port = (osm_port_t *) p_next; + osm_node_t *p_node = p_port->p_node; + ib_api_status_t status; + + p_next = cl_qmap_next(p_next); + + if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH) { + status = cc_send_sw_cong_setting(&p_osm->sm, p_node); + if (status != IB_SUCCESS) + ret = -1; + } else if (osm_node_get_type(p_node) == IB_NODE_TYPE_CA) { + status = cc_send_ca_cong_setting(&p_osm->sm, + p_node, + p_port->p_physp); + if (status != IB_SUCCESS) + ret = -1; + + status = cc_send_cct(&p_osm->sm, + p_node, + p_port->p_physp); + if (status != IB_SUCCESS) + ret = -1; + } + } + + cl_plock_release(&p_osm->lock); + + OSM_LOG_EXIT(&p_osm->log); + + return ret; +} + +int osm_congestion_control_wait_pending_transactions(struct osm_opensm *p_osm) +{ + osm_congestion_control_t *cc = &p_osm->sm.p_subn->p_osm->cc; + + if (!p_osm->subn.opt.congestion_control) + return 0; + + while (1) { + unsigned count = cc->outstanding_mads; + if (!count || osm_exit_flag) + break; + cl_event_wait_on(&cc->outstanding_mads_done_event, + EVENT_NO_TIMEOUT, + TRUE); + } + + return osm_exit_flag; +} + +static inline void decrement_outstanding_mads(osm_congestion_control_t *p_cc) +{ + uint32_t outstanding; + + outstanding = cl_atomic_dec(&p_cc->outstanding_mads); + if (!outstanding) + cl_event_signal(&p_cc->outstanding_mads_done_event); + + cl_atomic_dec(&p_cc->outstanding_mads_on_wire); + cl_event_signal(&p_cc->sig_mads_on_wire_continue); +} + + +static void cc_rcv_mad(void *context, void *data) +{ + osm_congestion_control_t *p_cc = context; + osm_opensm_t *p_osm = p_cc->osm; + osm_madw_t *p_madw = data; + ib_cc_mad_t *p_cc_mad; + osm_madw_context_t *p_mad_context = &p_madw->context; + ib_mad_t *p_mad = osm_madw_get_mad_ptr(p_madw); + uint64_t node_guid = p_mad_context->cc_context.node_guid; + uint64_t port_guid = p_mad_context->cc_context.port_guid; + uint8_t port = p_mad_context->cc_context.port; + osm_port_t *p_port; + + OSM_LOG_ENTER(p_cc->log); + + OSM_LOG(p_cc->log, OSM_LOG_VERBOSE, + "Processing received MAD status 0x%x context 0x%" + PRIx64 "port %u\n", p_mad->status, node_guid, port); + + p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw); + + cl_plock_acquire(&p_osm->lock); + + p_port = osm_get_port_by_guid(p_cc->subn, port_guid); + if (!p_port) { + OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C109: " + "Port guid not in table 0x%" PRIx64 "\n", + port_guid); + cl_plock_release(&p_osm->lock); + goto Exit; + } + + if (p_cc_mad->header.attr_id == IB_MAD_ATTR_SW_CONG_SETTING) { + ib_sw_cong_setting_t *p_sw_cong_setting; + + p_sw_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad); + p_port->p_physp->cc.sw.sw_cong_setting = *p_sw_cong_setting; + } + else if (p_cc_mad->header.attr_id == IB_MAD_ATTR_CA_CONG_SETTING) { + ib_ca_cong_setting_t *p_ca_cong_setting; + + p_ca_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad); + p_port->p_physp->cc.ca.ca_cong_setting = *p_ca_cong_setting; + } + else if (p_cc_mad->header.attr_id == IB_MAD_ATTR_CC_TBL) { + ib_net32_t attr_mod = p_mad_context->cc_context.attr_mod; + uint32_t index = cl_ntoh32(attr_mod); + ib_cc_tbl_t *p_cc_tbl; + + p_cc_tbl = ib_cc_mad_get_mgt_data_ptr(p_cc_mad); + p_port->p_physp->cc.ca.cc_tbl[index] = *p_cc_tbl; + } + else + OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C10A: " + "Unexpected MAD attribute received: %u\n", + p_cc_mad->header.attr_id); + + cl_plock_release(&p_osm->lock); + +Exit: + decrement_outstanding_mads(p_cc); + osm_mad_pool_put(p_cc->mad_pool, p_madw); + OSM_LOG_EXIT(p_cc->log); +} + +static void cc_poller_send(osm_congestion_control_t *p_cc, + osm_madw_t *p_madw) +{ + osm_subn_opt_t *p_opt = &p_cc->subn->opt; + ib_api_status_t status; + + status = osm_vendor_send(p_cc->bind_handle, p_madw, TRUE); + if (status == IB_SUCCESS) { + cl_atomic_inc(&p_cc->outstanding_mads_on_wire); + if (p_cc->outstanding_mads_on_wire > + p_opt->cc_max_outstanding_mads) + cl_event_wait_on(&p_cc->sig_mads_on_wire_continue, + EVENT_NO_TIMEOUT, + TRUE); + } + else { + osm_madw_context_t *mad_context = &p_madw->context; + + OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C104: " + "send failed to node 0x%" PRIx64 "port %u\n", + mad_context->cc_context.node_guid, + mad_context->cc_context.port); + } +} + +static void cc_poller(void *p_ptr) +{ + osm_congestion_control_t *p_cc = p_ptr; + osm_madw_t *p_madw; + + OSM_LOG_ENTER(p_cc->log); + + if (p_cc->thread_state == OSM_THREAD_STATE_NONE) + p_cc->thread_state = OSM_THREAD_STATE_RUN; + + while (p_cc->thread_state == OSM_THREAD_STATE_RUN) { + cl_spinlock_acquire(&p_cc->mad_queue_lock); + + p_madw = (osm_madw_t *) cl_qlist_remove_head(&p_cc->mad_queue); + + cl_spinlock_release(&p_cc->mad_queue_lock); + + if (p_madw != (osm_madw_t *) cl_qlist_end(&p_cc->mad_queue)) + cc_poller_send(p_cc, p_madw); + else + cl_event_wait_on(&p_cc->cc_poller_wakeup, + EVENT_NO_TIMEOUT, TRUE); + } + + OSM_LOG_EXIT(p_cc->log); +} + +ib_api_status_t osm_congestion_control_init(osm_congestion_control_t * p_cc, + struct osm_opensm *p_osm, + const osm_subn_opt_t * p_opt) +{ + ib_api_status_t status = IB_SUCCESS; + + OSM_LOG_ENTER(&p_osm->log); + + memset(p_cc, 0, sizeof(*p_cc)); + + p_cc->osm = p_osm; + p_cc->subn = &p_osm->subn; + p_cc->sm = &p_osm->sm; + p_cc->log = &p_osm->log; + p_cc->mad_pool = &p_osm->mad_pool; + p_cc->trans_id = CONGESTION_CONTROL_INITIAL_TID_VALUE; + p_cc->vendor = p_osm->p_vendor; + + p_cc->cc_disp_h = cl_disp_register(&p_osm->disp, OSM_MSG_MAD_CC, + cc_rcv_mad, p_cc); + if (p_cc->cc_disp_h == CL_DISP_INVALID_HANDLE) + goto Exit; + + cl_qlist_init(&p_cc->mad_queue); + + status = cl_spinlock_init(&p_cc->mad_queue_lock); + if (status != IB_SUCCESS) + goto Exit; + + cl_event_construct(&p_cc->cc_poller_wakeup); + status = cl_event_init(&p_cc->cc_poller_wakeup, FALSE); + if (status != IB_SUCCESS) + goto Exit; + + cl_event_construct(&p_cc->outstanding_mads_done_event); + status = cl_event_init(&p_cc->outstanding_mads_done_event, FALSE); + if (status != IB_SUCCESS) + goto Exit; + + cl_event_construct(&p_cc->sig_mads_on_wire_continue); + status = cl_event_init(&p_cc->sig_mads_on_wire_continue, FALSE); + if (status != IB_SUCCESS) + goto Exit; + + p_cc->thread_state = OSM_THREAD_STATE_NONE; + + status = cl_thread_init(&p_cc->cc_poller, cc_poller, p_cc, + "cc poller"); + if (status != IB_SUCCESS) + goto Exit; + + status = IB_SUCCESS; +Exit: + OSM_LOG_EXIT(p_cc->log); + return status; +} + +static void cc_mad_recv_callback(osm_madw_t * p_madw, void *bind_context, + osm_madw_t * p_req_madw) +{ + osm_congestion_control_t *p_cc = bind_context; + + OSM_LOG_ENTER(p_cc->log); + + osm_madw_copy_context(p_madw, p_req_madw); + osm_mad_pool_put(p_cc->mad_pool, p_req_madw); + + /* Do not decrement outstanding mads here, do it in the dispatcher */ + + if (cl_disp_post(p_cc->cc_disp_h, OSM_MSG_MAD_CC, + p_madw, NULL, NULL) != CL_SUCCESS) { + OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C105: " + "Congestion Control Dispatcher post failed\n"); + osm_mad_pool_put(p_cc->mad_pool, p_madw); + } + + OSM_LOG_EXIT(p_cc->log); +} + +static void cc_mad_send_err_callback(void *bind_context, + osm_madw_t * p_madw) +{ + osm_congestion_control_t *p_cc = bind_context; + osm_madw_context_t *p_madw_context = &p_madw->context; + uint64_t node_guid = p_madw_context->cc_context.node_guid; + uint8_t port = p_madw_context->cc_context.port; + + OSM_LOG_ENTER(p_cc->log); + + OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C106: MAD Error (%s): " + "attr id = %u LID %u GUID 0x%016" PRIx64 " port %u " + "TID 0x%" PRIx64 "\n", + ib_get_err_str(p_madw->status), + p_madw->p_mad->attr_id, + cl_ntoh16(p_madw->mad_addr.dest_lid), + node_guid, + port, + cl_ntoh64(p_madw->p_mad->trans_id)); + + p_cc->subn->subnet_initialization_error = TRUE; + + osm_mad_pool_put(p_cc->mad_pool, p_madw); + + decrement_outstanding_mads(p_cc); + + OSM_LOG_EXIT(p_cc->log); +} + +ib_api_status_t osm_congestion_control_bind(osm_congestion_control_t * p_cc, + ib_net64_t port_guid) +{ + osm_bind_info_t bind_info; + ib_api_status_t status = IB_SUCCESS; + + OSM_LOG_ENTER(p_cc->log); + + bind_info.port_guid = p_cc->port_guid = port_guid; + bind_info.mad_class = IB_MCLASS_CC; + bind_info.class_version = 2; + bind_info.is_responder = FALSE; + bind_info.is_report_processor = FALSE; + bind_info.is_trap_processor = FALSE; + bind_info.recv_q_size = OSM_SM_DEFAULT_QP1_RCV_SIZE; + bind_info.send_q_size = OSM_SM_DEFAULT_QP1_SEND_SIZE; + bind_info.timeout = p_cc->subn->opt.transaction_timeout; + bind_info.retries = p_cc->subn->opt.transaction_retries; + + OSM_LOG(p_cc->log, OSM_LOG_VERBOSE, + "Binding to port GUID 0x%" PRIx64 "\n", cl_ntoh64(port_guid)); + + p_cc->bind_handle = osm_vendor_bind(p_cc->vendor, &bind_info, + p_cc->mad_pool, + cc_mad_recv_callback, + cc_mad_send_err_callback, p_cc); + + if (p_cc->bind_handle == OSM_BIND_INVALID_HANDLE) { + status = IB_ERROR; + OSM_LOG(p_cc->log, OSM_LOG_ERROR, + "ERR C107: Vendor specific bind failed (%s)\n", + ib_get_err_str(status)); + goto Exit; + } + +Exit: + OSM_LOG_EXIT(p_cc->log); + return status; +} + +void osm_congestion_control_shutdown(osm_congestion_control_t * p_cc) +{ + OSM_LOG_ENTER(p_cc->log); + if (p_cc->bind_handle == OSM_BIND_INVALID_HANDLE) { + OSM_LOG(p_cc->log, OSM_LOG_ERROR, + "ERR C108: No previous bind\n"); + goto Exit; + } + cl_disp_unregister(p_cc->cc_disp_h); +Exit: + OSM_LOG_EXIT(p_cc->log); +} + +void osm_congestion_control_destroy(osm_congestion_control_t * p_cc) +{ + osm_madw_t *p_madw; + + OSM_LOG_ENTER(p_cc->log); + + p_cc->thread_state = OSM_THREAD_STATE_EXIT; + + cl_event_signal(&p_cc->sig_mads_on_wire_continue); + cl_event_signal(&p_cc->cc_poller_wakeup); + + cl_thread_destroy(&p_cc->cc_poller); + + cl_spinlock_acquire(&p_cc->mad_queue_lock); + + while (!cl_is_qlist_empty(&p_cc->mad_queue)) { + p_madw = (osm_madw_t *) cl_qlist_remove_head(&p_cc->mad_queue); + osm_mad_pool_put(p_cc->mad_pool, p_madw); + } + + cl_spinlock_release(&p_cc->mad_queue_lock); + + cl_spinlock_destroy(&p_cc->mad_queue_lock); + + cl_event_destroy(&p_cc->cc_poller_wakeup); + cl_event_destroy(&p_cc->outstanding_mads_done_event); + cl_event_destroy(&p_cc->sig_mads_on_wire_continue); + + OSM_LOG_EXIT(p_cc->log); +} diff --git a/opensm/osm_opensm.c b/opensm/osm_opensm.c index 429108a..c7328ef 100644 --- a/opensm/osm_opensm.c +++ b/opensm/osm_opensm.c @@ -61,6 +61,7 @@ #include #include #include +#include struct routing_engine_module { const char *name; @@ -291,6 +292,8 @@ void osm_opensm_destroy(IN osm_opensm_t * p_osm) osm_perfmgr_shutdown(&p_osm->perfmgr); #endif /* ENABLE_OSM_PERF_MGR */ + osm_congestion_control_shutdown(&p_osm->cc); + /* shut down the SA * - unbind from QP1 messages */ @@ -320,6 +323,7 @@ void osm_opensm_destroy(IN osm_opensm_t * p_osm) #ifdef ENABLE_OSM_PERF_MGR osm_perfmgr_destroy(&p_osm->perfmgr); #endif /* ENABLE_OSM_PERF_MGR */ + osm_congestion_control_destroy(&p_osm->cc); osm_db_destroy(&p_osm->db); osm_vl15_destroy(&p_osm->vl15, &p_osm->mad_pool); osm_mad_pool_destroy(&p_osm->mad_pool); @@ -464,6 +468,11 @@ ib_api_status_t osm_opensm_init(IN osm_opensm_t * p_osm, goto Exit; #endif /* ENABLE_OSM_PERF_MGR */ + status = osm_congestion_control_init(&p_osm->cc, + p_osm, p_opt); + if (status != IB_SUCCESS) + goto Exit; + p_osm->no_fallback_routing_engine = FALSE; setup_routing_engines(p_osm, p_opt->routing_engine_names); @@ -497,6 +506,10 @@ ib_api_status_t osm_opensm_bind(IN osm_opensm_t * p_osm, IN ib_net64_t guid) goto Exit; #endif /* ENABLE_OSM_PERF_MGR */ + status = osm_congestion_control_bind(&p_osm->cc, guid); + if (status != IB_SUCCESS) + goto Exit; + /* setting IS_SM in capability mask */ OSM_LOG(&p_osm->log, OSM_LOG_INFO, "Setting IS_SM on port 0x%016" PRIx64 "\n", cl_ntoh64(guid)); diff --git a/opensm/osm_state_mgr.c b/opensm/osm_state_mgr.c index 143b744..4d762a3 100644 --- a/opensm/osm_state_mgr.c +++ b/opensm/osm_state_mgr.c @@ -66,6 +66,7 @@ #include #include #include +#include extern void osm_drop_mgr_process(IN osm_sm_t * sm); extern int osm_qos_setup(IN osm_opensm_t * p_osm); @@ -1156,6 +1157,11 @@ static void do_sweep(osm_sm_t * sm) if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) return; + osm_congestion_control_setup(sm->p_subn->p_osm); + + if (osm_congestion_control_wait_pending_transactions (sm->p_subn->p_osm)) + return; + if (!sm->p_subn->subnet_initialization_error) { OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, "REROUTE COMPLETE"); @@ -1401,6 +1407,13 @@ repeat_discovery: * The sweep completed! */ + /* Now do GSI configuration */ + + osm_congestion_control_setup(sm->p_subn->p_osm); + + if (osm_congestion_control_wait_pending_transactions (sm->p_subn->p_osm)) + return; + /* * Send trap 64 on newly discovered endports */ diff --git a/opensm/osm_subnet.c b/opensm/osm_subnet.c index ccaa47c..3584caa 100644 --- a/opensm/osm_subnet.c +++ b/opensm/osm_subnet.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -300,6 +301,22 @@ static void opts_parse_uint32(IN osm_subn_t *p_subn, IN char *p_key, } } +static void opts_parse_net32(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + uint32_t *p_val1 = p_v1, *p_val2 = p_v2; + uint32_t val = strtoul(p_val_str, NULL, 0); + + if (cl_hton32(val) != *p_val1) { + log_config_value(p_key, "%u", val); + if (pfn) + pfn(p_subn, &val); + *p_val1 = *p_val2 = cl_hton32(val); + } +} + + static void opts_parse_int32(IN osm_subn_t *p_subn, IN char *p_key, IN char *p_val_str, void *p_v1, void *p_v2, void (*pfn)(osm_subn_t *, void *)) @@ -405,6 +422,274 @@ static void opts_parse_charp(IN osm_subn_t *p_subn, IN char *p_key, } } +static void opts_parse_256bit(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + uint8_t *p_val1 = p_v1, *p_val2 = p_v2; + uint8_t val[IB_CC_PORT_MASK_DATA_SIZE] = { 0 }; + char tmpbuf[3] = { 0 }; + uint8_t tmpint; + int numdigits = 0; + int startindex; + char *strptr = p_val_str; + char *ptr; + int i; + + /* parse like it's hypothetically a 256 bit integer code + * + * store "big endian" + */ + + if (!strncmp(strptr, "0x", 2) || !strncmp(strptr, "0X", 2)) + strptr+=2; + + for (ptr = strptr; *ptr; ptr++) { + if (!isxdigit(*ptr)) { + log_report("invalid hex digit in bitmask\n"); + return; + } + numdigits++; + } + + if (!numdigits) { + log_report("invalid length bitmask\n"); + return; + } + + /* max of 2 hex chars per byte */ + if (numdigits > IB_CC_PORT_MASK_DATA_SIZE * 2) + numdigits = IB_CC_PORT_MASK_DATA_SIZE * 2; + + startindex = IB_CC_PORT_MASK_DATA_SIZE - ((numdigits - 1) / 2) - 1; + + if (numdigits % 2) { + memcpy(tmpbuf, strptr, 1); + strptr += 1; + } + else { + memcpy(tmpbuf, strptr, 2); + strptr += 2; + } + + tmpint = strtoul(tmpbuf, NULL, 16); + val[startindex] = tmpint; + + for (i = (startindex + 1); i < IB_CC_PORT_MASK_DATA_SIZE; i++) { + memcpy(tmpbuf, strptr, 2); + strptr += 2; + tmpint = strtoul(tmpbuf, NULL, 16); + val[i] = tmpint; + } + + if (memcmp(val, p_val1, IB_CC_PORT_MASK_DATA_SIZE)) { + log_config_value(p_key, "%s", p_val_str); + if (pfn) + pfn(p_subn, val); + memcpy(p_val1, val, IB_CC_PORT_MASK_DATA_SIZE); + memcpy(p_val2, val, IB_CC_PORT_MASK_DATA_SIZE); + } + +} + +static void opts_parse_cct_entry(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + osm_cct_entry_t *p_cct1 = p_v1, *p_cct2 = p_v2; + osm_cct_entry_t cct; + char buf[512] = { 0 }; + char *ptr; + + strncpy(buf, p_val_str, 511); + + if (!(ptr = strchr(buf, ':'))) { + log_report("invalid CCT entry\n"); + return; + } + + *ptr = '\0'; + ptr++; + + cct.shift = strtoul(buf, NULL, 0); + cct.multiplier = strtoul(ptr, NULL, 0); + + if (cct.shift != p_cct1->shift + || cct.multiplier != p_cct1->multiplier) { + log_config_value(p_key, "%s", p_val_str); + if (pfn) + pfn(p_subn, &cct); + p_cct1->shift = p_cct2->shift = cct.shift; + p_cct1->multiplier = p_cct2->multiplier = cct.multiplier; + } +} + +static void opts_parse_cc_cct(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + osm_cct_t *p_val1 = p_v1, *p_val2 = p_v2; + const char *current_str = p_val1->input_str ? p_val1->input_str : null_str; + + if (p_val_str && strcmp(p_val_str, current_str)) { + osm_cct_t newcct; + char *new; + unsigned int len = 0; + char *lasts; + char *tok; + char *ptr; + + /* special case the "(null)" string */ + new = strcmp(null_str, p_val_str) ? strdup(p_val_str) : NULL; + + if (!new) { + log_config_value(p_key, "%s", p_val_str); + if (pfn) + pfn(p_subn, NULL); + memset(p_val1->entries, '\0', sizeof(p_val1->entries)); + memset(p_val2->entries, '\0', sizeof(p_val2->entries)); + p_val1->entries_len = p_val2->entries_len = 0; + p_val1->input_str = p_val2->input_str = NULL; + return; + } + + memset(&newcct, '\0', sizeof(newcct)); + + tok = strtok_r(new, ",", &lasts); + while (tok && len < OSM_CCT_ENTRY_MAX) { + + if (!(ptr = strchr(tok, ':'))) { + log_report("invalid CCT entry\n"); + free(new); + return; + } + *ptr = '\0'; + ptr++; + + newcct.entries[len].shift = strtoul(tok, NULL, 0); + newcct.entries[len].multiplier = strtoul(ptr, NULL, 0); + len++; + tok = strtok_r(NULL, ",", &lasts); + } + + free(new); + + newcct.entries_len = len; + newcct.input_str = strdup(p_val_str); + + log_config_value(p_key, "%s", p_val_str); + if (pfn) + pfn(p_subn, &newcct); + if (p_val1->input_str && p_val1->input_str != p_val2->input_str) + free(p_val1->input_str); + if (p_val2->input_str) + free(p_val2->input_str); + memcpy(p_val1->entries, newcct.entries, sizeof(newcct.entries)); + memcpy(p_val2->entries, newcct.entries, sizeof(newcct.entries)); + p_val1->entries_len = p_val2->entries_len = newcct.entries_len; + p_val1->input_str = p_val2->input_str = newcct.input_str; + } +} + +static int parse_ca_cong_common(char *p_val_str, uint8_t *sl, unsigned int *val_offset) { + char *new, *lasts, *sl_str, *val_str; + uint8_t sltmp; + + new = strcmp(null_str, p_val_str) ? strdup(p_val_str) : NULL; + if (!new) + return -1; + + sl_str = strtok_r(new, " \t", &lasts); + val_str = strtok_r(NULL, " \t", &lasts); + + if (!val_str) { + log_report("value must be specified in addition to SL\n"); + free(new); + return -1; + } + + sltmp = strtoul(sl_str, NULL, 0); + if (sltmp >= IB_CA_CONG_ENTRY_DATA_SIZE) { + log_report("invalid SL specified\n"); + free(new); + return -1; + } + + *sl = sltmp; + *val_offset = (unsigned int)(val_str - new); + + free(new); + return 0; +} + +static void opts_parse_ccti_timer(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2; + unsigned int val_offset = 0; + uint8_t sl = 0; + + if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0) + return; + + opts_parse_net16(p_subn, p_key, p_val_str + val_offset, + &p_val1[sl].ccti_timer, + &p_val2[sl].ccti_timer, + pfn); +} + +static void opts_parse_ccti_increase(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2; + unsigned int val_offset = 0; + uint8_t sl = 0; + + if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0) + return; + + opts_parse_uint8(p_subn, p_key, p_val_str + val_offset, + &p_val1[sl].ccti_increase, + &p_val2[sl].ccti_increase, + pfn); +} + +static void opts_parse_trigger_threshold(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2; + unsigned int val_offset = 0; + uint8_t sl = 0; + + if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0) + return; + + opts_parse_uint8(p_subn, p_key, p_val_str + val_offset, + &p_val1[sl].trigger_threshold, + &p_val2[sl].trigger_threshold, + pfn); +} + +static void opts_parse_ccti_min(IN osm_subn_t *p_subn, IN char *p_key, + IN char *p_val_str, void *p_v1, void *p_v2, + void (*pfn)(osm_subn_t *, void *)) +{ + osm_cacongestion_entry_t *p_val1 = p_v1, *p_val2 = p_v2; + unsigned int val_offset = 0; + uint8_t sl = 0; + + if (parse_ca_cong_common(p_val_str, &sl, &val_offset) < 0) + return; + + opts_parse_uint8(p_subn, p_key, p_val_str + val_offset, + &p_val1[sl].ccti_min, + &p_val2[sl].ccti_min, + pfn); +} + static const opt_rec_t opt_tbl[] = { { "guid", OPT_OFFSET(guid), opts_parse_net64, NULL, 0 }, { "m_key", OPT_OFFSET(m_key), opts_parse_net64, NULL, 1 }, @@ -524,6 +809,24 @@ static const opt_rec_t opt_tbl[] = { { "qos_rtr_vlarb_high", OPT_OFFSET(qos_rtr_options.vlarb_high), opts_parse_charp, NULL, 1 }, { "qos_rtr_vlarb_low", OPT_OFFSET(qos_rtr_options.vlarb_low), opts_parse_charp, NULL, 1 }, { "qos_rtr_sl2vl", OPT_OFFSET(qos_rtr_options.sl2vl), opts_parse_charp, NULL, 1 }, + { "congestion_control", OPT_OFFSET(congestion_control), opts_parse_boolean, NULL, 1 }, + { "cc_key", OPT_OFFSET(cc_key), opts_parse_net64, NULL, 0}, + { "cc_max_outstanding_mads", OPT_OFFSET(cc_max_outstanding_mads), opts_parse_uint32, NULL, 0 }, + { "cc_sw_cong_setting_control_map", OPT_OFFSET(cc_sw_cong_setting_control_map), opts_parse_net32, NULL, 1}, + { "cc_sw_cong_setting_victim_mask", OPT_OFFSET(cc_sw_cong_setting_victim_mask), opts_parse_256bit, NULL, 1}, + { "cc_sw_cong_setting_credit_mask", OPT_OFFSET(cc_sw_cong_setting_credit_mask), opts_parse_256bit, NULL, 1}, + { "cc_sw_cong_setting_threshold", OPT_OFFSET(cc_sw_cong_setting_threshold), opts_parse_uint8, NULL, 1}, + { "cc_sw_cong_setting_packet_size", OPT_OFFSET(cc_sw_cong_setting_packet_size), opts_parse_uint8, NULL, 1}, + { "cc_sw_cong_setting_credit_starvation_threshold", OPT_OFFSET(cc_sw_cong_setting_credit_starvation_threshold), opts_parse_uint8, NULL, 1}, + { "cc_sw_cong_setting_credit_starvation_return_delay", OPT_OFFSET(cc_sw_cong_setting_credit_starvation_return_delay), opts_parse_cct_entry, NULL, 1}, + { "cc_sw_cong_setting_marking_rate", OPT_OFFSET(cc_sw_cong_setting_marking_rate), opts_parse_net16, NULL, 1}, + { "cc_ca_cong_setting_port_control", OPT_OFFSET(cc_ca_cong_setting_port_control), opts_parse_net16, NULL, 1}, + { "cc_ca_cong_setting_control_map", OPT_OFFSET(cc_ca_cong_setting_control_map), opts_parse_net16, NULL, 1}, + { "cc_ca_cong_setting_ccti_timer", OPT_OFFSET(cc_ca_cong_entries), opts_parse_ccti_timer, NULL, 1}, + { "cc_ca_cong_setting_ccti_increase", OPT_OFFSET(cc_ca_cong_entries), opts_parse_ccti_increase, NULL, 1}, + { "cc_ca_cong_setting_trigger_threshold", OPT_OFFSET(cc_ca_cong_entries), opts_parse_trigger_threshold, NULL, 1}, + { "cc_ca_cong_setting_ccti_min", OPT_OFFSET(cc_ca_cong_entries), opts_parse_ccti_min, NULL, 1}, + { "cc_cct", OPT_OFFSET(cc_cct), opts_parse_cc_cct, NULL, 1}, { "enable_quirks", OPT_OFFSET(enable_quirks), opts_parse_boolean, NULL, 1 }, { "no_clients_rereg", OPT_OFFSET(no_clients_rereg), opts_parse_boolean, NULL, 1 }, { "prefix_routes_file", OPT_OFFSET(prefix_routes_file), opts_parse_charp, NULL, 0 }, @@ -601,6 +904,7 @@ static void subn_opt_destroy(IN osm_subn_opt_t * p_opt) subn_destroy_qos_options(&p_opt->qos_sw0_options); subn_destroy_qos_options(&p_opt->qos_swe_options); subn_destroy_qos_options(&p_opt->qos_rtr_options); + free(p_opt->cc_cct.input_str); } void osm_subn_destroy(IN osm_subn_t * p_subn) @@ -1033,6 +1337,9 @@ void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt) p_opt->torus_conf_file = strdup(OSM_DEFAULT_TORUS_CONF_FILE); p_opt->do_mesh_analysis = FALSE; p_opt->exit_on_fatal = TRUE; + p_opt->congestion_control = FALSE; + p_opt->cc_key = OSM_DEFAULT_CC_KEY; + p_opt->cc_max_outstanding_mads = OSM_PERFMGR_DEFAULT_MAX_OUTSTANDING_QUERIES; p_opt->enable_quirks = FALSE; p_opt->no_clients_rereg = FALSE; p_opt->prefix_routes_file = strdup(OSM_DEFAULT_PREFIX_ROUTES_FILE); @@ -1047,6 +1354,8 @@ void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt) subn_init_qos_options(&p_opt->qos_sw0_options, NULL); subn_init_qos_options(&p_opt->qos_swe_options, NULL); subn_init_qos_options(&p_opt->qos_rtr_options, NULL); + p_opt->cc_cct.entries_len = 0; + p_opt->cc_cct.input_str = NULL; } static char *clean_val(char *val) @@ -1674,6 +1983,9 @@ int osm_subn_rescan_conf_files(IN osm_subn_t * p_subn) int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t * p_opts) { + int cacongoutputcount = 0; + int i; + fprintf(out, "#\n# DEVICE ATTRIBUTES OPTIONS\n#\n" "# The port GUID on which the OpenSM is running\n" @@ -2138,6 +2450,164 @@ int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t * p_opts) fprintf(out, "\n"); fprintf(out, + "#\n# Congestion Control OPTIONS (EXPERIMENTAL)\n#\n\n" + "# Enable Congestion Control Configuration\n" + "congestion_control %s\n\n" + "# CCKey to use when configuring congestion control\n" + "# note that this does not configure a new CCkey, only the CCkey to use\n" + "cc_key 0x%016" PRIx64 "\n\n" + "# Congestion Control Max outstanding MAD\n" + "cc_max_outstanding_mads %u\n\n", + p_opts->congestion_control ? "TRUE" : "FALSE", + cl_ntoh64(p_opts->cc_key), + p_opts->cc_max_outstanding_mads); + + fprintf(out, + "#\n# Congestion Control SwitchCongestionSetting options\n#\n" + "# Control Map - bitmask indicating which of the following attributes are to be used\n" + "# bit 0 - victim mask\n" + "# bit 1 - credit mask\n" + "# bit 2 - threshold + packet size\n" + "# bit 3 - credit starvation threshold + return delay valid\n" + "# bit 4 - marking rate valid\n" + "cc_sw_cong_setting_control_map 0x%X\n\n", + cl_ntoh32(p_opts->cc_sw_cong_setting_control_map)); + + fprintf(out, + "# Victim Mask - 256 bit mask representing switch ports, mark packets with FECN\n" + "# whether they are the source or victim of congestion\n" + "# bit 0 - port 0 (enhanced port)\n" + "# bit 1 - port 1\n" + "# ...\n" + "# bit 254 - port 254\n" + "# bit 255 - reserved\n" + "cc_sw_cong_setting_victim_mask 0x"); + + for (i = 0; i < IB_CC_PORT_MASK_DATA_SIZE; i++) + fprintf(out, "%02X", p_opts->cc_sw_cong_setting_victim_mask[i]); + fprintf(out, "\n\n"); + + fprintf(out, + "# Credit Mask - 256 bit mask representing switch ports to apply credit starvation\n" + "# bit 0 - port 0 (enhanced port)\n" + "# bit 1 - port 1\n" + "# ...\n" + "# bit 254 - port 254\n" + "# bit 255 - reserved\n" + "cc_sw_cong_setting_credit_mask 0x"); + + for (i = 0; i < IB_CC_PORT_MASK_DATA_SIZE; i++) + fprintf(out, "%02X", p_opts->cc_sw_cong_setting_credit_mask[i]); + fprintf(out, "\n\n"); + + fprintf(out, + "# Threshold - value indicating aggressiveness of congestion marking\n" + "# 0x0 - none, 0x1 - loose, ..., 0xF - aggressive\n" + "cc_sw_cong_setting_threshold 0x%02X\n\n" + "# Packet Size - any packet less than this size will not be marked with a FECN\n" + "# units are in credits\n" + "cc_sw_cong_setting_packet_size %u\n\n" + "# Credit Starvation Threshold - value indicating aggressiveness of credit starvation\n" + "# 0x0 - none, 0x1 - loose, ..., 0xF - aggressive\n" + "cc_sw_cong_setting_credit_starvation_threshold 0x%02X\n\n" + "# Credit Starvation Return Delay - in CCT entry shift:multiplier format, see IB spec\n" + "cc_sw_cong_setting_credit_starvation_return_delay %u:%u\n\n" + "# Marking Rate - mean number of packets between markings\n" + "cc_sw_cong_setting_marking_rate %u\n\n", + p_opts->cc_sw_cong_setting_threshold, + p_opts->cc_sw_cong_setting_packet_size, + p_opts->cc_sw_cong_setting_credit_starvation_threshold, + p_opts->cc_sw_cong_setting_credit_starvation_return_delay.shift, + p_opts->cc_sw_cong_setting_credit_starvation_return_delay.multiplier, + cl_ntoh16(p_opts->cc_sw_cong_setting_marking_rate)); + + fprintf(out, + "#\n# Congestion Control CA Congestion Setting options\n#\n" + "# Port Control\n" + "# bit 0 = 0, QP based congestion control\n" + "# bit 0 = 1, SL/port based congestion control\n" + "cc_ca_cong_setting_port_control 0x%04X\n\n" + "# Control Map - 16 bit bitmask indicating which SLs should be configured\n" + "cc_ca_cong_setting_control_map 0x%04X\n\n", + cl_ntoh16(p_opts->cc_ca_cong_setting_port_control), + cl_ntoh16(p_opts->cc_ca_cong_setting_control_map)); + + fprintf(out, + "#\n# CA Congestion Setting Entries\n#\n" + "# Each of congestion control settings below configures the CA Congestion\n" + "# Settings for an individual SL. The SL must be specified before the value.\n" + "# These options may be specified multiple times to configure different values\n" + "# for different SLs.\n" + "#\n" + "# ccti timer - when expires decrements 1 from the CCTI\n" + "# ccti increase - number to be added to the table index on receipt of a BECN\n" + "# trigger threshold - when the ccti is equal to this, an event is logged\n" + "# ccti min - the minimum value for the ccti. This imposes a minimum rate\n" + "# on the injection rate\n\n"); + + for (i = 0; i < IB_CA_CONG_ENTRY_DATA_SIZE; i++) { + /* Don't output unless one of the settings has been set, there's no need + * to output 16 chunks of this with all defaults of 0 */ + if (p_opts->cc_ca_cong_entries[i].ccti_timer + || p_opts->cc_ca_cong_entries[i].ccti_increase + || p_opts->cc_ca_cong_entries[i].trigger_threshold + || p_opts->cc_ca_cong_entries[i].ccti_min) { + fprintf(out, + "# SL = %u\n" + "cc_ca_cong_setting_ccti_timer %u %u\n" + "cc_ca_cong_setting_ccti_increase %u %u\n" + "cc_ca_cong_setting_trigger_threshold %u %u\n" + "cc_ca_cong_setting_ccti_min %u %u\n\n", + i, + i, + cl_ntoh16(p_opts->cc_ca_cong_entries[i].ccti_timer), + i, + p_opts->cc_ca_cong_entries[i].ccti_increase, + i, + p_opts->cc_ca_cong_entries[i].trigger_threshold, + i, + p_opts->cc_ca_cong_entries[i].ccti_min); + cacongoutputcount++; + } + } + + /* If by chance all the CA Cong Settings are default, output atleast 1 chunk + * for illustration */ + if (!cacongoutputcount) + fprintf(out, + "# SL = 0\n" + "cc_ca_cong_setting_ccti_timer 0 %u\n" + "cc_ca_cong_setting_ccti_increase 0 %u\n" + "cc_ca_cong_setting_trigger_threshold 0 %u\n" + "cc_ca_cong_setting_ccti_min 0 %u\n\n", + cl_ntoh16(p_opts->cc_ca_cong_entries[0].ccti_timer), + p_opts->cc_ca_cong_entries[0].ccti_increase, + p_opts->cc_ca_cong_entries[0].trigger_threshold, + p_opts->cc_ca_cong_entries[0].ccti_min); + + fprintf(out, + "#\n# Congestion Control Table\n#\n" + "# Comma separated list of CCT entries representing CCT.\n" + "# Format is shift:multipler,shift_multiplier,shift:multiplier,...\n" + "cc_cct "); + + if (!p_opts->cc_cct.entries_len) { + fprintf(out, "%s\n", null_str); + } + else { + fprintf(out, "%u:%u", + p_opts->cc_cct.entries[0].shift, + p_opts->cc_cct.entries[0].multiplier); + for (i = 0; i < p_opts->cc_cct.entries_len; i++) { + fprintf(out, ",%u:%u", + p_opts->cc_cct.entries[0].shift, + p_opts->cc_cct.entries[0].multiplier); + } + fprintf(out, "\n"); + } + fprintf(out, "\n"); + + fprintf(out, "# Prefix routes file name\n" "prefix_routes_file %s\n\n", p_opts->prefix_routes_file);