From patchwork Thu Jun 17 17:28:49 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hal Rosenstock X-Patchwork-Id: 106718 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o5HHWEvO022811 for ; Thu, 17 Jun 2010 17:32:53 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932996Ab0FQRcN (ORCPT ); Thu, 17 Jun 2010 13:32:13 -0400 Received: from qmta10.westchester.pa.mail.comcast.net ([76.96.62.17]:37113 "EHLO qmta10.westchester.pa.mail.comcast.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1760500Ab0FQRcM (ORCPT ); Thu, 17 Jun 2010 13:32:12 -0400 Received: from omta15.westchester.pa.mail.comcast.net ([76.96.62.87]) by qmta10.westchester.pa.mail.comcast.net with comcast id X39b1e0021swQuc5A5YCad; Thu, 17 Jun 2010 17:32:12 +0000 Received: from hal.comcast.net ([75.69.247.31]) by omta15.westchester.pa.mail.comcast.net with comcast id X5YB1e00N0hNrtn3b5YBWm; Thu, 17 Jun 2010 17:32:12 +0000 Received: from hal.comcast.net (localhost.localdomain [127.0.0.1]) by hal.comcast.net (8.14.3/8.14.3) with ESMTP id o5HHVHcB017288; Thu, 17 Jun 2010 13:31:22 -0400 Received: (from hnrose@localhost) by hal.comcast.net (8.14.3/8.14.3/Submit) id o5HHSnZB017249; Thu, 17 Jun 2010 13:28:49 -0400 Date: Thu, 17 Jun 2010 13:28:49 -0400 From: Hal Rosenstock To: sashak@voltaire.com Cc: linux-rdma@vger.kernel.org Subject: [PATCHv5 1/2][RESEND] opensm/PerfMgr: Better redirection support Message-ID: <20100617172849.GA17245@comcast.net> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.19 (2009-01-05) Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Thu, 17 Jun 2010 17:32:53 +0000 (UTC) diff --git a/opensm/include/opensm/osm_perfmgr.h b/opensm/include/opensm/osm_perfmgr.h index c26c141..34925e8 100644 --- a/opensm/include/opensm/osm_perfmgr.h +++ b/opensm/include/opensm/osm_perfmgr.h @@ -1,7 +1,7 @@ /* * Copyright (c) 2007 The Regents of the University of California. * Copyright (c) 2007-2009 Voltaire, Inc. All rights reserved. - * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2009,2010 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -90,11 +90,17 @@ typedef enum { PERFMGR_SWEEP_SUSPENDED } osm_perfmgr_sweep_state_t; -/* Redirection information */ -typedef struct redir { - ib_net16_t redir_lid; - ib_net32_t redir_qp; -} redir_t; +typedef struct monitored_port { + uint16_t pkey_ix; + ib_net16_t orig_lid; + boolean_t redirection; + boolean_t valid; + /* Redirection fields from ClassPortInfo */ + ib_gid_t gid; + ib_net16_t lid; + ib_net16_t pkey; + ib_net32_t qp; +} monitored_port_t; /* Node to store information about nodes being monitored */ typedef struct monitored_node { @@ -104,7 +110,7 @@ typedef struct monitored_node { boolean_t esp0; char *name; uint32_t num_ports; - redir_t redir_port[1]; /* redirection on a per port basis */ + monitored_port_t port[1]; } monitored_node_t; struct osm_opensm; @@ -134,6 +140,8 @@ typedef struct osm_perfmgr { uint32_t max_outstanding_queries; cl_qmap_t monitored_map; /* map the nodes being tracked */ monitored_node_t *remove_list; + ib_net64_t port_guid; + int16_t local_port; } osm_perfmgr_t; /* * FIELDS diff --git a/opensm/opensm/osm_perfmgr.c b/opensm/opensm/osm_perfmgr.c index 398b463..fccf9d6 100644 --- a/opensm/opensm/osm_perfmgr.c +++ b/opensm/opensm/osm_perfmgr.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2007 The Regents of the University of California. * Copyright (c) 2007-2009 Voltaire, Inc. All rights reserved. - * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2009,2010 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -64,6 +64,7 @@ #include #include #include +#include #define PERFMGR_INITIAL_TID_VALUE 0xcafe @@ -194,6 +195,7 @@ static void perfmgr_mad_send_err_callback(void *bind_context, uint8_t port = context->perfmgr_context.port; cl_map_item_t *p_node; monitored_node_t *p_mon_node; + ib_net16_t orig_lid; OSM_LOG_ENTER(pm->log); @@ -225,9 +227,11 @@ static void perfmgr_mad_send_err_callback(void *bind_context, p_mon_node->num_ports); goto Exit; } - /* Clear redirection info */ - p_mon_node->redir_port[port].redir_lid = 0; - p_mon_node->redir_port[port].redir_qp = 0; + /* Clear redirection info for this port except orig_lid */ + orig_lid = p_mon_node->port[port].orig_lid; + memset(&p_mon_node->port[port], 0, sizeof(monitored_port_t)); + p_mon_node->port[port].orig_lid = orig_lid; + p_mon_node->port[port].valid = TRUE; cl_plock_release(&pm->osm->lock); } @@ -256,7 +260,7 @@ ib_api_status_t osm_perfmgr_bind(osm_perfmgr_t * pm, ib_net64_t port_guid) goto Exit; } - bind_info.port_guid = port_guid; + bind_info.port_guid = pm->port_guid = port_guid; bind_info.mad_class = IB_MCLASS_PERF; bind_info.class_version = 1; bind_info.is_responder = FALSE; @@ -309,24 +313,14 @@ static ib_net32_t get_qp(monitored_node_t * mon_node, uint8_t port) ib_net32_t qp = IB_QP1; if (mon_node && mon_node->num_ports && port < mon_node->num_ports && - mon_node->redir_port[port].redir_lid && - mon_node->redir_port[port].redir_qp) - qp = mon_node->redir_port[port].redir_qp; + mon_node->port[port].redirection && mon_node->port[port].qp) + qp = mon_node->port[port].qp; return qp; } -/********************************************************************** - * Given a node, a port, and an optional monitored node, - * return the appropriate lid to query that port - **********************************************************************/ -static ib_net16_t get_lid(osm_node_t * p_node, uint8_t port, - monitored_node_t * mon_node) +static ib_net16_t get_base_lid(osm_node_t * p_node, uint8_t port) { - if (mon_node && mon_node->num_ports && port < mon_node->num_ports && - mon_node->redir_port[port].redir_lid) - return mon_node->redir_port[port].redir_lid; - switch (p_node->node_info.node_type) { case IB_NODE_TYPE_CA: case IB_NODE_TYPE_ROUTER: @@ -339,12 +333,26 @@ static ib_net16_t get_lid(osm_node_t * p_node, uint8_t port, } /********************************************************************** + * Given a node, a port, and an optional monitored node, + * return the lid appropriate to query that port + **********************************************************************/ +static ib_net16_t get_lid(osm_node_t * p_node, uint8_t port, + monitored_node_t * mon_node) +{ + if (mon_node && mon_node->num_ports && port < mon_node->num_ports && + mon_node->port[port].lid) + return mon_node->port[port].lid; + + return get_base_lid(p_node, port); +} + +/********************************************************************** * Form and send the Port Counters MAD for a single port. **********************************************************************/ static ib_api_status_t perfmgr_send_pc_mad(osm_perfmgr_t * perfmgr, ib_net16_t dest_lid, - ib_net32_t dest_qp, uint8_t port, - uint8_t mad_method, + ib_net32_t dest_qp, uint16_t pkey_ix, + uint8_t port, uint8_t mad_method, osm_madw_context_t * p_context) { ib_api_status_t status = IB_SUCCESS; @@ -383,8 +391,7 @@ static ib_api_status_t perfmgr_send_pc_mad(osm_perfmgr_t * perfmgr, p_madw->mad_addr.addr_type.gsi.remote_qp = dest_qp; p_madw->mad_addr.addr_type.gsi.remote_qkey = cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY); - /* FIXME what about other partitions */ - p_madw->mad_addr.addr_type.gsi.pkey_ix = 0; + p_madw->mad_addr.addr_type.gsi.pkey_ix = pkey_ix; p_madw->mad_addr.addr_type.gsi.service_level = 0; p_madw->mad_addr.addr_type.gsi.global_route = FALSE; p_madw->resp_expected = TRUE; @@ -420,6 +427,7 @@ static void collect_guids(cl_map_item_t * p_map_item, void *context) osm_perfmgr_t *pm = (osm_perfmgr_t *) context; monitored_node_t *mon_node = NULL; uint32_t num_ports; + int port; OSM_LOG_ENTER(pm->log); @@ -428,7 +436,7 @@ static void collect_guids(cl_map_item_t * p_map_item, void *context) /* if not already in map add it */ num_ports = osm_node_get_num_physp(node); mon_node = malloc(sizeof(*mon_node) + - sizeof(redir_t) * num_ports); + sizeof(monitored_port_t) * num_ports); if (!mon_node) { OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C06: " "malloc failed: not handling node %s" @@ -437,7 +445,7 @@ static void collect_guids(cl_map_item_t * p_map_item, void *context) goto Exit; } memset(mon_node, 0, - sizeof(*mon_node) + sizeof(redir_t) * num_ports); + sizeof(*mon_node) + sizeof(monitored_port_t) * num_ports); mon_node->guid = node_guid; mon_node->name = strdup(node->print_desc); mon_node->num_ports = num_ports; @@ -445,6 +453,11 @@ static void collect_guids(cl_map_item_t * p_map_item, void *context) mon_node->esp0 = (node->sw && ib_switch_info_is_enhanced_port0(&node->sw-> switch_info)); + for (port = mon_node->esp0 ? 0 : 1; port < num_ports; port++) { + mon_node->port[port].orig_lid = get_base_lid(node, port); + mon_node->port[port].valid = TRUE; + } + cl_qmap_insert(&pm->monitored_map, node_guid, (cl_map_item_t *) mon_node); } @@ -501,6 +514,9 @@ static void perfmgr_query_counters(cl_map_item_t * p_map_item, void *context) if (!osm_node_get_physp_ptr(node, port)) continue; + if (!mon_node->port[port].valid) + continue; + lid = get_lid(node, port, mon_node); if (lid == 0) { OSM_LOG(pm->log, OSM_LOG_DEBUG, "WARN: node 0x%" PRIx64 @@ -521,8 +537,10 @@ static void perfmgr_query_counters(cl_map_item_t * p_map_item, void *context) OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Getting stats for node 0x%" PRIx64 " port %d (lid %u) (%s)\n", node_guid, port, cl_ntoh16(lid), node->print_desc); - status = perfmgr_send_pc_mad(pm, lid, remote_qp, port, - IB_MAD_METHOD_GET, &mad_context); + status = perfmgr_send_pc_mad(pm, lid, remote_qp, + mon_node->port[port].pkey_ix, + port, IB_MAD_METHOD_GET, + &mad_context); if (status != IB_SUCCESS) OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C09: " "Failed to issue port counter query for node 0x%" @@ -769,6 +787,24 @@ void osm_perfmgr_process(osm_perfmgr_t * pm) pm->subn->sm_state == IB_SMINFO_STATE_NOTACTIVE) perfmgr_discovery(pm->subn->p_osm); + /* if redirection enabled, determine local port */ + if (pm->subn->opt.perfmgr_redir && pm->local_port == -1) { + osm_node_t *p_node; + osm_port_t *p_port; + + CL_PLOCK_ACQUIRE(pm->sm->p_lock); + p_port = osm_get_port_by_guid(pm->subn, pm->port_guid); + if (p_port) { + p_node = p_port->p_node; + CL_ASSERT(p_node); + pm->local_port = + ib_node_info_get_local_port_num(&p_node->node_info); + } else + OSM_LOG(pm->log, OSM_LOG_ERROR, + "ERR 4C87: No PerfMgr port object\n"); + CL_PLOCK_RELEASE(pm->sm->p_lock); + } + #if ENABLE_OSM_PERF_MGR_PROFILE gettimeofday(&before, NULL); #endif @@ -932,8 +968,8 @@ static int counter_overflow_32(ib_net32_t val) * MAD to the port. **********************************************************************/ static void perfmgr_check_overflow(osm_perfmgr_t * pm, - monitored_node_t * mon_node, uint8_t port, - ib_port_counters_t * pc) + monitored_node_t * mon_node, int16_t pkey_ix, + uint8_t port, ib_port_counters_t * pc) { osm_madw_context_t mad_context; ib_api_status_t status; @@ -960,6 +996,9 @@ static void perfmgr_check_overflow(osm_perfmgr_t * pm, osm_node_t *p_node = NULL; ib_net16_t lid = 0; + if (!mon_node->port[port].valid) + goto Exit; + osm_log(pm->log, OSM_LOG_VERBOSE, "PerfMgr: Counter overflow: %s (0x%" PRIx64 ") port %d; clearing counters\n", @@ -984,8 +1023,9 @@ static void perfmgr_check_overflow(osm_perfmgr_t * pm, mad_context.perfmgr_context.port = port; mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_SET; /* clear port counters */ - status = perfmgr_send_pc_mad(pm, lid, remote_qp, port, - IB_MAD_METHOD_SET, &mad_context); + status = perfmgr_send_pc_mad(pm, lid, remote_qp, pkey_ix, + port, IB_MAD_METHOD_SET, + &mad_context); if (status != IB_SUCCESS) OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C11: " "Failed to send clear counters MAD for %s (0x%" @@ -1043,6 +1083,64 @@ static void perfmgr_log_events(osm_perfmgr_t * pm, time_diff, mon_node->name, mon_node->guid, port); } +static int16_t validate_redir_pkey(osm_perfmgr_t *pm, ib_net16_t pkey) +{ + int16_t pkey_ix = -1; + osm_port_t *p_port; + osm_pkey_tbl_t *p_pkey_tbl; + ib_net16_t *p_orig_pkey; + uint16_t block; + uint8_t index; + + OSM_LOG_ENTER(pm->log); + + CL_PLOCK_ACQUIRE(pm->sm->p_lock); + p_port = osm_get_port_by_guid(pm->subn, pm->port_guid); + if (!p_port) { + CL_PLOCK_RELEASE(pm->sm->p_lock); + OSM_LOG(pm->log, OSM_LOG_ERROR, + "ERR 4C1E: No PerfMgr port object\n"); + goto Exit; + } + if (p_port->p_physp && osm_physp_is_valid(p_port->p_physp)) { + p_pkey_tbl = &p_port->p_physp->pkeys; + if (!p_pkey_tbl) { + CL_PLOCK_RELEASE(pm->sm->p_lock); + OSM_LOG(pm->log, OSM_LOG_VERBOSE, + "No PKey table found for PerfMgr port\n"); + goto Exit; + } + p_orig_pkey = cl_map_get(&p_pkey_tbl->keys, + ib_pkey_get_base(pkey)); + if (!p_orig_pkey) { + CL_PLOCK_RELEASE(pm->sm->p_lock); + OSM_LOG(pm->log, OSM_LOG_VERBOSE, + "PKey 0x%x not found for PerfMgr port\n", + cl_ntoh16(pkey)); + goto Exit; + } + if (osm_pkey_tbl_get_block_and_idx(p_pkey_tbl, p_orig_pkey, + &block, &index) == IB_SUCCESS) { + CL_PLOCK_RELEASE(pm->sm->p_lock); + pkey_ix = block * IB_NUM_PKEY_ELEMENTS_IN_BLOCK + index; + } else { + CL_PLOCK_RELEASE(pm->sm->p_lock); + OSM_LOG(pm->log, OSM_LOG_ERROR, + "ERR 0x4C1F: Failed to obtain P_Key 0x%04x " + "block and index for PerfMgr port\n", + cl_ntoh16(pkey)); + } + } else { + CL_PLOCK_RELEASE(pm->sm->p_lock); + OSM_LOG(pm->log, OSM_LOG_ERROR, + "ERR 4C20: Local PerfMgt port physp invalid\n"); + } + +Exit: + OSM_LOG_EXIT(pm->log); + return pkey_ix; +} + /********************************************************************** * The dispatcher uses a thread pool which will call this function when * there is a thread available to process the mad received on the wire. @@ -1061,6 +1159,8 @@ static void pc_recv_process(void *context, void *data) perfmgr_db_data_cnt_reading_t data_reading; cl_map_item_t *p_node; monitored_node_t *p_mon_node; + int16_t pkey_ix = 0; + boolean_t valid = TRUE; OSM_LOG_ENTER(pm->log); @@ -1084,7 +1184,8 @@ static void pc_recv_process(void *context, void *data) p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO); /* Response could also be redirection (IBM eHCA PMA does this) */ - if (p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO) { + if (p_mad->status & IB_MAD_STATUS_REDIRECT && + p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO) { char gid_str[INET6_ADDRSTRLEN]; ib_class_port_info_t *cpi = (ib_class_port_info_t *) & @@ -1097,17 +1198,46 @@ static void pc_recv_process(void *context, void *data) inet_ntop(AF_INET6, cpi->redir_gid.raw, gid_str, sizeof gid_str), cl_ntoh32(cpi->redir_qp)); - /* LID or GID redirection ? */ - /* For GID redirection, need to get PathRecord from SA */ + if (!pm->subn->opt.perfmgr_redir) { + OSM_LOG(pm->log, OSM_LOG_VERBOSE, + "Redirection requested but disabled\n"); + valid = FALSE; + } + + /* valid redirection ? */ if (cpi->redir_lid == 0) { + if (!ib_gid_is_notzero(&cpi->redir_gid)) { + OSM_LOG(pm->log, OSM_LOG_VERBOSE, + "Invalid redirection " + "(both redirect LID and GID are zero)\n"); + valid = FALSE; + } + } + if (cpi->redir_qp == 0) { + OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Invalid RedirectQP\n"); + valid = FALSE; + } + if (cpi->redir_pkey == 0) { + OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Invalid RedirectP_Key\n"); + valid = FALSE; + } + if (cpi->redir_qkey != IB_QP1_WELL_KNOWN_Q_KEY) { + OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Invalid RedirectQ_Key\n"); + valid = FALSE; + } + + pkey_ix = validate_redir_pkey(pm, cpi->redir_pkey); + if (pkey_ix == -1) { OSM_LOG(pm->log, OSM_LOG_VERBOSE, - "GID redirection not currently implemented!\n"); - goto Exit; + "Index for Pkey 0x%x not found\n", + cl_ntoh16(cpi->redir_pkey)); + valid = FALSE; } - if (!pm->subn->opt.perfmgr_redir) { - OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C16: " - "redirection requested but disabled\n"); + if (cpi->redir_lid == 0) { + /* GID redirection: get PathRecord information */ + OSM_LOG(pm->log, OSM_LOG_VERBOSE, + "GID redirection not currently supported\n"); goto Exit; } @@ -1122,13 +1252,24 @@ static void pc_recv_process(void *context, void *data) p_mon_node->num_ports); goto Exit; } - p_mon_node->redir_port[port].redir_lid = cpi->redir_lid; - p_mon_node->redir_port[port].redir_qp = cpi->redir_qp; + p_mon_node->port[port].redirection = TRUE; + p_mon_node->port[port].valid = valid; + memcpy(&p_mon_node->port[port].gid, &cpi->redir_gid, + sizeof(ib_gid_t)); + p_mon_node->port[port].lid = cpi->redir_lid; + p_mon_node->port[port].qp = cpi->redir_qp; + p_mon_node->port[port].pkey = cpi->redir_pkey; + if (pkey_ix != -1) + p_mon_node->port[port].pkey_ix = pkey_ix; cl_plock_release(&pm->osm->lock); + if (!valid) + goto Exit; + /* Finally, reissue the query to the redirected location */ status = perfmgr_send_pc_mad(pm, cpi->redir_lid, cpi->redir_qp, - port, mad_context->perfmgr_context. + pkey_ix, port, + mad_context->perfmgr_context. mad_method, mad_context); if (status != IB_SUCCESS) OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C14: " @@ -1163,7 +1304,7 @@ static void pc_recv_process(void *context, void *data) perfmgr_db_clear_prev_dc(pm->db, node_guid, port); } - perfmgr_check_overflow(pm, p_mon_node, port, wire_read); + perfmgr_check_overflow(pm, p_mon_node, pkey_ix, port, wire_read); #if ENABLE_OSM_PERF_MGR_PROFILE do { @@ -1208,6 +1349,7 @@ ib_api_status_t osm_perfmgr_init(osm_perfmgr_t * pm, osm_opensm_t * osm, pm->sweep_time_s = p_opt->perfmgr_sweep_time_s; pm->max_outstanding_queries = p_opt->perfmgr_max_outstanding_queries; pm->osm = osm; + pm->local_port = -1; status = cl_timer_init(&pm->sweep_timer, perfmgr_sweep, pm); if (status != IB_SUCCESS)