From patchwork Tue Sep 25 01:07:15 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: NeilBrown X-Patchwork-Id: 10613201 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id B1589157B for ; Tue, 25 Sep 2018 01:12:43 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id B37C22A052 for ; Tue, 25 Sep 2018 01:12:43 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id A79352A05D; Tue, 25 Sep 2018 01:12:43 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-2.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_NONE autolearn=ham version=3.3.1 Received: from pdx1-mailman02.dreamhost.com (pdx1-mailman02.dreamhost.com [64.90.62.194]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id EA7D42A052 for ; Tue, 25 Sep 2018 01:12:42 +0000 (UTC) Received: from pdx1-mailman02.dreamhost.com (localhost [IPv6:::1]) by pdx1-mailman02.dreamhost.com (Postfix) with ESMTP id 8DCA84C4255; Mon, 24 Sep 2018 18:12:42 -0700 (PDT) X-Original-To: lustre-devel@lists.lustre.org Delivered-To: lustre-devel-lustre.org@pdx1-mailman02.dreamhost.com Received: from mx1.suse.de (mx2.suse.de [195.135.220.15]) by pdx1-mailman02.dreamhost.com (Postfix) with ESMTP id EF0204C4210 for ; Mon, 24 Sep 2018 18:12:39 -0700 (PDT) X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id 180DFB034; Tue, 25 Sep 2018 01:12:39 +0000 (UTC) From: NeilBrown To: Oleg Drokin , Doug Oucharek , James Simmons , Andreas Dilger Date: Tue, 25 Sep 2018 11:07:15 +1000 Message-ID: <153783763587.32103.5037367646271689437.stgit@noble> In-Reply-To: <153783752960.32103.8394391715843917125.stgit@noble> References: <153783752960.32103.8394391715843917125.stgit@noble> User-Agent: StGit/0.17.1-dirty MIME-Version: 1.0 Subject: [lustre-devel] [PATCH 26/34] LU-7734 lnet: Routing fixes part 2 X-BeenThere: lustre-devel@lists.lustre.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: "For discussing Lustre software development." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: Lustre Development List Errors-To: lustre-devel-bounces@lists.lustre.org Sender: "lustre-devel" X-Virus-Scanned: ClamAV using ClamSMTP From: Amir Shehata Fix lnet_select_pathway() to handle the routing cases correctly. The following general cases are handled: . Non-MR directly connected . Non-MR not directly connected . MR Directly connected . MR Not directly connected . No gateway . Gateway is non-mr . Gateway is mr Signed-off-by: Amir Shehata Change-Id: If2d16b797b94421e78a9f2a254a250a440f8b244 Reviewed-on: http://review.whamcloud.com/21167 Signed-off-by: NeilBrown --- drivers/staging/lustre/lnet/lnet/lib-move.c | 214 ++++++++++++++++++--------- drivers/staging/lustre/lnet/lnet/peer.c | 29 +++- 2 files changed, 167 insertions(+), 76 deletions(-) diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c index 58521b014ef3..12bc80d060e9 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-move.c +++ b/drivers/staging/lustre/lnet/lnet/lib-move.c @@ -1145,6 +1145,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, __u32 seq; int cpt, cpt2, rc; bool routing; + bool routing2; bool ni_is_pref; bool preferred; int best_credits; @@ -1168,6 +1169,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, best_gw = NULL; local_net = NULL; routing = false; + routing2 = false; seq = lnet_get_dlc_seq_locked(); @@ -1201,7 +1203,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, } /* - * STEP 1: first jab at determineing best_ni + * STEP 1: first jab at determining best_ni * if src_nid is explicitly specified, then best_ni is already * pre-determiend for us. Otherwise we need to select the best * one to use later on @@ -1215,17 +1217,122 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, libcfs_nid2str(src_nid)); return -EINVAL; } + } + + if (msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_ACK || + !peer->lp_multi_rail) { + /* + * for replies we want to respond on the same peer_ni we + * received the message on if possible. If not, then pick + * a peer_ni to send to + * + * if the peer is non-multi-rail then you want to send to + * the dst_nid provided as well. + * + * It is expected to find the lpni using dst_nid, since we + * created it earlier. + */ + best_lpni = lnet_find_peer_ni_locked(dst_nid); + if (best_lpni) + lnet_peer_ni_decref_locked(best_lpni); - if (best_ni->ni_net->net_id != LNET_NIDNET(dst_nid)) { + if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) { + /* + * this lpni is not on a local network so we need + * to route this reply. + */ + best_gw = lnet_find_route_locked(NULL, + best_lpni->lpni_nid, + rtr_nid); + if (best_gw) { + /* + * RULE: Each node considers only the next-hop + * + * We're going to route the message, + * so change the peer to the router. + */ + LASSERT(best_gw->lpni_peer_net); + LASSERT(best_gw->lpni_peer_net->lpn_peer); + peer = best_gw->lpni_peer_net->lpn_peer; + + /* + * if the router is not multi-rail + * then use the best_gw found to send + * the message to + */ + if (!peer->lp_multi_rail) + best_lpni = best_gw; + else + best_lpni = NULL; + + routing = true; + } else { + best_lpni = NULL; + } + } else if (!best_lpni) { lnet_net_unlock(cpt); - LCONSOLE_WARN("No route to %s via from %s\n", - libcfs_nid2str(dst_nid), - libcfs_nid2str(src_nid)); + CERROR("unable to send msg_type %d to originating %s. Destination NID not in DB\n", + msg->msg_type, libcfs_nid2str(dst_nid)); return -EINVAL; } - goto pick_peer; } + /* + * if the peer is not MR capable, then we should always send to it + * using the first NI in the NET we determined. + */ + if (!peer->lp_multi_rail) { + if (!best_lpni) { + lnet_net_unlock(cpt); + CERROR("no route to %s\n", + libcfs_nid2str(dst_nid)); + return -EHOSTUNREACH; + } + + /* best ni could be set because src_nid was provided */ + if (!best_ni) { + best_ni = lnet_net2ni_locked( + best_lpni->lpni_net->net_id, cpt); + if (!best_ni) { + lnet_net_unlock(cpt); + CERROR("no path to %s from net %s\n", + libcfs_nid2str(best_lpni->lpni_nid), + libcfs_net2str(best_lpni->lpni_net->net_id)); + return -EHOSTUNREACH; + } + } + } + + if (best_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + lnet_ni_addref_locked(best_ni, cpt); + msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); + msg->msg_target.nid = best_ni->ni_nid; + lnet_msg_commit(msg, cpt); + msg->msg_txni = best_ni; + lnet_net_unlock(cpt); + + return LNET_CREDIT_OK; + } + + /* + * if we already found a best_ni because src_nid is specified and + * best_lpni because we are replying to a message then just send + * the message + */ + if (best_ni && best_lpni) + goto send; + + /* + * If we already found a best_ni because src_nid is specified then + * pick the peer then send the message + */ + if (best_ni) + goto pick_peer; + /* * Decide whether we need to route to peer_ni. * Get the local net that I need to be on to be able to directly @@ -1242,7 +1349,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, continue; local_net = lnet_get_net_locked(peer_net->lpn_net_id); - if (!local_net) { + if (!local_net && !routing) { struct lnet_peer_ni *net_gw; /* * go through each peer_ni on that peer_net and @@ -1263,14 +1370,11 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, if (!best_gw) { best_gw = net_gw; - best_lpni = lpni; } else { rc = lnet_compare_peers(net_gw, best_gw); - if (rc > 0) { + if (rc > 0) best_gw = net_gw; - best_lpni = lpni; - } } } @@ -1279,9 +1383,9 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, local_net = lnet_get_net_locked (LNET_NIDNET(best_gw->lpni_nid)); - routing = true; + routing2 = true; } else { - routing = false; + routing2 = false; best_gw = NULL; } @@ -1342,12 +1446,17 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, } } - /* - * if the peer is not MR capable, then we should always send to it - * using the first NI in the NET we determined. - */ - if (!peer->lp_multi_rail && local_net) - best_ni = lnet_net2ni_locked(local_net->net_id, cpt); + if (routing2) { + /* + * RULE: Each node considers only the next-hop + * + * We're going to route the message, so change the peer to + * the router. + */ + LASSERT(best_gw->lpni_peer_net); + LASSERT(best_gw->lpni_peer_net->lpn_peer); + peer = best_gw->lpni_peer_net->lpn_peer; + } if (!best_ni) { lnet_net_unlock(cpt); @@ -1363,43 +1472,11 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, */ best_ni->ni_seq++; - if (routing) - goto send; - pick_peer: - if (best_ni == the_lnet.ln_loni) { - /* No send credit hassles with LOLND */ - lnet_ni_addref_locked(best_ni, cpt); - msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); - if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); - msg->msg_target.nid = best_ni->ni_nid; - lnet_msg_commit(msg, cpt); - msg->msg_txni = best_ni; - lnet_net_unlock(cpt); - - return LNET_CREDIT_OK; - } - - if (msg->msg_type == LNET_MSG_REPLY || - msg->msg_type == LNET_MSG_ACK) { - /* - * for replies we want to respond on the same peer_ni we - * received the message on if possible. If not, then pick - * a peer_ni to send to - */ - best_lpni = lnet_find_peer_ni_locked(dst_nid); - if (best_lpni) { - lnet_peer_ni_decref_locked(best_lpni); - goto send; - } else { - CDEBUG(D_NET, - "unable to send msg_type %d to originating %s\n", - msg->msg_type, - libcfs_nid2str(dst_nid)); - } - } - + /* + * At this point the best_ni is on a local network on which + * the peer has a peer_ni as well + */ peer_net = lnet_peer_get_net_locked(peer, best_ni->ni_net->net_id); /* @@ -1429,13 +1506,16 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, libcfs_nid2str(best_gw->lpni_nid), lnet_msgtyp2str(msg->msg_type), msg->msg_len); - best_lpni = lnet_find_peer_ni_locked(dst_nid); - LASSERT(best_lpni); - lnet_peer_ni_decref_locked(best_lpni); - - routing = true; - - goto send; + routing2 = true; + /* + * RULE: Each node considers only the next-hop + * + * We're going to route the message, so change the peer to + * the router. + */ + LASSERT(best_gw->lpni_peer_net); + LASSERT(best_gw->lpni_peer_net->lpn_peer); + peer = best_gw->lpni_peer_net->lpn_peer; } else if (!lnet_is_peer_net_healthy_locked(peer_net)) { /* * this peer_net is unhealthy but we still have an opportunity @@ -1459,6 +1539,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, lpni = NULL; best_lpni_credits = INT_MIN; preferred = false; + best_lpni = NULL; while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { /* * if this peer ni is not healthy just skip it, no point in @@ -1513,19 +1594,14 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, } send: + routing = routing || routing2; + /* * Increment sequence number of the peer selected so that we * pick the next one in Round Robin. */ best_lpni->lpni_seq++; - /* - * When routing the best gateway found acts as the best peer - * NI to send to. - */ - if (routing) - best_lpni = best_gw; - /* * grab a reference on the peer_ni so it sticks around even if * we need to drop and relock the lnet_net_lock below. diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c index 9cecfb49db87..d757f4df1f39 100644 --- a/drivers/staging/lustre/lnet/lnet/peer.c +++ b/drivers/staging/lustre/lnet/lnet/peer.c @@ -225,11 +225,18 @@ lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni) } /* called with lnet_net_lock LNET_LOCK_EX held */ -static void +static int lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni) { struct lnet_peer_table *ptable = NULL; + /* don't remove a peer_ni if it's also a gateway */ + if (lpni->lpni_rtr_refcount > 0) { + CERROR("Peer NI %s is a gateway. Can not delete it\n", + libcfs_nid2str(lpni->lpni_nid)); + return -EBUSY; + } + lnet_peer_remove_from_remote_list(lpni); /* remove peer ni from the hash list. */ @@ -260,6 +267,8 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni) /* decrement reference on peer */ lnet_peer_ni_decref_locked(lpni); + + return 0; } void lnet_peer_uninit(void) @@ -313,17 +322,22 @@ lnet_peer_tables_create(void) return 0; } -static void +static int lnet_peer_del_locked(struct lnet_peer *peer) { struct lnet_peer_ni *lpni = NULL, *lpni2; + int rc = 0, rc2 = 0; lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni); while (lpni) { lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni); - lnet_peer_ni_del_locked(lpni); + rc = lnet_peer_ni_del_locked(lpni); + if (rc != 0) + rc2 = rc; lpni = lpni2; } + + return rc2; } static void @@ -899,6 +913,7 @@ lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid) lnet_nid_t local_nid; struct lnet_peer *peer; struct lnet_peer_ni *lpni; + int rc; if (key_nid == LNET_NID_ANY) return -EINVAL; @@ -919,17 +934,17 @@ lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid) * entire peer */ lnet_net_lock(LNET_LOCK_EX); - lnet_peer_del_locked(peer); + rc = lnet_peer_del_locked(peer); lnet_net_unlock(LNET_LOCK_EX); - return 0; + return rc; } lnet_net_lock(LNET_LOCK_EX); - lnet_peer_ni_del_locked(lpni); + rc = lnet_peer_ni_del_locked(lpni); lnet_net_unlock(LNET_LOCK_EX); - return 0; + return rc; } void