From patchwork Fri Sep 7 00:49:31 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: NeilBrown X-Patchwork-Id: 10591359 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 2F580921 for ; Fri, 7 Sep 2018 00:53:35 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 1F6BD2B06C for ; Fri, 7 Sep 2018 00:53:35 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 12C182B17D; Fri, 7 Sep 2018 00:53:35 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-2.3 required=2.0 tests=BAYES_00,FUZZY_AMBIEN, MAILING_LIST_MULTI,RCVD_IN_DNSWL_NONE autolearn=no version=3.3.1 Received: from pdx1-mailman02.dreamhost.com (pdx1-mailman02.dreamhost.com [64.90.62.194]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 93A7E2B184 for ; Fri, 7 Sep 2018 00:53:34 +0000 (UTC) Received: from pdx1-mailman02.dreamhost.com (localhost [IPv6:::1]) by pdx1-mailman02.dreamhost.com (Postfix) with ESMTP id 2B56A4E31DC; Thu, 6 Sep 2018 17:53:34 -0700 (PDT) X-Original-To: lustre-devel@lists.lustre.org Delivered-To: lustre-devel-lustre.org@pdx1-mailman02.dreamhost.com Received: from mx1.suse.de (mx2.suse.de [195.135.220.15]) by pdx1-mailman02.dreamhost.com (Postfix) with ESMTP id 7DB884E318F for ; Thu, 6 Sep 2018 17:53:32 -0700 (PDT) X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id C5CF2AED7; Fri, 7 Sep 2018 00:53:31 +0000 (UTC) From: NeilBrown To: Oleg Drokin , Doug Oucharek , James Simmons , Andreas Dilger Date: Fri, 07 Sep 2018 10:49:31 +1000 Message-ID: <153628137183.8267.14166864803956204561.stgit@noble> In-Reply-To: <153628058697.8267.6056114844033479774.stgit@noble> References: <153628058697.8267.6056114844033479774.stgit@noble> User-Agent: StGit/0.17.1-dirty MIME-Version: 1.0 Subject: [lustre-devel] [PATCH 15/34] lnet: extend zombie handling to nets and nis X-BeenThere: lustre-devel@lists.lustre.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: "For discussing Lustre software development." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: Lustre Development List Errors-To: lustre-devel-bounces@lists.lustre.org Sender: "lustre-devel" X-Virus-Scanned: ClamAV using ClamSMTP A zombie lnet_ni is now attached to the lnet_net rather than the global the_lnet. The zombie lnet_net are attached to the_lnet. For some reason, we don't drop the refcount on the lnd before shutting it down now. This is part of 8cbb8cd3e771e7f7e0f99cafc19fad32770dc015 LU-7734 lnet: Multi-Rail local NI split Signed-off-by: NeilBrown Reviewed-by: Doug Oucharek k --- .../staging/lustre/include/linux/lnet/lib-types.h | 9 ++- drivers/staging/lustre/lnet/lnet/api-ni.c | 65 ++++++++++---------- drivers/staging/lustre/lnet/lnet/config.c | 3 + 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h index 22957d142cc0..1d372672e2de 100644 --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h @@ -284,6 +284,9 @@ struct lnet_net { struct lnet_lnd *net_lnd; /* list of NIs on this net */ struct list_head net_ni_list; + + /* dying LND instances */ + struct list_head net_ni_zombie; }; struct lnet_ni { @@ -653,11 +656,11 @@ struct lnet { /* LND instances */ struct list_head ln_nets; /* NIs bond on specific CPT(s) */ - struct list_head ln_nis_cpt; - /* dying LND instances */ - struct list_head ln_nis_zombie; + struct list_head ln_nis_cpt; /* the loopback NI */ struct lnet_ni *ln_loni; + /* network zombie list */ + struct list_head ln_net_zombie; /* remote networks with routes to them */ struct list_head *ln_remote_nets_hash; diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c index c3c568e63342..18d111cb826b 100644 --- a/drivers/staging/lustre/lnet/lnet/api-ni.c +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c @@ -539,7 +539,6 @@ lnet_prepare(lnet_pid_t requested_pid) INIT_LIST_HEAD(&the_lnet.ln_test_peers); INIT_LIST_HEAD(&the_lnet.ln_nets); INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); - INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); INIT_LIST_HEAD(&the_lnet.ln_routers); INIT_LIST_HEAD(&the_lnet.ln_drop_rules); INIT_LIST_HEAD(&the_lnet.ln_delay_rules); @@ -618,7 +617,6 @@ lnet_unprepare(void) LASSERT(list_empty(&the_lnet.ln_test_peers)); LASSERT(list_empty(&the_lnet.ln_nets)); LASSERT(list_empty(&the_lnet.ln_nis_cpt)); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); lnet_portals_destroy(); @@ -1095,34 +1093,35 @@ lnet_ni_unlink_locked(struct lnet_ni *ni) /* move it to zombie list and nobody can find it anymore */ LASSERT(!list_empty(&ni->ni_netlist)); - list_move(&ni->ni_netlist, &the_lnet.ln_nis_zombie); + list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie); lnet_ni_decref_locked(ni, 0); } static void -lnet_clear_zombies_nis_locked(void) +lnet_clear_zombies_nis_locked(struct lnet_net *net) { int i; int islo; struct lnet_ni *ni; + struct list_head *zombie_list = &net->net_ni_zombie; /* - * Now wait for the NI's I just nuked to show up on ln_zombie_nis - * and shut them down in guaranteed thread context + * Now wait for the NIs I just nuked to show up on the zombie + * list and shut them down in guaranteed thread context */ i = 2; - while (!list_empty(&the_lnet.ln_nis_zombie)) { + while (!list_empty(zombie_list)) { int *ref; int j; - ni = list_entry(the_lnet.ln_nis_zombie.next, + ni = list_entry(zombie_list->next, struct lnet_ni, ni_netlist); list_del_init(&ni->ni_netlist); cfs_percpt_for_each(ref, j, ni->ni_refs) { if (!*ref) continue; /* still busy, add it back to zombie list */ - list_add(&ni->ni_netlist, &the_lnet.ln_nis_zombie); + list_add(&ni->ni_netlist, zombie_list); break; } @@ -1138,18 +1137,13 @@ lnet_clear_zombies_nis_locked(void) continue; } - ni->ni_net->net_lnd->lnd_refcount--; lnet_net_unlock(LNET_LOCK_EX); islo = ni->ni_net->net_lnd->lnd_type == LOLND; LASSERT(!in_interrupt()); - ni->ni_net->net_lnd->lnd_shutdown(ni); + net->net_lnd->lnd_shutdown(ni); - /* - * can't deref lnd anymore now; it might have unregistered - * itself... - */ if (!islo) CDEBUG(D_LNI, "Removed LNI %s\n", libcfs_nid2str(ni->ni_nid)); @@ -1162,9 +1156,11 @@ lnet_clear_zombies_nis_locked(void) } static void -lnet_shutdown_lndnis(void) +lnet_shutdown_lndnet(struct lnet_net *net); + +static void +lnet_shutdown_lndnets(void) { - struct lnet_ni *ni; int i; struct lnet_net *net; @@ -1173,30 +1169,35 @@ lnet_shutdown_lndnis(void) /* All quiet on the API front */ LASSERT(!the_lnet.ln_shutdown); LASSERT(!the_lnet.ln_refcount); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_shutdown = 1; /* flag shutdown */ - /* Unlink NIs from the global table */ while (!list_empty(&the_lnet.ln_nets)) { + /* + * move the nets to the zombie list to avoid them being + * picked up for new work. LONET is also included in the + * Nets that will be moved to the zombie list + */ net = list_entry(the_lnet.ln_nets.next, struct lnet_net, net_list); - while (!list_empty(&net->net_ni_list)) { - ni = list_entry(net->net_ni_list.next, - struct lnet_ni, ni_netlist); - lnet_ni_unlink_locked(ni); - } + list_move(&net->net_list, &the_lnet.ln_net_zombie); } - /* Drop the cached loopback NI. */ + /* Drop the cached loopback Net. */ if (the_lnet.ln_loni) { lnet_ni_decref_locked(the_lnet.ln_loni, 0); the_lnet.ln_loni = NULL; } - lnet_net_unlock(LNET_LOCK_EX); + /* iterate through the net zombie list and delete each net */ + while (!list_empty(&the_lnet.ln_net_zombie)) { + net = list_entry(the_lnet.ln_net_zombie.next, + struct lnet_net, net_list); + lnet_shutdown_lndnet(net); + } + /* * Clear lazy portals and drop delayed messages which hold refs * on their lnet_msg::msg_rxpeer @@ -1211,8 +1212,6 @@ lnet_shutdown_lndnis(void) lnet_peer_tables_cleanup(NULL); lnet_net_lock(LNET_LOCK_EX); - - lnet_clear_zombies_nis_locked(); the_lnet.ln_shutdown = 0; lnet_net_unlock(LNET_LOCK_EX); } @@ -1222,6 +1221,7 @@ static void lnet_shutdown_lndni(struct lnet_ni *ni) { int i; + struct lnet_net *net = ni->ni_net; lnet_net_lock(LNET_LOCK_EX); lnet_ni_unlink_locked(ni); @@ -1235,7 +1235,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni) lnet_peer_tables_cleanup(ni); lnet_net_lock(LNET_LOCK_EX); - lnet_clear_zombies_nis_locked(); + lnet_clear_zombies_nis_locked(net); lnet_net_unlock(LNET_LOCK_EX); } @@ -1445,7 +1445,7 @@ lnet_startup_lndnets(struct list_head *netlist) return ni_count; failed: - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); return rc; } @@ -1492,6 +1492,7 @@ int lnet_lib_init(void) the_lnet.ln_refcount = 0; LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh); INIT_LIST_HEAD(&the_lnet.ln_lnds); + INIT_LIST_HEAD(&the_lnet.ln_net_zombie); INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); @@ -1656,7 +1657,7 @@ LNetNIInit(lnet_pid_t requested_pid) if (!the_lnet.ln_nis_from_mod_params) lnet_destroy_routes(); err_shutdown_lndnis: - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); err_empty_list: lnet_unprepare(); LASSERT(rc < 0); @@ -1703,7 +1704,7 @@ LNetNIFini(void) lnet_acceptor_stop(); lnet_destroy_routes(); - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); lnet_unprepare(); } diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c index 380a3fb1caba..2588d67fea1b 100644 --- a/drivers/staging/lustre/lnet/lnet/config.c +++ b/drivers/staging/lustre/lnet/lnet/config.c @@ -279,6 +279,8 @@ lnet_net_free(struct lnet_net *net) struct list_head *tmp, *tmp2; struct lnet_ni *ni; + LASSERT(list_empty(&net->net_ni_zombie)); + /* delete any nis which have been started. */ list_for_each_safe(tmp, tmp2, &net->net_ni_list) { ni = list_entry(tmp, struct lnet_ni, ni_netlist); @@ -312,6 +314,7 @@ lnet_net_alloc(__u32 net_id, struct list_head *net_list) INIT_LIST_HEAD(&net->net_list); INIT_LIST_HEAD(&net->net_ni_list); + INIT_LIST_HEAD(&net->net_ni_zombie); net->net_id = net_id;