Message ID | 153628137183.8267.14166864803956204561.stgit@noble (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Beginning of multi-rail support for drivers/staging/lustre | expand |
Which refcount line are you referring to? The call to lnet_ni_unlink_locked()? Reviewed-by: Doug Oucharek <dougso@me.com>k Doug On 9/6/18, 5:53 PM, "NeilBrown" <neilb@suse.com> wrote: A zombie lnet_ni is now attached to the lnet_net rather than the global the_lnet. The zombie lnet_net are attached to the_lnet. For some reason, we don't drop the refcount on the lnd before shutting it down now. This is part of 8cbb8cd3e771e7f7e0f99cafc19fad32770dc015 LU-7734 lnet: Multi-Rail local NI split Signed-off-by: NeilBrown <neilb@suse.com> --- .../staging/lustre/include/linux/lnet/lib-types.h | 9 ++- drivers/staging/lustre/lnet/lnet/api-ni.c | 65 ++++++++++---------- drivers/staging/lustre/lnet/lnet/config.c | 3 + 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h index 22957d142cc0..1d372672e2de 100644 --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h @@ -284,6 +284,9 @@ struct lnet_net { struct lnet_lnd *net_lnd; /* list of NIs on this net */ struct list_head net_ni_list; + + /* dying LND instances */ + struct list_head net_ni_zombie; }; struct lnet_ni { @@ -653,11 +656,11 @@ struct lnet { /* LND instances */ struct list_head ln_nets; /* NIs bond on specific CPT(s) */ - struct list_head ln_nis_cpt; - /* dying LND instances */ - struct list_head ln_nis_zombie; + struct list_head ln_nis_cpt; /* the loopback NI */ struct lnet_ni *ln_loni; + /* network zombie list */ + struct list_head ln_net_zombie; /* remote networks with routes to them */ struct list_head *ln_remote_nets_hash; diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c index c3c568e63342..18d111cb826b 100644 --- a/drivers/staging/lustre/lnet/lnet/api-ni.c +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c @@ -539,7 +539,6 @@ lnet_prepare(lnet_pid_t requested_pid) INIT_LIST_HEAD(&the_lnet.ln_test_peers); INIT_LIST_HEAD(&the_lnet.ln_nets); INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); - INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); INIT_LIST_HEAD(&the_lnet.ln_routers); INIT_LIST_HEAD(&the_lnet.ln_drop_rules); INIT_LIST_HEAD(&the_lnet.ln_delay_rules); @@ -618,7 +617,6 @@ lnet_unprepare(void) LASSERT(list_empty(&the_lnet.ln_test_peers)); LASSERT(list_empty(&the_lnet.ln_nets)); LASSERT(list_empty(&the_lnet.ln_nis_cpt)); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); lnet_portals_destroy(); @@ -1095,34 +1093,35 @@ lnet_ni_unlink_locked(struct lnet_ni *ni) /* move it to zombie list and nobody can find it anymore */ LASSERT(!list_empty(&ni->ni_netlist)); - list_move(&ni->ni_netlist, &the_lnet.ln_nis_zombie); + list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie); lnet_ni_decref_locked(ni, 0); } static void -lnet_clear_zombies_nis_locked(void) +lnet_clear_zombies_nis_locked(struct lnet_net *net) { int i; int islo; struct lnet_ni *ni; + struct list_head *zombie_list = &net->net_ni_zombie; /* - * Now wait for the NI's I just nuked to show up on ln_zombie_nis - * and shut them down in guaranteed thread context + * Now wait for the NIs I just nuked to show up on the zombie + * list and shut them down in guaranteed thread context */ i = 2; - while (!list_empty(&the_lnet.ln_nis_zombie)) { + while (!list_empty(zombie_list)) { int *ref; int j; - ni = list_entry(the_lnet.ln_nis_zombie.next, + ni = list_entry(zombie_list->next, struct lnet_ni, ni_netlist); list_del_init(&ni->ni_netlist); cfs_percpt_for_each(ref, j, ni->ni_refs) { if (!*ref) continue; /* still busy, add it back to zombie list */ - list_add(&ni->ni_netlist, &the_lnet.ln_nis_zombie); + list_add(&ni->ni_netlist, zombie_list); break; } @@ -1138,18 +1137,13 @@ lnet_clear_zombies_nis_locked(void) continue; } - ni->ni_net->net_lnd->lnd_refcount--; lnet_net_unlock(LNET_LOCK_EX); islo = ni->ni_net->net_lnd->lnd_type == LOLND; LASSERT(!in_interrupt()); - ni->ni_net->net_lnd->lnd_shutdown(ni); + net->net_lnd->lnd_shutdown(ni); - /* - * can't deref lnd anymore now; it might have unregistered - * itself... - */ if (!islo) CDEBUG(D_LNI, "Removed LNI %s\n", libcfs_nid2str(ni->ni_nid)); @@ -1162,9 +1156,11 @@ lnet_clear_zombies_nis_locked(void) } static void -lnet_shutdown_lndnis(void) +lnet_shutdown_lndnet(struct lnet_net *net); + +static void +lnet_shutdown_lndnets(void) { - struct lnet_ni *ni; int i; struct lnet_net *net; @@ -1173,30 +1169,35 @@ lnet_shutdown_lndnis(void) /* All quiet on the API front */ LASSERT(!the_lnet.ln_shutdown); LASSERT(!the_lnet.ln_refcount); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_shutdown = 1; /* flag shutdown */ - /* Unlink NIs from the global table */ while (!list_empty(&the_lnet.ln_nets)) { + /* + * move the nets to the zombie list to avoid them being + * picked up for new work. LONET is also included in the + * Nets that will be moved to the zombie list + */ net = list_entry(the_lnet.ln_nets.next, struct lnet_net, net_list); - while (!list_empty(&net->net_ni_list)) { - ni = list_entry(net->net_ni_list.next, - struct lnet_ni, ni_netlist); - lnet_ni_unlink_locked(ni); - } + list_move(&net->net_list, &the_lnet.ln_net_zombie); } - /* Drop the cached loopback NI. */ + /* Drop the cached loopback Net. */ if (the_lnet.ln_loni) { lnet_ni_decref_locked(the_lnet.ln_loni, 0); the_lnet.ln_loni = NULL; } - lnet_net_unlock(LNET_LOCK_EX); + /* iterate through the net zombie list and delete each net */ + while (!list_empty(&the_lnet.ln_net_zombie)) { + net = list_entry(the_lnet.ln_net_zombie.next, + struct lnet_net, net_list); + lnet_shutdown_lndnet(net); + } + /* * Clear lazy portals and drop delayed messages which hold refs * on their lnet_msg::msg_rxpeer @@ -1211,8 +1212,6 @@ lnet_shutdown_lndnis(void) lnet_peer_tables_cleanup(NULL); lnet_net_lock(LNET_LOCK_EX); - - lnet_clear_zombies_nis_locked(); the_lnet.ln_shutdown = 0; lnet_net_unlock(LNET_LOCK_EX); } @@ -1222,6 +1221,7 @@ static void lnet_shutdown_lndni(struct lnet_ni *ni) { int i; + struct lnet_net *net = ni->ni_net; lnet_net_lock(LNET_LOCK_EX); lnet_ni_unlink_locked(ni); @@ -1235,7 +1235,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni) lnet_peer_tables_cleanup(ni); lnet_net_lock(LNET_LOCK_EX); - lnet_clear_zombies_nis_locked(); + lnet_clear_zombies_nis_locked(net); lnet_net_unlock(LNET_LOCK_EX); } @@ -1445,7 +1445,7 @@ lnet_startup_lndnets(struct list_head *netlist) return ni_count; failed: - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); return rc; } @@ -1492,6 +1492,7 @@ int lnet_lib_init(void) the_lnet.ln_refcount = 0; LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh); INIT_LIST_HEAD(&the_lnet.ln_lnds); + INIT_LIST_HEAD(&the_lnet.ln_net_zombie); INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); @@ -1656,7 +1657,7 @@ LNetNIInit(lnet_pid_t requested_pid) if (!the_lnet.ln_nis_from_mod_params) lnet_destroy_routes(); err_shutdown_lndnis: - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); err_empty_list: lnet_unprepare(); LASSERT(rc < 0); @@ -1703,7 +1704,7 @@ LNetNIFini(void) lnet_acceptor_stop(); lnet_destroy_routes(); - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); lnet_unprepare(); } diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c index 380a3fb1caba..2588d67fea1b 100644 --- a/drivers/staging/lustre/lnet/lnet/config.c +++ b/drivers/staging/lustre/lnet/lnet/config.c @@ -279,6 +279,8 @@ lnet_net_free(struct lnet_net *net) struct list_head *tmp, *tmp2; struct lnet_ni *ni; + LASSERT(list_empty(&net->net_ni_zombie)); + /* delete any nis which have been started. */ list_for_each_safe(tmp, tmp2, &net->net_ni_list) { ni = list_entry(tmp, struct lnet_ni, ni_netlist); @@ -312,6 +314,7 @@ lnet_net_alloc(__u32 net_id, struct list_head *net_list) INIT_LIST_HEAD(&net->net_list); INIT_LIST_HEAD(&net->net_ni_list); + INIT_LIST_HEAD(&net->net_ni_zombie); net->net_id = net_id;
On Wed, Sep 12 2018, Doug Oucharek wrote: > Which refcount line are you referring to? The call to > lnet_ni_unlink_locked()? Line 1141 = in lnet_clear_zombies_nis_locked(). > - ni->ni_net->net_lnd->lnd_refcount--; Thanks, NeilBrown > > Reviewed-by: Doug Oucharek <dougso@me.com> > > Doug > > On 9/6/18, 5:53 PM, "NeilBrown" <neilb@suse.com> wrote: > > A zombie lnet_ni is now attached to the lnet_net rather than the > global the_lnet. The zombie lnet_net are attached to the_lnet. > > For some reason, we don't drop the refcount on the lnd before shutting > it down now. > > This is part of > 8cbb8cd3e771e7f7e0f99cafc19fad32770dc015 > LU-7734 lnet: Multi-Rail local NI split > > Signed-off-by: NeilBrown <neilb@suse.com> > --- > .../staging/lustre/include/linux/lnet/lib-types.h | 9 ++- > drivers/staging/lustre/lnet/lnet/api-ni.c | 65 ++++++++++---------- > drivers/staging/lustre/lnet/lnet/config.c | 3 + > 3 files changed, 42 insertions(+), 35 deletions(-) > > diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h > index 22957d142cc0..1d372672e2de 100644 > --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h > +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h > @@ -284,6 +284,9 @@ struct lnet_net { > struct lnet_lnd *net_lnd; > /* list of NIs on this net */ > struct list_head net_ni_list; > + > + /* dying LND instances */ > + struct list_head net_ni_zombie; > }; > > struct lnet_ni { > @@ -653,11 +656,11 @@ struct lnet { > /* LND instances */ > struct list_head ln_nets; > /* NIs bond on specific CPT(s) */ > - struct list_head ln_nis_cpt; > - /* dying LND instances */ > - struct list_head ln_nis_zombie; > + struct list_head ln_nis_cpt; > /* the loopback NI */ > struct lnet_ni *ln_loni; > + /* network zombie list */ > + struct list_head ln_net_zombie; > > /* remote networks with routes to them */ > struct list_head *ln_remote_nets_hash; > diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c > index c3c568e63342..18d111cb826b 100644 > --- a/drivers/staging/lustre/lnet/lnet/api-ni.c > +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c > @@ -539,7 +539,6 @@ lnet_prepare(lnet_pid_t requested_pid) > INIT_LIST_HEAD(&the_lnet.ln_test_peers); > INIT_LIST_HEAD(&the_lnet.ln_nets); > INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); > - INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); > INIT_LIST_HEAD(&the_lnet.ln_routers); > INIT_LIST_HEAD(&the_lnet.ln_drop_rules); > INIT_LIST_HEAD(&the_lnet.ln_delay_rules); > @@ -618,7 +617,6 @@ lnet_unprepare(void) > LASSERT(list_empty(&the_lnet.ln_test_peers)); > LASSERT(list_empty(&the_lnet.ln_nets)); > LASSERT(list_empty(&the_lnet.ln_nis_cpt)); > - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); > > lnet_portals_destroy(); > > @@ -1095,34 +1093,35 @@ lnet_ni_unlink_locked(struct lnet_ni *ni) > > /* move it to zombie list and nobody can find it anymore */ > LASSERT(!list_empty(&ni->ni_netlist)); > - list_move(&ni->ni_netlist, &the_lnet.ln_nis_zombie); > + list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie); > lnet_ni_decref_locked(ni, 0); > } > > static void > -lnet_clear_zombies_nis_locked(void) > +lnet_clear_zombies_nis_locked(struct lnet_net *net) > { > int i; > int islo; > struct lnet_ni *ni; > + struct list_head *zombie_list = &net->net_ni_zombie; > > /* > - * Now wait for the NI's I just nuked to show up on ln_zombie_nis > - * and shut them down in guaranteed thread context > + * Now wait for the NIs I just nuked to show up on the zombie > + * list and shut them down in guaranteed thread context > */ > i = 2; > - while (!list_empty(&the_lnet.ln_nis_zombie)) { > + while (!list_empty(zombie_list)) { > int *ref; > int j; > > - ni = list_entry(the_lnet.ln_nis_zombie.next, > + ni = list_entry(zombie_list->next, > struct lnet_ni, ni_netlist); > list_del_init(&ni->ni_netlist); > cfs_percpt_for_each(ref, j, ni->ni_refs) { > if (!*ref) > continue; > /* still busy, add it back to zombie list */ > - list_add(&ni->ni_netlist, &the_lnet.ln_nis_zombie); > + list_add(&ni->ni_netlist, zombie_list); > break; > } > > @@ -1138,18 +1137,13 @@ lnet_clear_zombies_nis_locked(void) > continue; > } > > - ni->ni_net->net_lnd->lnd_refcount--; > lnet_net_unlock(LNET_LOCK_EX); > > islo = ni->ni_net->net_lnd->lnd_type == LOLND; > > LASSERT(!in_interrupt()); > - ni->ni_net->net_lnd->lnd_shutdown(ni); > + net->net_lnd->lnd_shutdown(ni); > > - /* > - * can't deref lnd anymore now; it might have unregistered > - * itself... > - */ > if (!islo) > CDEBUG(D_LNI, "Removed LNI %s\n", > libcfs_nid2str(ni->ni_nid)); > @@ -1162,9 +1156,11 @@ lnet_clear_zombies_nis_locked(void) > } > > static void > -lnet_shutdown_lndnis(void) > +lnet_shutdown_lndnet(struct lnet_net *net); > + > +static void > +lnet_shutdown_lndnets(void) > { > - struct lnet_ni *ni; > int i; > struct lnet_net *net; > > @@ -1173,30 +1169,35 @@ lnet_shutdown_lndnis(void) > /* All quiet on the API front */ > LASSERT(!the_lnet.ln_shutdown); > LASSERT(!the_lnet.ln_refcount); > - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); > > lnet_net_lock(LNET_LOCK_EX); > the_lnet.ln_shutdown = 1; /* flag shutdown */ > > - /* Unlink NIs from the global table */ > while (!list_empty(&the_lnet.ln_nets)) { > + /* > + * move the nets to the zombie list to avoid them being > + * picked up for new work. LONET is also included in the > + * Nets that will be moved to the zombie list > + */ > net = list_entry(the_lnet.ln_nets.next, > struct lnet_net, net_list); > - while (!list_empty(&net->net_ni_list)) { > - ni = list_entry(net->net_ni_list.next, > - struct lnet_ni, ni_netlist); > - lnet_ni_unlink_locked(ni); > - } > + list_move(&net->net_list, &the_lnet.ln_net_zombie); > } > > - /* Drop the cached loopback NI. */ > + /* Drop the cached loopback Net. */ > if (the_lnet.ln_loni) { > lnet_ni_decref_locked(the_lnet.ln_loni, 0); > the_lnet.ln_loni = NULL; > } > - > lnet_net_unlock(LNET_LOCK_EX); > > + /* iterate through the net zombie list and delete each net */ > + while (!list_empty(&the_lnet.ln_net_zombie)) { > + net = list_entry(the_lnet.ln_net_zombie.next, > + struct lnet_net, net_list); > + lnet_shutdown_lndnet(net); > + } > + > /* > * Clear lazy portals and drop delayed messages which hold refs > * on their lnet_msg::msg_rxpeer > @@ -1211,8 +1212,6 @@ lnet_shutdown_lndnis(void) > lnet_peer_tables_cleanup(NULL); > > lnet_net_lock(LNET_LOCK_EX); > - > - lnet_clear_zombies_nis_locked(); > the_lnet.ln_shutdown = 0; > lnet_net_unlock(LNET_LOCK_EX); > } > @@ -1222,6 +1221,7 @@ static void > lnet_shutdown_lndni(struct lnet_ni *ni) > { > int i; > + struct lnet_net *net = ni->ni_net; > > lnet_net_lock(LNET_LOCK_EX); > lnet_ni_unlink_locked(ni); > @@ -1235,7 +1235,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni) > lnet_peer_tables_cleanup(ni); > > lnet_net_lock(LNET_LOCK_EX); > - lnet_clear_zombies_nis_locked(); > + lnet_clear_zombies_nis_locked(net); > lnet_net_unlock(LNET_LOCK_EX); > } > > @@ -1445,7 +1445,7 @@ lnet_startup_lndnets(struct list_head *netlist) > > return ni_count; > failed: > - lnet_shutdown_lndnis(); > + lnet_shutdown_lndnets(); > > return rc; > } > @@ -1492,6 +1492,7 @@ int lnet_lib_init(void) > the_lnet.ln_refcount = 0; > LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh); > INIT_LIST_HEAD(&the_lnet.ln_lnds); > + INIT_LIST_HEAD(&the_lnet.ln_net_zombie); > INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); > INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); > > @@ -1656,7 +1657,7 @@ LNetNIInit(lnet_pid_t requested_pid) > if (!the_lnet.ln_nis_from_mod_params) > lnet_destroy_routes(); > err_shutdown_lndnis: > - lnet_shutdown_lndnis(); > + lnet_shutdown_lndnets(); > err_empty_list: > lnet_unprepare(); > LASSERT(rc < 0); > @@ -1703,7 +1704,7 @@ LNetNIFini(void) > > lnet_acceptor_stop(); > lnet_destroy_routes(); > - lnet_shutdown_lndnis(); > + lnet_shutdown_lndnets(); > lnet_unprepare(); > } > > diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c > index 380a3fb1caba..2588d67fea1b 100644 > --- a/drivers/staging/lustre/lnet/lnet/config.c > +++ b/drivers/staging/lustre/lnet/lnet/config.c > @@ -279,6 +279,8 @@ lnet_net_free(struct lnet_net *net) > struct list_head *tmp, *tmp2; > struct lnet_ni *ni; > > + LASSERT(list_empty(&net->net_ni_zombie)); > + > /* delete any nis which have been started. */ > list_for_each_safe(tmp, tmp2, &net->net_ni_list) { > ni = list_entry(tmp, struct lnet_ni, ni_netlist); > @@ -312,6 +314,7 @@ lnet_net_alloc(__u32 net_id, struct list_head *net_list) > > INIT_LIST_HEAD(&net->net_list); > INIT_LIST_HEAD(&net->net_ni_list); > + INIT_LIST_HEAD(&net->net_ni_zombie); > > net->net_id = net_id; > > > >
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h index 22957d142cc0..1d372672e2de 100644 --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h @@ -284,6 +284,9 @@ struct lnet_net { struct lnet_lnd *net_lnd; /* list of NIs on this net */ struct list_head net_ni_list; + + /* dying LND instances */ + struct list_head net_ni_zombie; }; struct lnet_ni { @@ -653,11 +656,11 @@ struct lnet { /* LND instances */ struct list_head ln_nets; /* NIs bond on specific CPT(s) */ - struct list_head ln_nis_cpt; - /* dying LND instances */ - struct list_head ln_nis_zombie; + struct list_head ln_nis_cpt; /* the loopback NI */ struct lnet_ni *ln_loni; + /* network zombie list */ + struct list_head ln_net_zombie; /* remote networks with routes to them */ struct list_head *ln_remote_nets_hash; diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c index c3c568e63342..18d111cb826b 100644 --- a/drivers/staging/lustre/lnet/lnet/api-ni.c +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c @@ -539,7 +539,6 @@ lnet_prepare(lnet_pid_t requested_pid) INIT_LIST_HEAD(&the_lnet.ln_test_peers); INIT_LIST_HEAD(&the_lnet.ln_nets); INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); - INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); INIT_LIST_HEAD(&the_lnet.ln_routers); INIT_LIST_HEAD(&the_lnet.ln_drop_rules); INIT_LIST_HEAD(&the_lnet.ln_delay_rules); @@ -618,7 +617,6 @@ lnet_unprepare(void) LASSERT(list_empty(&the_lnet.ln_test_peers)); LASSERT(list_empty(&the_lnet.ln_nets)); LASSERT(list_empty(&the_lnet.ln_nis_cpt)); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); lnet_portals_destroy(); @@ -1095,34 +1093,35 @@ lnet_ni_unlink_locked(struct lnet_ni *ni) /* move it to zombie list and nobody can find it anymore */ LASSERT(!list_empty(&ni->ni_netlist)); - list_move(&ni->ni_netlist, &the_lnet.ln_nis_zombie); + list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie); lnet_ni_decref_locked(ni, 0); } static void -lnet_clear_zombies_nis_locked(void) +lnet_clear_zombies_nis_locked(struct lnet_net *net) { int i; int islo; struct lnet_ni *ni; + struct list_head *zombie_list = &net->net_ni_zombie; /* - * Now wait for the NI's I just nuked to show up on ln_zombie_nis - * and shut them down in guaranteed thread context + * Now wait for the NIs I just nuked to show up on the zombie + * list and shut them down in guaranteed thread context */ i = 2; - while (!list_empty(&the_lnet.ln_nis_zombie)) { + while (!list_empty(zombie_list)) { int *ref; int j; - ni = list_entry(the_lnet.ln_nis_zombie.next, + ni = list_entry(zombie_list->next, struct lnet_ni, ni_netlist); list_del_init(&ni->ni_netlist); cfs_percpt_for_each(ref, j, ni->ni_refs) { if (!*ref) continue; /* still busy, add it back to zombie list */ - list_add(&ni->ni_netlist, &the_lnet.ln_nis_zombie); + list_add(&ni->ni_netlist, zombie_list); break; } @@ -1138,18 +1137,13 @@ lnet_clear_zombies_nis_locked(void) continue; } - ni->ni_net->net_lnd->lnd_refcount--; lnet_net_unlock(LNET_LOCK_EX); islo = ni->ni_net->net_lnd->lnd_type == LOLND; LASSERT(!in_interrupt()); - ni->ni_net->net_lnd->lnd_shutdown(ni); + net->net_lnd->lnd_shutdown(ni); - /* - * can't deref lnd anymore now; it might have unregistered - * itself... - */ if (!islo) CDEBUG(D_LNI, "Removed LNI %s\n", libcfs_nid2str(ni->ni_nid)); @@ -1162,9 +1156,11 @@ lnet_clear_zombies_nis_locked(void) } static void -lnet_shutdown_lndnis(void) +lnet_shutdown_lndnet(struct lnet_net *net); + +static void +lnet_shutdown_lndnets(void) { - struct lnet_ni *ni; int i; struct lnet_net *net; @@ -1173,30 +1169,35 @@ lnet_shutdown_lndnis(void) /* All quiet on the API front */ LASSERT(!the_lnet.ln_shutdown); LASSERT(!the_lnet.ln_refcount); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_shutdown = 1; /* flag shutdown */ - /* Unlink NIs from the global table */ while (!list_empty(&the_lnet.ln_nets)) { + /* + * move the nets to the zombie list to avoid them being + * picked up for new work. LONET is also included in the + * Nets that will be moved to the zombie list + */ net = list_entry(the_lnet.ln_nets.next, struct lnet_net, net_list); - while (!list_empty(&net->net_ni_list)) { - ni = list_entry(net->net_ni_list.next, - struct lnet_ni, ni_netlist); - lnet_ni_unlink_locked(ni); - } + list_move(&net->net_list, &the_lnet.ln_net_zombie); } - /* Drop the cached loopback NI. */ + /* Drop the cached loopback Net. */ if (the_lnet.ln_loni) { lnet_ni_decref_locked(the_lnet.ln_loni, 0); the_lnet.ln_loni = NULL; } - lnet_net_unlock(LNET_LOCK_EX); + /* iterate through the net zombie list and delete each net */ + while (!list_empty(&the_lnet.ln_net_zombie)) { + net = list_entry(the_lnet.ln_net_zombie.next, + struct lnet_net, net_list); + lnet_shutdown_lndnet(net); + } + /* * Clear lazy portals and drop delayed messages which hold refs * on their lnet_msg::msg_rxpeer @@ -1211,8 +1212,6 @@ lnet_shutdown_lndnis(void) lnet_peer_tables_cleanup(NULL); lnet_net_lock(LNET_LOCK_EX); - - lnet_clear_zombies_nis_locked(); the_lnet.ln_shutdown = 0; lnet_net_unlock(LNET_LOCK_EX); } @@ -1222,6 +1221,7 @@ static void lnet_shutdown_lndni(struct lnet_ni *ni) { int i; + struct lnet_net *net = ni->ni_net; lnet_net_lock(LNET_LOCK_EX); lnet_ni_unlink_locked(ni); @@ -1235,7 +1235,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni) lnet_peer_tables_cleanup(ni); lnet_net_lock(LNET_LOCK_EX); - lnet_clear_zombies_nis_locked(); + lnet_clear_zombies_nis_locked(net); lnet_net_unlock(LNET_LOCK_EX); } @@ -1445,7 +1445,7 @@ lnet_startup_lndnets(struct list_head *netlist) return ni_count; failed: - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); return rc; } @@ -1492,6 +1492,7 @@ int lnet_lib_init(void) the_lnet.ln_refcount = 0; LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh); INIT_LIST_HEAD(&the_lnet.ln_lnds); + INIT_LIST_HEAD(&the_lnet.ln_net_zombie); INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); @@ -1656,7 +1657,7 @@ LNetNIInit(lnet_pid_t requested_pid) if (!the_lnet.ln_nis_from_mod_params) lnet_destroy_routes(); err_shutdown_lndnis: - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); err_empty_list: lnet_unprepare(); LASSERT(rc < 0); @@ -1703,7 +1704,7 @@ LNetNIFini(void) lnet_acceptor_stop(); lnet_destroy_routes(); - lnet_shutdown_lndnis(); + lnet_shutdown_lndnets(); lnet_unprepare(); } diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c index 380a3fb1caba..2588d67fea1b 100644 --- a/drivers/staging/lustre/lnet/lnet/config.c +++ b/drivers/staging/lustre/lnet/lnet/config.c @@ -279,6 +279,8 @@ lnet_net_free(struct lnet_net *net) struct list_head *tmp, *tmp2; struct lnet_ni *ni; + LASSERT(list_empty(&net->net_ni_zombie)); + /* delete any nis which have been started. */ list_for_each_safe(tmp, tmp2, &net->net_ni_list) { ni = list_entry(tmp, struct lnet_ni, ni_netlist); @@ -312,6 +314,7 @@ lnet_net_alloc(__u32 net_id, struct list_head *net_list) INIT_LIST_HEAD(&net->net_list); INIT_LIST_HEAD(&net->net_ni_list); + INIT_LIST_HEAD(&net->net_ni_zombie); net->net_id = net_id;
A zombie lnet_ni is now attached to the lnet_net rather than the global the_lnet. The zombie lnet_net are attached to the_lnet. For some reason, we don't drop the refcount on the lnd before shutting it down now. This is part of 8cbb8cd3e771e7f7e0f99cafc19fad32770dc015 LU-7734 lnet: Multi-Rail local NI split Signed-off-by: NeilBrown <neilb@suse.com> --- .../staging/lustre/include/linux/lnet/lib-types.h | 9 ++- drivers/staging/lustre/lnet/lnet/api-ni.c | 65 ++++++++++---------- drivers/staging/lustre/lnet/lnet/config.c | 3 + 3 files changed, 42 insertions(+), 35 deletions(-)