Message ID | 20250331231730.2857655-3-bmarzins@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fix issue of multipathd not tracking device | expand |
On Mon, 2025-03-31 at 19:17 -0400, Benjamin Marzinski wrote: > When a new device is added by the multipath command, multipathd may > know > of other paths that cannot be added to the device because they are > currently offline. Instead of ignoring these paths, multipathd will > now > re-add them when they come back online. To do this, it multipathd > needs > a new path variable add_when_online, to track devices that could not > be > added to an existing multipath device because they were offline. > These > paths are handled along with the other uninitialized paths. > > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> > --- > libmultipath/print.c | 5 +++- > libmultipath/structs.h | 1 + > libmultipath/structs_vec.c | 5 ++++ > multipathd/main.c | 59 > ++++++++++++++++++++++++++++++++++++-- > multipathd/multipathd.8.in | 5 ++-- > 5 files changed, 70 insertions(+), 5 deletions(-) > > diff --git a/libmultipath/print.c b/libmultipath/print.c > index 00c03ace..019ae56f 100644 > --- a/libmultipath/print.c > +++ b/libmultipath/print.c > @@ -658,8 +658,11 @@ snprint_path_serial (struct strbuf *buff, const > struct path * pp) > static int > snprint_path_mpp (struct strbuf *buff, const struct path * pp) > { > - if (!pp->mpp) > + if (!pp->mpp) { > + if (pp->add_when_online) > + return append_strbuf_str(buff, "[offline]"); > return append_strbuf_str(buff, "[orphan]"); > + } > if (!pp->mpp->alias) > return append_strbuf_str(buff, "[unknown]"); > return snprint_str(buff, pp->mpp->alias); > diff --git a/libmultipath/structs.h b/libmultipath/structs.h > index 28de9a7f..39d1c71c 100644 > --- a/libmultipath/structs.h > +++ b/libmultipath/structs.h > @@ -413,6 +413,7 @@ struct path { > int eh_deadline; > enum check_path_states is_checked; > bool can_use_env_uid; > + bool add_when_online; > unsigned int checker_timeout; > /* configlet pointers */ > vector hwe; > diff --git a/libmultipath/structs_vec.c b/libmultipath/structs_vec.c > index f6407e12..663c9053 100644 > --- a/libmultipath/structs_vec.c > +++ b/libmultipath/structs_vec.c > @@ -389,6 +389,9 @@ static void orphan_paths(vector pathvec, struct > multipath *mpp, const char *reas > free_path(pp); > } else > orphan_path(pp, reason); > + } else if (pp->add_when_online && > + strncmp(mpp->wwid, pp->wwid, WWID_SIZE) > == 0) { > + pp->add_when_online = false; > } > } > } > @@ -595,6 +598,8 @@ void sync_paths(struct multipath *mpp, vector > pathvec) > found = 0; > vector_foreach_slot(mpp->pg, pgp, j) { > if (find_slot(pgp->paths, (void *)pp) != -1) > { > + if (pp->add_when_online) > + pp->add_when_online = false; > found = 1; > break; > } > diff --git a/multipathd/main.c b/multipathd/main.c > index 7aaae773..9aa5a2fa 100644 > --- a/multipathd/main.c > +++ b/multipathd/main.c > @@ -644,11 +644,45 @@ pr_register_active_paths(struct multipath *mpp) > } > } > > +static void > +save_offline_paths(const struct multipath *mpp, vector > offline_paths) > +{ > + unsigned int i, j; > + struct path *pp; > + struct pathgroup *pgp; > + > + vector_foreach_slot (mpp->pg, pgp, i) > + vector_foreach_slot (pgp->paths, pp, j) > + if (pp->initialized == INIT_OK && > + pp->sysfs_state == PATH_DOWN) > + /* ignore failures storing the > paths. */ > + store_path(offline_paths, pp); > +} > + > +static void > +handle_orphaned_offline_paths(vector offline_paths) > +{ > + unsigned int i; > + struct path *pp; > + > + vector_foreach_slot (offline_paths, pp, i) > + if (pp->mpp == NULL) > + pp->add_when_online = true; > +} > + > +static void > +cleanup_reset_vec(struct vector_s **v) > +{ > + vector_reset(*v); > +} > + > static int > update_map (struct multipath *mpp, struct vectors *vecs, int > new_map) > { > int retries = 3; > char *params __attribute__((cleanup(cleanup_charp))) = NULL; > + struct vector_s offline_paths_vec = { .allocated = 0 }; > + vector offline_paths > __attribute__((cleanup(cleanup_reset_vec))) = &offline_paths_vec; > > retry: > condlog(4, "%s: updating new map", mpp->alias); > @@ -685,6 +719,9 @@ fail: > return 1; > } > > + if (new_map && retries < 0) > + save_offline_paths(mpp, offline_paths); > + > if (setup_multipath(vecs, mpp)) > return 1; > > @@ -695,6 +732,9 @@ fail: > if (mpp->prflag == PRFLAG_SET) > pr_register_active_paths(mpp); > > + if (VECTOR_SIZE(offline_paths) != 0) > + handle_orphaned_offline_paths(offline_paths); > + > if (retries < 0) > condlog(0, "%s: failed reload in new map update", > mpp->alias); > return 0; > @@ -2793,7 +2833,8 @@ check_uninitialized_path(struct path * pp, > unsigned int ticks) > struct config *conf; > > if (pp->initialized != INIT_NEW && pp->initialized != > INIT_FAILED && > - pp->initialized != INIT_MISSING_UDEV) > + pp->initialized != INIT_MISSING_UDEV && > + !(pp->initialized == INIT_OK && pp->add_when_online)) > return CHECK_PATH_SKIPPED; > > if (pp->tick) > @@ -2849,7 +2890,8 @@ update_uninitialized_path(struct vectors * > vecs, struct path * pp) > struct config *conf; > > if (pp->initialized != INIT_NEW && pp->initialized != > INIT_FAILED && > - pp->initialized != INIT_MISSING_UDEV) > + pp->initialized != INIT_MISSING_UDEV && > + !(pp->initialized == INIT_OK && pp->add_when_online)) > return CHECK_PATH_SKIPPED; > > newstate = get_new_state(pp); > @@ -2875,6 +2917,19 @@ update_uninitialized_path(struct vectors * > vecs, struct path * pp) > free_path(pp); > return CHECK_PATH_REMOVED; > } > + } else if (pp->initialized == INIT_OK && pp->add_when_online > && > + (newstate == PATH_UP || newstate == PATH_GHOST)) > { > + pp->initialized = INIT_OK; > + if (pp->recheck_wwid == RECHECK_WWID_ON && I wonder if we should always check the WWID here. After all, this path has never been part of the map, and it was offline when the map was created (IOW, the WWID couldn't be checked at that point in time). Can we rely on the stored WWID? Other than that, LGTM. Regards, Martin > + check_path_wwid_change(pp)) { > + condlog(0, "%s: path wwid change detected. > Removing", > + pp->dev); > + return handle_path_wwid_change(pp, vecs)? > + CHECK_PATH_REMOVED : > + CHECK_PATH_SKIPPED; > + } > + ev_add_path(pp, vecs, 1); > + pp->tick = 1; > } > return CHECK_PATH_CHECKED; > } > diff --git a/multipathd/multipathd.8.in b/multipathd/multipathd.8.in > index 43f87bf8..a91acff1 100644 > --- a/multipathd/multipathd.8.in > +++ b/multipathd/multipathd.8.in > @@ -595,8 +595,9 @@ The device serial number. > The device marginal state, either \fImarginal\fR or \fInormal\fR. > .TP > .B %m > -The multipath device that this device is a path of, or > \fI[orphan]\fR if > -it is not part of any multipath device. > +The multipath device that this device is a path of, or > \fI[offline]\fR > +if this device could not be added to a device because is is offline > or > +\fI[orphan]\fR if it is not part of any multipath device. > .TP > .B %N > The host World Wide Node Name (WWNN) of the device, if any.
On Wed, Apr 02, 2025 at 06:42:34PM +0200, Martin Wilck wrote: > On Mon, 2025-03-31 at 19:17 -0400, Benjamin Marzinski wrote: > > When a new device is added by the multipath command, multipathd may > > know > > of other paths that cannot be added to the device because they are > > currently offline. Instead of ignoring these paths, multipathd will > > now > > re-add them when they come back online. To do this, it multipathd > > needs > > a new path variable add_when_online, to track devices that could not > > be > > added to an existing multipath device because they were offline. > > These > > paths are handled along with the other uninitialized paths. > > > > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> > > --- > > libmultipath/print.c | 5 +++- > > libmultipath/structs.h | 1 + > > libmultipath/structs_vec.c | 5 ++++ > > multipathd/main.c | 59 > > ++++++++++++++++++++++++++++++++++++-- > > multipathd/multipathd.8.in | 5 ++-- > > 5 files changed, 70 insertions(+), 5 deletions(-) > > > > diff --git a/libmultipath/print.c b/libmultipath/print.c > > index 00c03ace..019ae56f 100644 > > --- a/libmultipath/print.c > > +++ b/libmultipath/print.c > > @@ -658,8 +658,11 @@ snprint_path_serial (struct strbuf *buff, const > > struct path * pp) > > static int > > snprint_path_mpp (struct strbuf *buff, const struct path * pp) > > { > > - if (!pp->mpp) > > + if (!pp->mpp) { > > + if (pp->add_when_online) > > + return append_strbuf_str(buff, "[offline]"); > > return append_strbuf_str(buff, "[orphan]"); > > + } > > if (!pp->mpp->alias) > > return append_strbuf_str(buff, "[unknown]"); > > return snprint_str(buff, pp->mpp->alias); > > diff --git a/libmultipath/structs.h b/libmultipath/structs.h > > index 28de9a7f..39d1c71c 100644 > > --- a/libmultipath/structs.h > > +++ b/libmultipath/structs.h > > @@ -413,6 +413,7 @@ struct path { > > int eh_deadline; > > enum check_path_states is_checked; > > bool can_use_env_uid; > > + bool add_when_online; > > unsigned int checker_timeout; > > /* configlet pointers */ > > vector hwe; > > diff --git a/libmultipath/structs_vec.c b/libmultipath/structs_vec.c > > index f6407e12..663c9053 100644 > > --- a/libmultipath/structs_vec.c > > +++ b/libmultipath/structs_vec.c > > @@ -389,6 +389,9 @@ static void orphan_paths(vector pathvec, struct > > multipath *mpp, const char *reas > > free_path(pp); > > } else > > orphan_path(pp, reason); > > + } else if (pp->add_when_online && > > + strncmp(mpp->wwid, pp->wwid, WWID_SIZE) > > == 0) { > > + pp->add_when_online = false; > > } > > } > > } > > @@ -595,6 +598,8 @@ void sync_paths(struct multipath *mpp, vector > > pathvec) > > found = 0; > > vector_foreach_slot(mpp->pg, pgp, j) { > > if (find_slot(pgp->paths, (void *)pp) != -1) > > { > > + if (pp->add_when_online) > > + pp->add_when_online = false; > > found = 1; > > break; > > } > > diff --git a/multipathd/main.c b/multipathd/main.c > > index 7aaae773..9aa5a2fa 100644 > > --- a/multipathd/main.c > > +++ b/multipathd/main.c > > @@ -644,11 +644,45 @@ pr_register_active_paths(struct multipath *mpp) > > } > > } > > > > +static void > > +save_offline_paths(const struct multipath *mpp, vector > > offline_paths) > > +{ > > + unsigned int i, j; > > + struct path *pp; > > + struct pathgroup *pgp; > > + > > + vector_foreach_slot (mpp->pg, pgp, i) > > + vector_foreach_slot (pgp->paths, pp, j) > > + if (pp->initialized == INIT_OK && > > + pp->sysfs_state == PATH_DOWN) > > + /* ignore failures storing the > > paths. */ > > + store_path(offline_paths, pp); > > +} > > + > > +static void > > +handle_orphaned_offline_paths(vector offline_paths) > > +{ > > + unsigned int i; > > + struct path *pp; > > + > > + vector_foreach_slot (offline_paths, pp, i) > > + if (pp->mpp == NULL) > > + pp->add_when_online = true; > > +} > > + > > +static void > > +cleanup_reset_vec(struct vector_s **v) > > +{ > > + vector_reset(*v); > > +} > > + > > static int > > update_map (struct multipath *mpp, struct vectors *vecs, int > > new_map) > > { > > int retries = 3; > > char *params __attribute__((cleanup(cleanup_charp))) = NULL; > > + struct vector_s offline_paths_vec = { .allocated = 0 }; > > + vector offline_paths > > __attribute__((cleanup(cleanup_reset_vec))) = &offline_paths_vec; > > > > retry: > > condlog(4, "%s: updating new map", mpp->alias); > > @@ -685,6 +719,9 @@ fail: > > return 1; > > } > > > > + if (new_map && retries < 0) > > + save_offline_paths(mpp, offline_paths); > > + > > if (setup_multipath(vecs, mpp)) > > return 1; > > > > @@ -695,6 +732,9 @@ fail: > > if (mpp->prflag == PRFLAG_SET) > > pr_register_active_paths(mpp); > > > > + if (VECTOR_SIZE(offline_paths) != 0) > > + handle_orphaned_offline_paths(offline_paths); > > + > > if (retries < 0) > > condlog(0, "%s: failed reload in new map update", > > mpp->alias); > > return 0; > > @@ -2793,7 +2833,8 @@ check_uninitialized_path(struct path * pp, > > unsigned int ticks) > > struct config *conf; > > > > if (pp->initialized != INIT_NEW && pp->initialized != > > INIT_FAILED && > > - pp->initialized != INIT_MISSING_UDEV) > > + pp->initialized != INIT_MISSING_UDEV && > > + !(pp->initialized == INIT_OK && pp->add_when_online)) > > return CHECK_PATH_SKIPPED; > > > > if (pp->tick) > > @@ -2849,7 +2890,8 @@ update_uninitialized_path(struct vectors * > > vecs, struct path * pp) > > struct config *conf; > > > > if (pp->initialized != INIT_NEW && pp->initialized != > > INIT_FAILED && > > - pp->initialized != INIT_MISSING_UDEV) > > + pp->initialized != INIT_MISSING_UDEV && > > + !(pp->initialized == INIT_OK && pp->add_when_online)) > > return CHECK_PATH_SKIPPED; > > > > newstate = get_new_state(pp); > > @@ -2875,6 +2917,19 @@ update_uninitialized_path(struct vectors * > > vecs, struct path * pp) > > free_path(pp); > > return CHECK_PATH_REMOVED; > > } > > + } else if (pp->initialized == INIT_OK && pp->add_when_online > > && > > + (newstate == PATH_UP || newstate == PATH_GHOST)) > > { > > + pp->initialized = INIT_OK; > > + if (pp->recheck_wwid == RECHECK_WWID_ON && > > I wonder if we should always check the WWID here. After all, this path > has never been part of the map, and it was offline when the map was > created (IOW, the WWID couldn't be checked at that point in time). > Can we rely on the stored WWID? That make sense. I also noticed a typo in my documentation changes. I'll send a new patch. -Ben > Other than that, LGTM. > > Regards, > Martin > > > + check_path_wwid_change(pp)) { > > + condlog(0, "%s: path wwid change detected. > > Removing", > > + pp->dev); > > + return handle_path_wwid_change(pp, vecs)? > > + CHECK_PATH_REMOVED : > > + CHECK_PATH_SKIPPED; > > + } > > + ev_add_path(pp, vecs, 1); > > + pp->tick = 1; > > } > > return CHECK_PATH_CHECKED; > > } > > diff --git a/multipathd/multipathd.8.in b/multipathd/multipathd.8.in > > index 43f87bf8..a91acff1 100644 > > --- a/multipathd/multipathd.8.in > > +++ b/multipathd/multipathd.8.in > > @@ -595,8 +595,9 @@ The device serial number. > > The device marginal state, either \fImarginal\fR or \fInormal\fR. > > .TP > > .B %m > > -The multipath device that this device is a path of, or > > \fI[orphan]\fR if > > -it is not part of any multipath device. > > +The multipath device that this device is a path of, or > > \fI[offline]\fR > > +if this device could not be added to a device because is is offline > > or > > +\fI[orphan]\fR if it is not part of any multipath device. > > .TP > > .B %N > > The host World Wide Node Name (WWNN) of the device, if any.
diff --git a/libmultipath/print.c b/libmultipath/print.c index 00c03ace..019ae56f 100644 --- a/libmultipath/print.c +++ b/libmultipath/print.c @@ -658,8 +658,11 @@ snprint_path_serial (struct strbuf *buff, const struct path * pp) static int snprint_path_mpp (struct strbuf *buff, const struct path * pp) { - if (!pp->mpp) + if (!pp->mpp) { + if (pp->add_when_online) + return append_strbuf_str(buff, "[offline]"); return append_strbuf_str(buff, "[orphan]"); + } if (!pp->mpp->alias) return append_strbuf_str(buff, "[unknown]"); return snprint_str(buff, pp->mpp->alias); diff --git a/libmultipath/structs.h b/libmultipath/structs.h index 28de9a7f..39d1c71c 100644 --- a/libmultipath/structs.h +++ b/libmultipath/structs.h @@ -413,6 +413,7 @@ struct path { int eh_deadline; enum check_path_states is_checked; bool can_use_env_uid; + bool add_when_online; unsigned int checker_timeout; /* configlet pointers */ vector hwe; diff --git a/libmultipath/structs_vec.c b/libmultipath/structs_vec.c index f6407e12..663c9053 100644 --- a/libmultipath/structs_vec.c +++ b/libmultipath/structs_vec.c @@ -389,6 +389,9 @@ static void orphan_paths(vector pathvec, struct multipath *mpp, const char *reas free_path(pp); } else orphan_path(pp, reason); + } else if (pp->add_when_online && + strncmp(mpp->wwid, pp->wwid, WWID_SIZE) == 0) { + pp->add_when_online = false; } } } @@ -595,6 +598,8 @@ void sync_paths(struct multipath *mpp, vector pathvec) found = 0; vector_foreach_slot(mpp->pg, pgp, j) { if (find_slot(pgp->paths, (void *)pp) != -1) { + if (pp->add_when_online) + pp->add_when_online = false; found = 1; break; } diff --git a/multipathd/main.c b/multipathd/main.c index 7aaae773..9aa5a2fa 100644 --- a/multipathd/main.c +++ b/multipathd/main.c @@ -644,11 +644,45 @@ pr_register_active_paths(struct multipath *mpp) } } +static void +save_offline_paths(const struct multipath *mpp, vector offline_paths) +{ + unsigned int i, j; + struct path *pp; + struct pathgroup *pgp; + + vector_foreach_slot (mpp->pg, pgp, i) + vector_foreach_slot (pgp->paths, pp, j) + if (pp->initialized == INIT_OK && + pp->sysfs_state == PATH_DOWN) + /* ignore failures storing the paths. */ + store_path(offline_paths, pp); +} + +static void +handle_orphaned_offline_paths(vector offline_paths) +{ + unsigned int i; + struct path *pp; + + vector_foreach_slot (offline_paths, pp, i) + if (pp->mpp == NULL) + pp->add_when_online = true; +} + +static void +cleanup_reset_vec(struct vector_s **v) +{ + vector_reset(*v); +} + static int update_map (struct multipath *mpp, struct vectors *vecs, int new_map) { int retries = 3; char *params __attribute__((cleanup(cleanup_charp))) = NULL; + struct vector_s offline_paths_vec = { .allocated = 0 }; + vector offline_paths __attribute__((cleanup(cleanup_reset_vec))) = &offline_paths_vec; retry: condlog(4, "%s: updating new map", mpp->alias); @@ -685,6 +719,9 @@ fail: return 1; } + if (new_map && retries < 0) + save_offline_paths(mpp, offline_paths); + if (setup_multipath(vecs, mpp)) return 1; @@ -695,6 +732,9 @@ fail: if (mpp->prflag == PRFLAG_SET) pr_register_active_paths(mpp); + if (VECTOR_SIZE(offline_paths) != 0) + handle_orphaned_offline_paths(offline_paths); + if (retries < 0) condlog(0, "%s: failed reload in new map update", mpp->alias); return 0; @@ -2793,7 +2833,8 @@ check_uninitialized_path(struct path * pp, unsigned int ticks) struct config *conf; if (pp->initialized != INIT_NEW && pp->initialized != INIT_FAILED && - pp->initialized != INIT_MISSING_UDEV) + pp->initialized != INIT_MISSING_UDEV && + !(pp->initialized == INIT_OK && pp->add_when_online)) return CHECK_PATH_SKIPPED; if (pp->tick) @@ -2849,7 +2890,8 @@ update_uninitialized_path(struct vectors * vecs, struct path * pp) struct config *conf; if (pp->initialized != INIT_NEW && pp->initialized != INIT_FAILED && - pp->initialized != INIT_MISSING_UDEV) + pp->initialized != INIT_MISSING_UDEV && + !(pp->initialized == INIT_OK && pp->add_when_online)) return CHECK_PATH_SKIPPED; newstate = get_new_state(pp); @@ -2875,6 +2917,19 @@ update_uninitialized_path(struct vectors * vecs, struct path * pp) free_path(pp); return CHECK_PATH_REMOVED; } + } else if (pp->initialized == INIT_OK && pp->add_when_online && + (newstate == PATH_UP || newstate == PATH_GHOST)) { + pp->initialized = INIT_OK; + if (pp->recheck_wwid == RECHECK_WWID_ON && + check_path_wwid_change(pp)) { + condlog(0, "%s: path wwid change detected. Removing", + pp->dev); + return handle_path_wwid_change(pp, vecs)? + CHECK_PATH_REMOVED : + CHECK_PATH_SKIPPED; + } + ev_add_path(pp, vecs, 1); + pp->tick = 1; } return CHECK_PATH_CHECKED; } diff --git a/multipathd/multipathd.8.in b/multipathd/multipathd.8.in index 43f87bf8..a91acff1 100644 --- a/multipathd/multipathd.8.in +++ b/multipathd/multipathd.8.in @@ -595,8 +595,9 @@ The device serial number. The device marginal state, either \fImarginal\fR or \fInormal\fR. .TP .B %m -The multipath device that this device is a path of, or \fI[orphan]\fR if -it is not part of any multipath device. +The multipath device that this device is a path of, or \fI[offline]\fR +if this device could not be added to a device because is is offline or +\fI[orphan]\fR if it is not part of any multipath device. .TP .B %N The host World Wide Node Name (WWNN) of the device, if any.
When a new device is added by the multipath command, multipathd may know of other paths that cannot be added to the device because they are currently offline. Instead of ignoring these paths, multipathd will now re-add them when they come back online. To do this, it multipathd needs a new path variable add_when_online, to track devices that could not be added to an existing multipath device because they were offline. These paths are handled along with the other uninitialized paths. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> --- libmultipath/print.c | 5 +++- libmultipath/structs.h | 1 + libmultipath/structs_vec.c | 5 ++++ multipathd/main.c | 59 ++++++++++++++++++++++++++++++++++++-- multipathd/multipathd.8.in | 5 ++-- 5 files changed, 70 insertions(+), 5 deletions(-)