Message ID | 20241206233617.382200-4-mwilck@suse.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Delegated to: | Benjamin Marzinski |
Headers | show |
Series | multipathd: More map reload handling, and checkerloop work | expand |
On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > We previously didn't allow map removal inside the checker loop. But > with the late updates to the checkerloop code, it should be safe to orphan > paths and delete maps even in this situation. We remove such maps everywhere > else in the code already, whenever refresh_multipath() or setup_multipath() > is called. I don't think that this is safe. It's possible that the multipath device has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get silently removed from the pathvec if we remove the map here. This will mess up our iteration through the pathvec in update_paths(). Perhaps a better idea would be to set mpp->sync_ticks to 0 if update_multipath_strings() fails in do_sync_mpp(). This would force a refresh by sync_mpp() at the start of the next loop in checkerloop(), where it can safely remove the multipath device. -Ben > Signed-off-by: Martin Wilck <mwilck@suse.com> > --- > multipathd/main.c | 43 ++++++++++++++++++++----------------------- > 1 file changed, 20 insertions(+), 23 deletions(-) > > diff --git a/multipathd/main.c b/multipathd/main.c > index 4a28fbb..131dab6 100644 > --- a/multipathd/main.c > +++ b/multipathd/main.c > @@ -2446,34 +2446,30 @@ get_new_state(struct path *pp) > return newstate; > } > > -static void > -do_sync_mpp(struct vectors * vecs, struct multipath *mpp) > +/* Returns true if the mpp was deleted */ > +static int > +do_sync_mpp(struct vectors *vecs, struct multipath *mpp) > { > - int i, ret; > - struct path *pp; > + int ret; > + > + ret = refresh_multipath(vecs, mpp); > + if (ret) > + return ret; > > - ret = update_multipath_strings(mpp, vecs->pathvec); > - if (ret != DMP_OK) { > - condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ? > - "device not found" : > - "couldn't synchronize with kernel state"); > - vector_foreach_slot (mpp->paths, pp, i) > - pp->dmstate = PSTATE_UNDEF; > - return; > - } > set_no_path_retry(mpp); > + return 0; > } > > -static void > +static int > sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks) > { > if (mpp->sync_tick) > mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks : > mpp->sync_tick; > if (mpp->sync_tick) > - return; > + return 0; > > - do_sync_mpp(vecs, mpp); > + return do_sync_mpp(vecs, mpp); > } > > static int > @@ -2513,12 +2509,10 @@ update_path_state (struct vectors * vecs, struct path * pp) > return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED : > CHECK_PATH_SKIPPED; > } > - if (pp->mpp->synced_count == 0) { > - do_sync_mpp(vecs, pp->mpp); > + if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp)) > /* if update_multipath_strings orphaned the path, quit early */ > - if (!pp->mpp) > - return CHECK_PATH_SKIPPED; > - } > + return CHECK_PATH_SKIPPED; > + > if ((newstate != PATH_UP && newstate != PATH_GHOST && > newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) { > /* If path state become failed again cancel path delay state */ > @@ -3018,8 +3012,11 @@ checkerloop (void *ap) > mpp->synced_count = 0; > if (checker_state == CHECKER_STARTING) { > vector_foreach_slot(vecs->mpvec, mpp, i) { > - sync_mpp(vecs, mpp, ticks); > - mpp->prio_update = PRIO_UPDATE_NONE; > + if (sync_mpp(vecs, mpp, ticks)) > + /* map deleted */ > + i--; > + else > + mpp->prio_update = PRIO_UPDATE_NONE; > } > vector_foreach_slot(vecs->pathvec, pp, i) > pp->is_checked = CHECK_PATH_UNCHECKED; > -- > 2.47.0
On Tue, Dec 10, 2024 at 02:02:32PM -0500, Benjamin Marzinski wrote: > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > > We previously didn't allow map removal inside the checker loop. But > > with the late updates to the checkerloop code, it should be safe to orphan > > paths and delete maps even in this situation. We remove such maps everywhere > > else in the code already, whenever refresh_multipath() or setup_multipath() > > is called. > > I don't think that this is safe. It's possible that the multipath device > has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get > silently removed from the pathvec if we remove the map here. This will > mess up our iteration through the pathvec in update_paths(). Perhaps a > better idea would be to set mpp->sync_ticks to 0 if > update_multipath_strings() fails in do_sync_mpp(). This would force a > refresh by sync_mpp() at the start of the next loop in checkerloop(), > where it can safely remove the multipath device. If we go this route we should probably rename do_sync_map() to sync_map() and rename sync_map() to something like check_refresh_map(), since it would now be calling refresh_map() instead of do_sync_map(). > > -Ben > > > Signed-off-by: Martin Wilck <mwilck@suse.com> > > --- > > multipathd/main.c | 43 ++++++++++++++++++++----------------------- > > 1 file changed, 20 insertions(+), 23 deletions(-) > > > > diff --git a/multipathd/main.c b/multipathd/main.c > > index 4a28fbb..131dab6 100644 > > --- a/multipathd/main.c > > +++ b/multipathd/main.c > > @@ -2446,34 +2446,30 @@ get_new_state(struct path *pp) > > return newstate; > > } > > > > -static void > > -do_sync_mpp(struct vectors * vecs, struct multipath *mpp) > > +/* Returns true if the mpp was deleted */ > > +static int > > +do_sync_mpp(struct vectors *vecs, struct multipath *mpp) > > { > > - int i, ret; > > - struct path *pp; > > + int ret; > > + > > + ret = refresh_multipath(vecs, mpp); > > + if (ret) > > + return ret; > > > > - ret = update_multipath_strings(mpp, vecs->pathvec); > > - if (ret != DMP_OK) { > > - condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ? > > - "device not found" : > > - "couldn't synchronize with kernel state"); > > - vector_foreach_slot (mpp->paths, pp, i) > > - pp->dmstate = PSTATE_UNDEF; > > - return; > > - } > > set_no_path_retry(mpp); > > + return 0; > > } > > > > -static void > > +static int > > sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks) > > { > > if (mpp->sync_tick) > > mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks : > > mpp->sync_tick; > > if (mpp->sync_tick) > > - return; > > + return 0; > > > > - do_sync_mpp(vecs, mpp); > > + return do_sync_mpp(vecs, mpp); > > } > > > > static int > > @@ -2513,12 +2509,10 @@ update_path_state (struct vectors * vecs, struct path * pp) > > return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED : > > CHECK_PATH_SKIPPED; > > } > > - if (pp->mpp->synced_count == 0) { > > - do_sync_mpp(vecs, pp->mpp); > > + if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp)) > > /* if update_multipath_strings orphaned the path, quit early */ > > - if (!pp->mpp) > > - return CHECK_PATH_SKIPPED; > > - } > > + return CHECK_PATH_SKIPPED; > > + > > if ((newstate != PATH_UP && newstate != PATH_GHOST && > > newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) { > > /* If path state become failed again cancel path delay state */ > > @@ -3018,8 +3012,11 @@ checkerloop (void *ap) > > mpp->synced_count = 0; > > if (checker_state == CHECKER_STARTING) { > > vector_foreach_slot(vecs->mpvec, mpp, i) { > > - sync_mpp(vecs, mpp, ticks); > > - mpp->prio_update = PRIO_UPDATE_NONE; > > + if (sync_mpp(vecs, mpp, ticks)) > > + /* map deleted */ > > + i--; > > + else > > + mpp->prio_update = PRIO_UPDATE_NONE; > > } > > vector_foreach_slot(vecs->pathvec, pp, i) > > pp->is_checked = CHECK_PATH_UNCHECKED; > > -- > > 2.47.0
On Tue, 2024-12-10 at 14:02 -0500, Benjamin Marzinski wrote: > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > > We previously didn't allow map removal inside the checker loop. But > > with the late updates to the checkerloop code, it should be safe to > > orphan > > paths and delete maps even in this situation. We remove such maps > > everywhere > > else in the code already, whenever refresh_multipath() or > > setup_multipath() > > is called. > > I don't think that this is safe. It's possible that the multipath > device > has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get > silently removed from the pathvec if we remove the map here. This > will > mess up our iteration through the pathvec in update_paths(). Hm. You're right. But that applies to the current code in 0.11.0 PR as well, because we'd call do_sync_mpp() update_multipath_strings() sync_paths() check_removed_paths() vector_del_slot(pathvec, i--); Or am I missing something? It seems to me that the only safe way to handle this is to refrain from deleting paths from the pathvec anywhere deep in the call stack. Even if we can avoid this situation now by moving the sync towards the end of the checker loop, I believe that in the long run we need to fix these traps in our code, because it's just so easy to get this wrong. I wonder if we need yet another path state, of if we could simply set these entries in the pathvec to NULL. That sounds crazy, but it might actually be doable. Not 0.11.0 material, though. > Perhaps a > better idea would be to set mpp->sync_ticks to 0 if > update_multipath_strings() fails in do_sync_mpp(). This would force a > refresh by sync_mpp() at the start of the next loop in checkerloop(), > where it can safely remove the multipath device. I like the idea of your other post to move the sync to the CHECKER_FINISHED state. Thanks, Martin
On Tue, Dec 10, 2024 at 10:05:14PM +0100, Martin Wilck wrote: > On Tue, 2024-12-10 at 14:02 -0500, Benjamin Marzinski wrote: > > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > > > We previously didn't allow map removal inside the checker loop. But > > > with the late updates to the checkerloop code, it should be safe to > > > orphan > > > paths and delete maps even in this situation. We remove such maps > > > everywhere > > > else in the code already, whenever refresh_multipath() or > > > setup_multipath() > > > is called. > > > > I don't think that this is safe. It's possible that the multipath > > device > > has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get > > silently removed from the pathvec if we remove the map here. This > > will > > mess up our iteration through the pathvec in update_paths(). > > Hm. You're right. But that applies to the current code in 0.11.0 PR as > well, because we'd call > > do_sync_mpp() > update_multipath_strings() > sync_paths() > check_removed_paths() > vector_del_slot(pathvec, i--); > > Or am I missing something? Nope. Your right. Nuts. > It seems to me that the only safe way to handle this is to refrain from > deleting paths from the pathvec anywhere deep in the call stack. Even > if we can avoid this situation now by moving the sync towards the end > of the checker loop, I believe that in the long run we need to fix > these traps in our code, because it's just so easy to get this wrong. > > I wonder if we need yet another path state, of if we could simply set > these entries in the pathvec to NULL. That sounds crazy, but it might > actually be doable. Not 0.11.0 material, though. I think we could just not call check_removed_paths() in sync_paths(). We would still orphan all the paths that were no longer part of the multipath device, and set pp->mpp for all the paths that are part of the device just like before, but we wouldn't delete the paths from pathvec there. Instead we would call check_removed_paths() in refresh_multipath(), so we did it after loads and in update_multipath. I'm pretty sure that should be fine. If the device table changed and removed a path so that we can free it, either we reloaded the device, and we will call setup_multipath() after the reload, or something external did, and multipathd will see an event for that and call setup_multipath() via update_multipath(). Does that make sense? > > Perhaps a > > better idea would be to set mpp->sync_ticks to 0 if > > update_multipath_strings() fails in do_sync_mpp(). This would force a > > refresh by sync_mpp() at the start of the next loop in checkerloop(), > > where it can safely remove the multipath device. > > I like the idea of your other post to move the sync to the > CHECKER_FINISHED state. > > Thanks, > Martin >
On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > We previously didn't allow map removal inside the checker loop. But > with the late updates to the checkerloop code, it should be safe to orphan > paths and delete maps even in this situation. We remove such maps everywhere > else in the code already, whenever refresh_multipath() or setup_multipath() > is called. Actually, thinking about this more, what do we get by proactively deleting the multipath device if something goes wrong in the checker? If we successfully reload a device, but can't sync it with the kernel, that's one thing, But that was triggered by a change in the device, and we know that when we reloaded the device, device-mapper was working. I'm leery of possibly deleting the map because of a transient device-mapper issue. I'm not sure if on a check that we do repeatedly, we should delete the device on an error. We haven't in the past, and as far as I know, it doesn't cause problems. Without a benefit to doing this, I'm not sure it makes sense. -Ben > > Signed-off-by: Martin Wilck <mwilck@suse.com> > --- > multipathd/main.c | 43 ++++++++++++++++++++----------------------- > 1 file changed, 20 insertions(+), 23 deletions(-) > > diff --git a/multipathd/main.c b/multipathd/main.c > index 4a28fbb..131dab6 100644 > --- a/multipathd/main.c > +++ b/multipathd/main.c > @@ -2446,34 +2446,30 @@ get_new_state(struct path *pp) > return newstate; > } > > -static void > -do_sync_mpp(struct vectors * vecs, struct multipath *mpp) > +/* Returns true if the mpp was deleted */ > +static int > +do_sync_mpp(struct vectors *vecs, struct multipath *mpp) > { > - int i, ret; > - struct path *pp; > + int ret; > + > + ret = refresh_multipath(vecs, mpp); > + if (ret) > + return ret; > > - ret = update_multipath_strings(mpp, vecs->pathvec); > - if (ret != DMP_OK) { > - condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ? > - "device not found" : > - "couldn't synchronize with kernel state"); > - vector_foreach_slot (mpp->paths, pp, i) > - pp->dmstate = PSTATE_UNDEF; > - return; > - } > set_no_path_retry(mpp); > + return 0; > } > > -static void > +static int > sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks) > { > if (mpp->sync_tick) > mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks : > mpp->sync_tick; > if (mpp->sync_tick) > - return; > + return 0; > > - do_sync_mpp(vecs, mpp); > + return do_sync_mpp(vecs, mpp); > } > > static int > @@ -2513,12 +2509,10 @@ update_path_state (struct vectors * vecs, struct path * pp) > return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED : > CHECK_PATH_SKIPPED; > } > - if (pp->mpp->synced_count == 0) { > - do_sync_mpp(vecs, pp->mpp); > + if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp)) > /* if update_multipath_strings orphaned the path, quit early */ > - if (!pp->mpp) > - return CHECK_PATH_SKIPPED; > - } > + return CHECK_PATH_SKIPPED; > + > if ((newstate != PATH_UP && newstate != PATH_GHOST && > newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) { > /* If path state become failed again cancel path delay state */ > @@ -3018,8 +3012,11 @@ checkerloop (void *ap) > mpp->synced_count = 0; > if (checker_state == CHECKER_STARTING) { > vector_foreach_slot(vecs->mpvec, mpp, i) { > - sync_mpp(vecs, mpp, ticks); > - mpp->prio_update = PRIO_UPDATE_NONE; > + if (sync_mpp(vecs, mpp, ticks)) > + /* map deleted */ > + i--; > + else > + mpp->prio_update = PRIO_UPDATE_NONE; > } > vector_foreach_slot(vecs->pathvec, pp, i) > pp->is_checked = CHECK_PATH_UNCHECKED; > -- > 2.47.0
On Tue, 2024-12-10 at 18:30 -0500, Benjamin Marzinski wrote: > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > > We previously didn't allow map removal inside the checker loop. But > > with the late updates to the checkerloop code, it should be safe to > > orphan > > paths and delete maps even in this situation. We remove such maps > > everywhere > > else in the code already, whenever refresh_multipath() or > > setup_multipath() > > is called. > > Actually, thinking about this more, what do we get by proactively > deleting the multipath device if something goes wrong in the checker? > If > we successfully reload a device, but can't sync it with the kernel, > that's one thing, But that was triggered by a change in the device, > and > we know that when we reloaded the device, device-mapper was working. > I'm > leery of possibly deleting the map because of a transient device- > mapper > issue. I'm not sure if on a check that we do repeatedly, we should > delete the device on an error. We haven't in the past, and as far as > I > know, it doesn't cause problems. I don't disagree. But the same can be said for basically all call chains where setup_multipath() is called for an existing map. I was just following the pattern that we use e.g. in ev_add_path(), or in update_mpp_prio(). Why would we treat the checker and path addition differently in this respect? If we look at this pragmatically (assuming that multipathd gets the parameters right), the most probable reason for a map reload failure is failure to open a path device in bdev_open(), either because the device doesn't exist, or because it's busy or otherwise unavailable. If this happens in ev_add_path(), the likely reason is that the path just added was busy, and the smartest action upon such a failure would probably be to just undo that addition. We currently don't do that; we remove the entire map, which is questionable, as you state correctly. In the checker, this can't happen. Obviously, no other process can grab a path device while the device mapper is holding it, so -EBUSY won't occur if we reload an existing map. Even device deletion doesn't cause failure on reload. It is possible to delete a SCSI device while it's mapped, and execute a table reload / suspend / resume cycle on the map while referencing the deleted device. The kernel keeps holding the reference to the deleted device, and will simply mark it as failed. This holds also if the mapped paths are re-grouped or re- ordered in the table. Failure occurs only if we temporarily remove the device from the map and re-add it, because as soon as the device is removed from the map's dm table, its refcount drops to zero, and it's gone for good. IOW, reloading a map with a table containing only already-mapped devices will never fail, except in extreme situations like kernel OOM. Thus, AFAICS, the only relevant scenario where a reloading would fail is trying to add a path device that was not previously mapped, and that's either busy (perhaps in another map) or has been deleted, IOW only when we reload after calling adopt_paths(). This is where we could improve. If we fail to reload after adopting new paths, we could fall back to the existing table, and perhaps try to add paths one by one. Again, this is post-0.11 material. OTOH, practially impossible is not totally impossible, so we need to be prepared to map reload failure either way. IMO the best thing we can do in this case is to keep using the kernel's map, and retry reloading later. The only critical situation is WWID change of path devices. We must try to fix this situation ASAP when we detect it. I'm unsure what the best action is if a reload fails in that situation, though (other than failing the path, as we already do). Martin
On Wed, Dec 11, 2024 at 01:06:46PM +0100, Martin Wilck wrote: > On Tue, 2024-12-10 at 18:30 -0500, Benjamin Marzinski wrote: > > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > > > We previously didn't allow map removal inside the checker loop. But > > > with the late updates to the checkerloop code, it should be safe to > > > orphan > > > paths and delete maps even in this situation. We remove such maps > > > everywhere > > > else in the code already, whenever refresh_multipath() or > > > setup_multipath() > > > is called. > > > > Actually, thinking about this more, what do we get by proactively > > deleting the multipath device if something goes wrong in the checker? > > If > > we successfully reload a device, but can't sync it with the kernel, > > that's one thing, But that was triggered by a change in the device, > > and > > we know that when we reloaded the device, device-mapper was working. > > I'm > > leery of possibly deleting the map because of a transient device- > > mapper > > issue. I'm not sure if on a check that we do repeatedly, we should > > delete the device on an error. We haven't in the past, and as far as > > I > > know, it doesn't cause problems. > > I don't disagree. But the same can be said for basically all call > chains where setup_multipath() is called for an existing map. I was > just following the pattern that we use e.g. in ev_add_path(), or in > update_mpp_prio(). Why would we treat the checker and path addition > differently in this respect? I'm confused here. ev_add_path() doesn't remove the device if the reload fails. If a reload fails, the table should stay the same. That's why I said that in other cases where we delete the device, we know that when we just reloaded the device, device-mapper was working. Looking at the code, that isn't really true. After failed reloads, we still call setup_multipath to update our state, and we will delete the device if that fails. > If we look at this pragmatically (assuming that multipathd gets the > parameters right), the most probable reason for a map reload failure is > failure to open a path device in bdev_open(), either because the device > doesn't exist, or because it's busy or otherwise unavailable. If this > happens in ev_add_path(), the likely reason is that the path just added > was busy, and the smartest action upon such a failure would probably be > to just undo that addition. We currently don't do that; we remove the > entire map, which is questionable, as you state correctly. This is why we call setup_multipath after failed reloads, to make sure multipathd's view of the multipath device resyncs with the kernel's, which hasn't changed from what it was before the reload failed. > In the checker, this can't happen. Obviously, no other process can grab > a path device while the device mapper is holding it, so -EBUSY won't > occur if we reload an existing map. Even device deletion doesn't cause > failure on reload. It is possible to delete a SCSI device while it's > mapped, and execute a table reload / suspend / resume cycle on the map > while referencing the deleted device. The kernel keeps holding the > reference to the deleted device, and will simply mark it as > failed. This holds also if the mapped paths are re-grouped or re- > ordered in the table. Failure occurs only if we temporarily remove the > device from the map and re-add it, because as soon as the device is > removed from the map's dm table, its refcount drops to zero, and it's > gone for good. > > IOW, reloading a map with a table containing only already-mapped > devices will never fail, except in extreme situations like kernel OOM. Maybe I should clarify my position a bit. I am fine with reloading the device in the checkerloop if something has changed. This obviously does run a very small risk of something going wrong and a device getting removed unnecessarily, but we know that we need to reload the device, so we should. What I would rather avoid is reloading the device because we failed to get it's state in do_sync_mpp(). We don't do this because we know that something has changed. We do this as a safety measure to deal with corner cases where our state doesn't match the kernel's and we didn't get an event. Double checking this each time we check a path in a device saves having to catch all these corner cases elsewhere. But it's almost always completely unnecessary, and we're doing it on every multipath device every couple of seconds, unlike reloading a device, which is rare. > Thus, AFAICS, the only relevant scenario where a reloading would fail > is trying to add a path device that was not previously mapped, and > that's either busy (perhaps in another map) or has been deleted, IOW > only when we reload after calling adopt_paths(). This is where we could > improve. If we fail to reload after adopting new paths, we could fall > back to the existing table, and perhaps try to add paths one by one. > Again, this is post-0.11 material. > > OTOH, practially impossible is not totally impossible, so we need to be > prepared to map reload failure either way. IMO the best thing we can do > in this case is to keep using the kernel's map, and retry reloading > later. I'm not actually worried about the kernel so much as libdevmapper. It is not designed for multi-threaded processes, and that has bitten us in the past. For intance, it's why we don't delete devices in dmevent_loop() on libdevmapper errors. dm_get_events() just waits and retries if getting the device list fails, and for each device, it calls dm_is_mpath and will only delete a device on DM_IS_MPATH_NO, which is what I suggested for the cleanup function. I'm pretty sure we've handled all of the known issues here, with fixes like: 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a mutex") 34e01d2f ("multipath-tools: don't call dm_lib_release() any more") I'd rather not risk having missed some issue that could cause a temporary error in a function that we call every couple of seconds (almost always unnecessarily). -Ben > The only critical situation is WWID change of path devices. We must try > to fix this situation ASAP when we detect it. I'm unsure what the best > action is if a reload fails in that situation, though (other than > failing the path, as we already do). > > Martin
On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote: > On Wed, Dec 11, 2024 at 01:06:46PM +0100, Martin Wilck wrote: > > On Tue, 2024-12-10 at 18:30 -0500, Benjamin Marzinski wrote: > > > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > > > > We previously didn't allow map removal inside the checker loop. > > > > But > > > > with the late updates to the checkerloop code, it should be > > > > safe to > > > > orphan > > > > paths and delete maps even in this situation. We remove such > > > > maps > > > > everywhere > > > > else in the code already, whenever refresh_multipath() or > > > > setup_multipath() > > > > is called. > > > > > > Actually, thinking about this more, what do we get by proactively > > > deleting the multipath device if something goes wrong in the > > > checker? > > > If > > > we successfully reload a device, but can't sync it with the > > > kernel, > > > that's one thing, But that was triggered by a change in the > > > device, > > > and > > > we know that when we reloaded the device, device-mapper was > > > working. > > > I'm > > > leery of possibly deleting the map because of a transient device- > > > mapper > > > issue. I'm not sure if on a check that we do repeatedly, we > > > should > > > delete the device on an error. We haven't in the past, and as > > > far as > > > I > > > know, it doesn't cause problems. > > > > I don't disagree. But the same can be said for basically all call > > chains where setup_multipath() is called for an existing map. I was > > just following the pattern that we use e.g. in ev_add_path(), or in > > update_mpp_prio(). Why would we treat the checker and path addition > > differently in this respect? > > I'm confused here. Well, I was writing confused things. My thinking was going in circles about the removal of paths and maps, and I didn't properly distinguish between map reloading and updating the state from the kernel. Sorry. > ev_add_path() doesn't remove the device if the reload > fails. If a reload fails, the table should stay the same. That's why > I > said that in other cases where we delete the device, we know that > when > we just reloaded the device, device-mapper was working. Looking at > the > code, that isn't really true. After failed reloads, we still call > setup_multipath to update our state, and we will delete the device if > that fails. > This is why we call setup_multipath after failed reloads, to make > sure > multipathd's view of the multipath device resyncs with the kernel's, > which hasn't changed from what it was before the reload failed. Right. > > In the checker, this can't happen. Obviously, no other process can > > grab > > a path device while the device mapper is holding it, so -EBUSY > > won't > > occur if we reload an existing map. Even device deletion doesn't > > cause > > failure on reload. It is possible to delete a SCSI device while > > it's > > mapped, and execute a table reload / suspend / resume cycle on the > > map > > while referencing the deleted device. The kernel keeps holding the > > reference to the deleted device, and will simply mark it as > > failed. This holds also if the mapped paths are re-grouped or re- > > ordered in the table. Failure occurs only if we temporarily remove > > the > > device from the map and re-add it, because as soon as the device is > > removed from the map's dm table, its refcount drops to zero, and > > it's > > gone for good. > > > > IOW, reloading a map with a table containing only already-mapped > > devices will never fail, except in extreme situations like kernel > > OOM. > > Maybe I should clarify my position a bit. I am fine with reloading > the > device in the checkerloop if something has changed. This obviously > does run a very small risk of something going wrong and a device > getting > removed unnecessarily, but we know that we need to reload the device, > so > we should. > > What I would rather avoid is reloading the device because we failed > to > get it's state in do_sync_mpp(). FTR, in my v4 patchset, I won't try to do that any more. > I'm not actually worried about the kernel so much as libdevmapper. It > is > not designed for multi-threaded processes, and that has bitten us in > the > past. For intance, it's why we don't delete devices in dmevent_loop() > on > libdevmapper errors. dm_get_events() just waits and retries if > getting > the device list fails, and for each device, it calls dm_is_mpath and > will only delete a device on DM_IS_MPATH_NO, which is what I > suggested > for the cleanup function. > > I'm pretty sure we've handled all of the known issues here, with > fixes > like: > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a > mutex") > 34e01d2f ("multipath-tools: don't call dm_lib_release() any more") > > I'd rather not risk having missed some issue that could cause a > temporary error in a function that we call every couple of seconds > (almost always unnecessarily). Ok, getting it. I thought that an error in DM_TABLE_STATUS must almost neccessarily mean -ENXIO (from the kernel pov), which would mean that some external entity removed the device, and that we should act as if someone had used the "remove map" CLI command. But I didn't think about libdevmapper. Martin
On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote: > On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote: > > > > I'm not actually worried about the kernel so much as libdevmapper. > > It > > is > > not designed for multi-threaded processes, and that has bitten us > > in > > the > > past. For intance, it's why we don't delete devices in > > dmevent_loop() > > on > > libdevmapper errors. dm_get_events() just waits and retries if > > getting > > the device list fails, and for each device, it calls dm_is_mpath > > and > > will only delete a device on DM_IS_MPATH_NO, which is what I > > suggested > > for the cleanup function. > > > > I'm pretty sure we've handled all of the known issues here, with > > fixes > > like: > > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a > > mutex") > > 34e01d2f ("multipath-tools: don't call dm_lib_release() any more") > > > > I'd rather not risk having missed some issue that could cause a > > temporary error in a function that we call every couple of seconds > > (almost always unnecessarily). > > Ok, getting it. I thought that an error in DM_TABLE_STATUS must > almost > neccessarily mean -ENXIO (from the kernel pov), which would mean that > some external entity removed the device, and that we should act as if > someone had used the "remove map" CLI command. But I didn't think > about > libdevmapper. But will libdevmapper return ENXIO if it's somehow interally confused? I don't think so. I believe that if we see this error code, removing the map is the right thing to do. I consider adding a patch on top of the v4 series that does this. If you reject it, fine :-) Regards, Martin
On Tue, 2024-12-10 at 17:49 -0500, Benjamin Marzinski wrote: > On Tue, Dec 10, 2024 at 10:05:14PM +0100, Martin Wilck wrote: > > On Tue, 2024-12-10 at 14:02 -0500, Benjamin Marzinski wrote: > > > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote: > > > > We previously didn't allow map removal inside the checker loop. > > > > But > > > > with the late updates to the checkerloop code, it should be > > > > safe to > > > > orphan > > > > paths and delete maps even in this situation. We remove such > > > > maps > > > > everywhere > > > > else in the code already, whenever refresh_multipath() or > > > > setup_multipath() > > > > is called. > > > > > > I don't think that this is safe. It's possible that the multipath > > > device > > > has paths in the INIT_REMOVED or INIT_PARTIAL state. These will > > > get > > > silently removed from the pathvec if we remove the map here. This > > > will > > > mess up our iteration through the pathvec in update_paths(). > > > > Hm. You're right. But that applies to the current code in 0.11.0 PR > > as > > well, because we'd call > > > > do_sync_mpp() > > update_multipath_strings() > > sync_paths() > > check_removed_paths() > > vector_del_slot(pathvec, i--); > > > > Or am I missing something? > > Nope. Your right. Nuts. > > > It seems to me that the only safe way to handle this is to refrain > > from > > deleting paths from the pathvec anywhere deep in the call stack. > > Even > > if we can avoid this situation now by moving the sync towards the > > end > > of the checker loop, I believe that in the long run we need to fix > > these traps in our code, because it's just so easy to get this > > wrong. > > > > I wonder if we need yet another path state, of if we could simply > > set > > these entries in the pathvec to NULL. That sounds crazy, but it > > might > > actually be doable. Not 0.11.0 material, though. > > I think we could just not call check_removed_paths() in sync_paths(). > We > would still orphan all the paths that were no longer part of the > multipath device, and set pp->mpp for all the paths that are part of > the > device just like before, but we wouldn't delete the paths from > pathvec there. Instead we would call check_removed_paths() in > refresh_multipath(), so we did it after loads and in > update_multipath. > > I'm pretty sure that should be fine. If the device table changed and > removed a path so that we can free it, either we reloaded the device, > and we will call setup_multipath() after the reload, or something > external did, and multipathd will see an event for that and call > setup_multipath() via update_multipath(). > > Does that make sense? I think we'll be fine with my upcoming patch set, which will call reload_and_sync_map() only from checker_finished(). I don't change sync_paths() in this set so far. I'm a little concerned about refresh_multipath() and reload_and_sync_map() being called from various CLI functions. But I won't start digging into that now. Maps may get removed in CLI calls, so what. Thanks, Martin
On Wed, Dec 11, 2024 at 09:33:40PM +0100, Martin Wilck wrote: > On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote: > > On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote: > > > > > > > I'm not actually worried about the kernel so much as libdevmapper. > > > It > > > is > > > not designed for multi-threaded processes, and that has bitten us > > > in > > > the > > > past. For intance, it's why we don't delete devices in > > > dmevent_loop() > > > on > > > libdevmapper errors. dm_get_events() just waits and retries if > > > getting > > > the device list fails, and for each device, it calls dm_is_mpath > > > and > > > will only delete a device on DM_IS_MPATH_NO, which is what I > > > suggested > > > for the cleanup function. > > > > > > I'm pretty sure we've handled all of the known issues here, with > > > fixes > > > like: > > > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a > > > mutex") > > > 34e01d2f ("multipath-tools: don't call dm_lib_release() any more") > > > > > > I'd rather not risk having missed some issue that could cause a > > > temporary error in a function that we call every couple of seconds > > > (almost always unnecessarily). > > > > Ok, getting it. I thought that an error in DM_TABLE_STATUS must > > almost > > neccessarily mean -ENXIO (from the kernel pov), which would mean that > > some external entity removed the device, and that we should act as if > > someone had used the "remove map" CLI command. But I didn't think > > about > > libdevmapper. > > But will libdevmapper return ENXIO if it's somehow interally confused? > I don't think so. I believe that if we see this error code, removing > the map is the right thing to do. I don't think that shouldn't ever happen. https://github.com/lvmteam/lvm2/blob/928b8e9c6eaf871b3405b91c64eac5ea854f2572/device_mapper/ioctl/libdm-iface.c#L2100 If libdevmapper gets an ENXIO from the kernel, it ends up setting dmi.exists to 0 instead of returning the error. -Ben > I consider adding a patch on top of the v4 series that does this. > If you reject it, fine :-) > > Regards, > Martin
On Thu, 2024-12-12 at 12:12 -0500, Benjamin Marzinski wrote: > On Wed, Dec 11, 2024 at 09:33:40PM +0100, Martin Wilck wrote: > > On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote: > > > On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote: > > > > > > > > > > I'm not actually worried about the kernel so much as > > > > libdevmapper. > > > > It > > > > is > > > > not designed for multi-threaded processes, and that has bitten > > > > us > > > > in > > > > the > > > > past. For intance, it's why we don't delete devices in > > > > dmevent_loop() > > > > on > > > > libdevmapper errors. dm_get_events() just waits and retries if > > > > getting > > > > the device list fails, and for each device, it calls > > > > dm_is_mpath > > > > and > > > > will only delete a device on DM_IS_MPATH_NO, which is what I > > > > suggested > > > > for the cleanup function. > > > > > > > > I'm pretty sure we've handled all of the known issues here, > > > > with > > > > fixes > > > > like: > > > > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a > > > > mutex") > > > > 34e01d2f ("multipath-tools: don't call dm_lib_release() any > > > > more") > > > > > > > > I'd rather not risk having missed some issue that could cause a > > > > temporary error in a function that we call every couple of > > > > seconds > > > > (almost always unnecessarily). > > > > > > Ok, getting it. I thought that an error in DM_TABLE_STATUS must > > > almost > > > neccessarily mean -ENXIO (from the kernel pov), which would mean > > > that > > > some external entity removed the device, and that we should act > > > as if > > > someone had used the "remove map" CLI command. But I didn't think > > > about > > > libdevmapper. > > > > But will libdevmapper return ENXIO if it's somehow interally > > confused? > > I don't think so. I believe that if we see this error code, > > removing > > the map is the right thing to do. > > I don't think that shouldn't ever happen. > > https://github.com/lvmteam/lvm2/blob/928b8e9c6eaf871b3405b91c64eac5ea854f2572/device_mapper/ioctl/libdm-iface.c#L2100 > > If libdevmapper gets an ENXIO from the kernel, it ends up setting > dmi.exists to 0 instead of returning the error. I meant the kernel ioctl return code. Sorry for being unclear. From the the point of view of libmultipath, it doesn't matter. libmp_mapinfo tests for ENXIO and dmi.exists, and returns DMP_NOT_FOUND in both cases. Martin
On Thu, Dec 12, 2024 at 06:18:40PM +0100, Martin Wilck wrote: > On Thu, 2024-12-12 at 12:12 -0500, Benjamin Marzinski wrote: > > On Wed, Dec 11, 2024 at 09:33:40PM +0100, Martin Wilck wrote: > > > On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote: > > > > On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote: > > > > > > > > > > > > > I'm not actually worried about the kernel so much as > > > > > libdevmapper. > > > > > It > > > > > is > > > > > not designed for multi-threaded processes, and that has bitten > > > > > us > > > > > in > > > > > the > > > > > past. For intance, it's why we don't delete devices in > > > > > dmevent_loop() > > > > > on > > > > > libdevmapper errors. dm_get_events() just waits and retries if > > > > > getting > > > > > the device list fails, and for each device, it calls > > > > > dm_is_mpath > > > > > and > > > > > will only delete a device on DM_IS_MPATH_NO, which is what I > > > > > suggested > > > > > for the cleanup function. > > > > > > > > > > I'm pretty sure we've handled all of the known issues here, > > > > > with > > > > > fixes > > > > > like: > > > > > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a > > > > > mutex") > > > > > 34e01d2f ("multipath-tools: don't call dm_lib_release() any > > > > > more") > > > > > > > > > > I'd rather not risk having missed some issue that could cause a > > > > > temporary error in a function that we call every couple of > > > > > seconds > > > > > (almost always unnecessarily). > > > > > > > > Ok, getting it. I thought that an error in DM_TABLE_STATUS must > > > > almost > > > > neccessarily mean -ENXIO (from the kernel pov), which would mean > > > > that > > > > some external entity removed the device, and that we should act > > > > as if > > > > someone had used the "remove map" CLI command. But I didn't think > > > > about > > > > libdevmapper. > > > > > > But will libdevmapper return ENXIO if it's somehow interally > > > confused? > > > I don't think so. I believe that if we see this error code, > > > removing > > > the map is the right thing to do. > > > > I don't think that shouldn't ever happen. > > > > https://github.com/lvmteam/lvm2/blob/928b8e9c6eaf871b3405b91c64eac5ea854f2572/device_mapper/ioctl/libdm-iface.c#L2100 > > > > If libdevmapper gets an ENXIO from the kernel, it ends up setting > > dmi.exists to 0 instead of returning the error. > > I meant the kernel ioctl return code. Sorry for being unclear. > > >From the the point of view of libmultipath, it doesn't matter. > libmp_mapinfo tests for ENXIO and dmi.exists, and returns DMP_NOT_FOUND > in both cases. Sure, that makes sense then. If libdevmapper told us the device doesn't exist, we can safely remove it. -Ben > Martin
diff --git a/multipathd/main.c b/multipathd/main.c index 4a28fbb..131dab6 100644 --- a/multipathd/main.c +++ b/multipathd/main.c @@ -2446,34 +2446,30 @@ get_new_state(struct path *pp) return newstate; } -static void -do_sync_mpp(struct vectors * vecs, struct multipath *mpp) +/* Returns true if the mpp was deleted */ +static int +do_sync_mpp(struct vectors *vecs, struct multipath *mpp) { - int i, ret; - struct path *pp; + int ret; + + ret = refresh_multipath(vecs, mpp); + if (ret) + return ret; - ret = update_multipath_strings(mpp, vecs->pathvec); - if (ret != DMP_OK) { - condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ? - "device not found" : - "couldn't synchronize with kernel state"); - vector_foreach_slot (mpp->paths, pp, i) - pp->dmstate = PSTATE_UNDEF; - return; - } set_no_path_retry(mpp); + return 0; } -static void +static int sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks) { if (mpp->sync_tick) mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks : mpp->sync_tick; if (mpp->sync_tick) - return; + return 0; - do_sync_mpp(vecs, mpp); + return do_sync_mpp(vecs, mpp); } static int @@ -2513,12 +2509,10 @@ update_path_state (struct vectors * vecs, struct path * pp) return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED : CHECK_PATH_SKIPPED; } - if (pp->mpp->synced_count == 0) { - do_sync_mpp(vecs, pp->mpp); + if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp)) /* if update_multipath_strings orphaned the path, quit early */ - if (!pp->mpp) - return CHECK_PATH_SKIPPED; - } + return CHECK_PATH_SKIPPED; + if ((newstate != PATH_UP && newstate != PATH_GHOST && newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) { /* If path state become failed again cancel path delay state */ @@ -3018,8 +3012,11 @@ checkerloop (void *ap) mpp->synced_count = 0; if (checker_state == CHECKER_STARTING) { vector_foreach_slot(vecs->mpvec, mpp, i) { - sync_mpp(vecs, mpp, ticks); - mpp->prio_update = PRIO_UPDATE_NONE; + if (sync_mpp(vecs, mpp, ticks)) + /* map deleted */ + i--; + else + mpp->prio_update = PRIO_UPDATE_NONE; } vector_foreach_slot(vecs->pathvec, pp, i) pp->is_checked = CHECK_PATH_UNCHECKED;
We previously didn't allow map removal inside the checker loop. But with the late updates to the checkerloop code, it should be safe to orphan paths and delete maps even in this situation. We remove such maps everywhere else in the code already, whenever refresh_multipath() or setup_multipath() is called. Signed-off-by: Martin Wilck <mwilck@suse.com> --- multipathd/main.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-)