diff mbox series

[03/13] multipathd: allow map removal in do_sync_mpp()

Message ID 20241206233617.382200-4-mwilck@suse.com (mailing list archive)
State Not Applicable, archived
Delegated to: Benjamin Marzinski
Headers show
Series multipathd: More map reload handling, and checkerloop work | expand

Commit Message

Martin Wilck Dec. 6, 2024, 11:36 p.m. UTC
We previously didn't allow map removal inside the checker loop. But
with the late updates to the checkerloop code, it should be safe to orphan
paths and delete maps even in this situation. We remove such maps everywhere
else in the code already, whenever refresh_multipath() or setup_multipath()
is called.

Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 multipathd/main.c | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

Comments

Benjamin Marzinski Dec. 10, 2024, 7:02 p.m. UTC | #1
On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> We previously didn't allow map removal inside the checker loop. But
> with the late updates to the checkerloop code, it should be safe to orphan
> paths and delete maps even in this situation. We remove such maps everywhere
> else in the code already, whenever refresh_multipath() or setup_multipath()
> is called.

I don't think that this is safe. It's possible that the multipath device
has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get
silently removed from the pathvec if we remove the map here. This will
mess up our iteration through the pathvec in update_paths(). Perhaps a
better idea would be to set mpp->sync_ticks to 0 if
update_multipath_strings() fails in do_sync_mpp(). This would force a
refresh by sync_mpp() at the start of the next loop in checkerloop(),
where it can safely remove the multipath device.

-Ben
 
> Signed-off-by: Martin Wilck <mwilck@suse.com>
> ---
>  multipathd/main.c | 43 ++++++++++++++++++++-----------------------
>  1 file changed, 20 insertions(+), 23 deletions(-)
> 
> diff --git a/multipathd/main.c b/multipathd/main.c
> index 4a28fbb..131dab6 100644
> --- a/multipathd/main.c
> +++ b/multipathd/main.c
> @@ -2446,34 +2446,30 @@ get_new_state(struct path *pp)
>  	return newstate;
>  }
>  
> -static void
> -do_sync_mpp(struct vectors * vecs, struct multipath *mpp)
> +/* Returns true if the mpp was deleted */
> +static int
> +do_sync_mpp(struct vectors *vecs, struct multipath *mpp)
>  {
> -	int i, ret;
> -	struct path *pp;
> +	int ret;
> +
> +	ret = refresh_multipath(vecs, mpp);
> +	if (ret)
> +		return ret;
>  
> -	ret = update_multipath_strings(mpp, vecs->pathvec);
> -	if (ret != DMP_OK) {
> -		condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ?
> -			"device not found" :
> -			"couldn't synchronize with kernel state");
> -		vector_foreach_slot (mpp->paths, pp, i)
> -			pp->dmstate = PSTATE_UNDEF;
> -		return;
> -	}
>  	set_no_path_retry(mpp);
> +	return 0;
>  }
>  
> -static void
> +static int
>  sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks)
>  {
>  	if (mpp->sync_tick)
>  		mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks :
>  				  mpp->sync_tick;
>  	if (mpp->sync_tick)
> -		return;
> +		return 0;
>  
> -	do_sync_mpp(vecs, mpp);
> +	return do_sync_mpp(vecs, mpp);
>  }
>  
>  static int
> @@ -2513,12 +2509,10 @@ update_path_state (struct vectors * vecs, struct path * pp)
>  		return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED :
>  							  CHECK_PATH_SKIPPED;
>  	}
> -	if (pp->mpp->synced_count == 0) {
> -		do_sync_mpp(vecs, pp->mpp);
> +	if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp))
>  		/* if update_multipath_strings orphaned the path, quit early */
> -		if (!pp->mpp)
> -			return CHECK_PATH_SKIPPED;
> -	}
> +		return CHECK_PATH_SKIPPED;
> +
>  	if ((newstate != PATH_UP && newstate != PATH_GHOST &&
>  	     newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) {
>  		/* If path state become failed again cancel path delay state */
> @@ -3018,8 +3012,11 @@ checkerloop (void *ap)
>  				mpp->synced_count = 0;
>  			if (checker_state == CHECKER_STARTING) {
>  				vector_foreach_slot(vecs->mpvec, mpp, i) {
> -					sync_mpp(vecs, mpp, ticks);
> -					mpp->prio_update = PRIO_UPDATE_NONE;
> +					if (sync_mpp(vecs, mpp, ticks))
> +						/* map deleted */
> +						i--;
> +					else
> +						mpp->prio_update = PRIO_UPDATE_NONE;
>  				}
>  				vector_foreach_slot(vecs->pathvec, pp, i)
>  					pp->is_checked = CHECK_PATH_UNCHECKED;
> -- 
> 2.47.0
Benjamin Marzinski Dec. 10, 2024, 7:44 p.m. UTC | #2
On Tue, Dec 10, 2024 at 02:02:32PM -0500, Benjamin Marzinski wrote:
> On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> > We previously didn't allow map removal inside the checker loop. But
> > with the late updates to the checkerloop code, it should be safe to orphan
> > paths and delete maps even in this situation. We remove such maps everywhere
> > else in the code already, whenever refresh_multipath() or setup_multipath()
> > is called.
> 
> I don't think that this is safe. It's possible that the multipath device
> has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get
> silently removed from the pathvec if we remove the map here. This will
> mess up our iteration through the pathvec in update_paths(). Perhaps a
> better idea would be to set mpp->sync_ticks to 0 if
> update_multipath_strings() fails in do_sync_mpp(). This would force a
> refresh by sync_mpp() at the start of the next loop in checkerloop(),
> where it can safely remove the multipath device.

If we go this route we should probably rename do_sync_map() to
sync_map() and rename sync_map() to something like check_refresh_map(),
since it would now be calling refresh_map() instead of do_sync_map().

> 
> -Ben
>  
> > Signed-off-by: Martin Wilck <mwilck@suse.com>
> > ---
> >  multipathd/main.c | 43 ++++++++++++++++++++-----------------------
> >  1 file changed, 20 insertions(+), 23 deletions(-)
> > 
> > diff --git a/multipathd/main.c b/multipathd/main.c
> > index 4a28fbb..131dab6 100644
> > --- a/multipathd/main.c
> > +++ b/multipathd/main.c
> > @@ -2446,34 +2446,30 @@ get_new_state(struct path *pp)
> >  	return newstate;
> >  }
> >  
> > -static void
> > -do_sync_mpp(struct vectors * vecs, struct multipath *mpp)
> > +/* Returns true if the mpp was deleted */
> > +static int
> > +do_sync_mpp(struct vectors *vecs, struct multipath *mpp)
> >  {
> > -	int i, ret;
> > -	struct path *pp;
> > +	int ret;
> > +
> > +	ret = refresh_multipath(vecs, mpp);
> > +	if (ret)
> > +		return ret;
> >  
> > -	ret = update_multipath_strings(mpp, vecs->pathvec);
> > -	if (ret != DMP_OK) {
> > -		condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ?
> > -			"device not found" :
> > -			"couldn't synchronize with kernel state");
> > -		vector_foreach_slot (mpp->paths, pp, i)
> > -			pp->dmstate = PSTATE_UNDEF;
> > -		return;
> > -	}
> >  	set_no_path_retry(mpp);
> > +	return 0;
> >  }
> >  
> > -static void
> > +static int
> >  sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks)
> >  {
> >  	if (mpp->sync_tick)
> >  		mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks :
> >  				  mpp->sync_tick;
> >  	if (mpp->sync_tick)
> > -		return;
> > +		return 0;
> >  
> > -	do_sync_mpp(vecs, mpp);
> > +	return do_sync_mpp(vecs, mpp);
> >  }
> >  
> >  static int
> > @@ -2513,12 +2509,10 @@ update_path_state (struct vectors * vecs, struct path * pp)
> >  		return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED :
> >  							  CHECK_PATH_SKIPPED;
> >  	}
> > -	if (pp->mpp->synced_count == 0) {
> > -		do_sync_mpp(vecs, pp->mpp);
> > +	if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp))
> >  		/* if update_multipath_strings orphaned the path, quit early */
> > -		if (!pp->mpp)
> > -			return CHECK_PATH_SKIPPED;
> > -	}
> > +		return CHECK_PATH_SKIPPED;
> > +
> >  	if ((newstate != PATH_UP && newstate != PATH_GHOST &&
> >  	     newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) {
> >  		/* If path state become failed again cancel path delay state */
> > @@ -3018,8 +3012,11 @@ checkerloop (void *ap)
> >  				mpp->synced_count = 0;
> >  			if (checker_state == CHECKER_STARTING) {
> >  				vector_foreach_slot(vecs->mpvec, mpp, i) {
> > -					sync_mpp(vecs, mpp, ticks);
> > -					mpp->prio_update = PRIO_UPDATE_NONE;
> > +					if (sync_mpp(vecs, mpp, ticks))
> > +						/* map deleted */
> > +						i--;
> > +					else
> > +						mpp->prio_update = PRIO_UPDATE_NONE;
> >  				}
> >  				vector_foreach_slot(vecs->pathvec, pp, i)
> >  					pp->is_checked = CHECK_PATH_UNCHECKED;
> > -- 
> > 2.47.0
Martin Wilck Dec. 10, 2024, 9:05 p.m. UTC | #3
On Tue, 2024-12-10 at 14:02 -0500, Benjamin Marzinski wrote:
> On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> > We previously didn't allow map removal inside the checker loop. But
> > with the late updates to the checkerloop code, it should be safe to
> > orphan
> > paths and delete maps even in this situation. We remove such maps
> > everywhere
> > else in the code already, whenever refresh_multipath() or
> > setup_multipath()
> > is called.
> 
> I don't think that this is safe. It's possible that the multipath
> device
> has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get
> silently removed from the pathvec if we remove the map here. This
> will
> mess up our iteration through the pathvec in update_paths(). 

Hm. You're right. But that applies to the current code in 0.11.0 PR as
well, because we'd call 

   do_sync_mpp()
      update_multipath_strings() 
         sync_paths()
            check_removed_paths()
               vector_del_slot(pathvec, i--);

Or am I missing something?

It seems to me that the only safe way to handle this is to refrain from
deleting paths from the pathvec anywhere deep in the call stack. Even
if we can avoid this situation now by moving the sync towards the end
of the checker loop, I believe that in the long run we need to fix
these traps in our code, because it's just so easy to get this wrong.

I wonder if we need yet another path state, of if we could simply set
these entries in the pathvec to NULL. That sounds crazy, but it might
actually be doable. Not 0.11.0 material, though.

> Perhaps a
> better idea would be to set mpp->sync_ticks to 0 if
> update_multipath_strings() fails in do_sync_mpp(). This would force a
> refresh by sync_mpp() at the start of the next loop in checkerloop(),
> where it can safely remove the multipath device.

I like the idea of your other post to move the sync to the
CHECKER_FINISHED state.

Thanks,
Martin
Benjamin Marzinski Dec. 10, 2024, 10:49 p.m. UTC | #4
On Tue, Dec 10, 2024 at 10:05:14PM +0100, Martin Wilck wrote:
> On Tue, 2024-12-10 at 14:02 -0500, Benjamin Marzinski wrote:
> > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> > > We previously didn't allow map removal inside the checker loop. But
> > > with the late updates to the checkerloop code, it should be safe to
> > > orphan
> > > paths and delete maps even in this situation. We remove such maps
> > > everywhere
> > > else in the code already, whenever refresh_multipath() or
> > > setup_multipath()
> > > is called.
> > 
> > I don't think that this is safe. It's possible that the multipath
> > device
> > has paths in the INIT_REMOVED or INIT_PARTIAL state. These will get
> > silently removed from the pathvec if we remove the map here. This
> > will
> > mess up our iteration through the pathvec in update_paths(). 
> 
> Hm. You're right. But that applies to the current code in 0.11.0 PR as
> well, because we'd call 
> 
>    do_sync_mpp()
>       update_multipath_strings() 
>          sync_paths()
>             check_removed_paths()
>                vector_del_slot(pathvec, i--);
> 
> Or am I missing something?

Nope. Your right. Nuts.
 
> It seems to me that the only safe way to handle this is to refrain from
> deleting paths from the pathvec anywhere deep in the call stack. Even
> if we can avoid this situation now by moving the sync towards the end
> of the checker loop, I believe that in the long run we need to fix
> these traps in our code, because it's just so easy to get this wrong.
> 
> I wonder if we need yet another path state, of if we could simply set
> these entries in the pathvec to NULL. That sounds crazy, but it might
> actually be doable. Not 0.11.0 material, though.

I think we could just not call check_removed_paths() in sync_paths(). We
would still orphan all the paths that were no longer part of the
multipath device, and set pp->mpp for all the paths that are part of the
device just like before, but we wouldn't delete the paths from
pathvec there. Instead we would call check_removed_paths() in
refresh_multipath(), so we did it after loads and in update_multipath.

I'm pretty sure that should be fine. If the device table changed and
removed a path so that we can free it, either we reloaded the device,
and we will call setup_multipath() after the reload, or something
external did, and multipathd will see an event for that and call
setup_multipath() via update_multipath().

Does that make sense?

> > Perhaps a
> > better idea would be to set mpp->sync_ticks to 0 if
> > update_multipath_strings() fails in do_sync_mpp(). This would force a
> > refresh by sync_mpp() at the start of the next loop in checkerloop(),
> > where it can safely remove the multipath device.
> 
> I like the idea of your other post to move the sync to the
> CHECKER_FINISHED state.
> 
> Thanks,
> Martin
>
Benjamin Marzinski Dec. 10, 2024, 11:30 p.m. UTC | #5
On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> We previously didn't allow map removal inside the checker loop. But
> with the late updates to the checkerloop code, it should be safe to orphan
> paths and delete maps even in this situation. We remove such maps everywhere
> else in the code already, whenever refresh_multipath() or setup_multipath()
> is called.

Actually, thinking about this more, what do we get by proactively
deleting the multipath device if something goes wrong in the checker? If
we successfully reload a device, but can't sync it with the kernel,
that's one thing, But that was triggered by a change in the device, and
we know that when we reloaded the device, device-mapper was working. I'm
leery of possibly deleting the map because of a transient device-mapper
issue.  I'm not sure if on a check that we do repeatedly, we should
delete the device on an error.  We haven't in the past, and as far as I
know, it doesn't cause problems.  

Without a benefit to doing this, I'm not sure it makes sense.

-Ben

> 
> Signed-off-by: Martin Wilck <mwilck@suse.com>
> ---
>  multipathd/main.c | 43 ++++++++++++++++++++-----------------------
>  1 file changed, 20 insertions(+), 23 deletions(-)
> 
> diff --git a/multipathd/main.c b/multipathd/main.c
> index 4a28fbb..131dab6 100644
> --- a/multipathd/main.c
> +++ b/multipathd/main.c
> @@ -2446,34 +2446,30 @@ get_new_state(struct path *pp)
>  	return newstate;
>  }
>  
> -static void
> -do_sync_mpp(struct vectors * vecs, struct multipath *mpp)
> +/* Returns true if the mpp was deleted */
> +static int
> +do_sync_mpp(struct vectors *vecs, struct multipath *mpp)
>  {
> -	int i, ret;
> -	struct path *pp;
> +	int ret;
> +
> +	ret = refresh_multipath(vecs, mpp);
> +	if (ret)
> +		return ret;
>  
> -	ret = update_multipath_strings(mpp, vecs->pathvec);
> -	if (ret != DMP_OK) {
> -		condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ?
> -			"device not found" :
> -			"couldn't synchronize with kernel state");
> -		vector_foreach_slot (mpp->paths, pp, i)
> -			pp->dmstate = PSTATE_UNDEF;
> -		return;
> -	}
>  	set_no_path_retry(mpp);
> +	return 0;
>  }
>  
> -static void
> +static int
>  sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks)
>  {
>  	if (mpp->sync_tick)
>  		mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks :
>  				  mpp->sync_tick;
>  	if (mpp->sync_tick)
> -		return;
> +		return 0;
>  
> -	do_sync_mpp(vecs, mpp);
> +	return do_sync_mpp(vecs, mpp);
>  }
>  
>  static int
> @@ -2513,12 +2509,10 @@ update_path_state (struct vectors * vecs, struct path * pp)
>  		return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED :
>  							  CHECK_PATH_SKIPPED;
>  	}
> -	if (pp->mpp->synced_count == 0) {
> -		do_sync_mpp(vecs, pp->mpp);
> +	if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp))
>  		/* if update_multipath_strings orphaned the path, quit early */
> -		if (!pp->mpp)
> -			return CHECK_PATH_SKIPPED;
> -	}
> +		return CHECK_PATH_SKIPPED;
> +
>  	if ((newstate != PATH_UP && newstate != PATH_GHOST &&
>  	     newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) {
>  		/* If path state become failed again cancel path delay state */
> @@ -3018,8 +3012,11 @@ checkerloop (void *ap)
>  				mpp->synced_count = 0;
>  			if (checker_state == CHECKER_STARTING) {
>  				vector_foreach_slot(vecs->mpvec, mpp, i) {
> -					sync_mpp(vecs, mpp, ticks);
> -					mpp->prio_update = PRIO_UPDATE_NONE;
> +					if (sync_mpp(vecs, mpp, ticks))
> +						/* map deleted */
> +						i--;
> +					else
> +						mpp->prio_update = PRIO_UPDATE_NONE;
>  				}
>  				vector_foreach_slot(vecs->pathvec, pp, i)
>  					pp->is_checked = CHECK_PATH_UNCHECKED;
> -- 
> 2.47.0
Martin Wilck Dec. 11, 2024, 12:06 p.m. UTC | #6
On Tue, 2024-12-10 at 18:30 -0500, Benjamin Marzinski wrote:
> On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> > We previously didn't allow map removal inside the checker loop. But
> > with the late updates to the checkerloop code, it should be safe to
> > orphan
> > paths and delete maps even in this situation. We remove such maps
> > everywhere
> > else in the code already, whenever refresh_multipath() or
> > setup_multipath()
> > is called.
> 
> Actually, thinking about this more, what do we get by proactively
> deleting the multipath device if something goes wrong in the checker?
> If
> we successfully reload a device, but can't sync it with the kernel,
> that's one thing, But that was triggered by a change in the device,
> and
> we know that when we reloaded the device, device-mapper was working.
> I'm
> leery of possibly deleting the map because of a transient device-
> mapper
> issue.  I'm not sure if on a check that we do repeatedly, we should
> delete the device on an error.  We haven't in the past, and as far as
> I
> know, it doesn't cause problems.  

I don't disagree. But the same can be said for basically all call
chains where setup_multipath() is called for an existing map. I was
just following the pattern that we use e.g. in ev_add_path(), or in
update_mpp_prio(). Why would we treat the checker and path addition
differently in this respect?

If we look at this pragmatically (assuming that multipathd gets the
parameters right), the most probable reason for a map reload failure is
failure to open a path device in bdev_open(), either because the device
doesn't exist, or because it's busy or otherwise unavailable. If this
happens in ev_add_path(), the likely reason is that the path just added
was busy, and the smartest action upon such a failure would probably be
to just undo that addition. We currently don't do that; we remove the
entire map, which is questionable, as you state correctly.

In the checker, this can't happen. Obviously, no other process can grab
a path device while the device mapper is holding it, so -EBUSY won't
occur if we reload an existing map. Even device deletion doesn't cause
failure on reload. It is possible to delete a SCSI device while it's
mapped, and execute a table reload / suspend / resume cycle on the map
while referencing the deleted device. The kernel keeps holding the
reference to the deleted device, and will simply mark it as
failed. This holds also if the mapped paths are re-grouped or re-
ordered in the table. Failure occurs only if we temporarily remove the
device from the map and re-add it, because as soon as the device is
removed from the map's dm table, its refcount drops to zero, and it's
gone for good.

IOW, reloading a map with a table containing only already-mapped
devices will never fail, except in extreme situations like kernel OOM.

Thus, AFAICS, the only relevant scenario where a reloading would fail
is trying to add a path device that was not previously mapped, and
that's either busy (perhaps in another map) or has been deleted, IOW
only when we reload after calling adopt_paths(). This is where we could
improve. If we fail to reload after adopting new paths, we could fall
back to the existing table, and perhaps try to add paths one by one.
Again, this is post-0.11 material.

OTOH, practially impossible is not totally impossible, so we need to be
prepared to map reload failure either way. IMO the best thing we can do
in this case is to keep using the kernel's map, and retry reloading
later. 

The only critical situation is WWID change of path devices. We must try
to fix this situation ASAP when we detect it. I'm unsure what the best
action is if a reload fails in that situation, though (other than
failing the path, as we already do).

Martin
Benjamin Marzinski Dec. 11, 2024, 5:09 p.m. UTC | #7
On Wed, Dec 11, 2024 at 01:06:46PM +0100, Martin Wilck wrote:
> On Tue, 2024-12-10 at 18:30 -0500, Benjamin Marzinski wrote:
> > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> > > We previously didn't allow map removal inside the checker loop. But
> > > with the late updates to the checkerloop code, it should be safe to
> > > orphan
> > > paths and delete maps even in this situation. We remove such maps
> > > everywhere
> > > else in the code already, whenever refresh_multipath() or
> > > setup_multipath()
> > > is called.
> > 
> > Actually, thinking about this more, what do we get by proactively
> > deleting the multipath device if something goes wrong in the checker?
> > If
> > we successfully reload a device, but can't sync it with the kernel,
> > that's one thing, But that was triggered by a change in the device,
> > and
> > we know that when we reloaded the device, device-mapper was working.
> > I'm
> > leery of possibly deleting the map because of a transient device-
> > mapper
> > issue.  I'm not sure if on a check that we do repeatedly, we should
> > delete the device on an error.  We haven't in the past, and as far as
> > I
> > know, it doesn't cause problems.  
> 
> I don't disagree. But the same can be said for basically all call
> chains where setup_multipath() is called for an existing map. I was
> just following the pattern that we use e.g. in ev_add_path(), or in
> update_mpp_prio(). Why would we treat the checker and path addition
> differently in this respect?

I'm confused here. ev_add_path() doesn't remove the device if the reload
fails. If a reload fails, the table should stay the same. That's why I
said that in other cases where we delete the device, we know that when
we just reloaded the device, device-mapper was working. Looking at the
code, that isn't really true. After failed reloads, we still call
setup_multipath to update our state, and we will delete the device if
that fails.
 
> If we look at this pragmatically (assuming that multipathd gets the
> parameters right), the most probable reason for a map reload failure is
> failure to open a path device in bdev_open(), either because the device
> doesn't exist, or because it's busy or otherwise unavailable. If this
> happens in ev_add_path(), the likely reason is that the path just added
> was busy, and the smartest action upon such a failure would probably be
> to just undo that addition. We currently don't do that; we remove the
> entire map, which is questionable, as you state correctly.

This is why we call setup_multipath after failed reloads, to make sure
multipathd's view of the multipath device resyncs with the kernel's,
which hasn't changed from what it was before the reload failed.

> In the checker, this can't happen. Obviously, no other process can grab
> a path device while the device mapper is holding it, so -EBUSY won't
> occur if we reload an existing map. Even device deletion doesn't cause
> failure on reload. It is possible to delete a SCSI device while it's
> mapped, and execute a table reload / suspend / resume cycle on the map
> while referencing the deleted device. The kernel keeps holding the
> reference to the deleted device, and will simply mark it as
> failed. This holds also if the mapped paths are re-grouped or re-
> ordered in the table. Failure occurs only if we temporarily remove the
> device from the map and re-add it, because as soon as the device is
> removed from the map's dm table, its refcount drops to zero, and it's
> gone for good.
> 
> IOW, reloading a map with a table containing only already-mapped
> devices will never fail, except in extreme situations like kernel OOM.

Maybe I should clarify my position a bit. I am fine with reloading the
device in the checkerloop if something has changed. This obviously
does run a very small risk of something going wrong and a device getting
removed unnecessarily, but we know that we need to reload the device, so
we should.

What I would rather avoid is reloading the device because we failed to
get it's state in do_sync_mpp(). We don't do this because we know that
something has changed. We do this as a safety measure to deal with
corner cases where our state doesn't match the kernel's and we didn't
get an event. Double checking this each time we check a path in a
device saves having to catch all these corner cases elsewhere. But it's
almost always completely unnecessary, and we're doing it on every
multipath device every couple of seconds, unlike reloading a device,
which is rare.

> Thus, AFAICS, the only relevant scenario where a reloading would fail
> is trying to add a path device that was not previously mapped, and
> that's either busy (perhaps in another map) or has been deleted, IOW
> only when we reload after calling adopt_paths(). This is where we could
> improve. If we fail to reload after adopting new paths, we could fall
> back to the existing table, and perhaps try to add paths one by one.
> Again, this is post-0.11 material.
> 
> OTOH, practially impossible is not totally impossible, so we need to be
> prepared to map reload failure either way. IMO the best thing we can do
> in this case is to keep using the kernel's map, and retry reloading
> later. 

I'm not actually worried about the kernel so much as libdevmapper. It is
not designed for multi-threaded processes, and that has bitten us in the
past. For intance, it's why we don't delete devices in dmevent_loop() on
libdevmapper errors. dm_get_events() just waits and retries if getting
the device list fails, and for each device, it calls dm_is_mpath and
will only delete a device on DM_IS_MPATH_NO, which is what I suggested
for the cleanup function.

I'm pretty sure we've handled all of the known issues here, with fixes
like:
02d4bf07 ("libmultipath: protect racy libdevmapper calls with a mutex")
34e01d2f ("multipath-tools: don't call dm_lib_release() any more")

I'd rather not risk having missed some issue that could cause a
temporary error in a function that we call every couple of seconds
(almost always unnecessarily).

-Ben

> The only critical situation is WWID change of path devices. We must try
> to fix this situation ASAP when we detect it. I'm unsure what the best
> action is if a reload fails in that situation, though (other than
> failing the path, as we already do).
> 
> Martin
Martin Wilck Dec. 11, 2024, 8:20 p.m. UTC | #8
On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote:
> On Wed, Dec 11, 2024 at 01:06:46PM +0100, Martin Wilck wrote:
> > On Tue, 2024-12-10 at 18:30 -0500, Benjamin Marzinski wrote:
> > > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> > > > We previously didn't allow map removal inside the checker loop.
> > > > But
> > > > with the late updates to the checkerloop code, it should be
> > > > safe to
> > > > orphan
> > > > paths and delete maps even in this situation. We remove such
> > > > maps
> > > > everywhere
> > > > else in the code already, whenever refresh_multipath() or
> > > > setup_multipath()
> > > > is called.
> > > 
> > > Actually, thinking about this more, what do we get by proactively
> > > deleting the multipath device if something goes wrong in the
> > > checker?
> > > If
> > > we successfully reload a device, but can't sync it with the
> > > kernel,
> > > that's one thing, But that was triggered by a change in the
> > > device,
> > > and
> > > we know that when we reloaded the device, device-mapper was
> > > working.
> > > I'm
> > > leery of possibly deleting the map because of a transient device-
> > > mapper
> > > issue.  I'm not sure if on a check that we do repeatedly, we
> > > should
> > > delete the device on an error.  We haven't in the past, and as
> > > far as
> > > I
> > > know, it doesn't cause problems.  
> > 
> > I don't disagree. But the same can be said for basically all call
> > chains where setup_multipath() is called for an existing map. I was
> > just following the pattern that we use e.g. in ev_add_path(), or in
> > update_mpp_prio(). Why would we treat the checker and path addition
> > differently in this respect?
> 
> I'm confused here. 

Well, I was writing confused things. My thinking was going in circles
about the removal of paths and maps, and I didn't properly distinguish
between map reloading and updating the state from the kernel.

Sorry.

> ev_add_path() doesn't remove the device if the reload
> fails. If a reload fails, the table should stay the same. That's why
> I
> said that in other cases where we delete the device, we know that
> when
> we just reloaded the device, device-mapper was working. Looking at
> the
> code, that isn't really true. After failed reloads, we still call
> setup_multipath to update our state, and we will delete the device if
> that fails.


> This is why we call setup_multipath after failed reloads, to make
> sure
> multipathd's view of the multipath device resyncs with the kernel's,
> which hasn't changed from what it was before the reload failed.

Right.

> > In the checker, this can't happen. Obviously, no other process can
> > grab
> > a path device while the device mapper is holding it, so -EBUSY
> > won't
> > occur if we reload an existing map. Even device deletion doesn't
> > cause
> > failure on reload. It is possible to delete a SCSI device while
> > it's
> > mapped, and execute a table reload / suspend / resume cycle on the
> > map
> > while referencing the deleted device. The kernel keeps holding the
> > reference to the deleted device, and will simply mark it as
> > failed. This holds also if the mapped paths are re-grouped or re-
> > ordered in the table. Failure occurs only if we temporarily remove
> > the
> > device from the map and re-add it, because as soon as the device is
> > removed from the map's dm table, its refcount drops to zero, and
> > it's
> > gone for good.
> > 
> > IOW, reloading a map with a table containing only already-mapped
> > devices will never fail, except in extreme situations like kernel
> > OOM.
> 
> Maybe I should clarify my position a bit. I am fine with reloading
> the
> device in the checkerloop if something has changed. This obviously
> does run a very small risk of something going wrong and a device
> getting
> removed unnecessarily, but we know that we need to reload the device,
> so
> we should.
> 
> What I would rather avoid is reloading the device because we failed
> to
> get it's state in do_sync_mpp().

FTR, in my v4 patchset, I won't try to do that any more.

> I'm not actually worried about the kernel so much as libdevmapper. It
> is
> not designed for multi-threaded processes, and that has bitten us in
> the
> past. For intance, it's why we don't delete devices in dmevent_loop()
> on
> libdevmapper errors. dm_get_events() just waits and retries if
> getting
> the device list fails, and for each device, it calls dm_is_mpath and
> will only delete a device on DM_IS_MPATH_NO, which is what I
> suggested
> for the cleanup function.
> 
> I'm pretty sure we've handled all of the known issues here, with
> fixes
> like:
> 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a
> mutex")
> 34e01d2f ("multipath-tools: don't call dm_lib_release() any more")
> 
> I'd rather not risk having missed some issue that could cause a
> temporary error in a function that we call every couple of seconds
> (almost always unnecessarily).

Ok, getting it. I thought that an error in DM_TABLE_STATUS must almost
neccessarily mean -ENXIO (from the kernel pov), which would mean that
some external entity removed the device, and that we should act as if
someone had used the "remove map" CLI command. But I didn't think about
libdevmapper.

Martin
Martin Wilck Dec. 11, 2024, 8:33 p.m. UTC | #9
On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote:
> On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote:
> 
> 
> > I'm not actually worried about the kernel so much as libdevmapper.
> > It
> > is
> > not designed for multi-threaded processes, and that has bitten us
> > in
> > the
> > past. For intance, it's why we don't delete devices in
> > dmevent_loop()
> > on
> > libdevmapper errors. dm_get_events() just waits and retries if
> > getting
> > the device list fails, and for each device, it calls dm_is_mpath
> > and
> > will only delete a device on DM_IS_MPATH_NO, which is what I
> > suggested
> > for the cleanup function.
> > 
> > I'm pretty sure we've handled all of the known issues here, with
> > fixes
> > like:
> > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a
> > mutex")
> > 34e01d2f ("multipath-tools: don't call dm_lib_release() any more")
> > 
> > I'd rather not risk having missed some issue that could cause a
> > temporary error in a function that we call every couple of seconds
> > (almost always unnecessarily).
> 
> Ok, getting it. I thought that an error in DM_TABLE_STATUS must
> almost
> neccessarily mean -ENXIO (from the kernel pov), which would mean that
> some external entity removed the device, and that we should act as if
> someone had used the "remove map" CLI command. But I didn't think
> about
> libdevmapper.

But will libdevmapper return ENXIO if it's somehow interally confused?
I don't think so. I believe that if we see this error code, removing
the map is the right thing to do.

I consider adding a patch on top of the v4 series that does this. 
If you reject it, fine :-)

Regards,
Martin
Martin Wilck Dec. 11, 2024, 8:48 p.m. UTC | #10
On Tue, 2024-12-10 at 17:49 -0500, Benjamin Marzinski wrote:
> On Tue, Dec 10, 2024 at 10:05:14PM +0100, Martin Wilck wrote:
> > On Tue, 2024-12-10 at 14:02 -0500, Benjamin Marzinski wrote:
> > > On Sat, Dec 07, 2024 at 12:36:07AM +0100, Martin Wilck wrote:
> > > > We previously didn't allow map removal inside the checker loop.
> > > > But
> > > > with the late updates to the checkerloop code, it should be
> > > > safe to
> > > > orphan
> > > > paths and delete maps even in this situation. We remove such
> > > > maps
> > > > everywhere
> > > > else in the code already, whenever refresh_multipath() or
> > > > setup_multipath()
> > > > is called.
> > > 
> > > I don't think that this is safe. It's possible that the multipath
> > > device
> > > has paths in the INIT_REMOVED or INIT_PARTIAL state. These will
> > > get
> > > silently removed from the pathvec if we remove the map here. This
> > > will
> > > mess up our iteration through the pathvec in update_paths(). 
> > 
> > Hm. You're right. But that applies to the current code in 0.11.0 PR
> > as
> > well, because we'd call 
> > 
> >    do_sync_mpp()
> >       update_multipath_strings() 
> >          sync_paths()
> >             check_removed_paths()
> >                vector_del_slot(pathvec, i--);
> > 
> > Or am I missing something?
> 
> Nope. Your right. Nuts.
>  
> > It seems to me that the only safe way to handle this is to refrain
> > from
> > deleting paths from the pathvec anywhere deep in the call stack.
> > Even
> > if we can avoid this situation now by moving the sync towards the
> > end
> > of the checker loop, I believe that in the long run we need to fix
> > these traps in our code, because it's just so easy to get this
> > wrong.
> > 
> > I wonder if we need yet another path state, of if we could simply
> > set
> > these entries in the pathvec to NULL. That sounds crazy, but it
> > might
> > actually be doable. Not 0.11.0 material, though.
> 
> I think we could just not call check_removed_paths() in sync_paths().
> We
> would still orphan all the paths that were no longer part of the
> multipath device, and set pp->mpp for all the paths that are part of
> the
> device just like before, but we wouldn't delete the paths from
> pathvec there. Instead we would call check_removed_paths() in
> refresh_multipath(), so we did it after loads and in
> update_multipath.
> 
> I'm pretty sure that should be fine. If the device table changed and
> removed a path so that we can free it, either we reloaded the device,
> and we will call setup_multipath() after the reload, or something
> external did, and multipathd will see an event for that and call
> setup_multipath() via update_multipath().
> 
> Does that make sense?

I think we'll be fine with my upcoming patch set, which will call
reload_and_sync_map() only from checker_finished(). I don't change
sync_paths() in this set so far.

I'm a little concerned about refresh_multipath() and
reload_and_sync_map() being called from various CLI functions. But I
won't start digging into that now. Maps may get removed in CLI calls,
so what.

Thanks,
Martin
Benjamin Marzinski Dec. 12, 2024, 5:12 p.m. UTC | #11
On Wed, Dec 11, 2024 at 09:33:40PM +0100, Martin Wilck wrote:
> On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote:
> > On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote:
> > 
> > 
> > > I'm not actually worried about the kernel so much as libdevmapper.
> > > It
> > > is
> > > not designed for multi-threaded processes, and that has bitten us
> > > in
> > > the
> > > past. For intance, it's why we don't delete devices in
> > > dmevent_loop()
> > > on
> > > libdevmapper errors. dm_get_events() just waits and retries if
> > > getting
> > > the device list fails, and for each device, it calls dm_is_mpath
> > > and
> > > will only delete a device on DM_IS_MPATH_NO, which is what I
> > > suggested
> > > for the cleanup function.
> > > 
> > > I'm pretty sure we've handled all of the known issues here, with
> > > fixes
> > > like:
> > > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a
> > > mutex")
> > > 34e01d2f ("multipath-tools: don't call dm_lib_release() any more")
> > > 
> > > I'd rather not risk having missed some issue that could cause a
> > > temporary error in a function that we call every couple of seconds
> > > (almost always unnecessarily).
> > 
> > Ok, getting it. I thought that an error in DM_TABLE_STATUS must
> > almost
> > neccessarily mean -ENXIO (from the kernel pov), which would mean that
> > some external entity removed the device, and that we should act as if
> > someone had used the "remove map" CLI command. But I didn't think
> > about
> > libdevmapper.
> 
> But will libdevmapper return ENXIO if it's somehow interally confused?
> I don't think so. I believe that if we see this error code, removing
> the map is the right thing to do.

I don't think that shouldn't ever happen.

https://github.com/lvmteam/lvm2/blob/928b8e9c6eaf871b3405b91c64eac5ea854f2572/device_mapper/ioctl/libdm-iface.c#L2100

If libdevmapper gets an ENXIO from the kernel, it ends up setting
dmi.exists to 0 instead of returning the error.

-Ben
 
> I consider adding a patch on top of the v4 series that does this. 
> If you reject it, fine :-)
> 
> Regards,
> Martin
Martin Wilck Dec. 12, 2024, 5:18 p.m. UTC | #12
On Thu, 2024-12-12 at 12:12 -0500, Benjamin Marzinski wrote:
> On Wed, Dec 11, 2024 at 09:33:40PM +0100, Martin Wilck wrote:
> > On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote:
> > > On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote:
> > > 
> > > 
> > > > I'm not actually worried about the kernel so much as
> > > > libdevmapper.
> > > > It
> > > > is
> > > > not designed for multi-threaded processes, and that has bitten
> > > > us
> > > > in
> > > > the
> > > > past. For intance, it's why we don't delete devices in
> > > > dmevent_loop()
> > > > on
> > > > libdevmapper errors. dm_get_events() just waits and retries if
> > > > getting
> > > > the device list fails, and for each device, it calls
> > > > dm_is_mpath
> > > > and
> > > > will only delete a device on DM_IS_MPATH_NO, which is what I
> > > > suggested
> > > > for the cleanup function.
> > > > 
> > > > I'm pretty sure we've handled all of the known issues here,
> > > > with
> > > > fixes
> > > > like:
> > > > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a
> > > > mutex")
> > > > 34e01d2f ("multipath-tools: don't call dm_lib_release() any
> > > > more")
> > > > 
> > > > I'd rather not risk having missed some issue that could cause a
> > > > temporary error in a function that we call every couple of
> > > > seconds
> > > > (almost always unnecessarily).
> > > 
> > > Ok, getting it. I thought that an error in DM_TABLE_STATUS must
> > > almost
> > > neccessarily mean -ENXIO (from the kernel pov), which would mean
> > > that
> > > some external entity removed the device, and that we should act
> > > as if
> > > someone had used the "remove map" CLI command. But I didn't think
> > > about
> > > libdevmapper.
> > 
> > But will libdevmapper return ENXIO if it's somehow interally
> > confused?
> > I don't think so. I believe that if we see this error code,
> > removing
> > the map is the right thing to do.
> 
> I don't think that shouldn't ever happen.
> 
> https://github.com/lvmteam/lvm2/blob/928b8e9c6eaf871b3405b91c64eac5ea854f2572/device_mapper/ioctl/libdm-iface.c#L2100
> 
> If libdevmapper gets an ENXIO from the kernel, it ends up setting
> dmi.exists to 0 instead of returning the error.

I meant the kernel ioctl return code. Sorry for being unclear.

From the the point of view of libmultipath, it doesn't matter.
libmp_mapinfo tests for ENXIO and dmi.exists, and returns DMP_NOT_FOUND
in both cases.

Martin
Benjamin Marzinski Dec. 12, 2024, 5:50 p.m. UTC | #13
On Thu, Dec 12, 2024 at 06:18:40PM +0100, Martin Wilck wrote:
> On Thu, 2024-12-12 at 12:12 -0500, Benjamin Marzinski wrote:
> > On Wed, Dec 11, 2024 at 09:33:40PM +0100, Martin Wilck wrote:
> > > On Wed, 2024-12-11 at 21:20 +0100, Martin Wilck wrote:
> > > > On Wed, 2024-12-11 at 12:09 -0500, Benjamin Marzinski wrote:
> > > > 
> > > > 
> > > > > I'm not actually worried about the kernel so much as
> > > > > libdevmapper.
> > > > > It
> > > > > is
> > > > > not designed for multi-threaded processes, and that has bitten
> > > > > us
> > > > > in
> > > > > the
> > > > > past. For intance, it's why we don't delete devices in
> > > > > dmevent_loop()
> > > > > on
> > > > > libdevmapper errors. dm_get_events() just waits and retries if
> > > > > getting
> > > > > the device list fails, and for each device, it calls
> > > > > dm_is_mpath
> > > > > and
> > > > > will only delete a device on DM_IS_MPATH_NO, which is what I
> > > > > suggested
> > > > > for the cleanup function.
> > > > > 
> > > > > I'm pretty sure we've handled all of the known issues here,
> > > > > with
> > > > > fixes
> > > > > like:
> > > > > 02d4bf07 ("libmultipath: protect racy libdevmapper calls with a
> > > > > mutex")
> > > > > 34e01d2f ("multipath-tools: don't call dm_lib_release() any
> > > > > more")
> > > > > 
> > > > > I'd rather not risk having missed some issue that could cause a
> > > > > temporary error in a function that we call every couple of
> > > > > seconds
> > > > > (almost always unnecessarily).
> > > > 
> > > > Ok, getting it. I thought that an error in DM_TABLE_STATUS must
> > > > almost
> > > > neccessarily mean -ENXIO (from the kernel pov), which would mean
> > > > that
> > > > some external entity removed the device, and that we should act
> > > > as if
> > > > someone had used the "remove map" CLI command. But I didn't think
> > > > about
> > > > libdevmapper.
> > > 
> > > But will libdevmapper return ENXIO if it's somehow interally
> > > confused?
> > > I don't think so. I believe that if we see this error code,
> > > removing
> > > the map is the right thing to do.
> > 
> > I don't think that shouldn't ever happen.
> > 
> > https://github.com/lvmteam/lvm2/blob/928b8e9c6eaf871b3405b91c64eac5ea854f2572/device_mapper/ioctl/libdm-iface.c#L2100
> > 
> > If libdevmapper gets an ENXIO from the kernel, it ends up setting
> > dmi.exists to 0 instead of returning the error.
> 
> I meant the kernel ioctl return code. Sorry for being unclear.
> 
> >From the the point of view of libmultipath, it doesn't matter.
> libmp_mapinfo tests for ENXIO and dmi.exists, and returns DMP_NOT_FOUND
> in both cases.

Sure, that makes sense then. If libdevmapper told us the device doesn't
exist, we can safely remove it.

-Ben
 
> Martin
diff mbox series

Patch

diff --git a/multipathd/main.c b/multipathd/main.c
index 4a28fbb..131dab6 100644
--- a/multipathd/main.c
+++ b/multipathd/main.c
@@ -2446,34 +2446,30 @@  get_new_state(struct path *pp)
 	return newstate;
 }
 
-static void
-do_sync_mpp(struct vectors * vecs, struct multipath *mpp)
+/* Returns true if the mpp was deleted */
+static int
+do_sync_mpp(struct vectors *vecs, struct multipath *mpp)
 {
-	int i, ret;
-	struct path *pp;
+	int ret;
+
+	ret = refresh_multipath(vecs, mpp);
+	if (ret)
+		return ret;
 
-	ret = update_multipath_strings(mpp, vecs->pathvec);
-	if (ret != DMP_OK) {
-		condlog(1, "%s: %s", mpp->alias, ret == DMP_NOT_FOUND ?
-			"device not found" :
-			"couldn't synchronize with kernel state");
-		vector_foreach_slot (mpp->paths, pp, i)
-			pp->dmstate = PSTATE_UNDEF;
-		return;
-	}
 	set_no_path_retry(mpp);
+	return 0;
 }
 
-static void
+static int
 sync_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int ticks)
 {
 	if (mpp->sync_tick)
 		mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks :
 				  mpp->sync_tick;
 	if (mpp->sync_tick)
-		return;
+		return 0;
 
-	do_sync_mpp(vecs, mpp);
+	return do_sync_mpp(vecs, mpp);
 }
 
 static int
@@ -2513,12 +2509,10 @@  update_path_state (struct vectors * vecs, struct path * pp)
 		return handle_path_wwid_change(pp, vecs)? CHECK_PATH_REMOVED :
 							  CHECK_PATH_SKIPPED;
 	}
-	if (pp->mpp->synced_count == 0) {
-		do_sync_mpp(vecs, pp->mpp);
+	if (pp->mpp->synced_count == 0 && do_sync_mpp(vecs, pp->mpp))
 		/* if update_multipath_strings orphaned the path, quit early */
-		if (!pp->mpp)
-			return CHECK_PATH_SKIPPED;
-	}
+		return CHECK_PATH_SKIPPED;
+
 	if ((newstate != PATH_UP && newstate != PATH_GHOST &&
 	     newstate != PATH_PENDING) && (pp->state == PATH_DELAYED)) {
 		/* If path state become failed again cancel path delay state */
@@ -3018,8 +3012,11 @@  checkerloop (void *ap)
 				mpp->synced_count = 0;
 			if (checker_state == CHECKER_STARTING) {
 				vector_foreach_slot(vecs->mpvec, mpp, i) {
-					sync_mpp(vecs, mpp, ticks);
-					mpp->prio_update = PRIO_UPDATE_NONE;
+					if (sync_mpp(vecs, mpp, ticks))
+						/* map deleted */
+						i--;
+					else
+						mpp->prio_update = PRIO_UPDATE_NONE;
 				}
 				vector_foreach_slot(vecs->pathvec, pp, i)
 					pp->is_checked = CHECK_PATH_UNCHECKED;