diff mbox series

[V2,mlx5-next,14/14] vfio/mlx5: Use its own PCI reset_done error handler

Message ID 20211019105838.227569-15-yishaih@nvidia.com (mailing list archive)
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series Add mlx5 live migration driver | expand

Commit Message

Yishai Hadas Oct. 19, 2021, 10:58 a.m. UTC
Register its own handler for pci_error_handlers.reset_done and update
state accordingly.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vfio/pci/mlx5/main.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

Comments

Alex Williamson Oct. 19, 2021, 6:55 p.m. UTC | #1
On Tue, 19 Oct 2021 13:58:38 +0300
Yishai Hadas <yishaih@nvidia.com> wrote:

> Register its own handler for pci_error_handlers.reset_done and update
> state accordingly.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/vfio/pci/mlx5/main.c | 33 ++++++++++++++++++++++++++++++++-
>  1 file changed, 32 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
> index 621b7fc60544..b759c6e153b6 100644
> --- a/drivers/vfio/pci/mlx5/main.c
> +++ b/drivers/vfio/pci/mlx5/main.c
> @@ -58,6 +58,7 @@ struct mlx5vf_pci_core_device {
>  	/* protect migartion state */
>  	struct mutex state_mutex;
>  	struct mlx5vf_pci_migration_info vmig;
> +	struct work_struct work;
>  };
>  
>  static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev)
> @@ -615,6 +616,27 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
>  	.match = vfio_pci_core_match,
>  };
>  
> +static void mlx5vf_reset_work_handler(struct work_struct *work)
> +{
> +	struct mlx5vf_pci_core_device *mvdev =
> +		container_of(work, struct mlx5vf_pci_core_device, work);
> +
> +	mutex_lock(&mvdev->state_mutex);
> +	mlx5vf_reset_mig_state(mvdev);

I see this calls mlx5vf_reset_vhca_state() but how does that unfreeze
and unquiesce the device as necessary to get back to _RUNNING?

> +	mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING;
> +	mutex_unlock(&mvdev->state_mutex);
> +}
> +
> +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
> +
> +	if (!mvdev->migrate_cap)
> +		return;
> +
> +	schedule_work(&mvdev->work);

This seems troublesome, how long does userspace poll the device state
after reset to get back to _RUNNING?  Seems we at least need a
flush_work() call when userspace reads the device state.  Thanks,

Alex

> +}
> +
>  static int mlx5vf_pci_probe(struct pci_dev *pdev,
>  			    const struct pci_device_id *id)
>  {
> @@ -634,6 +656,8 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev,
>  			if (MLX5_CAP_GEN(mdev, migration)) {
>  				mvdev->migrate_cap = 1;
>  				mutex_init(&mvdev->state_mutex);
> +				INIT_WORK(&mvdev->work,
> +					  mlx5vf_reset_work_handler);
>  			}
>  			mlx5_vf_put_core_dev(mdev);
>  		}
> @@ -656,6 +680,8 @@ static void mlx5vf_pci_remove(struct pci_dev *pdev)
>  {
>  	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
>  
> +	if (mvdev->migrate_cap)
> +		cancel_work_sync(&mvdev->work);
>  	vfio_pci_core_unregister_device(&mvdev->core_device);
>  	vfio_pci_core_uninit_device(&mvdev->core_device);
>  	kfree(mvdev);
> @@ -668,12 +694,17 @@ static const struct pci_device_id mlx5vf_pci_table[] = {
>  
>  MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
>  
> +const struct pci_error_handlers mlx5vf_err_handlers = {
> +	.reset_done = mlx5vf_pci_aer_reset_done,
> +	.error_detected = vfio_pci_aer_err_detected,
> +};
> +
>  static struct pci_driver mlx5vf_pci_driver = {
>  	.name = KBUILD_MODNAME,
>  	.id_table = mlx5vf_pci_table,
>  	.probe = mlx5vf_pci_probe,
>  	.remove = mlx5vf_pci_remove,
> -	.err_handler = &vfio_pci_core_err_handlers,
> +	.err_handler = &mlx5vf_err_handlers,
>  };
>  
>  static void __exit mlx5vf_pci_cleanup(void)
Jason Gunthorpe Oct. 19, 2021, 7:10 p.m. UTC | #2
On Tue, Oct 19, 2021 at 12:55:13PM -0600, Alex Williamson wrote:

> > +static void mlx5vf_reset_work_handler(struct work_struct *work)
> > +{
> > +	struct mlx5vf_pci_core_device *mvdev =
> > +		container_of(work, struct mlx5vf_pci_core_device, work);
> > +
> > +	mutex_lock(&mvdev->state_mutex);
> > +	mlx5vf_reset_mig_state(mvdev);
> 
> I see this calls mlx5vf_reset_vhca_state() but how does that unfreeze
> and unquiesce the device as necessary to get back to _RUNNING?

FLR of the function does it.

Same flow as if userspace attaches the vfio migration, freezes the
device then closes the FD. The FLR puts everything in the device right
and the next open will see a functional, unfrozen, blank device.

> > +	mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING;
> > +	mutex_unlock(&mvdev->state_mutex);
> > +}
> > +
> > +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
> > +{
> > +	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
> > +
> > +	if (!mvdev->migrate_cap)
> > +		return;
> > +
> > +	schedule_work(&mvdev->work);
> 
> This seems troublesome, how long does userspace poll the device state
> after reset to get back to _RUNNING?  Seems we at least need a
> flush_work() call when userspace reads the device state.  Thanks,

The locking is very troubled here because the higher VFIO layers are
holding locks across reset and using those same locks with the mm_lock

The delay is a good point :(

The other algorithm that can rescue this is to defer the cleanup work
to the mutex unlock, which ever context happens to get to it:

reset_done:
   spin_lock(spin)
   defered_reset = true;
   if (!mutex_trylock(&state_mutex)) 
      spin_unlock(spin)
      return
   spin_unlock(spin)

   state_mutex_unlock()

state_mutex_unlock:
 again:
   spin_lock(spin)
   if (defered_reset)
      spin_unlock()
      do_post_reset;
      goto again;
   mutex_unlock(state_mutex);
   spin_unlock()

and call state_mutex_unlock() in all unlock cases.

It is a much more complicated algorithm than the work.

Yishai this should also have had a comment explaining why this is
needed as nobody is going to guess a ABBA deadlock on mm_lock is the
reason.

Jason
Yishai Hadas Oct. 20, 2021, 8:46 a.m. UTC | #3
On 10/19/2021 10:10 PM, Jason Gunthorpe wrote:
> On Tue, Oct 19, 2021 at 12:55:13PM -0600, Alex Williamson wrote:
>
>>> +static void mlx5vf_reset_work_handler(struct work_struct *work)
>>> +{
>>> +	struct mlx5vf_pci_core_device *mvdev =
>>> +		container_of(work, struct mlx5vf_pci_core_device, work);
>>> +
>>> +	mutex_lock(&mvdev->state_mutex);
>>> +	mlx5vf_reset_mig_state(mvdev);
>> I see this calls mlx5vf_reset_vhca_state() but how does that unfreeze
>> and unquiesce the device as necessary to get back to _RUNNING?
> FLR of the function does it.
>
> Same flow as if userspace attaches the vfio migration, freezes the
> device then closes the FD. The FLR puts everything in the device right
> and the next open will see a functional, unfrozen, blank device.


Right

>
>>> +	mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING;
>>> +	mutex_unlock(&mvdev->state_mutex);
>>> +}
>>> +
>>> +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
>>> +{
>>> +	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
>>> +
>>> +	if (!mvdev->migrate_cap)
>>> +		return;
>>> +
>>> +	schedule_work(&mvdev->work);
>> This seems troublesome, how long does userspace poll the device state
>> after reset to get back to _RUNNING?  Seems we at least need a
>> flush_work() call when userspace reads the device state.  Thanks,
> The locking is very troubled here because the higher VFIO layers are
> holding locks across reset and using those same locks with the mm_lock
>
> The delay is a good point :(


What is the expectation for a reasonable delay ? we may expect this 
system WQ to run only short tasks and be very responsive.

See 
https://elixir.bootlin.com/linux/v5.15-rc6/source/include/linux/workqueue.h#L355

>
> The other algorithm that can rescue this is to defer the cleanup work
> to the mutex unlock, which ever context happens to get to it:
>
> reset_done:
>     spin_lock(spin)
>     defered_reset = true;
>     if (!mutex_trylock(&state_mutex))
>        spin_unlock(spin)
>        return
>     spin_unlock(spin)
>
>     state_mutex_unlock()
>
> state_mutex_unlock:
>   again:
>     spin_lock(spin)
>     if (defered_reset)
>        spin_unlock()
>        do_post_reset;
>        goto again;
>     mutex_unlock(state_mutex);
>     spin_unlock()
>
> and call state_mutex_unlock() in all unlock cases.
>
> It is a much more complicated algorithm than the work.


Right, it seems much more complicated compared to current code..

In addition, need to get into the details of the above algorithm, not 
sure that I got all of them ..

>
> Yishai this should also have had a comment explaining why this is
> needed as nobody is going to guess a ABBA deadlock on mm_lock is the
> reason.

Sure, this can be done.

Are we fine to start/stay with current simple approach that I wrote and 
tested so far ?

Yishai
Jason Gunthorpe Oct. 20, 2021, 4:46 p.m. UTC | #4
On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote:

> What is the expectation for a reasonable delay ? we may expect this system
> WQ to run only short tasks and be very responsive.

If the expectation is that qemu will see the error return and the turn
around and issue FLR followed by another state operation then it does
seem strange that there would be a delay.

On the other hand, this doesn't seem that useful. If qemu tries to
migrate and the device fails then the migration operation is toast and
possibly the device is wrecked. It can't really issue a FLR without
coordinating with the VM, and it cannot resume the VM as the device is
now irrecoverably messed up.

If we look at this from a RAS perspective would would be useful here
is a way for qemu to request a fail safe migration data. This must
always be available and cannot fail.

When the failsafe is loaded into the device it would trigger the
device's built-in RAS features to co-ordinate with the VM driver and
recover. Perhaps qemu would also have to inject an AER or something.

Basically instead of the device starting in an "empty ready to use
state" it would start in a "failure detected, needs recovery" state.

Not hitless, but preserves overall availability vs a failed migration
== VM crash.

That said, it is just a thought, and I don't know if anyone has put
any resources into what to do if migration operations fail right now.

But failure is possible, ie the physical device could have crashed and
perhaps the migration is to move the VMs off the broken HW. In this
scenario all the migration operations will timeout and fail in the
driver.

However, since the guest VM could issue a FLR at any time, we really
shouldn't have this kind of operation floating around in the
background. Things must be made deterministic for qemu.

eg if qemu gets a guest request for FLR during the pre-copy stage it
really should abort the pre-copy, issue the FLR and then restart the
migration. I think it is unresonable to ask a device to be able to
maintain pre-copy across FLR.

To make this work the restarting of the migration must not race with a
schedule work wiping out all the state.

So, regrettably, something is needed here.

Ideally more of this logic would be in shared code, but I'm not sure I
have a good feeling what that should look like at this
point. Something to attempt once there are a few more implementations.

For instance the if predicate ladder I mentioned in the last email
should be shared code, not driver core as it is fundamental ABI.

Jason
Alex Williamson Oct. 20, 2021, 5:45 p.m. UTC | #5
On Wed, 20 Oct 2021 13:46:29 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote:
> 
> > What is the expectation for a reasonable delay ? we may expect this system
> > WQ to run only short tasks and be very responsive.  
> 
> If the expectation is that qemu will see the error return and the turn
> around and issue FLR followed by another state operation then it does
> seem strange that there would be a delay.
> 
> On the other hand, this doesn't seem that useful. If qemu tries to
> migrate and the device fails then the migration operation is toast and
> possibly the device is wrecked. It can't really issue a FLR without
> coordinating with the VM, and it cannot resume the VM as the device is
> now irrecoverably messed up.
> 
> If we look at this from a RAS perspective would would be useful here
> is a way for qemu to request a fail safe migration data. This must
> always be available and cannot fail.
> 
> When the failsafe is loaded into the device it would trigger the
> device's built-in RAS features to co-ordinate with the VM driver and
> recover. Perhaps qemu would also have to inject an AER or something.
> 
> Basically instead of the device starting in an "empty ready to use
> state" it would start in a "failure detected, needs recovery" state.

The "fail-safe recovery state" is essentially the reset state of the
device.  If a device enters an error state during migration, I would
think the ultimate recovery procedure would be to abort the migration,
send an AER to the VM, whereby the guest would trigger a reset, and
the RAS capabilities of the guest would handle failing over to a
multipath device, ejecting the failing device, etc.

However, regardless of the migration recovery strategy, userspace needs
a means to get the device back into an initial state in a deterministic
way without closing and re-opening the device (or polling for an
arbitrary length of time).  That's the minimum viable product here.
Thanks,

Alex
Jason Gunthorpe Oct. 20, 2021, 6:57 p.m. UTC | #6
On Wed, Oct 20, 2021 at 11:45:14AM -0600, Alex Williamson wrote:
> On Wed, 20 Oct 2021 13:46:29 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote:
> > 
> > > What is the expectation for a reasonable delay ? we may expect this system
> > > WQ to run only short tasks and be very responsive.  
> > 
> > If the expectation is that qemu will see the error return and the turn
> > around and issue FLR followed by another state operation then it does
> > seem strange that there would be a delay.
> > 
> > On the other hand, this doesn't seem that useful. If qemu tries to
> > migrate and the device fails then the migration operation is toast and
> > possibly the device is wrecked. It can't really issue a FLR without
> > coordinating with the VM, and it cannot resume the VM as the device is
> > now irrecoverably messed up.
> > 
> > If we look at this from a RAS perspective would would be useful here
> > is a way for qemu to request a fail safe migration data. This must
> > always be available and cannot fail.
> > 
> > When the failsafe is loaded into the device it would trigger the
> > device's built-in RAS features to co-ordinate with the VM driver and
> > recover. Perhaps qemu would also have to inject an AER or something.
> > 
> > Basically instead of the device starting in an "empty ready to use
> > state" it would start in a "failure detected, needs recovery" state.
> 
> The "fail-safe recovery state" is essentially the reset state of the
> device.

This is only the case if qemu does work to isolate the recently FLR'd
device from the VM until the VM acknowledges that it understands it is
FLR'd.

At least it would have to remove it from CPU access and the IOMMU, as
though the memory enable bit was cleared.

Is it reasonable to do this using just qemu, AER and no device
support?

> If a device enters an error state during migration, I would
> think the ultimate recovery procedure would be to abort the migration,
> send an AER to the VM, whereby the guest would trigger a reset, and
> the RAS capabilities of the guest would handle failing over to a
> multipath device, ejecting the failing device, etc.

Yes, this is my thinking, except I would not abort the migration but
continue on to the new hypervisor and then do the RAS recovery with
the new device.

Jason
Alex Williamson Oct. 20, 2021, 9:38 p.m. UTC | #7
On Wed, 20 Oct 2021 15:57:21 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Oct 20, 2021 at 11:45:14AM -0600, Alex Williamson wrote:
> > On Wed, 20 Oct 2021 13:46:29 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote:
> > >   
> > > > What is the expectation for a reasonable delay ? we may expect this system
> > > > WQ to run only short tasks and be very responsive.    
> > > 
> > > If the expectation is that qemu will see the error return and the turn
> > > around and issue FLR followed by another state operation then it does
> > > seem strange that there would be a delay.
> > > 
> > > On the other hand, this doesn't seem that useful. If qemu tries to
> > > migrate and the device fails then the migration operation is toast and
> > > possibly the device is wrecked. It can't really issue a FLR without
> > > coordinating with the VM, and it cannot resume the VM as the device is
> > > now irrecoverably messed up.
> > > 
> > > If we look at this from a RAS perspective would would be useful here
> > > is a way for qemu to request a fail safe migration data. This must
> > > always be available and cannot fail.
> > > 
> > > When the failsafe is loaded into the device it would trigger the
> > > device's built-in RAS features to co-ordinate with the VM driver and
> > > recover. Perhaps qemu would also have to inject an AER or something.
> > > 
> > > Basically instead of the device starting in an "empty ready to use
> > > state" it would start in a "failure detected, needs recovery" state.  
> > 
> > The "fail-safe recovery state" is essentially the reset state of the
> > device.  
> 
> This is only the case if qemu does work to isolate the recently FLR'd
> device from the VM until the VM acknowledges that it understands it is
> FLR'd.
> 
> At least it would have to remove it from CPU access and the IOMMU, as
> though the memory enable bit was cleared.
> 
> Is it reasonable to do this using just qemu, AER and no device
> support?

I suspect yes, worst case could be a surprise hot-remove or DPC event,
but IIRC Linux will reset a device on a fatal AER error regardless of
the driver.

> > If a device enters an error state during migration, I would
> > think the ultimate recovery procedure would be to abort the migration,
> > send an AER to the VM, whereby the guest would trigger a reset, and
> > the RAS capabilities of the guest would handle failing over to a
> > multipath device, ejecting the failing device, etc.  
> 
> Yes, this is my thinking, except I would not abort the migration but
> continue on to the new hypervisor and then do the RAS recovery with
> the new device.

Potentially a valid option, QEMU might optionally insert a subsection in
the migration stream to indicate the device failed during the migration
process.  The option might also allow migrating devices that don't
support migration, ie. the recovery process on the target is the same.
This is essentially a policy decision and I think QEMU probably leans
more towards failing the migration and letting a management tool
decided on the next course of action.  Thanks,

Alex
Yishai Hadas Oct. 21, 2021, 10:39 a.m. UTC | #8
On 10/20/2021 8:45 PM, Alex Williamson wrote:
> However, regardless of the migration recovery strategy, userspace needs
> a means to get the device back into an initial state in a deterministic
> way without closing and re-opening the device (or polling for an
> arbitrary length of time).  That's the minimum viable product here.
> Thanks,
>
> Alex
>

OK, in V3 I'll use the algorithm that Jason has suggested in this mail 
thread to let things be done in a
deterministic way without using a WQ.

Thanks,
Yishai
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 621b7fc60544..b759c6e153b6 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -58,6 +58,7 @@  struct mlx5vf_pci_core_device {
 	/* protect migartion state */
 	struct mutex state_mutex;
 	struct mlx5vf_pci_migration_info vmig;
+	struct work_struct work;
 };
 
 static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev)
@@ -615,6 +616,27 @@  static const struct vfio_device_ops mlx5vf_pci_ops = {
 	.match = vfio_pci_core_match,
 };
 
+static void mlx5vf_reset_work_handler(struct work_struct *work)
+{
+	struct mlx5vf_pci_core_device *mvdev =
+		container_of(work, struct mlx5vf_pci_core_device, work);
+
+	mutex_lock(&mvdev->state_mutex);
+	mlx5vf_reset_mig_state(mvdev);
+	mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING;
+	mutex_unlock(&mvdev->state_mutex);
+}
+
+static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
+
+	if (!mvdev->migrate_cap)
+		return;
+
+	schedule_work(&mvdev->work);
+}
+
 static int mlx5vf_pci_probe(struct pci_dev *pdev,
 			    const struct pci_device_id *id)
 {
@@ -634,6 +656,8 @@  static int mlx5vf_pci_probe(struct pci_dev *pdev,
 			if (MLX5_CAP_GEN(mdev, migration)) {
 				mvdev->migrate_cap = 1;
 				mutex_init(&mvdev->state_mutex);
+				INIT_WORK(&mvdev->work,
+					  mlx5vf_reset_work_handler);
 			}
 			mlx5_vf_put_core_dev(mdev);
 		}
@@ -656,6 +680,8 @@  static void mlx5vf_pci_remove(struct pci_dev *pdev)
 {
 	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
 
+	if (mvdev->migrate_cap)
+		cancel_work_sync(&mvdev->work);
 	vfio_pci_core_unregister_device(&mvdev->core_device);
 	vfio_pci_core_uninit_device(&mvdev->core_device);
 	kfree(mvdev);
@@ -668,12 +694,17 @@  static const struct pci_device_id mlx5vf_pci_table[] = {
 
 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
 
+const struct pci_error_handlers mlx5vf_err_handlers = {
+	.reset_done = mlx5vf_pci_aer_reset_done,
+	.error_detected = vfio_pci_aer_err_detected,
+};
+
 static struct pci_driver mlx5vf_pci_driver = {
 	.name = KBUILD_MODNAME,
 	.id_table = mlx5vf_pci_table,
 	.probe = mlx5vf_pci_probe,
 	.remove = mlx5vf_pci_remove,
-	.err_handler = &vfio_pci_core_err_handlers,
+	.err_handler = &mlx5vf_err_handlers,
 };
 
 static void __exit mlx5vf_pci_cleanup(void)