diff mbox series

[V7,mlx5-next,14/15] vfio/mlx5: Use its own PCI reset_done error handler

Message ID 20220207172216.206415-15-yishaih@nvidia.com (mailing list archive)
State Superseded
Headers show
Series Add mlx5 live migration driver and v2 migration protocol | expand

Commit Message

Yishai Hadas Feb. 7, 2022, 5:22 p.m. UTC
Register its own handler for pci_error_handlers.reset_done and update
state accordingly.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/vfio/pci/mlx5/main.c | 57 ++++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

Comments

Alex Williamson Feb. 9, 2022, 12:08 a.m. UTC | #1
On Mon, 7 Feb 2022 19:22:15 +0200
Yishai Hadas <yishaih@nvidia.com> wrote:

> Register its own handler for pci_error_handlers.reset_done and update
> state accordingly.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/vfio/pci/mlx5/main.c | 57 ++++++++++++++++++++++++++++++++++--
>  1 file changed, 55 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
> index acd205bcff70..63a889210ef3 100644
> --- a/drivers/vfio/pci/mlx5/main.c
> +++ b/drivers/vfio/pci/mlx5/main.c
> @@ -28,9 +28,12 @@
>  struct mlx5vf_pci_core_device {
>  	struct vfio_pci_core_device core_device;
>  	u8 migrate_cap:1;
> +	u8 deferred_reset:1;
>  	/* protect migration state */
>  	struct mutex state_mutex;
>  	enum vfio_device_mig_state mig_state;
> +	/* protect the reset_done flow */
> +	spinlock_t reset_lock;
>  	u16 vhca_id;
>  	struct mlx5_vf_migration_file *resuming_migf;
>  	struct mlx5_vf_migration_file *saving_migf;
> @@ -437,6 +440,25 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
>  	return ERR_PTR(-EINVAL);
>  }
>  
> +/*
> + * This function is called in all state_mutex unlock cases to
> + * handle a 'deferred_reset' if exists.
> + */
> +static void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
> +{
> +again:
> +	spin_lock(&mvdev->reset_lock);
> +	if (mvdev->deferred_reset) {
> +		mvdev->deferred_reset = false;
> +		spin_unlock(&mvdev->reset_lock);
> +		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
> +		mlx5vf_disable_fds(mvdev);
> +		goto again;
> +	}
> +	mutex_unlock(&mvdev->state_mutex);
> +	spin_unlock(&mvdev->reset_lock);
> +}
> +
>  static struct file *
>  mlx5vf_pci_set_device_state(struct vfio_device *vdev,
>  			    enum vfio_device_mig_state new_state)
> @@ -465,7 +487,7 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev,
>  			break;
>  		}
>  	}
> -	mutex_unlock(&mvdev->state_mutex);
> +	mlx5vf_state_mutex_unlock(mvdev);
>  	return res;
>  }
>  
> @@ -477,10 +499,34 @@ static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
>  
>  	mutex_lock(&mvdev->state_mutex);
>  	*curr_state = mvdev->mig_state;
> -	mutex_unlock(&mvdev->state_mutex);
> +	mlx5vf_state_mutex_unlock(mvdev);
>  	return 0;

I still can't see why it wouldn't be a both fairly trivial to implement
and a usability improvement if the unlock wrapper returned -EAGAIN on a
deferred reset so we could avoid returning a stale state to the user
and a dead fd in the former case.  Thanks,

Alex
Jason Gunthorpe Feb. 9, 2022, 2:39 a.m. UTC | #2
On Tue, Feb 08, 2022 at 05:08:01PM -0700, Alex Williamson wrote:
> > @@ -477,10 +499,34 @@ static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
> >  
> >  	mutex_lock(&mvdev->state_mutex);
> >  	*curr_state = mvdev->mig_state;
> > -	mutex_unlock(&mvdev->state_mutex);
> > +	mlx5vf_state_mutex_unlock(mvdev);
> >  	return 0;
> 
> I still can't see why it wouldn't be a both fairly trivial to implement
> and a usability improvement if the unlock wrapper returned -EAGAIN on a
> deferred reset so we could avoid returning a stale state to the user
> and a dead fd in the former case.  Thanks,

It simply is not useful - again, we always resolve this race that
should never happen as though the two events happened consecutively,
which is what would normally happen if we could use a simple mutex. We
do not need to add any more complexity to deal with this already
troublesome thing..

Jason
Alex Williamson Feb. 10, 2022, 4:48 p.m. UTC | #3
On Tue, 8 Feb 2022 22:39:18 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Feb 08, 2022 at 05:08:01PM -0700, Alex Williamson wrote:
> > > @@ -477,10 +499,34 @@ static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
> > >  
> > >  	mutex_lock(&mvdev->state_mutex);
> > >  	*curr_state = mvdev->mig_state;
> > > -	mutex_unlock(&mvdev->state_mutex);
> > > +	mlx5vf_state_mutex_unlock(mvdev);
> > >  	return 0;  
> > 
> > I still can't see why it wouldn't be a both fairly trivial to implement
> > and a usability improvement if the unlock wrapper returned -EAGAIN on a
> > deferred reset so we could avoid returning a stale state to the user
> > and a dead fd in the former case.  Thanks,  
> 
> It simply is not useful - again, we always resolve this race that
> should never happen as though the two events happened consecutively,
> which is what would normally happen if we could use a simple mutex. We
> do not need to add any more complexity to deal with this already
> troublesome thing..

So walk me through how this works with QEMU, it's easy to hand-wave
userspace race and move on, but device reset can be triggered by guest
behavior while migration is supposed to be transparent to the guest.
These are essentially asynchronous threads where we're imposing a
synchronization point or lots of double checking in userspace whether
the device actually entered the state we think it did and if the
returned FD is usable.

Specifically, I suspect we can trigger this race if the VM reboots as
we're initiating a migration in the STOP_COPY phase, but that's maybe
less interesting if we expect the VM to be halted before the device
state is stepped.  More interesting might be how a PRE_COPY transition
works relative to asynchronous VM resets triggering device resets.  Are
we serializing all access to reset vs this DEVICE_FEATURE op or are we
resorting to double checking the device state, and how do we plan to
re-initiate migration states if a VM reset occurs during migration?
Thanks,

Alex
Jason Gunthorpe Feb. 10, 2022, 5:27 p.m. UTC | #4
On Thu, Feb 10, 2022 at 09:48:11AM -0700, Alex Williamson wrote:

> Specifically, I suspect we can trigger this race if the VM reboots as
> we're initiating a migration in the STOP_COPY phase, but that's maybe
> less interesting if we expect the VM to be halted before the device
> state is stepped.  

Yes, STOP_COPY drivers like mlx5/acc are fine here inherently.

We have already restricted what device touches are allowed in
STOP_COPY, and this must include reset too. None of the two drivers
posted can tolerate a reset during the serialization step. 

mlx5 will fail the STOP_COPY FW command and I guess acc will 'tear'
its register reads and produce a corrupted state.

> More interesting might be how a PRE_COPY transition works relative
> to asynchronous VM resets triggering device resets.  Are we
> serializing all access to reset vs this DEVICE_FEATURE op or are we
> resorting to double checking the device state, and how do we plan to
> re-initiate migration states if a VM reset occurs during migration?
> Thanks,

The device will be in PRE_COPY with VCPUs running. An async reset will
be triggered in the guest, so the device returns to RUNNING and the
data_fd's immediately return an errno.

There are three ways qemu can observe this:

 1) it is actively using the data_fds, so it immediately gets an
    error and propogates it up, aborting the migration
    eg it is doing read(), poll(), iouring, etc.

 2) it is done with the PRE_COPY phase of the data_fd and is moving
    toward STOP_COPY.
    In this case the vCPU is halted and the SET_STATE to STOP_COPY
    will execute, without any race, either:
      PRE_COPY -> STOP_COPY (data_fd == -1)
      RUNNING -> STOP_COPY (data_fd != -1)
    The expected data_fd is detected in the WIP qemu patch, however it
    mishandles the error, we will fix it.

 3) it is aborting the PRE_COPY migration, closing the data_fd and
    doing SET_STATE to RUNNING. In which case it doesn't know the
    device was reset. close() succeeds and SET_STATE RUNNING -> RUNNING
    is a nop.

Today's qemu must abort the migration at this point and fully restart
it because it has no mechanism to serialize a 'discard all of this
device's PRE_COPY state up to here' tag.

Some future qemu could learn to do this and then the receiver would
discard already sent device state - by triggering reset and a new
RUNNING -> RESUMING on the receiving device. In this case qemu would
have a choice of:
  abort the entire migration
  restart just this device back to PRE_COPY
  stop the vCPUs and use STOP_COPY

In any case, qemu fully detects this race as a natural part of its
operations and knows with certainty when it commands to go to
STOP_COPY, with vCPUs halted, if the preceeding PRE_COPY state is
correct or not.

It is interesting you bring this up, I'm not sure this worked properly
with v1. It seems we have solved it, inadvertently even, by using the
basic logic of the FSM and FD.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index acd205bcff70..63a889210ef3 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -28,9 +28,12 @@ 
 struct mlx5vf_pci_core_device {
 	struct vfio_pci_core_device core_device;
 	u8 migrate_cap:1;
+	u8 deferred_reset:1;
 	/* protect migration state */
 	struct mutex state_mutex;
 	enum vfio_device_mig_state mig_state;
+	/* protect the reset_done flow */
+	spinlock_t reset_lock;
 	u16 vhca_id;
 	struct mlx5_vf_migration_file *resuming_migf;
 	struct mlx5_vf_migration_file *saving_migf;
@@ -437,6 +440,25 @@  mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
 	return ERR_PTR(-EINVAL);
 }
 
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
+{
+again:
+	spin_lock(&mvdev->reset_lock);
+	if (mvdev->deferred_reset) {
+		mvdev->deferred_reset = false;
+		spin_unlock(&mvdev->reset_lock);
+		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+		mlx5vf_disable_fds(mvdev);
+		goto again;
+	}
+	mutex_unlock(&mvdev->state_mutex);
+	spin_unlock(&mvdev->reset_lock);
+}
+
 static struct file *
 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
 			    enum vfio_device_mig_state new_state)
@@ -465,7 +487,7 @@  mlx5vf_pci_set_device_state(struct vfio_device *vdev,
 			break;
 		}
 	}
-	mutex_unlock(&mvdev->state_mutex);
+	mlx5vf_state_mutex_unlock(mvdev);
 	return res;
 }
 
@@ -477,10 +499,34 @@  static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
 
 	mutex_lock(&mvdev->state_mutex);
 	*curr_state = mvdev->mig_state;
-	mutex_unlock(&mvdev->state_mutex);
+	mlx5vf_state_mutex_unlock(mvdev);
 	return 0;
 }
 
+static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
+
+	if (!mvdev->migrate_cap)
+		return;
+
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&mvdev->reset_lock);
+	mvdev->deferred_reset = true;
+	if (!mutex_trylock(&mvdev->state_mutex)) {
+		spin_unlock(&mvdev->reset_lock);
+		return;
+	}
+	spin_unlock(&mvdev->reset_lock);
+	mlx5vf_state_mutex_unlock(mvdev);
+}
+
 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
 {
 	struct mlx5vf_pci_core_device *mvdev = container_of(
@@ -562,6 +608,7 @@  static int mlx5vf_pci_probe(struct pci_dev *pdev,
 					VFIO_MIGRATION_STOP_COPY |
 					VFIO_MIGRATION_P2P;
 				mutex_init(&mvdev->state_mutex);
+				spin_lock_init(&mvdev->reset_lock);
 			}
 			mlx5_vf_put_core_dev(mdev);
 		}
@@ -596,11 +643,17 @@  static const struct pci_device_id mlx5vf_pci_table[] = {
 
 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
 
+static const struct pci_error_handlers mlx5vf_err_handlers = {
+	.reset_done = mlx5vf_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
 static struct pci_driver mlx5vf_pci_driver = {
 	.name = KBUILD_MODNAME,
 	.id_table = mlx5vf_pci_table,
 	.probe = mlx5vf_pci_probe,
 	.remove = mlx5vf_pci_remove,
+	.err_handler = &mlx5vf_err_handlers,
 };
 
 static void __exit mlx5vf_pci_cleanup(void)