diff mbox series

[V4,mlx5-next,13/13] vfio/mlx5: Use its own PCI reset_done error handler

Message ID 20211026090605.91646-14-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add mlx5 live migration driver | expand

Commit Message

Yishai Hadas Oct. 26, 2021, 9:06 a.m. UTC
Register its own handler for pci_error_handlers.reset_done and update
state accordingly.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vfio/pci/mlx5/main.c | 54 ++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

Comments

Alex Williamson Oct. 26, 2021, 11:16 p.m. UTC | #1
On Tue, 26 Oct 2021 12:06:05 +0300
Yishai Hadas <yishaih@nvidia.com> wrote:

> Register its own handler for pci_error_handlers.reset_done and update
> state accordingly.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/vfio/pci/mlx5/main.c | 54 ++++++++++++++++++++++++++++++++++--
>  1 file changed, 52 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
> index 4b21b388dcc5..c157f540d384 100644
> --- a/drivers/vfio/pci/mlx5/main.c
> +++ b/drivers/vfio/pci/mlx5/main.c
> @@ -55,8 +55,11 @@ struct mlx5vf_pci_migration_info {
>  struct mlx5vf_pci_core_device {
>  	struct vfio_pci_core_device core_device;
>  	u8 migrate_cap:1;
> +	u8 defered_reset:1;

s/defered/deferred/ throughout

>  	/* protect migration state */
>  	struct mutex state_mutex;
> +	/* protect the reset_done flow */
> +	spinlock_t reset_lock;
>  	struct mlx5vf_pci_migration_info vmig;
>  };
>  
> @@ -471,6 +474,47 @@ mlx5vf_pci_migration_data_rw(struct mlx5vf_pci_core_device *mvdev,
>  	return count;
>  }
>  
> +/* This function is called in all state_mutex unlock cases to
> + * handle a 'defered_reset' if exists.
> + */

I refrained from noting it elsewhere, but we're not in net/ or
drivers/net/ here, but we're using their multi-line comment style.  Are
we using the strong relation to a driver that does belong there as
justification for the style here?

> +static void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
> +{
> +again:
> +	spin_lock(&mvdev->reset_lock);
> +	if (mvdev->defered_reset) {
> +		mvdev->defered_reset = false;
> +		spin_unlock(&mvdev->reset_lock);
> +		mlx5vf_reset_mig_state(mvdev);
> +		mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING;
> +		goto again;
> +	}
> +	mutex_unlock(&mvdev->state_mutex);
> +	spin_unlock(&mvdev->reset_lock);
> +}
> +
> +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
> +
> +	if (!mvdev->migrate_cap)
> +		return;
> +
> +	/* As the higher VFIO layers are holding locks across reset and using
> +	 * those same locks with the mm_lock we need to prevent ABBA deadlock
> +	 * with the state_mutex and mm_lock.
> +	 * In case the state_mutex was taken alreday we differ the cleanup work

s/alreday/already/  s/differ/defer/ 

> +	 * to the unlock flow of the other running context.
> +	 */
> +	spin_lock(&mvdev->reset_lock);
> +	mvdev->defered_reset = true;
> +	if (!mutex_trylock(&mvdev->state_mutex)) {
> +		spin_unlock(&mvdev->reset_lock);
> +		return;
> +	}
> +	spin_unlock(&mvdev->reset_lock);
> +	mlx5vf_state_mutex_unlock(mvdev);
> +}
> +
>  static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev,
>  				 char __user *buf, size_t count, loff_t *ppos,
>  				 bool iswrite)
> @@ -539,7 +583,7 @@ static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev,
>  	}
>  
>  end:
> -	mutex_unlock(&mvdev->state_mutex);
> +	mlx5vf_state_mutex_unlock(mvdev);

I'm a little lost here, if the operation was to read the device_state
and mvdev->vmig.vfio_dev_state was error, that's already been copied to
the user buffer, so the user continues to see the error state for the
first read of device_state after reset if they encounter this race?
Thanks,

Alex

>  	return ret;
>  }
>  
> @@ -634,6 +678,7 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev,
>  			if (MLX5_CAP_GEN(mdev, migration)) {
>  				mvdev->migrate_cap = 1;
>  				mutex_init(&mvdev->state_mutex);
> +				spin_lock_init(&mvdev->reset_lock);
>  			}
>  			mlx5_vf_put_core_dev(mdev);
>  		}
> @@ -668,12 +713,17 @@ static const struct pci_device_id mlx5vf_pci_table[] = {
>  
>  MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
>  
> +const struct pci_error_handlers mlx5vf_err_handlers = {
> +	.reset_done = mlx5vf_pci_aer_reset_done,
> +	.error_detected = vfio_pci_aer_err_detected,
> +};
> +
>  static struct pci_driver mlx5vf_pci_driver = {
>  	.name = KBUILD_MODNAME,
>  	.id_table = mlx5vf_pci_table,
>  	.probe = mlx5vf_pci_probe,
>  	.remove = mlx5vf_pci_remove,
> -	.err_handler = &vfio_pci_core_err_handlers,
> +	.err_handler = &mlx5vf_err_handlers,
>  };
>  
>  static void __exit mlx5vf_pci_cleanup(void)
Jason Gunthorpe Oct. 26, 2021, 11:50 p.m. UTC | #2
On Tue, Oct 26, 2021 at 05:16:44PM -0600, Alex Williamson wrote:
> > @@ -471,6 +474,47 @@ mlx5vf_pci_migration_data_rw(struct mlx5vf_pci_core_device *mvdev,
> >  	return count;
> >  }
> >  
> > +/* This function is called in all state_mutex unlock cases to
> > + * handle a 'defered_reset' if exists.
> > + */
> 
> I refrained from noting it elsewhere, but we're not in net/ or
> drivers/net/ here, but we're using their multi-line comment style.  Are
> we using the strong relation to a driver that does belong there as
> justification for the style here?

I think it is an oversight, tell Yishai you prefer the other format in
drivers/vfio and it can be fixed

> > @@ -539,7 +583,7 @@ static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev,
> >  	}
> >  
> >  end:
> > -	mutex_unlock(&mvdev->state_mutex);
> > +	mlx5vf_state_mutex_unlock(mvdev);
> 
> I'm a little lost here, if the operation was to read the device_state
> and mvdev->vmig.vfio_dev_state was error, that's already been copied to
> the user buffer, so the user continues to see the error state for the
> first read of device_state after reset if they encounter this race?

Yes. If the userspace races ioctls they get a deserved mess.

This race exists no matter what we do, as soon as the unlock happens a
racing reset ioctl could run in during the system call exit path.

The purpose of the locking is to protect the kernel from hostile
userspace, not to allow userspace to execute concurrent ioctl's in a
sensible way.

Jason
Alex Williamson Oct. 27, 2021, 3:29 p.m. UTC | #3
On Tue, 26 Oct 2021 20:50:02 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Oct 26, 2021 at 05:16:44PM -0600, Alex Williamson wrote:
> > > @@ -471,6 +474,47 @@ mlx5vf_pci_migration_data_rw(struct mlx5vf_pci_core_device *mvdev,
> > >  	return count;
> > >  }
> > >  
> > > +/* This function is called in all state_mutex unlock cases to
> > > + * handle a 'defered_reset' if exists.
> > > + */  
> > 
> > I refrained from noting it elsewhere, but we're not in net/ or
> > drivers/net/ here, but we're using their multi-line comment style.  Are
> > we using the strong relation to a driver that does belong there as
> > justification for the style here?  
> 
> I think it is an oversight, tell Yishai you prefer the other format in
> drivers/vfio and it can be fixed

Seems fixed in the new version.

> > > @@ -539,7 +583,7 @@ static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev,
> > >  	}
> > >  
> > >  end:
> > > -	mutex_unlock(&mvdev->state_mutex);
> > > +	mlx5vf_state_mutex_unlock(mvdev);  
> > 
> > I'm a little lost here, if the operation was to read the device_state
> > and mvdev->vmig.vfio_dev_state was error, that's already been copied to
> > the user buffer, so the user continues to see the error state for the
> > first read of device_state after reset if they encounter this race?  
> 
> Yes. If the userspace races ioctls they get a deserved mess.
> 
> This race exists no matter what we do, as soon as the unlock happens a
> racing reset ioctl could run in during the system call exit path.
> 
> The purpose of the locking is to protect the kernel from hostile
> userspace, not to allow userspace to execute concurrent ioctl's in a
> sensible way.

The reset_done handler sets deferred_reset = true and if it's possible
to get the state_mutex, will reset migration data and device_state as
part of releasing that mutex.  If there's contention on state_mutex,
the deferred_reset field flags that this migration state is still stale.

So, I assume that it's possible that a user resets the device via ioctl
or config space, there was contention and the migration state is still
stale, right?

The user then goes to read device_state, but the staleness of the
migration state is not resolved until *after* the stale device state is
copied to the user buffer.

What did the user do wrong to see stale data?  Thanks,

Alex
Jason Gunthorpe Oct. 27, 2021, 3:53 p.m. UTC | #4
On Wed, Oct 27, 2021 at 09:29:43AM -0600, Alex Williamson wrote:

> The reset_done handler sets deferred_reset = true and if it's possible
> to get the state_mutex, will reset migration data and device_state as
> part of releasing that mutex.  If there's contention on state_mutex,
> the deferred_reset field flags that this migration state is still stale.
> 
> So, I assume that it's possible that a user resets the device via ioctl
> or config space, there was contention and the migration state is still
> stale, right?

If this occurs it is a userspace bug and the goal here is to maintain
kernel integrity.

> The user then goes to read device_state, but the staleness of the
> migration state is not resolved until *after* the stale device state is
> copied to the user buffer.

This is not preventable in the general case. Assume we have sane
locking and it looks like this:

   CPU0                            CPU1
  ioctl state change
    mutex_lock
    copy_to_user(state == !RUNNING)
    mutex_unlock
                               ioctl reset
                                 mutex_lock
                                 state = RUNNING
                                 mutex_unlock
                               return to userspace
  return to userspace
  Userspace sees state != RUNNING

Same issue. Userspace cannot race state manipulating ioctls and expect
things to make any sense.

In all cases contention on the mutex during reset causes the reset to
order after the mutex is released. This is true with this approach and
it is true with a simple direct use of mutex.

In either case userspace will see incoherent results, and it is
userspace error to try and run the kernel ioctls this way.

> What did the user do wrong to see stale data?  Thanks,

Userspace allowed two state effecting IOCTLs to run concurrently.

Userspace must block reset while it is manipulating migration states.

Jason
Alex Williamson Oct. 27, 2021, 4:48 p.m. UTC | #5
On Wed, 27 Oct 2021 12:53:39 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Oct 27, 2021 at 09:29:43AM -0600, Alex Williamson wrote:
> 
> > The reset_done handler sets deferred_reset = true and if it's possible
> > to get the state_mutex, will reset migration data and device_state as
> > part of releasing that mutex.  If there's contention on state_mutex,
> > the deferred_reset field flags that this migration state is still stale.
> > 
> > So, I assume that it's possible that a user resets the device via ioctl
> > or config space, there was contention and the migration state is still
> > stale, right?  
> 
> If this occurs it is a userspace bug and the goal here is to maintain
> kernel integrity.
> 
> > The user then goes to read device_state, but the staleness of the
> > migration state is not resolved until *after* the stale device state is
> > copied to the user buffer.  
> 
> This is not preventable in the general case. Assume we have sane
> locking and it looks like this:
> 
>    CPU0                            CPU1
>   ioctl state change
>     mutex_lock
>     copy_to_user(state == !RUNNING)
>     mutex_unlock
>                                ioctl reset
>                                  mutex_lock
>                                  state = RUNNING
>                                  mutex_unlock
>                                return to userspace
>   return to userspace
>   Userspace sees state != RUNNING
> 
> Same issue. Userspace cannot race state manipulating ioctls and expect
> things to make any sense.
> 
> In all cases contention on the mutex during reset causes the reset to
> order after the mutex is released. This is true with this approach and
> it is true with a simple direct use of mutex.
> 
> In either case userspace will see incoherent results, and it is
> userspace error to try and run the kernel ioctls this way.
> 
> > What did the user do wrong to see stale data?  Thanks,  
> 
> Userspace allowed two state effecting IOCTLs to run concurrently.
> 
> Userspace must block reset while it is manipulating migration states.

Ok, I see.  I didn't digest that contention on state_mutex can only
occur from a concurrent migration region access and the stale state is
resolved at the end of that concurrent access, not some subsequent
access.  I agree we have no obligation to resolve anything about the
state that concurrent access would see.  Thanks,

Alex
Jason Gunthorpe Oct. 27, 2021, 4:53 p.m. UTC | #6
On Wed, Oct 27, 2021 at 10:48:55AM -0600, Alex Williamson wrote:

> Ok, I see.  I didn't digest that contention on state_mutex can only
> occur from a concurrent migration region access and the stale state is
> resolved at the end of that concurrent access, not some subsequent
> access.

Ah, I see, yes, that is tricky - the spinlock around the mutex
provides the guarentee: deferral cannot be set at mutex_lock() time.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 4b21b388dcc5..c157f540d384 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -55,8 +55,11 @@  struct mlx5vf_pci_migration_info {
 struct mlx5vf_pci_core_device {
 	struct vfio_pci_core_device core_device;
 	u8 migrate_cap:1;
+	u8 defered_reset:1;
 	/* protect migration state */
 	struct mutex state_mutex;
+	/* protect the reset_done flow */
+	spinlock_t reset_lock;
 	struct mlx5vf_pci_migration_info vmig;
 };
 
@@ -471,6 +474,47 @@  mlx5vf_pci_migration_data_rw(struct mlx5vf_pci_core_device *mvdev,
 	return count;
 }
 
+/* This function is called in all state_mutex unlock cases to
+ * handle a 'defered_reset' if exists.
+ */
+static void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
+{
+again:
+	spin_lock(&mvdev->reset_lock);
+	if (mvdev->defered_reset) {
+		mvdev->defered_reset = false;
+		spin_unlock(&mvdev->reset_lock);
+		mlx5vf_reset_mig_state(mvdev);
+		mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING;
+		goto again;
+	}
+	mutex_unlock(&mvdev->state_mutex);
+	spin_unlock(&mvdev->reset_lock);
+}
+
+static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
+
+	if (!mvdev->migrate_cap)
+		return;
+
+	/* As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken alreday we differ the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&mvdev->reset_lock);
+	mvdev->defered_reset = true;
+	if (!mutex_trylock(&mvdev->state_mutex)) {
+		spin_unlock(&mvdev->reset_lock);
+		return;
+	}
+	spin_unlock(&mvdev->reset_lock);
+	mlx5vf_state_mutex_unlock(mvdev);
+}
+
 static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev,
 				 char __user *buf, size_t count, loff_t *ppos,
 				 bool iswrite)
@@ -539,7 +583,7 @@  static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev,
 	}
 
 end:
-	mutex_unlock(&mvdev->state_mutex);
+	mlx5vf_state_mutex_unlock(mvdev);
 	return ret;
 }
 
@@ -634,6 +678,7 @@  static int mlx5vf_pci_probe(struct pci_dev *pdev,
 			if (MLX5_CAP_GEN(mdev, migration)) {
 				mvdev->migrate_cap = 1;
 				mutex_init(&mvdev->state_mutex);
+				spin_lock_init(&mvdev->reset_lock);
 			}
 			mlx5_vf_put_core_dev(mdev);
 		}
@@ -668,12 +713,17 @@  static const struct pci_device_id mlx5vf_pci_table[] = {
 
 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
 
+const struct pci_error_handlers mlx5vf_err_handlers = {
+	.reset_done = mlx5vf_pci_aer_reset_done,
+	.error_detected = vfio_pci_aer_err_detected,
+};
+
 static struct pci_driver mlx5vf_pci_driver = {
 	.name = KBUILD_MODNAME,
 	.id_table = mlx5vf_pci_table,
 	.probe = mlx5vf_pci_probe,
 	.remove = mlx5vf_pci_remove,
-	.err_handler = &vfio_pci_core_err_handlers,
+	.err_handler = &mlx5vf_err_handlers,
 };
 
 static void __exit mlx5vf_pci_cleanup(void)