diff mbox series

[4/6] vfio: Fully lock struct vfio_group::container

Message ID 4-v1-c1d14aae2e8f+2f4-vfio_group_locking_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Fully lock the container members of struct vfio_group | expand

Commit Message

Jason Gunthorpe May 6, 2022, 12:25 a.m. UTC
This is necessary to avoid various user triggerable races, for instance
racing SET_CONTAINER/UNSET_CONTAINER:

                                  ioctl(VFIO_GROUP_SET_CONTAINER)
ioctl(VFIO_GROUP_UNSET_CONTAINER)
 vfio_group_unset_container
    int users = atomic_cmpxchg(&group->container_users, 1, 0);
    // users == 1 container_users == 0
    __vfio_group_unset_container(group);

                                    vfio_group_set_container()
	                              if (atomic_read(&group->container_users))
				        down_write(&container->group_lock);
				        group->container = container;
				        up_write(&container->group_lock);

      down_write(&container->group_lock);
      group->container = NULL;
      up_write(&container->group_lock);
      vfio_container_put(container);
      /* woops we leaked the original container  */

This can then go on to NULL pointer deref since container == 0 and
container_users == 1.

Wrap all touches of container, except those on a performance path with a
known open device, with the group_rwsem.

The only user of vfio_group_add_container_user() holds the user count for
a simple operation, change it to just hold the group_lock over the
operation and delete vfio_group_add_container_user(). Containers now only
gain a user when a device FD is opened.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/vfio/vfio.c | 66 +++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 26 deletions(-)

Comments

Tian, Kevin May 13, 2022, 9:53 a.m. UTC | #1
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Friday, May 6, 2022 8:25 AM
> 
> This is necessary to avoid various user triggerable races, for instance
> racing SET_CONTAINER/UNSET_CONTAINER:
> 
>                                   ioctl(VFIO_GROUP_SET_CONTAINER)
> ioctl(VFIO_GROUP_UNSET_CONTAINER)
>  vfio_group_unset_container
>     int users = atomic_cmpxchg(&group->container_users, 1, 0);
>     // users == 1 container_users == 0
>     __vfio_group_unset_container(group);
> 
>                                     vfio_group_set_container()
> 	                              if (atomic_read(&group->container_users))

	                       if (!atomic_read(...))

> 				        down_write(&container->group_lock);
> 				        group->container = container;
> 				        up_write(&container->group_lock);
> 

It'd be clearer to add below step here:

       container = group->container;

otherwise if this assignment is done before above race then the
original container is not leaked.

>       down_write(&container->group_lock);
>       group->container = NULL;
>       up_write(&container->group_lock);
>       vfio_container_put(container);
>       /* woops we leaked the original container  */
> 
> This can then go on to NULL pointer deref since container == 0 and
> container_users == 1.
> 
> Wrap all touches of container, except those on a performance path with a
> known open device, with the group_rwsem.
> 
> The only user of vfio_group_add_container_user() holds the user count for
> a simple operation, change it to just hold the group_lock over the
> operation and delete vfio_group_add_container_user(). Containers now only
> gain a user when a device FD is opened.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

Reviewed-by: Kevin Tian <kevin.tian@intel.com>

> ---
>  drivers/vfio/vfio.c | 66 +++++++++++++++++++++++++++------------------
>  1 file changed, 40 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index d8d14e528ab795..63f7fa872eae60 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -937,6 +937,8 @@ static void __vfio_group_unset_container(struct
> vfio_group *group)
>  	struct vfio_container *container = group->container;
>  	struct vfio_iommu_driver *driver;
> 
> +	lockdep_assert_held_write(&group->group_rwsem);
> +
>  	down_write(&container->group_lock);
> 
>  	driver = container->iommu_driver;
> @@ -973,6 +975,8 @@ static int vfio_group_unset_container(struct
> vfio_group *group)
>  {
>  	int users = atomic_cmpxchg(&group->container_users, 1, 0);
> 
> +	lockdep_assert_held_write(&group->group_rwsem);
> +
>  	if (!users)
>  		return -EINVAL;
>  	if (users != 1)
> @@ -991,8 +995,10 @@ static int vfio_group_unset_container(struct
> vfio_group *group)
>   */
>  static void vfio_group_try_dissolve_container(struct vfio_group *group)
>  {
> +	down_write(&group->group_rwsem);
>  	if (0 == atomic_dec_if_positive(&group->container_users))
>  		__vfio_group_unset_container(group);
> +	up_write(&group->group_rwsem);
>  }
> 
>  static int vfio_group_set_container(struct vfio_group *group, int
> container_fd)
> @@ -1002,6 +1008,8 @@ static int vfio_group_set_container(struct
> vfio_group *group, int container_fd)
>  	struct vfio_iommu_driver *driver;
>  	int ret = 0;
> 
> +	lockdep_assert_held_write(&group->group_rwsem);
> +
>  	if (atomic_read(&group->container_users))
>  		return -EINVAL;
> 
> @@ -1059,23 +1067,6 @@ static int vfio_group_set_container(struct
> vfio_group *group, int container_fd)
>  	return ret;
>  }
> 
> -static int vfio_group_add_container_user(struct vfio_group *group)
> -{
> -	if (!atomic_inc_not_zero(&group->container_users))
> -		return -EINVAL;
> -
> -	if (group->type == VFIO_NO_IOMMU) {
> -		atomic_dec(&group->container_users);
> -		return -EPERM;
> -	}
> -	if (!group->container->iommu_driver) {
> -		atomic_dec(&group->container_users);
> -		return -EINVAL;
> -	}
> -
> -	return 0;
> -}
> -
>  static const struct file_operations vfio_device_fops;
> 
>  /* true if the vfio_device has open_device() called but not close_device() */
> @@ -1088,6 +1079,8 @@ static int vfio_device_assign_container(struct
> vfio_device *device)
>  {
>  	struct vfio_group *group = device->group;
> 
> +	lockdep_assert_held_write(&group->group_rwsem);
> +
>  	if (0 == atomic_read(&group->container_users) ||
>  	    !group->container->iommu_driver)
>  		return -EINVAL;
> @@ -1109,7 +1102,9 @@ static struct file *vfio_device_open(struct
> vfio_device *device)
>  	struct file *filep;
>  	int ret;
> 
> +	down_write(&device->group->group_rwsem);
>  	ret = vfio_device_assign_container(device);
> +	up_write(&device->group->group_rwsem);
>  	if (ret)
>  		return ERR_PTR(ret);
> 
> @@ -1219,11 +1214,13 @@ static long vfio_group_fops_unl_ioctl(struct file
> *filep,
> 
>  		status.flags = 0;
> 
> +		down_read(&group->group_rwsem);
>  		if (group->container)
>  			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET
> |
>  					VFIO_GROUP_FLAGS_VIABLE;
>  		else if (!iommu_group_dma_owner_claimed(group-
> >iommu_group))
>  			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
> +		up_read(&group->group_rwsem);
> 
>  		if (copy_to_user((void __user *)arg, &status, minsz))
>  			return -EFAULT;
> @@ -1241,11 +1238,15 @@ static long vfio_group_fops_unl_ioctl(struct file
> *filep,
>  		if (fd < 0)
>  			return -EINVAL;
> 
> +		down_write(&group->group_rwsem);
>  		ret = vfio_group_set_container(group, fd);
> +		up_write(&group->group_rwsem);
>  		break;
>  	}
>  	case VFIO_GROUP_UNSET_CONTAINER:
> +		down_write(&group->group_rwsem);
>  		ret = vfio_group_unset_container(group);
> +		up_write(&group->group_rwsem);
>  		break;
>  	case VFIO_GROUP_GET_DEVICE_FD:
>  	{
> @@ -1731,15 +1732,19 @@ bool vfio_file_enforced_coherent(struct file *file)
>  	if (file->f_op != &vfio_group_fops)
>  		return true;
> 
> -	/*
> -	 * Since the coherency state is determined only once a container is
> -	 * attached the user must do so before they can prove they have
> -	 * permission.
> -	 */
> -	if (vfio_group_add_container_user(group))
> -		return true;
> -	ret = vfio_ioctl_check_extension(group->container,
> VFIO_DMA_CC_IOMMU);
> -	vfio_group_try_dissolve_container(group);
> +	down_read(&group->group_rwsem);
> +	if (group->container) {
> +		ret = vfio_ioctl_check_extension(group->container,
> +						 VFIO_DMA_CC_IOMMU);
> +	} else {
> +		/*
> +		 * Since the coherency state is determined only once a
> container
> +		 * is attached the user must do so before they can prove they
> +		 * have permission.
> +		 */
> +		ret = true;
> +	}
> +	up_read(&group->group_rwsem);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
> @@ -1932,6 +1937,7 @@ int vfio_pin_pages(struct vfio_device *device,
> unsigned long *user_pfn,
>  	if (group->dev_counter > 1)
>  		return -EINVAL;
> 
> +	/* group->container cannot change while a vfio device is open */
>  	container = group->container;
>  	driver = container->iommu_driver;
>  	if (likely(driver && driver->ops->pin_pages))
> @@ -1967,6 +1973,7 @@ int vfio_unpin_pages(struct vfio_device *device,
> unsigned long *user_pfn,
>  	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
>  		return -E2BIG;
> 
> +	/* group->container cannot change while a vfio device is open */
>  	container = device->group->container;
>  	driver = container->iommu_driver;
>  	if (likely(driver && driver->ops->unpin_pages))
> @@ -2006,6 +2013,7 @@ int vfio_dma_rw(struct vfio_device *device,
> dma_addr_t user_iova, void *data,
>  	if (!data || len <= 0 || !vfio_assert_device_open(device))
>  		return -EINVAL;
> 
> +	/* group->container cannot change while a vfio device is open */
>  	container = device->group->container;
>  	driver = container->iommu_driver;
> 
> @@ -2026,6 +2034,7 @@ static int vfio_register_iommu_notifier(struct
> vfio_group *group,
>  	struct vfio_iommu_driver *driver;
>  	int ret;
> 
> +	down_read(&group->group_rwsem);
>  	container = group->container;
>  	driver = container->iommu_driver;
>  	if (likely(driver && driver->ops->register_notifier))
> @@ -2033,6 +2042,8 @@ static int vfio_register_iommu_notifier(struct
> vfio_group *group,
>  						     events, nb);
>  	else
>  		ret = -ENOTTY;
> +	up_read(&group->group_rwsem);
> +
>  	return ret;
>  }
> 
> @@ -2043,6 +2054,7 @@ static int vfio_unregister_iommu_notifier(struct
> vfio_group *group,
>  	struct vfio_iommu_driver *driver;
>  	int ret;
> 
> +	down_read(&group->group_rwsem);
>  	container = group->container;
>  	driver = container->iommu_driver;
>  	if (likely(driver && driver->ops->unregister_notifier))
> @@ -2050,6 +2062,8 @@ static int vfio_unregister_iommu_notifier(struct
> vfio_group *group,
>  						       nb);
>  	else
>  		ret = -ENOTTY;
> +	up_read(&group->group_rwsem);
> +
>  	return ret;
>  }
> 
> --
> 2.36.0
Jason Gunthorpe May 13, 2022, 1:29 p.m. UTC | #2
On Fri, May 13, 2022 at 09:53:05AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Friday, May 6, 2022 8:25 AM
> > 
> > This is necessary to avoid various user triggerable races, for instance
> > racing SET_CONTAINER/UNSET_CONTAINER:
> > 
> >                                   ioctl(VFIO_GROUP_SET_CONTAINER)
> > ioctl(VFIO_GROUP_UNSET_CONTAINER)
> >  vfio_group_unset_container
> >     int users = atomic_cmpxchg(&group->container_users, 1, 0);
> >     // users == 1 container_users == 0
> >     __vfio_group_unset_container(group);
> > 
> >                                     vfio_group_set_container()
> > 	                              if (atomic_read(&group->container_users))
> 
> 	                       if (!atomic_read(...))
> 
> > 				        down_write(&container->group_lock);
> > 				        group->container = container;
> > 				        up_write(&container->group_lock);
> > 
> 
> It'd be clearer to add below step here:
> 
>        container = group->container;
> 
> otherwise if this assignment is done before above race then the
> original container is not leaked.

Not quite, it is the assignment to NULL that is trouble:

> >       down_write(&container->group_lock);
> >       group->container = NULL;
> >       up_write(&container->group_lock);

Either we leaked the original container or we leaked the new
container, and now the atomics are out of sync with the state of
group->container.

But sure, it is a touch clearer to focus only one scenario, I picked
the one where group->container is read early and we leak the new
container.

Thanks,
Jason
diff mbox series

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index d8d14e528ab795..63f7fa872eae60 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -937,6 +937,8 @@  static void __vfio_group_unset_container(struct vfio_group *group)
 	struct vfio_container *container = group->container;
 	struct vfio_iommu_driver *driver;
 
+	lockdep_assert_held_write(&group->group_rwsem);
+
 	down_write(&container->group_lock);
 
 	driver = container->iommu_driver;
@@ -973,6 +975,8 @@  static int vfio_group_unset_container(struct vfio_group *group)
 {
 	int users = atomic_cmpxchg(&group->container_users, 1, 0);
 
+	lockdep_assert_held_write(&group->group_rwsem);
+
 	if (!users)
 		return -EINVAL;
 	if (users != 1)
@@ -991,8 +995,10 @@  static int vfio_group_unset_container(struct vfio_group *group)
  */
 static void vfio_group_try_dissolve_container(struct vfio_group *group)
 {
+	down_write(&group->group_rwsem);
 	if (0 == atomic_dec_if_positive(&group->container_users))
 		__vfio_group_unset_container(group);
+	up_write(&group->group_rwsem);
 }
 
 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
@@ -1002,6 +1008,8 @@  static int vfio_group_set_container(struct vfio_group *group, int container_fd)
 	struct vfio_iommu_driver *driver;
 	int ret = 0;
 
+	lockdep_assert_held_write(&group->group_rwsem);
+
 	if (atomic_read(&group->container_users))
 		return -EINVAL;
 
@@ -1059,23 +1067,6 @@  static int vfio_group_set_container(struct vfio_group *group, int container_fd)
 	return ret;
 }
 
-static int vfio_group_add_container_user(struct vfio_group *group)
-{
-	if (!atomic_inc_not_zero(&group->container_users))
-		return -EINVAL;
-
-	if (group->type == VFIO_NO_IOMMU) {
-		atomic_dec(&group->container_users);
-		return -EPERM;
-	}
-	if (!group->container->iommu_driver) {
-		atomic_dec(&group->container_users);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static const struct file_operations vfio_device_fops;
 
 /* true if the vfio_device has open_device() called but not close_device() */
@@ -1088,6 +1079,8 @@  static int vfio_device_assign_container(struct vfio_device *device)
 {
 	struct vfio_group *group = device->group;
 
+	lockdep_assert_held_write(&group->group_rwsem);
+
 	if (0 == atomic_read(&group->container_users) ||
 	    !group->container->iommu_driver)
 		return -EINVAL;
@@ -1109,7 +1102,9 @@  static struct file *vfio_device_open(struct vfio_device *device)
 	struct file *filep;
 	int ret;
 
+	down_write(&device->group->group_rwsem);
 	ret = vfio_device_assign_container(device);
+	up_write(&device->group->group_rwsem);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -1219,11 +1214,13 @@  static long vfio_group_fops_unl_ioctl(struct file *filep,
 
 		status.flags = 0;
 
+		down_read(&group->group_rwsem);
 		if (group->container)
 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
 					VFIO_GROUP_FLAGS_VIABLE;
 		else if (!iommu_group_dma_owner_claimed(group->iommu_group))
 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
+		up_read(&group->group_rwsem);
 
 		if (copy_to_user((void __user *)arg, &status, minsz))
 			return -EFAULT;
@@ -1241,11 +1238,15 @@  static long vfio_group_fops_unl_ioctl(struct file *filep,
 		if (fd < 0)
 			return -EINVAL;
 
+		down_write(&group->group_rwsem);
 		ret = vfio_group_set_container(group, fd);
+		up_write(&group->group_rwsem);
 		break;
 	}
 	case VFIO_GROUP_UNSET_CONTAINER:
+		down_write(&group->group_rwsem);
 		ret = vfio_group_unset_container(group);
+		up_write(&group->group_rwsem);
 		break;
 	case VFIO_GROUP_GET_DEVICE_FD:
 	{
@@ -1731,15 +1732,19 @@  bool vfio_file_enforced_coherent(struct file *file)
 	if (file->f_op != &vfio_group_fops)
 		return true;
 
-	/*
-	 * Since the coherency state is determined only once a container is
-	 * attached the user must do so before they can prove they have
-	 * permission.
-	 */
-	if (vfio_group_add_container_user(group))
-		return true;
-	ret = vfio_ioctl_check_extension(group->container, VFIO_DMA_CC_IOMMU);
-	vfio_group_try_dissolve_container(group);
+	down_read(&group->group_rwsem);
+	if (group->container) {
+		ret = vfio_ioctl_check_extension(group->container,
+						 VFIO_DMA_CC_IOMMU);
+	} else {
+		/*
+		 * Since the coherency state is determined only once a container
+		 * is attached the user must do so before they can prove they
+		 * have permission.
+		 */
+		ret = true;
+	}
+	up_read(&group->group_rwsem);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
@@ -1932,6 +1937,7 @@  int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	if (group->dev_counter > 1)
 		return -EINVAL;
 
+	/* group->container cannot change while a vfio device is open */
 	container = group->container;
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->pin_pages))
@@ -1967,6 +1973,7 @@  int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
 		return -E2BIG;
 
+	/* group->container cannot change while a vfio device is open */
 	container = device->group->container;
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->unpin_pages))
@@ -2006,6 +2013,7 @@  int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
 	if (!data || len <= 0 || !vfio_assert_device_open(device))
 		return -EINVAL;
 
+	/* group->container cannot change while a vfio device is open */
 	container = device->group->container;
 	driver = container->iommu_driver;
 
@@ -2026,6 +2034,7 @@  static int vfio_register_iommu_notifier(struct vfio_group *group,
 	struct vfio_iommu_driver *driver;
 	int ret;
 
+	down_read(&group->group_rwsem);
 	container = group->container;
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->register_notifier))
@@ -2033,6 +2042,8 @@  static int vfio_register_iommu_notifier(struct vfio_group *group,
 						     events, nb);
 	else
 		ret = -ENOTTY;
+	up_read(&group->group_rwsem);
+
 	return ret;
 }
 
@@ -2043,6 +2054,7 @@  static int vfio_unregister_iommu_notifier(struct vfio_group *group,
 	struct vfio_iommu_driver *driver;
 	int ret;
 
+	down_read(&group->group_rwsem);
 	container = group->container;
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->unregister_notifier))
@@ -2050,6 +2062,8 @@  static int vfio_unregister_iommu_notifier(struct vfio_group *group,
 						       nb);
 	else
 		ret = -ENOTTY;
+	up_read(&group->group_rwsem);
+
 	return ret;
 }