diff mbox series

[v3,06/14] vfio/type1: Add VFIO_IOMMU_PASID_REQUEST (alloc/free)

Message ID 1592988927-48009-7-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio: expose virtual Shared Virtual Addressing to VMs | expand

Commit Message

Yi Liu June 24, 2020, 8:55 a.m. UTC
This patch allows user space to request PASID allocation/free, e.g. when
serving the request from the guest.

PASIDs that are not freed by userspace are automatically freed when the
IOASID set is destroyed when process exits.

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
v1 -> v2:
*) move the vfio_mm related code to be a seprate module
*) use a single structure for alloc/free, could support a range of PASIDs
*) fetch vfio_mm at group_attach time instead of at iommu driver open time
---
 drivers/vfio/Kconfig            |  1 +
 drivers/vfio/vfio_iommu_type1.c | 96 ++++++++++++++++++++++++++++++++++++++++-
 drivers/vfio/vfio_pasid.c       | 10 +++++
 include/linux/vfio.h            |  6 +++
 include/uapi/linux/vfio.h       | 36 ++++++++++++++++
 5 files changed, 147 insertions(+), 2 deletions(-)

Comments

Alex Williamson July 2, 2020, 9:18 p.m. UTC | #1
On Wed, 24 Jun 2020 01:55:19 -0700
Liu Yi L <yi.l.liu@intel.com> wrote:

> This patch allows user space to request PASID allocation/free, e.g. when
> serving the request from the guest.
> 
> PASIDs that are not freed by userspace are automatically freed when the
> IOASID set is destroyed when process exits.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Eric Auger <eric.auger@redhat.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
> v1 -> v2:
> *) move the vfio_mm related code to be a seprate module
> *) use a single structure for alloc/free, could support a range of PASIDs
> *) fetch vfio_mm at group_attach time instead of at iommu driver open time
> ---
>  drivers/vfio/Kconfig            |  1 +
>  drivers/vfio/vfio_iommu_type1.c | 96 ++++++++++++++++++++++++++++++++++++++++-
>  drivers/vfio/vfio_pasid.c       | 10 +++++
>  include/linux/vfio.h            |  6 +++
>  include/uapi/linux/vfio.h       | 36 ++++++++++++++++
>  5 files changed, 147 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 3d8a108..95d90c6 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -2,6 +2,7 @@
>  config VFIO_IOMMU_TYPE1
>  	tristate
>  	depends on VFIO
> +	select VFIO_PASID if (X86)
>  	default n
>  
>  config VFIO_IOMMU_SPAPR_TCE
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 8c143d5..d0891c5 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -73,6 +73,7 @@ struct vfio_iommu {
>  	bool			v2;
>  	bool			nesting;
>  	struct iommu_nesting_info *nesting_info;
> +	struct vfio_mm		*vmm;

Structure alignment again.

>  	bool			dirty_page_tracking;
>  	bool			pinned_page_dirty_scope;
>  };
> @@ -1933,6 +1934,17 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
>  
>  	list_splice_tail(iova_copy, iova);
>  }
> +
> +static void vfio_iommu_release_nesting_info(struct vfio_iommu *iommu)
> +{
> +	if (iommu->vmm) {
> +		vfio_mm_put(iommu->vmm);
> +		iommu->vmm = NULL;
> +	}
> +
> +	kfree(iommu->nesting_info);

iommu->nesting_info = NULL;

> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
> @@ -2067,6 +2079,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  			goto out_detach;
>  		}
>  		iommu->nesting_info = info;
> +
> +		if (info->features & IOMMU_NESTING_FEAT_SYSWIDE_PASID) {
> +			struct vfio_mm *vmm;
> +			int sid;
> +
> +			vmm = vfio_mm_get_from_task(current);
> +			if (IS_ERR(vmm)) {
> +				ret = PTR_ERR(vmm);
> +				goto out_detach;
> +			}
> +			iommu->vmm = vmm;
> +
> +			sid = vfio_mm_ioasid_sid(vmm);
> +			ret = iommu_domain_set_attr(domain->domain,
> +						    DOMAIN_ATTR_IOASID_SID,
> +						    &sid);

This looks pretty dicey in the case of !CONFIG_VFIO_PASID, can we get
here in that case?  If so it looks like we're doing bad things with
setting the domain->ioasid_sid.

> +			if (ret)
> +				goto out_detach;
> +		}
>  	}
>  
>  	/* Get aperture info */
> @@ -2178,7 +2209,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	return 0;
>  
>  out_detach:
> -	kfree(iommu->nesting_info);
> +	if (iommu->nesting_info)
> +		vfio_iommu_release_nesting_info(iommu);

Make vfio_iommu_release_nesting_info() check iommu->nesting_info, then
call it unconditionally?

>  	vfio_iommu_detach_group(domain, group);
>  out_domain:
>  	iommu_domain_free(domain->domain);
> @@ -2380,7 +2412,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  				else
>  					vfio_iommu_unmap_unpin_reaccount(iommu);
>  
> -				kfree(iommu->nesting_info);
> +				if (iommu->nesting_info)
> +					vfio_iommu_release_nesting_info(iommu);
>  			}
>  			iommu_domain_free(domain->domain);
>  			list_del(&domain->next);
> @@ -2852,6 +2885,63 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>  	return -EINVAL;
>  }
>  
> +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> +					unsigned int min,
> +					unsigned int max)
> +{
> +	int ret = -ENOTSUPP;
> +
> +	mutex_lock(&iommu->lock);
> +	if (iommu->vmm)
> +		ret = vfio_pasid_alloc(iommu->vmm, min, max);
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> +					unsigned int min,
> +					unsigned int max)
> +{
> +	int ret = -ENOTSUPP;
> +
> +	mutex_lock(&iommu->lock);
> +	if (iommu->vmm) {
> +		vfio_pasid_free_range(iommu->vmm, min, max);
> +		ret = 0;
> +	}
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_pasid_request(struct vfio_iommu *iommu,
> +					  unsigned long arg)
> +{
> +	struct vfio_iommu_type1_pasid_request req;
> +	unsigned long minsz;
> +
> +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request, range);
> +
> +	if (copy_from_user(&req, (void __user *)arg, minsz))
> +		return -EFAULT;
> +
> +	if (req.argsz < minsz || (req.flags & ~VFIO_PASID_REQUEST_MASK))
> +		return -EINVAL;
> +
> +	if (req.range.min > req.range.max)

Is it exploitable that a user can spin the kernel for a long time in
the case of a free by calling this with [0, MAX_UINT] regardless of
their actual allocations?

> +		return -EINVAL;
> +
> +	switch (req.flags & VFIO_PASID_REQUEST_MASK) {
> +	case VFIO_IOMMU_ALLOC_PASID:
> +		return vfio_iommu_type1_pasid_alloc(iommu,
> +					req.range.min, req.range.max);
> +	case VFIO_IOMMU_FREE_PASID:
> +		return vfio_iommu_type1_pasid_free(iommu,
> +					req.range.min, req.range.max);
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -2868,6 +2958,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  		return vfio_iommu_type1_unmap_dma(iommu, arg);
>  	case VFIO_IOMMU_DIRTY_PAGES:
>  		return vfio_iommu_type1_dirty_pages(iommu, arg);
> +	case VFIO_IOMMU_PASID_REQUEST:
> +		return vfio_iommu_type1_pasid_request(iommu, arg);
>  	}
>  
>  	return -ENOTTY;
> diff --git a/drivers/vfio/vfio_pasid.c b/drivers/vfio/vfio_pasid.c
> index dd5b6d1..2ea9f1a 100644
> --- a/drivers/vfio/vfio_pasid.c
> +++ b/drivers/vfio/vfio_pasid.c
> @@ -54,6 +54,7 @@ void vfio_mm_put(struct vfio_mm *vmm)
>  {
>  	kref_put_mutex(&vmm->kref, vfio_mm_release, &vfio_pasid.vfio_mm_lock);
>  }
> +EXPORT_SYMBOL_GPL(vfio_mm_put);
>  
>  static void vfio_mm_get(struct vfio_mm *vmm)
>  {
> @@ -103,6 +104,13 @@ struct vfio_mm *vfio_mm_get_from_task(struct task_struct *task)
>  	mmput(mm);
>  	return vmm;
>  }
> +EXPORT_SYMBOL_GPL(vfio_mm_get_from_task);
> +
> +int vfio_mm_ioasid_sid(struct vfio_mm *vmm)
> +{
> +	return vmm->ioasid_sid;
> +}
> +EXPORT_SYMBOL_GPL(vfio_mm_ioasid_sid);
>  
>  int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max)
>  {
> @@ -112,6 +120,7 @@ int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max)
>  
>  	return (pasid == INVALID_IOASID) ? -ENOSPC : pasid;
>  }
> +EXPORT_SYMBOL_GPL(vfio_pasid_alloc);
>  
>  void vfio_pasid_free_range(struct vfio_mm *vmm,
>  			    ioasid_t min, ioasid_t max)
> @@ -129,6 +138,7 @@ void vfio_pasid_free_range(struct vfio_mm *vmm,
>  	for (; pasid <= max; pasid++)
>  		ioasid_free(pasid);
>  }
> +EXPORT_SYMBOL_GPL(vfio_pasid_free_range);
>  
>  static int __init vfio_pasid_init(void)
>  {
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 74e077d..8e60a32 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -101,6 +101,7 @@ struct vfio_mm;
>  #if IS_ENABLED(CONFIG_VFIO_PASID)
>  extern struct vfio_mm *vfio_mm_get_from_task(struct task_struct *task);
>  extern void vfio_mm_put(struct vfio_mm *vmm);
> +int vfio_mm_ioasid_sid(struct vfio_mm *vmm);
>  extern int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max);
>  extern void vfio_pasid_free_range(struct vfio_mm *vmm,
>  					ioasid_t min, ioasid_t max);
> @@ -114,6 +115,11 @@ static inline void vfio_mm_put(struct vfio_mm *vmm)
>  {
>  }
>  
> +static inline int vfio_mm_ioasid_sid(struct vfio_mm *vmm)
> +{
> +	return -ENOTTY;
> +}
> +
>  static inline int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max)
>  {
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index f1f39e1..657b2db 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1162,6 +1162,42 @@ struct vfio_iommu_type1_dirty_bitmap_get {
>  
>  #define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
>  
> +/**
> + * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 18,
> + *				struct vfio_iommu_type1_pasid_request)
> + *
> + * PASID (Processor Address Space ID) is a PCIe concept for tagging
> + * address spaces in DMA requests. When system-wide PASID allocation
> + * is required by underlying iommu driver (e.g. Intel VT-d), this
> + * provides an interface for userspace to request pasid alloc/free
> + * for its assigned devices. Userspace should check the availability
> + * of this API through VFIO_IOMMU_GET_INFO.
> + *
> + * @flags=VFIO_IOMMU_ALLOC_PASID, allocate a single PASID within @range.
> + * @flags=VFIO_IOMMU_FREE_PASID, free the PASIDs within @range.
> + * @range is [min, max], which means both @min and @max are inclusive.
> + * ALLOC_PASID and FREE_PASID are mutually exclusive.
> + *
> + * returns: allocated PASID value on success, -errno on failure for
> + *	     ALLOC_PASID;
> + *	     0 for FREE_PASID operation;
> + */
> +struct vfio_iommu_type1_pasid_request {
> +	__u32	argsz;
> +#define VFIO_IOMMU_ALLOC_PASID	(1 << 0)
> +#define VFIO_IOMMU_FREE_PASID	(1 << 1)

VFIO_IOMMU_PASID_FLAG_{ALLOC,FREE} would be more similar to other VFIO
UAPI conventions.  Thanks,

Alex

> +	__u32	flags;
> +	struct {
> +		__u32	min;
> +		__u32	max;
> +	} range;
> +};
> +
> +#define VFIO_PASID_REQUEST_MASK	(VFIO_IOMMU_ALLOC_PASID | \
> +					 VFIO_IOMMU_FREE_PASID)
> +
> +#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 18)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>  
>  /*
Yi Liu July 3, 2020, 6:28 a.m. UTC | #2
Hi Alex,

> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Friday, July 3, 2020 5:19 AM
> 
> On Wed, 24 Jun 2020 01:55:19 -0700
> Liu Yi L <yi.l.liu@intel.com> wrote:
> 
> > This patch allows user space to request PASID allocation/free, e.g.
> > when serving the request from the guest.
> >
> > PASIDs that are not freed by userspace are automatically freed when
> > the IOASID set is destroyed when process exits.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Eric Auger <eric.auger@redhat.com>
> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Cc: Lu Baolu <baolu.lu@linux.intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> > v1 -> v2:
> > *) move the vfio_mm related code to be a seprate module
> > *) use a single structure for alloc/free, could support a range of
> > PASIDs
> > *) fetch vfio_mm at group_attach time instead of at iommu driver open
> > time
> > ---
> >  drivers/vfio/Kconfig            |  1 +
> >  drivers/vfio/vfio_iommu_type1.c | 96
> ++++++++++++++++++++++++++++++++++++++++-
> >  drivers/vfio/vfio_pasid.c       | 10 +++++
> >  include/linux/vfio.h            |  6 +++
> >  include/uapi/linux/vfio.h       | 36 ++++++++++++++++
> >  5 files changed, 147 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index
> > 3d8a108..95d90c6 100644
> > --- a/drivers/vfio/Kconfig
> > +++ b/drivers/vfio/Kconfig
> > @@ -2,6 +2,7 @@
> >  config VFIO_IOMMU_TYPE1
> >  	tristate
> >  	depends on VFIO
> > +	select VFIO_PASID if (X86)
> >  	default n
> >
> >  config VFIO_IOMMU_SPAPR_TCE
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index 8c143d5..d0891c5 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -73,6 +73,7 @@ struct vfio_iommu {
> >  	bool			v2;
> >  	bool			nesting;
> >  	struct iommu_nesting_info *nesting_info;
> > +	struct vfio_mm		*vmm;
> 
> Structure alignment again.

sure. may get agreement in the prior email.

> 
> >  	bool			dirty_page_tracking;
> >  	bool			pinned_page_dirty_scope;
> >  };
> > @@ -1933,6 +1934,17 @@ static void vfio_iommu_iova_insert_copy(struct
> > vfio_iommu *iommu,
> >
> >  	list_splice_tail(iova_copy, iova);
> >  }
> > +
> > +static void vfio_iommu_release_nesting_info(struct vfio_iommu *iommu)
> > +{
> > +	if (iommu->vmm) {
> > +		vfio_mm_put(iommu->vmm);
> > +		iommu->vmm = NULL;
> > +	}
> > +
> > +	kfree(iommu->nesting_info);
> 
> iommu->nesting_info = NULL;

got it.

> > +}
> > +
> >  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >  					 struct iommu_group *iommu_group)
> { @@ -2067,6 +2079,25 @@
> > static int vfio_iommu_type1_attach_group(void *iommu_data,
> >  			goto out_detach;
> >  		}
> >  		iommu->nesting_info = info;
> > +
> > +		if (info->features & IOMMU_NESTING_FEAT_SYSWIDE_PASID) {
> > +			struct vfio_mm *vmm;
> > +			int sid;
> > +
> > +			vmm = vfio_mm_get_from_task(current);
> > +			if (IS_ERR(vmm)) {
> > +				ret = PTR_ERR(vmm);
> > +				goto out_detach;
> > +			}
> > +			iommu->vmm = vmm;
> > +
> > +			sid = vfio_mm_ioasid_sid(vmm);
> > +			ret = iommu_domain_set_attr(domain->domain,
> > +						    DOMAIN_ATTR_IOASID_SID,
> > +						    &sid);
> 
> This looks pretty dicey in the case of !CONFIG_VFIO_PASID, can we get here in
> that case?  If so it looks like we're doing bad things with setting the domain-
> >ioasid_sid.

I guess not. So far, vfio_iommu_type1 will select CONFIG_VFIO_PASID for X86.
do you think it is enough?

> 
> > +			if (ret)
> > +				goto out_detach;
> > +		}
> >  	}
> >
> >  	/* Get aperture info */
> > @@ -2178,7 +2209,8 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> >  	return 0;
> >
> >  out_detach:
> > -	kfree(iommu->nesting_info);
> > +	if (iommu->nesting_info)
> > +		vfio_iommu_release_nesting_info(iommu);
> 
> Make vfio_iommu_release_nesting_info() check iommu->nesting_info, then call
> it unconditionally?

got it. :-)

> >  	vfio_iommu_detach_group(domain, group);
> >  out_domain:
> >  	iommu_domain_free(domain->domain);
> > @@ -2380,7 +2412,8 @@ static void vfio_iommu_type1_detach_group(void
> *iommu_data,
> >  				else
> >
> 	vfio_iommu_unmap_unpin_reaccount(iommu);
> >
> > -				kfree(iommu->nesting_info);
> > +				if (iommu->nesting_info)
> > +
> 	vfio_iommu_release_nesting_info(iommu);
> >  			}
> >  			iommu_domain_free(domain->domain);
> >  			list_del(&domain->next);
> > @@ -2852,6 +2885,63 @@ static int vfio_iommu_type1_dirty_pages(struct
> vfio_iommu *iommu,
> >  	return -EINVAL;
> >  }
> >
> > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > +					unsigned int min,
> > +					unsigned int max)
> > +{
> > +	int ret = -ENOTSUPP;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (iommu->vmm)
> > +		ret = vfio_pasid_alloc(iommu->vmm, min, max);
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > +					unsigned int min,
> > +					unsigned int max)
> > +{
> > +	int ret = -ENOTSUPP;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (iommu->vmm) {
> > +		vfio_pasid_free_range(iommu->vmm, min, max);
> > +		ret = 0;
> > +	}
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu *iommu,
> > +					  unsigned long arg)
> > +{
> > +	struct vfio_iommu_type1_pasid_request req;
> > +	unsigned long minsz;
> > +
> > +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request, range);
> > +
> > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (req.argsz < minsz || (req.flags & ~VFIO_PASID_REQUEST_MASK))
> > +		return -EINVAL;
> > +
> > +	if (req.range.min > req.range.max)
> 
> Is it exploitable that a user can spin the kernel for a long time in the case of a free
> by calling this with [0, MAX_UINT] regardless of their actual allocations?

IOASID can ensure that user can only free the PASIDs allocated to the
user. but it's true, kernel needs to loop all the PASIDs within the
range provided by user. it may take a long time. is there anything we
can do? one thing may limit the range provided by user?

> > +		return -EINVAL;
> > +
> > +	switch (req.flags & VFIO_PASID_REQUEST_MASK) {
> > +	case VFIO_IOMMU_ALLOC_PASID:
> > +		return vfio_iommu_type1_pasid_alloc(iommu,
> > +					req.range.min, req.range.max);
> > +	case VFIO_IOMMU_FREE_PASID:
> > +		return vfio_iommu_type1_pasid_free(iommu,
> > +					req.range.min, req.range.max);
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  				   unsigned int cmd, unsigned long arg)  { @@ -
> 2868,6 +2958,8 @@
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  		return vfio_iommu_type1_unmap_dma(iommu, arg);
> >  	case VFIO_IOMMU_DIRTY_PAGES:
> >  		return vfio_iommu_type1_dirty_pages(iommu, arg);
> > +	case VFIO_IOMMU_PASID_REQUEST:
> > +		return vfio_iommu_type1_pasid_request(iommu, arg);
> >  	}
> >
> >  	return -ENOTTY;
> > diff --git a/drivers/vfio/vfio_pasid.c b/drivers/vfio/vfio_pasid.c
> > index dd5b6d1..2ea9f1a 100644
> > --- a/drivers/vfio/vfio_pasid.c
> > +++ b/drivers/vfio/vfio_pasid.c
> > @@ -54,6 +54,7 @@ void vfio_mm_put(struct vfio_mm *vmm)  {
> >  	kref_put_mutex(&vmm->kref, vfio_mm_release,
> > &vfio_pasid.vfio_mm_lock);  }
> > +EXPORT_SYMBOL_GPL(vfio_mm_put);
> >
> >  static void vfio_mm_get(struct vfio_mm *vmm)  { @@ -103,6 +104,13 @@
> > struct vfio_mm *vfio_mm_get_from_task(struct task_struct *task)
> >  	mmput(mm);
> >  	return vmm;
> >  }
> > +EXPORT_SYMBOL_GPL(vfio_mm_get_from_task);
> > +
> > +int vfio_mm_ioasid_sid(struct vfio_mm *vmm) {
> > +	return vmm->ioasid_sid;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_mm_ioasid_sid);
> >
> >  int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max)  { @@
> > -112,6 +120,7 @@ int vfio_pasid_alloc(struct vfio_mm *vmm, int min,
> > int max)
> >
> >  	return (pasid == INVALID_IOASID) ? -ENOSPC : pasid;  }
> > +EXPORT_SYMBOL_GPL(vfio_pasid_alloc);
> >
> >  void vfio_pasid_free_range(struct vfio_mm *vmm,
> >  			    ioasid_t min, ioasid_t max)
> > @@ -129,6 +138,7 @@ void vfio_pasid_free_range(struct vfio_mm *vmm,
> >  	for (; pasid <= max; pasid++)
> >  		ioasid_free(pasid);
> >  }
> > +EXPORT_SYMBOL_GPL(vfio_pasid_free_range);
> >
> >  static int __init vfio_pasid_init(void)  { diff --git
> > a/include/linux/vfio.h b/include/linux/vfio.h index 74e077d..8e60a32
> > 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -101,6 +101,7 @@ struct vfio_mm;
> >  #if IS_ENABLED(CONFIG_VFIO_PASID)
> >  extern struct vfio_mm *vfio_mm_get_from_task(struct task_struct
> > *task);  extern void vfio_mm_put(struct vfio_mm *vmm);
> > +int vfio_mm_ioasid_sid(struct vfio_mm *vmm);
> >  extern int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max);
> > extern void vfio_pasid_free_range(struct vfio_mm *vmm,
> >  					ioasid_t min, ioasid_t max);
> > @@ -114,6 +115,11 @@ static inline void vfio_mm_put(struct vfio_mm
> > *vmm)  {  }
> >
> > +static inline int vfio_mm_ioasid_sid(struct vfio_mm *vmm) {
> > +	return -ENOTTY;
> > +}
> > +
> >  static inline int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int
> > max)  {
> >  	return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index f1f39e1..657b2db 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -1162,6 +1162,42 @@ struct vfio_iommu_type1_dirty_bitmap_get {
> >
> >  #define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
> >
> > +/**
> > + * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 18,
> > + *				struct vfio_iommu_type1_pasid_request)
> > + *
> > + * PASID (Processor Address Space ID) is a PCIe concept for tagging
> > + * address spaces in DMA requests. When system-wide PASID allocation
> > + * is required by underlying iommu driver (e.g. Intel VT-d), this
> > + * provides an interface for userspace to request pasid alloc/free
> > + * for its assigned devices. Userspace should check the availability
> > + * of this API through VFIO_IOMMU_GET_INFO.
> > + *
> > + * @flags=VFIO_IOMMU_ALLOC_PASID, allocate a single PASID within @range.
> > + * @flags=VFIO_IOMMU_FREE_PASID, free the PASIDs within @range.
> > + * @range is [min, max], which means both @min and @max are inclusive.
> > + * ALLOC_PASID and FREE_PASID are mutually exclusive.
> > + *
> > + * returns: allocated PASID value on success, -errno on failure for
> > + *	     ALLOC_PASID;
> > + *	     0 for FREE_PASID operation;
> > + */
> > +struct vfio_iommu_type1_pasid_request {
> > +	__u32	argsz;
> > +#define VFIO_IOMMU_ALLOC_PASID	(1 << 0)
> > +#define VFIO_IOMMU_FREE_PASID	(1 << 1)
> 
> VFIO_IOMMU_PASID_FLAG_{ALLOC,FREE} would be more similar to other VFIO
> UAPI conventions.  Thanks,

yes, much better. will modify it.

Thanks,
Yi Liu

> Alex
> 
> > +	__u32	flags;
> > +	struct {
> > +		__u32	min;
> > +		__u32	max;
> > +	} range;
> > +};
> > +
> > +#define VFIO_PASID_REQUEST_MASK	(VFIO_IOMMU_ALLOC_PASID | \
> > +					 VFIO_IOMMU_FREE_PASID)
> > +
> > +#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 18)
> > +
> >  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU
> > -------- */
> >
> >  /*
Yi Liu July 8, 2020, 8:16 a.m. UTC | #3
Hi Alex,

> From: Liu, Yi L < yi.l.liu@intel.com>
> Sent: Friday, July 3, 2020 2:28 PM
> 
> Hi Alex,
> 
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Friday, July 3, 2020 5:19 AM
> >
> > On Wed, 24 Jun 2020 01:55:19 -0700
> > Liu Yi L <yi.l.liu@intel.com> wrote:
> >
> > > This patch allows user space to request PASID allocation/free, e.g.
> > > when serving the request from the guest.
> > >
> > > PASIDs that are not freed by userspace are automatically freed when
> > > the IOASID set is destroyed when process exits.
[...]
> > > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu *iommu,
> > > +					  unsigned long arg)
> > > +{
> > > +	struct vfio_iommu_type1_pasid_request req;
> > > +	unsigned long minsz;
> > > +
> > > +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request, range);
> > > +
> > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > +		return -EFAULT;
> > > +
> > > +	if (req.argsz < minsz || (req.flags & ~VFIO_PASID_REQUEST_MASK))
> > > +		return -EINVAL;
> > > +
> > > +	if (req.range.min > req.range.max)
> >
> > Is it exploitable that a user can spin the kernel for a long time in
> > the case of a free by calling this with [0, MAX_UINT] regardless of their actual
> allocations?
> 
> IOASID can ensure that user can only free the PASIDs allocated to the user. but
> it's true, kernel needs to loop all the PASIDs within the range provided by user. it
> may take a long time. is there anything we can do? one thing may limit the range
> provided by user?

thought about it more, we have per-VM pasid quota (say 1000), so even if
user passed down [0, MAX_UNIT], kernel will only loop the 1000 pasids at
most. do you think we still need to do something on it?

Regards,
Yi Liu
Alex Williamson July 8, 2020, 7:54 p.m. UTC | #4
On Wed, 8 Jul 2020 08:16:16 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> Hi Alex,
> 
> > From: Liu, Yi L < yi.l.liu@intel.com>
> > Sent: Friday, July 3, 2020 2:28 PM
> > 
> > Hi Alex,
> >   
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Friday, July 3, 2020 5:19 AM
> > >
> > > On Wed, 24 Jun 2020 01:55:19 -0700
> > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > >  
> > > > This patch allows user space to request PASID allocation/free, e.g.
> > > > when serving the request from the guest.
> > > >
> > > > PASIDs that are not freed by userspace are automatically freed when
> > > > the IOASID set is destroyed when process exits.  
> [...]
> > > > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu *iommu,
> > > > +					  unsigned long arg)
> > > > +{
> > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > +	unsigned long minsz;
> > > > +
> > > > +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request, range);
> > > > +
> > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > +		return -EFAULT;
> > > > +
> > > > +	if (req.argsz < minsz || (req.flags & ~VFIO_PASID_REQUEST_MASK))
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (req.range.min > req.range.max)  
> > >
> > > Is it exploitable that a user can spin the kernel for a long time in
> > > the case of a free by calling this with [0, MAX_UINT] regardless of their actual  
> > allocations?
> > 
> > IOASID can ensure that user can only free the PASIDs allocated to the user. but
> > it's true, kernel needs to loop all the PASIDs within the range provided by user. it
> > may take a long time. is there anything we can do? one thing may limit the range
> > provided by user?  
> 
> thought about it more, we have per-VM pasid quota (say 1000), so even if
> user passed down [0, MAX_UNIT], kernel will only loop the 1000 pasids at
> most. do you think we still need to do something on it?

How do you figure that?  vfio_iommu_type1_pasid_request() accepts the
user's min/max so long as (max > min) and passes that to
vfio_iommu_type1_pasid_free(), then to vfio_pasid_free_range()  which
loops as:

	ioasid_t pasid = min;
	for (; pasid <= max; pasid++)
		ioasid_free(pasid);

A user might only be able to allocate 1000 pasids, but apparently they
can ask to free all they want.

It's also not obvious to me that calling ioasid_free() is only allowing
the user to free their own passid.  Does it?  It would be a pretty
gaping hole if a user could free arbitrary pasids.  A r-b tree of
passids might help both for security and to bound spinning in a loop.
Thanks,

Alex
Yi Liu July 9, 2020, 12:32 a.m. UTC | #5
Hi Alex,

> Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, July 9, 2020 3:55 AM
> 
> On Wed, 8 Jul 2020 08:16:16 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > Hi Alex,
> >
> > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > Sent: Friday, July 3, 2020 2:28 PM
> > >
> > > Hi Alex,
> > >
> > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > Sent: Friday, July 3, 2020 5:19 AM
> > > >
> > > > On Wed, 24 Jun 2020 01:55:19 -0700
> > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > >
> > > > > This patch allows user space to request PASID allocation/free, e.g.
> > > > > when serving the request from the guest.
> > > > >
> > > > > PASIDs that are not freed by userspace are automatically freed when
> > > > > the IOASID set is destroyed when process exits.
> > [...]
> > > > > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu *iommu,
> > > > > +					  unsigned long arg)
> > > > > +{
> > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > +	unsigned long minsz;
> > > > > +
> > > > > +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> range);
> > > > > +
> > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > +		return -EFAULT;
> > > > > +
> > > > > +	if (req.argsz < minsz || (req.flags &
> ~VFIO_PASID_REQUEST_MASK))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (req.range.min > req.range.max)
> > > >
> > > > Is it exploitable that a user can spin the kernel for a long time in
> > > > the case of a free by calling this with [0, MAX_UINT] regardless of their
> actual
> > > allocations?
> > >
> > > IOASID can ensure that user can only free the PASIDs allocated to the user.
> but
> > > it's true, kernel needs to loop all the PASIDs within the range provided
> > > by user.
> it
> > > may take a long time. is there anything we can do? one thing may limit the
> range
> > > provided by user?
> >
> > thought about it more, we have per-VM pasid quota (say 1000), so even if
> > user passed down [0, MAX_UNIT], kernel will only loop the 1000 pasids at
> > most. do you think we still need to do something on it?
> 
> How do you figure that?  vfio_iommu_type1_pasid_request() accepts the
> user's min/max so long as (max > min) and passes that to
> vfio_iommu_type1_pasid_free(), then to vfio_pasid_free_range()  which
> loops as:
> 
> 	ioasid_t pasid = min;
> 	for (; pasid <= max; pasid++)
> 		ioasid_free(pasid);
> 
> A user might only be able to allocate 1000 pasids, but apparently they
> can ask to free all they want.
> 
> It's also not obvious to me that calling ioasid_free() is only allowing
> the user to free their own passid.  Does it?  It would be a pretty
> gaping hole if a user could free arbitrary pasids.  A r-b tree of
> passids might help both for security and to bound spinning in a loop.

oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an ioasid_set
parameter for ioasid_free(), thus to prevent the user from freeing PASIDs
that doesn't belong to it. I remember Jacob mentioned it before.

@Jacob, is it still in your plan?

Regards,
Yi Liu

> Thanks,
> 
> Alex
Tian, Kevin July 9, 2020, 1:56 a.m. UTC | #6
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, July 9, 2020 8:32 AM
> 
> Hi Alex,
> 
> > Alex Williamson <alex.williamson@redhat.com>
> > Sent: Thursday, July 9, 2020 3:55 AM
> >
> > On Wed, 8 Jul 2020 08:16:16 +0000
> > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> >
> > > Hi Alex,
> > >
> > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > Sent: Friday, July 3, 2020 2:28 PM
> > > >
> > > > Hi Alex,
> > > >
> > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > >
> > > > > On Wed, 24 Jun 2020 01:55:19 -0700
> > > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > > >
> > > > > > This patch allows user space to request PASID allocation/free, e.g.
> > > > > > when serving the request from the guest.
> > > > > >
> > > > > > PASIDs that are not freed by userspace are automatically freed
> when
> > > > > > the IOASID set is destroyed when process exits.
> > > [...]
> > > > > > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu
> *iommu,
> > > > > > +					  unsigned long arg)
> > > > > > +{
> > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > +	unsigned long minsz;
> > > > > > +
> > > > > > +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> > range);
> > > > > > +
> > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > +		return -EFAULT;
> > > > > > +
> > > > > > +	if (req.argsz < minsz || (req.flags &
> > ~VFIO_PASID_REQUEST_MASK))
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	if (req.range.min > req.range.max)
> > > > >
> > > > > Is it exploitable that a user can spin the kernel for a long time in
> > > > > the case of a free by calling this with [0, MAX_UINT] regardless of their
> > actual
> > > > allocations?
> > > >
> > > > IOASID can ensure that user can only free the PASIDs allocated to the
> user.
> > but
> > > > it's true, kernel needs to loop all the PASIDs within the range provided
> > > > by user.
> > it
> > > > may take a long time. is there anything we can do? one thing may limit
> the
> > range
> > > > provided by user?
> > >
> > > thought about it more, we have per-VM pasid quota (say 1000), so even if
> > > user passed down [0, MAX_UNIT], kernel will only loop the 1000 pasids at
> > > most. do you think we still need to do something on it?
> >
> > How do you figure that?  vfio_iommu_type1_pasid_request() accepts the
> > user's min/max so long as (max > min) and passes that to
> > vfio_iommu_type1_pasid_free(), then to vfio_pasid_free_range()  which
> > loops as:
> >
> > 	ioasid_t pasid = min;
> > 	for (; pasid <= max; pasid++)
> > 		ioasid_free(pasid);
> >
> > A user might only be able to allocate 1000 pasids, but apparently they
> > can ask to free all they want.
> >
> > It's also not obvious to me that calling ioasid_free() is only allowing
> > the user to free their own passid.  Does it?  It would be a pretty

Agree. I thought ioasid_free should at least carry a token since the
user space is only allowed to manage PASIDs in its own set...

> > gaping hole if a user could free arbitrary pasids.  A r-b tree of
> > passids might help both for security and to bound spinning in a loop.
> 
> oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an ioasid_set
> parameter for ioasid_free(), thus to prevent the user from freeing PASIDs
> that doesn't belong to it. I remember Jacob mentioned it before.
> 

check current ioasid_free:

        spin_lock(&ioasid_allocator_lock);
        ioasid_data = xa_load(&active_allocator->xa, ioasid);
        if (!ioasid_data) {
                pr_err("Trying to free unknown IOASID %u\n", ioasid);
                goto exit_unlock;
        }

Allow an user to trigger above lock paths with MAX_UINT times might still
be bad. 

Thanks
Kevin
Yi Liu July 9, 2020, 2:08 a.m. UTC | #7
Hi Kevin,

> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Thursday, July 9, 2020 9:57 AM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Thursday, July 9, 2020 8:32 AM
> >
> > Hi Alex,
> >
> > > Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Thursday, July 9, 2020 3:55 AM
> > >
> > > On Wed, 8 Jul 2020 08:16:16 +0000
> > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > >
> > > > Hi Alex,
> > > >
> > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > >
> > > > > Hi Alex,
> > > > >
> > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > >
> > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > <yi.l.liu@intel.com> wrote:
> > > > > >
> > > > > > > This patch allows user space to request PASID allocation/free, e.g.
> > > > > > > when serving the request from the guest.
> > > > > > >
> > > > > > > PASIDs that are not freed by userspace are automatically
> > > > > > > freed
> > when
> > > > > > > the IOASID set is destroyed when process exits.
> > > > [...]
> > > > > > > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu
> > *iommu,
> > > > > > > +					  unsigned long arg)
> > > > > > > +{
> > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > +	unsigned long minsz;
> > > > > > > +
> > > > > > > +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> > > range);
> > > > > > > +
> > > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > > +		return -EFAULT;
> > > > > > > +
> > > > > > > +	if (req.argsz < minsz || (req.flags &
> > > ~VFIO_PASID_REQUEST_MASK))
> > > > > > > +		return -EINVAL;
> > > > > > > +
> > > > > > > +	if (req.range.min > req.range.max)
> > > > > >
> > > > > > Is it exploitable that a user can spin the kernel for a long
> > > > > > time in the case of a free by calling this with [0, MAX_UINT]
> > > > > > regardless of their
> > > actual
> > > > > allocations?
> > > > >
> > > > > IOASID can ensure that user can only free the PASIDs allocated
> > > > > to the
> > user.
> > > but
> > > > > it's true, kernel needs to loop all the PASIDs within the range
> > > > > provided by user.
> > > it
> > > > > may take a long time. is there anything we can do? one thing may
> > > > > limit
> > the
> > > range
> > > > > provided by user?
> > > >
> > > > thought about it more, we have per-VM pasid quota (say 1000), so
> > > > even if user passed down [0, MAX_UNIT], kernel will only loop the
> > > > 1000 pasids at most. do you think we still need to do something on it?
> > >
> > > How do you figure that?  vfio_iommu_type1_pasid_request() accepts
> > > the user's min/max so long as (max > min) and passes that to
> > > vfio_iommu_type1_pasid_free(), then to vfio_pasid_free_range()
> > > which loops as:
> > >
> > > 	ioasid_t pasid = min;
> > > 	for (; pasid <= max; pasid++)
> > > 		ioasid_free(pasid);
> > >
> > > A user might only be able to allocate 1000 pasids, but apparently
> > > they can ask to free all they want.
> > >
> > > It's also not obvious to me that calling ioasid_free() is only
> > > allowing the user to free their own passid.  Does it?  It would be a
> > > pretty
> 
> Agree. I thought ioasid_free should at least carry a token since the user space is
> only allowed to manage PASIDs in its own set...
> 
> > > gaping hole if a user could free arbitrary pasids.  A r-b tree of
> > > passids might help both for security and to bound spinning in a loop.
> >
> > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > ioasid_set parameter for ioasid_free(), thus to prevent the user from
> > freeing PASIDs that doesn't belong to it. I remember Jacob mentioned it before.
> >
> 
> check current ioasid_free:
> 
>         spin_lock(&ioasid_allocator_lock);
>         ioasid_data = xa_load(&active_allocator->xa, ioasid);
>         if (!ioasid_data) {
>                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
>                 goto exit_unlock;
>         }
> 
> Allow an user to trigger above lock paths with MAX_UINT times might still be bad.

yeah, how about the below two options:

- comparing the max - min with the quota before calling ioasid_free().
  If max - min > current quota of the user, then should fail it. If
  max - min < quota, then call ioasid_free() one by one. still trigger
  the above lock path with quota times.

- pass the max and min to ioasid_free(), let ioasid_free() decide. should
  be able to avoid trigger the lock multiple times, and ioasid has have a
  track on how may PASIDs have been allocated, if max - min is larger than
  the allocated number, should fail anyway.
 
thoughts on the above reply?

Regards,
Yi Liu

> Thanks
> Kevin
Tian, Kevin July 9, 2020, 2:18 a.m. UTC | #8
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, July 9, 2020 10:08 AM
> 
> Hi Kevin,
> 
> > From: Tian, Kevin <kevin.tian@intel.com>
> > Sent: Thursday, July 9, 2020 9:57 AM
> >
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Thursday, July 9, 2020 8:32 AM
> > >
> > > Hi Alex,
> > >
> > > > Alex Williamson <alex.williamson@redhat.com>
> > > > Sent: Thursday, July 9, 2020 3:55 AM
> > > >
> > > > On Wed, 8 Jul 2020 08:16:16 +0000
> > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > > >
> > > > > Hi Alex,
> > > > >
> > > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > > >
> > > > > > Hi Alex,
> > > > > >
> > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > > >
> > > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > >
> > > > > > > > This patch allows user space to request PASID allocation/free,
> e.g.
> > > > > > > > when serving the request from the guest.
> > > > > > > >
> > > > > > > > PASIDs that are not freed by userspace are automatically
> > > > > > > > freed
> > > when
> > > > > > > > the IOASID set is destroyed when process exits.
> > > > > [...]
> > > > > > > > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu
> > > *iommu,
> > > > > > > > +					  unsigned long arg)
> > > > > > > > +{
> > > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > > +	unsigned long minsz;
> > > > > > > > +
> > > > > > > > +	minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> > > > range);
> > > > > > > > +
> > > > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > > > +		return -EFAULT;
> > > > > > > > +
> > > > > > > > +	if (req.argsz < minsz || (req.flags &
> > > > ~VFIO_PASID_REQUEST_MASK))
> > > > > > > > +		return -EINVAL;
> > > > > > > > +
> > > > > > > > +	if (req.range.min > req.range.max)
> > > > > > >
> > > > > > > Is it exploitable that a user can spin the kernel for a long
> > > > > > > time in the case of a free by calling this with [0, MAX_UINT]
> > > > > > > regardless of their
> > > > actual
> > > > > > allocations?
> > > > > >
> > > > > > IOASID can ensure that user can only free the PASIDs allocated
> > > > > > to the
> > > user.
> > > > but
> > > > > > it's true, kernel needs to loop all the PASIDs within the range
> > > > > > provided by user.
> > > > it
> > > > > > may take a long time. is there anything we can do? one thing may
> > > > > > limit
> > > the
> > > > range
> > > > > > provided by user?
> > > > >
> > > > > thought about it more, we have per-VM pasid quota (say 1000), so
> > > > > even if user passed down [0, MAX_UNIT], kernel will only loop the
> > > > > 1000 pasids at most. do you think we still need to do something on it?
> > > >
> > > > How do you figure that?  vfio_iommu_type1_pasid_request() accepts
> > > > the user's min/max so long as (max > min) and passes that to
> > > > vfio_iommu_type1_pasid_free(), then to vfio_pasid_free_range()
> > > > which loops as:
> > > >
> > > > 	ioasid_t pasid = min;
> > > > 	for (; pasid <= max; pasid++)
> > > > 		ioasid_free(pasid);
> > > >
> > > > A user might only be able to allocate 1000 pasids, but apparently
> > > > they can ask to free all they want.
> > > >
> > > > It's also not obvious to me that calling ioasid_free() is only
> > > > allowing the user to free their own passid.  Does it?  It would be a
> > > > pretty
> >
> > Agree. I thought ioasid_free should at least carry a token since the user
> space is
> > only allowed to manage PASIDs in its own set...
> >
> > > > gaping hole if a user could free arbitrary pasids.  A r-b tree of
> > > > passids might help both for security and to bound spinning in a loop.
> > >
> > > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > > ioasid_set parameter for ioasid_free(), thus to prevent the user from
> > > freeing PASIDs that doesn't belong to it. I remember Jacob mentioned it
> before.
> > >
> >
> > check current ioasid_free:
> >
> >         spin_lock(&ioasid_allocator_lock);
> >         ioasid_data = xa_load(&active_allocator->xa, ioasid);
> >         if (!ioasid_data) {
> >                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
> >                 goto exit_unlock;
> >         }
> >
> > Allow an user to trigger above lock paths with MAX_UINT times might still
> be bad.
> 
> yeah, how about the below two options:
> 
> - comparing the max - min with the quota before calling ioasid_free().
>   If max - min > current quota of the user, then should fail it. If
>   max - min < quota, then call ioasid_free() one by one. still trigger
>   the above lock path with quota times.

This is definitely wrong. [min, max] is about the range of the PASID value,
while quota is about the number of allocated PASIDs. It's a bit weird to
mix two together. btw what is the main purpose of allowing batch PASID
free requests? Can we just simplify to allow one PASID in each free just
like how is it done in allocation path?

> 
> - pass the max and min to ioasid_free(), let ioasid_free() decide. should
>   be able to avoid trigger the lock multiple times, and ioasid has have a
>   track on how may PASIDs have been allocated, if max - min is larger than
>   the allocated number, should fail anyway.

What about Alex's r-b tree suggestion? Is there any downside in you mind?

Thanks,
Kevin
Yi Liu July 9, 2020, 2:26 a.m. UTC | #9
Hi Kevin,

> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Thursday, July 9, 2020 10:18 AM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Thursday, July 9, 2020 10:08 AM
> >
> > Hi Kevin,
> >
> > > From: Tian, Kevin <kevin.tian@intel.com>
> > > Sent: Thursday, July 9, 2020 9:57 AM
> > >
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Thursday, July 9, 2020 8:32 AM
> > > >
> > > > Hi Alex,
> > > >
> > > > > Alex Williamson <alex.williamson@redhat.com>
> > > > > Sent: Thursday, July 9, 2020 3:55 AM
> > > > >
> > > > > On Wed, 8 Jul 2020 08:16:16 +0000
> > > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > > > >
> > > > > > Hi Alex,
> > > > > >
> > > > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > > > >
> > > > > > > Hi Alex,
> > > > > > >
> > > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > > > >
> > > > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > >
> > > > > > > > > This patch allows user space to request PASID allocation/free,
> > e.g.
> > > > > > > > > when serving the request from the guest.
> > > > > > > > >
> > > > > > > > > PASIDs that are not freed by userspace are automatically
> > > > > > > > > freed
> > > > when
> > > > > > > > > the IOASID set is destroyed when process exits.
> > > > > > [...]
> > > > > > > > > +static int vfio_iommu_type1_pasid_request(struct vfio_iommu
> > > > *iommu,
> > > > > > > > > +					  unsigned long arg)
> > > > > > > > > +{
> > > > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > > > +	unsigned long minsz;
> > > > > > > > > +
> > > > > > > > > +	minsz = offsetofend(struct
> vfio_iommu_type1_pasid_request,
> > > > > range);
> > > > > > > > > +
> > > > > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > > > > +		return -EFAULT;
> > > > > > > > > +
> > > > > > > > > +	if (req.argsz < minsz || (req.flags &
> > > > > ~VFIO_PASID_REQUEST_MASK))
> > > > > > > > > +		return -EINVAL;
> > > > > > > > > +
> > > > > > > > > +	if (req.range.min > req.range.max)
> > > > > > > >
> > > > > > > > Is it exploitable that a user can spin the kernel for a long
> > > > > > > > time in the case of a free by calling this with [0, MAX_UINT]
> > > > > > > > regardless of their
> > > > > actual
> > > > > > > allocations?
> > > > > > >
> > > > > > > IOASID can ensure that user can only free the PASIDs allocated
> > > > > > > to the
> > > > user.
> > > > > but
> > > > > > > it's true, kernel needs to loop all the PASIDs within the range
> > > > > > > provided by user.
> > > > > it
> > > > > > > may take a long time. is there anything we can do? one thing may
> > > > > > > limit
> > > > the
> > > > > range
> > > > > > > provided by user?
> > > > > >
> > > > > > thought about it more, we have per-VM pasid quota (say 1000), so
> > > > > > even if user passed down [0, MAX_UNIT], kernel will only loop the
> > > > > > 1000 pasids at most. do you think we still need to do something on it?
> > > > >
> > > > > How do you figure that?  vfio_iommu_type1_pasid_request() accepts
> > > > > the user's min/max so long as (max > min) and passes that to
> > > > > vfio_iommu_type1_pasid_free(), then to vfio_pasid_free_range()
> > > > > which loops as:
> > > > >
> > > > > 	ioasid_t pasid = min;
> > > > > 	for (; pasid <= max; pasid++)
> > > > > 		ioasid_free(pasid);
> > > > >
> > > > > A user might only be able to allocate 1000 pasids, but apparently
> > > > > they can ask to free all they want.
> > > > >
> > > > > It's also not obvious to me that calling ioasid_free() is only
> > > > > allowing the user to free their own passid.  Does it?  It would be a
> > > > > pretty
> > >
> > > Agree. I thought ioasid_free should at least carry a token since the user
> > space is
> > > only allowed to manage PASIDs in its own set...
> > >
> > > > > gaping hole if a user could free arbitrary pasids.  A r-b tree of
> > > > > passids might help both for security and to bound spinning in a loop.
> > > >
> > > > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > > > ioasid_set parameter for ioasid_free(), thus to prevent the user from
> > > > freeing PASIDs that doesn't belong to it. I remember Jacob mentioned it
> > before.
> > > >
> > >
> > > check current ioasid_free:
> > >
> > >         spin_lock(&ioasid_allocator_lock);
> > >         ioasid_data = xa_load(&active_allocator->xa, ioasid);
> > >         if (!ioasid_data) {
> > >                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
> > >                 goto exit_unlock;
> > >         }
> > >
> > > Allow an user to trigger above lock paths with MAX_UINT times might still
> > be bad.
> >
> > yeah, how about the below two options:
> >
> > - comparing the max - min with the quota before calling ioasid_free().
> >   If max - min > current quota of the user, then should fail it. If
> >   max - min < quota, then call ioasid_free() one by one. still trigger
> >   the above lock path with quota times.
> 
> This is definitely wrong. [min, max] is about the range of the PASID value,
> while quota is about the number of allocated PASIDs. It's a bit weird to
> mix two together.

got it.

> btw what is the main purpose of allowing batch PASID
> free requests? Can we just simplify to allow one PASID in each free just
> like how is it done in allocation path?

it's an intention to reuse the [min, max] range as allocation path. currently,
we don't have such request as far as I can see.

> >
> > - pass the max and min to ioasid_free(), let ioasid_free() decide. should
> >   be able to avoid trigger the lock multiple times, and ioasid has have a
> >   track on how may PASIDs have been allocated, if max - min is larger than
> >   the allocated number, should fail anyway.
> 
> What about Alex's r-b tree suggestion? Is there any downside in you mind?

no downside, I was just wanting to reuse the tracks in ioasid_set. I can add
a r-b for allocated PASIDs and find the PASIDs in the r-b tree only do free
for the PASIDs found in r-b tree, others in the range would be ignored.
does it look good?

Regards,
Yi Liu

> Thanks,
> Kevin
Yi Liu July 9, 2020, 7:16 a.m. UTC | #10
Hi Alex,

After more thinking, looks like adding a r-b tree is still not enough to
solve the potential problem for free a range of PASID in one ioctl. If
caller gives [0, MAX_UNIT] in the free request, kernel anyhow should
loop all the PASIDs and search in the r-b tree. Even VFIO can track the
smallest/largest allocated PASID, and limit the free range to an accurate
range, it is still no efficient. For example, user has allocated two PASIDs
( 1 and 999), and user gives the [0, MAX_UNIT] range in free request. VFIO
will limit the free range to be [1, 999], but still needs to loop PASID 1 -
999, and search in r-b tree.

So I'm wondering can we fall back to prior proposal which only free one
PASID for a free request. how about your opinion?

https://lore.kernel.org/linux-iommu/20200416084031.7266ad40@w520.home/

Regards,
Yi Liu

> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, July 9, 2020 10:26 AM
> 
> Hi Kevin,
> 
> > From: Tian, Kevin <kevin.tian@intel.com>
> > Sent: Thursday, July 9, 2020 10:18 AM
> >
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Thursday, July 9, 2020 10:08 AM
> > >
> > > Hi Kevin,
> > >
> > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > Sent: Thursday, July 9, 2020 9:57 AM
> > > >
> > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Sent: Thursday, July 9, 2020 8:32 AM
> > > > >
> > > > > Hi Alex,
> > > > >
> > > > > > Alex Williamson <alex.williamson@redhat.com>
> > > > > > Sent: Thursday, July 9, 2020 3:55 AM
> > > > > >
> > > > > > On Wed, 8 Jul 2020 08:16:16 +0000 "Liu, Yi L"
> > > > > > <yi.l.liu@intel.com> wrote:
> > > > > >
> > > > > > > Hi Alex,
> > > > > > >
> > > > > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > > > > >
> > > > > > > > Hi Alex,
> > > > > > > >
> > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > > > > >
> > > > > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > > >
> > > > > > > > > > This patch allows user space to request PASID
> > > > > > > > > > allocation/free,
> > > e.g.
> > > > > > > > > > when serving the request from the guest.
> > > > > > > > > >
> > > > > > > > > > PASIDs that are not freed by userspace are
> > > > > > > > > > automatically freed
> > > > > when
> > > > > > > > > > the IOASID set is destroyed when process exits.
> > > > > > > [...]
> > > > > > > > > > +static int vfio_iommu_type1_pasid_request(struct
> > > > > > > > > > +vfio_iommu
> > > > > *iommu,
> > > > > > > > > > +					  unsigned long arg) {
> > > > > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > > > > +	unsigned long minsz;
> > > > > > > > > > +
> > > > > > > > > > +	minsz = offsetofend(struct
> > vfio_iommu_type1_pasid_request,
> > > > > > range);
> > > > > > > > > > +
> > > > > > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > > > > > +		return -EFAULT;
> > > > > > > > > > +
> > > > > > > > > > +	if (req.argsz < minsz || (req.flags &
> > > > > > ~VFIO_PASID_REQUEST_MASK))
> > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > +
> > > > > > > > > > +	if (req.range.min > req.range.max)
> > > > > > > > >
> > > > > > > > > Is it exploitable that a user can spin the kernel for a
> > > > > > > > > long time in the case of a free by calling this with [0,
> > > > > > > > > MAX_UINT] regardless of their
> > > > > > actual
> > > > > > > > allocations?
> > > > > > > >
> > > > > > > > IOASID can ensure that user can only free the PASIDs
> > > > > > > > allocated to the
> > > > > user.
> > > > > > but
> > > > > > > > it's true, kernel needs to loop all the PASIDs within the
> > > > > > > > range provided by user.
> > > > > > it
> > > > > > > > may take a long time. is there anything we can do? one
> > > > > > > > thing may limit
> > > > > the
> > > > > > range
> > > > > > > > provided by user?
> > > > > > >
> > > > > > > thought about it more, we have per-VM pasid quota (say
> > > > > > > 1000), so even if user passed down [0, MAX_UNIT], kernel
> > > > > > > will only loop the
> > > > > > > 1000 pasids at most. do you think we still need to do something on it?
> > > > > >
> > > > > > How do you figure that?  vfio_iommu_type1_pasid_request()
> > > > > > accepts the user's min/max so long as (max > min) and passes
> > > > > > that to vfio_iommu_type1_pasid_free(), then to
> > > > > > vfio_pasid_free_range() which loops as:
> > > > > >
> > > > > > 	ioasid_t pasid = min;
> > > > > > 	for (; pasid <= max; pasid++)
> > > > > > 		ioasid_free(pasid);
> > > > > >
> > > > > > A user might only be able to allocate 1000 pasids, but
> > > > > > apparently they can ask to free all they want.
> > > > > >
> > > > > > It's also not obvious to me that calling ioasid_free() is only
> > > > > > allowing the user to free their own passid.  Does it?  It
> > > > > > would be a pretty
> > > >
> > > > Agree. I thought ioasid_free should at least carry a token since
> > > > the user
> > > space is
> > > > only allowed to manage PASIDs in its own set...
> > > >
> > > > > > gaping hole if a user could free arbitrary pasids.  A r-b tree
> > > > > > of passids might help both for security and to bound spinning in a loop.
> > > > >
> > > > > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > > > > ioasid_set parameter for ioasid_free(), thus to prevent the user
> > > > > from freeing PASIDs that doesn't belong to it. I remember Jacob
> > > > > mentioned it
> > > before.
> > > > >
> > > >
> > > > check current ioasid_free:
> > > >
> > > >         spin_lock(&ioasid_allocator_lock);
> > > >         ioasid_data = xa_load(&active_allocator->xa, ioasid);
> > > >         if (!ioasid_data) {
> > > >                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
> > > >                 goto exit_unlock;
> > > >         }
> > > >
> > > > Allow an user to trigger above lock paths with MAX_UINT times
> > > > might still
> > > be bad.
> > >
> > > yeah, how about the below two options:
> > >
> > > - comparing the max - min with the quota before calling ioasid_free().
> > >   If max - min > current quota of the user, then should fail it. If
> > >   max - min < quota, then call ioasid_free() one by one. still trigger
> > >   the above lock path with quota times.
> >
> > This is definitely wrong. [min, max] is about the range of the PASID
> > value, while quota is about the number of allocated PASIDs. It's a bit
> > weird to mix two together.
> 
> got it.
> 
> > btw what is the main purpose of allowing batch PASID free requests?
> > Can we just simplify to allow one PASID in each free just like how is
> > it done in allocation path?
> 
> it's an intention to reuse the [min, max] range as allocation path. currently, we
> don't have such request as far as I can see.
> 
> > >
> > > - pass the max and min to ioasid_free(), let ioasid_free() decide. should
> > >   be able to avoid trigger the lock multiple times, and ioasid has have a
> > >   track on how may PASIDs have been allocated, if max - min is larger than
> > >   the allocated number, should fail anyway.
> >
> > What about Alex's r-b tree suggestion? Is there any downside in you mind?
> 
> no downside, I was just wanting to reuse the tracks in ioasid_set. I can add a r-b
> for allocated PASIDs and find the PASIDs in the r-b tree only do free for the
> PASIDs found in r-b tree, others in the range would be ignored.
> does it look good?
> 
> Regards,
> Yi Liu
> 
> > Thanks,
> > Kevin
Alex Williamson July 9, 2020, 2:27 p.m. UTC | #11
On Thu, 9 Jul 2020 07:16:31 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> Hi Alex,
> 
> After more thinking, looks like adding a r-b tree is still not enough to
> solve the potential problem for free a range of PASID in one ioctl. If
> caller gives [0, MAX_UNIT] in the free request, kernel anyhow should
> loop all the PASIDs and search in the r-b tree. Even VFIO can track the
> smallest/largest allocated PASID, and limit the free range to an accurate
> range, it is still no efficient. For example, user has allocated two PASIDs
> ( 1 and 999), and user gives the [0, MAX_UNIT] range in free request. VFIO
> will limit the free range to be [1, 999], but still needs to loop PASID 1 -
> 999, and search in r-b tree.

That sounds like a poor tree implementation.  Look at vfio_find_dma()
for instance, it returns a node within the specified range.  If the
tree has two nodes within the specified range we should never need to
call a search function like vfio_find_dma() more than three times.  We
call it once, get the first node, remove it.  Call it again, get the
other node, remove it.  Call a third time, find no matches, we're done.
So such an implementation limits searches to N+1 where N is the number
of nodes within the range.

> So I'm wondering can we fall back to prior proposal which only free one
> PASID for a free request. how about your opinion?

Doesn't it still seem like it would be a useful user interface to have
a mechanism to free all pasids, by calling with exactly [0, MAX_UINT]?
I'm not sure if there's another use case for this given than the user
doesn't have strict control of the pasid values they get.  Thanks,

Alex

> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Thursday, July 9, 2020 10:26 AM
> > 
> > Hi Kevin,
> >   
> > > From: Tian, Kevin <kevin.tian@intel.com>
> > > Sent: Thursday, July 9, 2020 10:18 AM
> > >  
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Thursday, July 9, 2020 10:08 AM
> > > >
> > > > Hi Kevin,
> > > >  
> > > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > > Sent: Thursday, July 9, 2020 9:57 AM
> > > > >  
> > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > Sent: Thursday, July 9, 2020 8:32 AM
> > > > > >
> > > > > > Hi Alex,
> > > > > >  
> > > > > > > Alex Williamson <alex.williamson@redhat.com>
> > > > > > > Sent: Thursday, July 9, 2020 3:55 AM
> > > > > > >
> > > > > > > On Wed, 8 Jul 2020 08:16:16 +0000 "Liu, Yi L"
> > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > >  
> > > > > > > > Hi Alex,
> > > > > > > >  
> > > > > > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > > > > > >
> > > > > > > > > Hi Alex,
> > > > > > > > >  
> > > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > > > > > >
> > > > > > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > > > >  
> > > > > > > > > > > This patch allows user space to request PASID
> > > > > > > > > > > allocation/free,  
> > > > e.g.  
> > > > > > > > > > > when serving the request from the guest.
> > > > > > > > > > >
> > > > > > > > > > > PASIDs that are not freed by userspace are
> > > > > > > > > > > automatically freed  
> > > > > > when  
> > > > > > > > > > > the IOASID set is destroyed when process exits.  
> > > > > > > > [...]  
> > > > > > > > > > > +static int vfio_iommu_type1_pasid_request(struct
> > > > > > > > > > > +vfio_iommu  
> > > > > > *iommu,  
> > > > > > > > > > > +					  unsigned long arg) {
> > > > > > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > > > > > +	unsigned long minsz;
> > > > > > > > > > > +
> > > > > > > > > > > +	minsz = offsetofend(struct  
> > > vfio_iommu_type1_pasid_request,  
> > > > > > > range);  
> > > > > > > > > > > +
> > > > > > > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > > > > > > +		return -EFAULT;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (req.argsz < minsz || (req.flags &  
> > > > > > > ~VFIO_PASID_REQUEST_MASK))  
> > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (req.range.min > req.range.max)  
> > > > > > > > > >
> > > > > > > > > > Is it exploitable that a user can spin the kernel for a
> > > > > > > > > > long time in the case of a free by calling this with [0,
> > > > > > > > > > MAX_UINT] regardless of their  
> > > > > > > actual  
> > > > > > > > > allocations?
> > > > > > > > >
> > > > > > > > > IOASID can ensure that user can only free the PASIDs
> > > > > > > > > allocated to the  
> > > > > > user.  
> > > > > > > but  
> > > > > > > > > it's true, kernel needs to loop all the PASIDs within the
> > > > > > > > > range provided by user.  
> > > > > > > it  
> > > > > > > > > may take a long time. is there anything we can do? one
> > > > > > > > > thing may limit  
> > > > > > the  
> > > > > > > range  
> > > > > > > > > provided by user?  
> > > > > > > >
> > > > > > > > thought about it more, we have per-VM pasid quota (say
> > > > > > > > 1000), so even if user passed down [0, MAX_UNIT], kernel
> > > > > > > > will only loop the
> > > > > > > > 1000 pasids at most. do you think we still need to do something on it?  
> > > > > > >
> > > > > > > How do you figure that?  vfio_iommu_type1_pasid_request()
> > > > > > > accepts the user's min/max so long as (max > min) and passes
> > > > > > > that to vfio_iommu_type1_pasid_free(), then to
> > > > > > > vfio_pasid_free_range() which loops as:
> > > > > > >
> > > > > > > 	ioasid_t pasid = min;
> > > > > > > 	for (; pasid <= max; pasid++)
> > > > > > > 		ioasid_free(pasid);
> > > > > > >
> > > > > > > A user might only be able to allocate 1000 pasids, but
> > > > > > > apparently they can ask to free all they want.
> > > > > > >
> > > > > > > It's also not obvious to me that calling ioasid_free() is only
> > > > > > > allowing the user to free their own passid.  Does it?  It
> > > > > > > would be a pretty  
> > > > >
> > > > > Agree. I thought ioasid_free should at least carry a token since
> > > > > the user  
> > > > space is  
> > > > > only allowed to manage PASIDs in its own set...
> > > > >  
> > > > > > > gaping hole if a user could free arbitrary pasids.  A r-b tree
> > > > > > > of passids might help both for security and to bound spinning in a loop.  
> > > > > >
> > > > > > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > > > > > ioasid_set parameter for ioasid_free(), thus to prevent the user
> > > > > > from freeing PASIDs that doesn't belong to it. I remember Jacob
> > > > > > mentioned it  
> > > > before.  
> > > > > >  
> > > > >
> > > > > check current ioasid_free:
> > > > >
> > > > >         spin_lock(&ioasid_allocator_lock);
> > > > >         ioasid_data = xa_load(&active_allocator->xa, ioasid);
> > > > >         if (!ioasid_data) {
> > > > >                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
> > > > >                 goto exit_unlock;
> > > > >         }
> > > > >
> > > > > Allow an user to trigger above lock paths with MAX_UINT times
> > > > > might still  
> > > > be bad.
> > > >
> > > > yeah, how about the below two options:
> > > >
> > > > - comparing the max - min with the quota before calling ioasid_free().
> > > >   If max - min > current quota of the user, then should fail it. If
> > > >   max - min < quota, then call ioasid_free() one by one. still trigger
> > > >   the above lock path with quota times.  
> > >
> > > This is definitely wrong. [min, max] is about the range of the PASID
> > > value, while quota is about the number of allocated PASIDs. It's a bit
> > > weird to mix two together.  
> > 
> > got it.
> >   
> > > btw what is the main purpose of allowing batch PASID free requests?
> > > Can we just simplify to allow one PASID in each free just like how is
> > > it done in allocation path?  
> > 
> > it's an intention to reuse the [min, max] range as allocation path. currently, we
> > don't have such request as far as I can see.
> >   
> > > >
> > > > - pass the max and min to ioasid_free(), let ioasid_free() decide. should
> > > >   be able to avoid trigger the lock multiple times, and ioasid has have a
> > > >   track on how may PASIDs have been allocated, if max - min is larger than
> > > >   the allocated number, should fail anyway.  
> > >
> > > What about Alex's r-b tree suggestion? Is there any downside in you mind?  
> > 
> > no downside, I was just wanting to reuse the tracks in ioasid_set. I can add a r-b
> > for allocated PASIDs and find the PASIDs in the r-b tree only do free for the
> > PASIDs found in r-b tree, others in the range would be ignored.
> > does it look good?
> > 
> > Regards,
> > Yi Liu
> >   
> > > Thanks,
> > > Kevin  
>
Jacob Pan July 9, 2020, 6:05 p.m. UTC | #12
On Thu, 9 Jul 2020 08:27:51 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> > So I'm wondering can we fall back to prior proposal which only free
> > one PASID for a free request. how about your opinion?  
> 
> Doesn't it still seem like it would be a useful user interface to have
> a mechanism to free all pasids, by calling with exactly [0, MAX_UINT]?
> I'm not sure if there's another use case for this given than the user
> doesn't have strict control of the pasid values they get.  Thanks,

Yes, I agree free all pasids of a guest is a useful interface. Since all
PASIDs under one VM is already tracked by an IOASID set with its XArray,
I don't see a need to track again in VFIO.

Shall we only free one & free all? IMHO, free range isn't that useful
and not really symmetric to PASID allocation in that allocation is one
at a time.

Can we just add a new flag, e.g.  VFIO_IOMMU_FREE_ALL_PASID, and
ignored th range in free?
Yi Liu July 10, 2020, 5:39 a.m. UTC | #13
Hi Alex, 

> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, July 9, 2020 10:28 PM
> 
> On Thu, 9 Jul 2020 07:16:31 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > Hi Alex,
> >
> > After more thinking, looks like adding a r-b tree is still not enough to
> > solve the potential problem for free a range of PASID in one ioctl. If
> > caller gives [0, MAX_UNIT] in the free request, kernel anyhow should
> > loop all the PASIDs and search in the r-b tree. Even VFIO can track the
> > smallest/largest allocated PASID, and limit the free range to an accurate
> > range, it is still no efficient. For example, user has allocated two PASIDs
> > ( 1 and 999), and user gives the [0, MAX_UNIT] range in free request. VFIO
> > will limit the free range to be [1, 999], but still needs to loop PASID 1 -
> > 999, and search in r-b tree.
> 
> That sounds like a poor tree implementation.  Look at vfio_find_dma()
> for instance, it returns a node within the specified range.  If the
> tree has two nodes within the specified range we should never need to
> call a search function like vfio_find_dma() more than three times.  We
> call it once, get the first node, remove it.  Call it again, get the
> other node, remove it.  Call a third time, find no matches, we're done.
> So such an implementation limits searches to N+1 where N is the number
> of nodes within the range.

I see. When getting a free range from user. Use the range to find suited
PASIDs in the r-b tree. For the example I mentioned, if giving [0, MAX_UNIT],
will find two nodes. If giving [0, 100] range, then only one node will be
found. But even though, it still take some time if the user holds a bunch
of PASIDs and user gives a big free range.

> > So I'm wondering can we fall back to prior proposal which only free one
> > PASID for a free request. how about your opinion?
> 
> Doesn't it still seem like it would be a useful user interface to have
> a mechanism to free all pasids, by calling with exactly [0, MAX_UINT]?
> I'm not sure if there's another use case for this given than the user
> doesn't have strict control of the pasid values they get.  Thanks,

I don't have such use case neither. perhaps we may allow it in future by
adding flag. but if it's still useful, I may try with your suggestion. :-)

Regards,
Yi Liu

> Alex
> 
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Thursday, July 9, 2020 10:26 AM
> > >
> > > Hi Kevin,
> > >
> > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > Sent: Thursday, July 9, 2020 10:18 AM
> > > >
> > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Sent: Thursday, July 9, 2020 10:08 AM
> > > > >
> > > > > Hi Kevin,
> > > > >
> > > > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > > > Sent: Thursday, July 9, 2020 9:57 AM
> > > > > >
> > > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > Sent: Thursday, July 9, 2020 8:32 AM
> > > > > > >
> > > > > > > Hi Alex,
> > > > > > >
> > > > > > > > Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > Sent: Thursday, July 9, 2020 3:55 AM
> > > > > > > >
> > > > > > > > On Wed, 8 Jul 2020 08:16:16 +0000 "Liu, Yi L"
> > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > >
> > > > > > > > > Hi Alex,
> > > > > > > > >
> > > > > > > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > > > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > > > > > > >
> > > > > > > > > > Hi Alex,
> > > > > > > > > >
> > > > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > > > > > > >
> > > > > > > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > > This patch allows user space to request PASID
> > > > > > > > > > > > allocation/free,
> > > > > e.g.
> > > > > > > > > > > > when serving the request from the guest.
> > > > > > > > > > > >
> > > > > > > > > > > > PASIDs that are not freed by userspace are
> > > > > > > > > > > > automatically freed
> > > > > > > when
> > > > > > > > > > > > the IOASID set is destroyed when process exits.
> > > > > > > > > [...]
> > > > > > > > > > > > +static int vfio_iommu_type1_pasid_request(struct
> > > > > > > > > > > > +vfio_iommu
> > > > > > > *iommu,
> > > > > > > > > > > > +					  unsigned long arg) {
> > > > > > > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > > > > > > +	unsigned long minsz;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	minsz = offsetofend(struct
> > > > vfio_iommu_type1_pasid_request,
> > > > > > > > range);
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > > > > > > > +		return -EFAULT;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if (req.argsz < minsz || (req.flags &
> > > > > > > > ~VFIO_PASID_REQUEST_MASK))
> > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if (req.range.min > req.range.max)
> > > > > > > > > > >
> > > > > > > > > > > Is it exploitable that a user can spin the kernel for a
> > > > > > > > > > > long time in the case of a free by calling this with [0,
> > > > > > > > > > > MAX_UINT] regardless of their
> > > > > > > > actual
> > > > > > > > > > allocations?
> > > > > > > > > >
> > > > > > > > > > IOASID can ensure that user can only free the PASIDs
> > > > > > > > > > allocated to the
> > > > > > > user.
> > > > > > > > but
> > > > > > > > > > it's true, kernel needs to loop all the PASIDs within the
> > > > > > > > > > range provided by user.
> > > > > > > > it
> > > > > > > > > > may take a long time. is there anything we can do? one
> > > > > > > > > > thing may limit
> > > > > > > the
> > > > > > > > range
> > > > > > > > > > provided by user?
> > > > > > > > >
> > > > > > > > > thought about it more, we have per-VM pasid quota (say
> > > > > > > > > 1000), so even if user passed down [0, MAX_UNIT], kernel
> > > > > > > > > will only loop the
> > > > > > > > > 1000 pasids at most. do you think we still need to do something on
> it?
> > > > > > > >
> > > > > > > > How do you figure that?  vfio_iommu_type1_pasid_request()
> > > > > > > > accepts the user's min/max so long as (max > min) and passes
> > > > > > > > that to vfio_iommu_type1_pasid_free(), then to
> > > > > > > > vfio_pasid_free_range() which loops as:
> > > > > > > >
> > > > > > > > 	ioasid_t pasid = min;
> > > > > > > > 	for (; pasid <= max; pasid++)
> > > > > > > > 		ioasid_free(pasid);
> > > > > > > >
> > > > > > > > A user might only be able to allocate 1000 pasids, but
> > > > > > > > apparently they can ask to free all they want.
> > > > > > > >
> > > > > > > > It's also not obvious to me that calling ioasid_free() is only
> > > > > > > > allowing the user to free their own passid.  Does it?  It
> > > > > > > > would be a pretty
> > > > > >
> > > > > > Agree. I thought ioasid_free should at least carry a token since
> > > > > > the user
> > > > > space is
> > > > > > only allowed to manage PASIDs in its own set...
> > > > > >
> > > > > > > > gaping hole if a user could free arbitrary pasids.  A r-b tree
> > > > > > > > of passids might help both for security and to bound spinning in a
> loop.
> > > > > > >
> > > > > > > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > > > > > > ioasid_set parameter for ioasid_free(), thus to prevent the user
> > > > > > > from freeing PASIDs that doesn't belong to it. I remember Jacob
> > > > > > > mentioned it
> > > > > before.
> > > > > > >
> > > > > >
> > > > > > check current ioasid_free:
> > > > > >
> > > > > >         spin_lock(&ioasid_allocator_lock);
> > > > > >         ioasid_data = xa_load(&active_allocator->xa, ioasid);
> > > > > >         if (!ioasid_data) {
> > > > > >                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
> > > > > >                 goto exit_unlock;
> > > > > >         }
> > > > > >
> > > > > > Allow an user to trigger above lock paths with MAX_UINT times
> > > > > > might still
> > > > > be bad.
> > > > >
> > > > > yeah, how about the below two options:
> > > > >
> > > > > - comparing the max - min with the quota before calling ioasid_free().
> > > > >   If max - min > current quota of the user, then should fail it. If
> > > > >   max - min < quota, then call ioasid_free() one by one. still trigger
> > > > >   the above lock path with quota times.
> > > >
> > > > This is definitely wrong. [min, max] is about the range of the PASID
> > > > value, while quota is about the number of allocated PASIDs. It's a bit
> > > > weird to mix two together.
> > >
> > > got it.
> > >
> > > > btw what is the main purpose of allowing batch PASID free requests?
> > > > Can we just simplify to allow one PASID in each free just like how is
> > > > it done in allocation path?
> > >
> > > it's an intention to reuse the [min, max] range as allocation path. currently,
> we
> > > don't have such request as far as I can see.
> > >
> > > > >
> > > > > - pass the max and min to ioasid_free(), let ioasid_free() decide. should
> > > > >   be able to avoid trigger the lock multiple times, and ioasid has have a
> > > > >   track on how may PASIDs have been allocated, if max - min is larger than
> > > > >   the allocated number, should fail anyway.
> > > >
> > > > What about Alex's r-b tree suggestion? Is there any downside in you mind?
> > >
> > > no downside, I was just wanting to reuse the tracks in ioasid_set. I can add a
> r-b
> > > for allocated PASIDs and find the PASIDs in the r-b tree only do free for the
> > > PASIDs found in r-b tree, others in the range would be ignored.
> > > does it look good?
> > >
> > > Regards,
> > > Yi Liu
> > >
> > > > Thanks,
> > > > Kevin
> >
Alex Williamson July 10, 2020, 12:55 p.m. UTC | #14
On Fri, 10 Jul 2020 05:39:57 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> Hi Alex, 
> 
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Thursday, July 9, 2020 10:28 PM
> > 
> > On Thu, 9 Jul 2020 07:16:31 +0000
> > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> >   
> > > Hi Alex,
> > >
> > > After more thinking, looks like adding a r-b tree is still not enough to
> > > solve the potential problem for free a range of PASID in one ioctl. If
> > > caller gives [0, MAX_UNIT] in the free request, kernel anyhow should
> > > loop all the PASIDs and search in the r-b tree. Even VFIO can track the
> > > smallest/largest allocated PASID, and limit the free range to an accurate
> > > range, it is still no efficient. For example, user has allocated two PASIDs
> > > ( 1 and 999), and user gives the [0, MAX_UNIT] range in free request. VFIO
> > > will limit the free range to be [1, 999], but still needs to loop PASID 1 -
> > > 999, and search in r-b tree.  
> > 
> > That sounds like a poor tree implementation.  Look at vfio_find_dma()
> > for instance, it returns a node within the specified range.  If the
> > tree has two nodes within the specified range we should never need to
> > call a search function like vfio_find_dma() more than three times.  We
> > call it once, get the first node, remove it.  Call it again, get the
> > other node, remove it.  Call a third time, find no matches, we're done.
> > So such an implementation limits searches to N+1 where N is the number
> > of nodes within the range.  
> 
> I see. When getting a free range from user. Use the range to find suited
> PASIDs in the r-b tree. For the example I mentioned, if giving [0, MAX_UNIT],
> will find two nodes. If giving [0, 100] range, then only one node will be
> found. But even though, it still take some time if the user holds a bunch
> of PASIDs and user gives a big free range.


But that time is bounded.  The complexity of the tree and maximum
number of operations on the tree are bounded by the number of nodes,
which is bound by the user's pasid quota.  Thanks,

Alex
 
> > > So I'm wondering can we fall back to prior proposal which only free one
> > > PASID for a free request. how about your opinion?  
> > 
> > Doesn't it still seem like it would be a useful user interface to have
> > a mechanism to free all pasids, by calling with exactly [0, MAX_UINT]?
> > I'm not sure if there's another use case for this given than the user
> > doesn't have strict control of the pasid values they get.  Thanks,  
> 
> I don't have such use case neither. perhaps we may allow it in future by
> adding flag. but if it's still useful, I may try with your suggestion. :-)
> 
> Regards,
> Yi Liu
> 
> > Alex
> >   
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Thursday, July 9, 2020 10:26 AM
> > > >
> > > > Hi Kevin,
> > > >  
> > > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > > Sent: Thursday, July 9, 2020 10:18 AM
> > > > >  
> > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > Sent: Thursday, July 9, 2020 10:08 AM
> > > > > >
> > > > > > Hi Kevin,
> > > > > >  
> > > > > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > > > > Sent: Thursday, July 9, 2020 9:57 AM
> > > > > > >  
> > > > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > > Sent: Thursday, July 9, 2020 8:32 AM
> > > > > > > >
> > > > > > > > Hi Alex,
> > > > > > > >  
> > > > > > > > > Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > Sent: Thursday, July 9, 2020 3:55 AM
> > > > > > > > >
> > > > > > > > > On Wed, 8 Jul 2020 08:16:16 +0000 "Liu, Yi L"
> > > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > > >  
> > > > > > > > > > Hi Alex,
> > > > > > > > > >  
> > > > > > > > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > > > > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > > > > > > > >
> > > > > > > > > > > Hi Alex,
> > > > > > > > > > >  
> > > > > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > > > > > > > >
> > > > > > > > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > > > > > >  
> > > > > > > > > > > > > This patch allows user space to request PASID
> > > > > > > > > > > > > allocation/free,  
> > > > > > e.g.  
> > > > > > > > > > > > > when serving the request from the guest.
> > > > > > > > > > > > >
> > > > > > > > > > > > > PASIDs that are not freed by userspace are
> > > > > > > > > > > > > automatically freed  
> > > > > > > > when  
> > > > > > > > > > > > > the IOASID set is destroyed when process exits.  
> > > > > > > > > > [...]  
> > > > > > > > > > > > > +static int vfio_iommu_type1_pasid_request(struct
> > > > > > > > > > > > > +vfio_iommu  
> > > > > > > > *iommu,  
> > > > > > > > > > > > > +					  unsigned long arg) {
> > > > > > > > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > > > > > > > +	unsigned long minsz;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	minsz = offsetofend(struct  
> > > > > vfio_iommu_type1_pasid_request,  
> > > > > > > > > range);  
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > > > > > > > > > > +		return -EFAULT;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	if (req.argsz < minsz || (req.flags &  
> > > > > > > > > ~VFIO_PASID_REQUEST_MASK))  
> > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	if (req.range.min > req.range.max)  
> > > > > > > > > > > >
> > > > > > > > > > > > Is it exploitable that a user can spin the kernel for a
> > > > > > > > > > > > long time in the case of a free by calling this with [0,
> > > > > > > > > > > > MAX_UINT] regardless of their  
> > > > > > > > > actual  
> > > > > > > > > > > allocations?
> > > > > > > > > > >
> > > > > > > > > > > IOASID can ensure that user can only free the PASIDs
> > > > > > > > > > > allocated to the  
> > > > > > > > user.  
> > > > > > > > > but  
> > > > > > > > > > > it's true, kernel needs to loop all the PASIDs within the
> > > > > > > > > > > range provided by user.  
> > > > > > > > > it  
> > > > > > > > > > > may take a long time. is there anything we can do? one
> > > > > > > > > > > thing may limit  
> > > > > > > > the  
> > > > > > > > > range  
> > > > > > > > > > > provided by user?  
> > > > > > > > > >
> > > > > > > > > > thought about it more, we have per-VM pasid quota (say
> > > > > > > > > > 1000), so even if user passed down [0, MAX_UNIT], kernel
> > > > > > > > > > will only loop the
> > > > > > > > > > 1000 pasids at most. do you think we still need to do something on  
> > it?  
> > > > > > > > >
> > > > > > > > > How do you figure that?  vfio_iommu_type1_pasid_request()
> > > > > > > > > accepts the user's min/max so long as (max > min) and passes
> > > > > > > > > that to vfio_iommu_type1_pasid_free(), then to
> > > > > > > > > vfio_pasid_free_range() which loops as:
> > > > > > > > >
> > > > > > > > > 	ioasid_t pasid = min;
> > > > > > > > > 	for (; pasid <= max; pasid++)
> > > > > > > > > 		ioasid_free(pasid);
> > > > > > > > >
> > > > > > > > > A user might only be able to allocate 1000 pasids, but
> > > > > > > > > apparently they can ask to free all they want.
> > > > > > > > >
> > > > > > > > > It's also not obvious to me that calling ioasid_free() is only
> > > > > > > > > allowing the user to free their own passid.  Does it?  It
> > > > > > > > > would be a pretty  
> > > > > > >
> > > > > > > Agree. I thought ioasid_free should at least carry a token since
> > > > > > > the user  
> > > > > > space is  
> > > > > > > only allowed to manage PASIDs in its own set...
> > > > > > >  
> > > > > > > > > gaping hole if a user could free arbitrary pasids.  A r-b tree
> > > > > > > > > of passids might help both for security and to bound spinning in a  
> > loop.  
> > > > > > > >
> > > > > > > > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > > > > > > > ioasid_set parameter for ioasid_free(), thus to prevent the user
> > > > > > > > from freeing PASIDs that doesn't belong to it. I remember Jacob
> > > > > > > > mentioned it  
> > > > > > before.  
> > > > > > > >  
> > > > > > >
> > > > > > > check current ioasid_free:
> > > > > > >
> > > > > > >         spin_lock(&ioasid_allocator_lock);
> > > > > > >         ioasid_data = xa_load(&active_allocator->xa, ioasid);
> > > > > > >         if (!ioasid_data) {
> > > > > > >                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
> > > > > > >                 goto exit_unlock;
> > > > > > >         }
> > > > > > >
> > > > > > > Allow an user to trigger above lock paths with MAX_UINT times
> > > > > > > might still  
> > > > > > be bad.
> > > > > >
> > > > > > yeah, how about the below two options:
> > > > > >
> > > > > > - comparing the max - min with the quota before calling ioasid_free().
> > > > > >   If max - min > current quota of the user, then should fail it. If
> > > > > >   max - min < quota, then call ioasid_free() one by one. still trigger
> > > > > >   the above lock path with quota times.  
> > > > >
> > > > > This is definitely wrong. [min, max] is about the range of the PASID
> > > > > value, while quota is about the number of allocated PASIDs. It's a bit
> > > > > weird to mix two together.  
> > > >
> > > > got it.
> > > >  
> > > > > btw what is the main purpose of allowing batch PASID free requests?
> > > > > Can we just simplify to allow one PASID in each free just like how is
> > > > > it done in allocation path?  
> > > >
> > > > it's an intention to reuse the [min, max] range as allocation path. currently,  
> > we  
> > > > don't have such request as far as I can see.
> > > >  
> > > > > >
> > > > > > - pass the max and min to ioasid_free(), let ioasid_free() decide. should
> > > > > >   be able to avoid trigger the lock multiple times, and ioasid has have a
> > > > > >   track on how may PASIDs have been allocated, if max - min is larger than
> > > > > >   the allocated number, should fail anyway.  
> > > > >
> > > > > What about Alex's r-b tree suggestion? Is there any downside in you mind?  
> > > >
> > > > no downside, I was just wanting to reuse the tracks in ioasid_set. I can add a  
> > r-b  
> > > > for allocated PASIDs and find the PASIDs in the r-b tree only do free for the
> > > > PASIDs found in r-b tree, others in the range would be ignored.
> > > > does it look good?
> > > >
> > > > Regards,
> > > > Yi Liu
> > > >  
> > > > > Thanks,
> > > > > Kevin  
> > >  
>
Yi Liu July 10, 2020, 1:03 p.m. UTC | #15
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Friday, July 10, 2020 8:55 PM
> 
> On Fri, 10 Jul 2020 05:39:57 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > Hi Alex,
> >
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Thursday, July 9, 2020 10:28 PM
> > >
> > > On Thu, 9 Jul 2020 07:16:31 +0000
> > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > >
> > > > Hi Alex,
> > > >
> > > > After more thinking, looks like adding a r-b tree is still not enough to
> > > > solve the potential problem for free a range of PASID in one ioctl. If
> > > > caller gives [0, MAX_UNIT] in the free request, kernel anyhow should
> > > > loop all the PASIDs and search in the r-b tree. Even VFIO can track the
> > > > smallest/largest allocated PASID, and limit the free range to an accurate
> > > > range, it is still no efficient. For example, user has allocated two PASIDs
> > > > ( 1 and 999), and user gives the [0, MAX_UNIT] range in free request. VFIO
> > > > will limit the free range to be [1, 999], but still needs to loop PASID 1 -
> > > > 999, and search in r-b tree.
> > >
> > > That sounds like a poor tree implementation.  Look at vfio_find_dma()
> > > for instance, it returns a node within the specified range.  If the
> > > tree has two nodes within the specified range we should never need to
> > > call a search function like vfio_find_dma() more than three times.  We
> > > call it once, get the first node, remove it.  Call it again, get the
> > > other node, remove it.  Call a third time, find no matches, we're done.
> > > So such an implementation limits searches to N+1 where N is the number
> > > of nodes within the range.
> >
> > I see. When getting a free range from user. Use the range to find suited
> > PASIDs in the r-b tree. For the example I mentioned, if giving [0, MAX_UNIT],
> > will find two nodes. If giving [0, 100] range, then only one node will be
> > found. But even though, it still take some time if the user holds a bunch
> > of PASIDs and user gives a big free range.
> 
> 
> But that time is bounded.  The complexity of the tree and maximum
> number of operations on the tree are bounded by the number of nodes,
> which is bound by the user's pasid quota.  Thanks,

yes, let me try it. thanks. :-)

Regards,
Yi Liu

> Alex
> 
> > > > So I'm wondering can we fall back to prior proposal which only free one
> > > > PASID for a free request. how about your opinion?
> > >
> > > Doesn't it still seem like it would be a useful user interface to have
> > > a mechanism to free all pasids, by calling with exactly [0, MAX_UINT]?
> > > I'm not sure if there's another use case for this given than the user
> > > doesn't have strict control of the pasid values they get.  Thanks,
> >
> > I don't have such use case neither. perhaps we may allow it in future by
> > adding flag. but if it's still useful, I may try with your suggestion. :-)
> >
> > Regards,
> > Yi Liu
> >
> > > Alex
> > >
> > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Sent: Thursday, July 9, 2020 10:26 AM
> > > > >
> > > > > Hi Kevin,
> > > > >
> > > > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > > > Sent: Thursday, July 9, 2020 10:18 AM
> > > > > >
> > > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > Sent: Thursday, July 9, 2020 10:08 AM
> > > > > > >
> > > > > > > Hi Kevin,
> > > > > > >
> > > > > > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > > > > > Sent: Thursday, July 9, 2020 9:57 AM
> > > > > > > >
> > > > > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > > > Sent: Thursday, July 9, 2020 8:32 AM
> > > > > > > > >
> > > > > > > > > Hi Alex,
> > > > > > > > >
> > > > > > > > > > Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > Sent: Thursday, July 9, 2020 3:55 AM
> > > > > > > > > >
> > > > > > > > > > On Wed, 8 Jul 2020 08:16:16 +0000 "Liu, Yi L"
> > > > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > > > >
> > > > > > > > > > > Hi Alex,
> > > > > > > > > > >
> > > > > > > > > > > > From: Liu, Yi L < yi.l.liu@intel.com>
> > > > > > > > > > > > Sent: Friday, July 3, 2020 2:28 PM
> > > > > > > > > > > >
> > > > > > > > > > > > Hi Alex,
> > > > > > > > > > > >
> > > > > > > > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > > > > > > Sent: Friday, July 3, 2020 5:19 AM
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Wed, 24 Jun 2020 01:55:19 -0700 Liu Yi L
> > > > > > > > > > > > > <yi.l.liu@intel.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > > This patch allows user space to request PASID
> > > > > > > > > > > > > > allocation/free,
> > > > > > > e.g.
> > > > > > > > > > > > > > when serving the request from the guest.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > PASIDs that are not freed by userspace are
> > > > > > > > > > > > > > automatically freed
> > > > > > > > > when
> > > > > > > > > > > > > > the IOASID set is destroyed when process exits.
> > > > > > > > > > > [...]
> > > > > > > > > > > > > > +static int vfio_iommu_type1_pasid_request(struct
> > > > > > > > > > > > > > +vfio_iommu
> > > > > > > > > *iommu,
> > > > > > > > > > > > > > +					  unsigned long
> arg) {
> > > > > > > > > > > > > > +	struct vfio_iommu_type1_pasid_request req;
> > > > > > > > > > > > > > +	unsigned long minsz;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	minsz = offsetofend(struct
> > > > > > vfio_iommu_type1_pasid_request,
> > > > > > > > > > range);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	if (copy_from_user(&req, (void __user *)arg,
> minsz))
> > > > > > > > > > > > > > +		return -EFAULT;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	if (req.argsz < minsz || (req.flags &
> > > > > > > > > > ~VFIO_PASID_REQUEST_MASK))
> > > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	if (req.range.min > req.range.max)
> > > > > > > > > > > > >
> > > > > > > > > > > > > Is it exploitable that a user can spin the kernel for a
> > > > > > > > > > > > > long time in the case of a free by calling this with [0,
> > > > > > > > > > > > > MAX_UINT] regardless of their
> > > > > > > > > > actual
> > > > > > > > > > > > allocations?
> > > > > > > > > > > >
> > > > > > > > > > > > IOASID can ensure that user can only free the PASIDs
> > > > > > > > > > > > allocated to the
> > > > > > > > > user.
> > > > > > > > > > but
> > > > > > > > > > > > it's true, kernel needs to loop all the PASIDs within the
> > > > > > > > > > > > range provided by user.
> > > > > > > > > > it
> > > > > > > > > > > > may take a long time. is there anything we can do? one
> > > > > > > > > > > > thing may limit
> > > > > > > > > the
> > > > > > > > > > range
> > > > > > > > > > > > provided by user?
> > > > > > > > > > >
> > > > > > > > > > > thought about it more, we have per-VM pasid quota (say
> > > > > > > > > > > 1000), so even if user passed down [0, MAX_UNIT], kernel
> > > > > > > > > > > will only loop the
> > > > > > > > > > > 1000 pasids at most. do you think we still need to do something
> on
> > > it?
> > > > > > > > > >
> > > > > > > > > > How do you figure that?  vfio_iommu_type1_pasid_request()
> > > > > > > > > > accepts the user's min/max so long as (max > min) and passes
> > > > > > > > > > that to vfio_iommu_type1_pasid_free(), then to
> > > > > > > > > > vfio_pasid_free_range() which loops as:
> > > > > > > > > >
> > > > > > > > > > 	ioasid_t pasid = min;
> > > > > > > > > > 	for (; pasid <= max; pasid++)
> > > > > > > > > > 		ioasid_free(pasid);
> > > > > > > > > >
> > > > > > > > > > A user might only be able to allocate 1000 pasids, but
> > > > > > > > > > apparently they can ask to free all they want.
> > > > > > > > > >
> > > > > > > > > > It's also not obvious to me that calling ioasid_free() is only
> > > > > > > > > > allowing the user to free their own passid.  Does it?  It
> > > > > > > > > > would be a pretty
> > > > > > > >
> > > > > > > > Agree. I thought ioasid_free should at least carry a token since
> > > > > > > > the user
> > > > > > > space is
> > > > > > > > only allowed to manage PASIDs in its own set...
> > > > > > > >
> > > > > > > > > > gaping hole if a user could free arbitrary pasids.  A r-b tree
> > > > > > > > > > of passids might help both for security and to bound spinning in a
> > > loop.
> > > > > > > > >
> > > > > > > > > oh, yes. BTW. instead of r-b tree in VFIO, maybe we can add an
> > > > > > > > > ioasid_set parameter for ioasid_free(), thus to prevent the user
> > > > > > > > > from freeing PASIDs that doesn't belong to it. I remember Jacob
> > > > > > > > > mentioned it
> > > > > > > before.
> > > > > > > > >
> > > > > > > >
> > > > > > > > check current ioasid_free:
> > > > > > > >
> > > > > > > >         spin_lock(&ioasid_allocator_lock);
> > > > > > > >         ioasid_data = xa_load(&active_allocator->xa, ioasid);
> > > > > > > >         if (!ioasid_data) {
> > > > > > > >                 pr_err("Trying to free unknown IOASID %u\n", ioasid);
> > > > > > > >                 goto exit_unlock;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > Allow an user to trigger above lock paths with MAX_UINT times
> > > > > > > > might still
> > > > > > > be bad.
> > > > > > >
> > > > > > > yeah, how about the below two options:
> > > > > > >
> > > > > > > - comparing the max - min with the quota before calling ioasid_free().
> > > > > > >   If max - min > current quota of the user, then should fail it. If
> > > > > > >   max - min < quota, then call ioasid_free() one by one. still trigger
> > > > > > >   the above lock path with quota times.
> > > > > >
> > > > > > This is definitely wrong. [min, max] is about the range of the PASID
> > > > > > value, while quota is about the number of allocated PASIDs. It's a bit
> > > > > > weird to mix two together.
> > > > >
> > > > > got it.
> > > > >
> > > > > > btw what is the main purpose of allowing batch PASID free requests?
> > > > > > Can we just simplify to allow one PASID in each free just like how is
> > > > > > it done in allocation path?
> > > > >
> > > > > it's an intention to reuse the [min, max] range as allocation path. currently,
> > > we
> > > > > don't have such request as far as I can see.
> > > > >
> > > > > > >
> > > > > > > - pass the max and min to ioasid_free(), let ioasid_free() decide.
> should
> > > > > > >   be able to avoid trigger the lock multiple times, and ioasid has have a
> > > > > > >   track on how may PASIDs have been allocated, if max - min is larger
> than
> > > > > > >   the allocated number, should fail anyway.
> > > > > >
> > > > > > What about Alex's r-b tree suggestion? Is there any downside in you
> mind?
> > > > >
> > > > > no downside, I was just wanting to reuse the tracks in ioasid_set. I can add
> a
> > > r-b
> > > > > for allocated PASIDs and find the PASIDs in the r-b tree only do free for
> the
> > > > > PASIDs found in r-b tree, others in the range would be ignored.
> > > > > does it look good?
> > > > >
> > > > > Regards,
> > > > > Yi Liu
> > > > >
> > > > > > Thanks,
> > > > > > Kevin
> > > >
> >
diff mbox series

Patch

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 3d8a108..95d90c6 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -2,6 +2,7 @@ 
 config VFIO_IOMMU_TYPE1
 	tristate
 	depends on VFIO
+	select VFIO_PASID if (X86)
 	default n
 
 config VFIO_IOMMU_SPAPR_TCE
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8c143d5..d0891c5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@  struct vfio_iommu {
 	bool			v2;
 	bool			nesting;
 	struct iommu_nesting_info *nesting_info;
+	struct vfio_mm		*vmm;
 	bool			dirty_page_tracking;
 	bool			pinned_page_dirty_scope;
 };
@@ -1933,6 +1934,17 @@  static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
 
 	list_splice_tail(iova_copy, iova);
 }
+
+static void vfio_iommu_release_nesting_info(struct vfio_iommu *iommu)
+{
+	if (iommu->vmm) {
+		vfio_mm_put(iommu->vmm);
+		iommu->vmm = NULL;
+	}
+
+	kfree(iommu->nesting_info);
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -2067,6 +2079,25 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 			goto out_detach;
 		}
 		iommu->nesting_info = info;
+
+		if (info->features & IOMMU_NESTING_FEAT_SYSWIDE_PASID) {
+			struct vfio_mm *vmm;
+			int sid;
+
+			vmm = vfio_mm_get_from_task(current);
+			if (IS_ERR(vmm)) {
+				ret = PTR_ERR(vmm);
+				goto out_detach;
+			}
+			iommu->vmm = vmm;
+
+			sid = vfio_mm_ioasid_sid(vmm);
+			ret = iommu_domain_set_attr(domain->domain,
+						    DOMAIN_ATTR_IOASID_SID,
+						    &sid);
+			if (ret)
+				goto out_detach;
+		}
 	}
 
 	/* Get aperture info */
@@ -2178,7 +2209,8 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	return 0;
 
 out_detach:
-	kfree(iommu->nesting_info);
+	if (iommu->nesting_info)
+		vfio_iommu_release_nesting_info(iommu);
 	vfio_iommu_detach_group(domain, group);
 out_domain:
 	iommu_domain_free(domain->domain);
@@ -2380,7 +2412,8 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 				else
 					vfio_iommu_unmap_unpin_reaccount(iommu);
 
-				kfree(iommu->nesting_info);
+				if (iommu->nesting_info)
+					vfio_iommu_release_nesting_info(iommu);
 			}
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
@@ -2852,6 +2885,63 @@  static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
 	return -EINVAL;
 }
 
+static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
+					unsigned int min,
+					unsigned int max)
+{
+	int ret = -ENOTSUPP;
+
+	mutex_lock(&iommu->lock);
+	if (iommu->vmm)
+		ret = vfio_pasid_alloc(iommu->vmm, min, max);
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
+					unsigned int min,
+					unsigned int max)
+{
+	int ret = -ENOTSUPP;
+
+	mutex_lock(&iommu->lock);
+	if (iommu->vmm) {
+		vfio_pasid_free_range(iommu->vmm, min, max);
+		ret = 0;
+	}
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_iommu_type1_pasid_request(struct vfio_iommu *iommu,
+					  unsigned long arg)
+{
+	struct vfio_iommu_type1_pasid_request req;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct vfio_iommu_type1_pasid_request, range);
+
+	if (copy_from_user(&req, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (req.argsz < minsz || (req.flags & ~VFIO_PASID_REQUEST_MASK))
+		return -EINVAL;
+
+	if (req.range.min > req.range.max)
+		return -EINVAL;
+
+	switch (req.flags & VFIO_PASID_REQUEST_MASK) {
+	case VFIO_IOMMU_ALLOC_PASID:
+		return vfio_iommu_type1_pasid_alloc(iommu,
+					req.range.min, req.range.max);
+	case VFIO_IOMMU_FREE_PASID:
+		return vfio_iommu_type1_pasid_free(iommu,
+					req.range.min, req.range.max);
+	default:
+		return -EINVAL;
+	}
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -2868,6 +2958,8 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 		return vfio_iommu_type1_unmap_dma(iommu, arg);
 	case VFIO_IOMMU_DIRTY_PAGES:
 		return vfio_iommu_type1_dirty_pages(iommu, arg);
+	case VFIO_IOMMU_PASID_REQUEST:
+		return vfio_iommu_type1_pasid_request(iommu, arg);
 	}
 
 	return -ENOTTY;
diff --git a/drivers/vfio/vfio_pasid.c b/drivers/vfio/vfio_pasid.c
index dd5b6d1..2ea9f1a 100644
--- a/drivers/vfio/vfio_pasid.c
+++ b/drivers/vfio/vfio_pasid.c
@@ -54,6 +54,7 @@  void vfio_mm_put(struct vfio_mm *vmm)
 {
 	kref_put_mutex(&vmm->kref, vfio_mm_release, &vfio_pasid.vfio_mm_lock);
 }
+EXPORT_SYMBOL_GPL(vfio_mm_put);
 
 static void vfio_mm_get(struct vfio_mm *vmm)
 {
@@ -103,6 +104,13 @@  struct vfio_mm *vfio_mm_get_from_task(struct task_struct *task)
 	mmput(mm);
 	return vmm;
 }
+EXPORT_SYMBOL_GPL(vfio_mm_get_from_task);
+
+int vfio_mm_ioasid_sid(struct vfio_mm *vmm)
+{
+	return vmm->ioasid_sid;
+}
+EXPORT_SYMBOL_GPL(vfio_mm_ioasid_sid);
 
 int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max)
 {
@@ -112,6 +120,7 @@  int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max)
 
 	return (pasid == INVALID_IOASID) ? -ENOSPC : pasid;
 }
+EXPORT_SYMBOL_GPL(vfio_pasid_alloc);
 
 void vfio_pasid_free_range(struct vfio_mm *vmm,
 			    ioasid_t min, ioasid_t max)
@@ -129,6 +138,7 @@  void vfio_pasid_free_range(struct vfio_mm *vmm,
 	for (; pasid <= max; pasid++)
 		ioasid_free(pasid);
 }
+EXPORT_SYMBOL_GPL(vfio_pasid_free_range);
 
 static int __init vfio_pasid_init(void)
 {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 74e077d..8e60a32 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -101,6 +101,7 @@  struct vfio_mm;
 #if IS_ENABLED(CONFIG_VFIO_PASID)
 extern struct vfio_mm *vfio_mm_get_from_task(struct task_struct *task);
 extern void vfio_mm_put(struct vfio_mm *vmm);
+int vfio_mm_ioasid_sid(struct vfio_mm *vmm);
 extern int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max);
 extern void vfio_pasid_free_range(struct vfio_mm *vmm,
 					ioasid_t min, ioasid_t max);
@@ -114,6 +115,11 @@  static inline void vfio_mm_put(struct vfio_mm *vmm)
 {
 }
 
+static inline int vfio_mm_ioasid_sid(struct vfio_mm *vmm)
+{
+	return -ENOTTY;
+}
+
 static inline int vfio_pasid_alloc(struct vfio_mm *vmm, int min, int max)
 {
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index f1f39e1..657b2db 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1162,6 +1162,42 @@  struct vfio_iommu_type1_dirty_bitmap_get {
 
 #define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
 
+/**
+ * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 18,
+ *				struct vfio_iommu_type1_pasid_request)
+ *
+ * PASID (Processor Address Space ID) is a PCIe concept for tagging
+ * address spaces in DMA requests. When system-wide PASID allocation
+ * is required by underlying iommu driver (e.g. Intel VT-d), this
+ * provides an interface for userspace to request pasid alloc/free
+ * for its assigned devices. Userspace should check the availability
+ * of this API through VFIO_IOMMU_GET_INFO.
+ *
+ * @flags=VFIO_IOMMU_ALLOC_PASID, allocate a single PASID within @range.
+ * @flags=VFIO_IOMMU_FREE_PASID, free the PASIDs within @range.
+ * @range is [min, max], which means both @min and @max are inclusive.
+ * ALLOC_PASID and FREE_PASID are mutually exclusive.
+ *
+ * returns: allocated PASID value on success, -errno on failure for
+ *	     ALLOC_PASID;
+ *	     0 for FREE_PASID operation;
+ */
+struct vfio_iommu_type1_pasid_request {
+	__u32	argsz;
+#define VFIO_IOMMU_ALLOC_PASID	(1 << 0)
+#define VFIO_IOMMU_FREE_PASID	(1 << 1)
+	__u32	flags;
+	struct {
+		__u32	min;
+		__u32	max;
+	} range;
+};
+
+#define VFIO_PASID_REQUEST_MASK	(VFIO_IOMMU_ALLOC_PASID | \
+					 VFIO_IOMMU_FREE_PASID)
+
+#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 18)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*