diff mbox series

[RFC,v2,2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)

Message ID 1571919983-3231-3-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio: support Shared Virtual Addressing | expand

Commit Message

Yi Liu Oct. 24, 2019, 12:26 p.m. UTC
This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
to passdown PASID allocation/free request from the virtual
iommu. This is required to get PASID managed in system-wide.

Cc: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 drivers/vfio/vfio_iommu_type1.c | 114 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       |  25 +++++++++
 2 files changed, 139 insertions(+)

Comments

Tian, Kevin Oct. 25, 2019, 10:06 a.m. UTC | #1
> From: Liu Yi L
> Sent: Thursday, October 24, 2019 8:26 PM
> 
> This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> to passdown PASID allocation/free request from the virtual
> iommu. This is required to get PASID managed in system-wide.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 114
> ++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h       |  25 +++++++++
>  2 files changed, 139 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> index cd8d3a5..3d73a7d 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev,
> void *data)
>  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
>  }
> 
> +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> +					 int min_pasid,
> +					 int max_pasid)
> +{
> +	int ret;
> +	ioasid_t pasid;
> +	struct mm_struct *mm = NULL;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +	mm = get_task_mm(current);
> +	/* Track ioasid allocation owner by mm */
below is purely allocation. Where does 'track' come to play?

> +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> +				max_pasid, NULL);
> +	if (pasid == INVALID_IOASID) {
> +		ret = -ENOSPC;
> +		goto out_unlock;
> +	}
> +	ret = pasid;
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	if (mm)
> +		mmput(mm);
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> +				       unsigned int pasid)
> +{
> +	struct mm_struct *mm = NULL;
> +	void *pdata;
> +	int ret = 0;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	/**
> +	 * REVISIT:
> +	 * There are two cases free could fail:
> +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> +	 * the set does not match, caller is not permitted to free.
> +	 * 2. free before unbind all devices, we can check if ioasid private
> +	 * data, if data != NULL, then fail to free.
> +	 */

Does REVISIT mean that above comment is the right way but
the code doesn't follow yet, or the comment itself should be
revisited?

should we have some notification mechanism, so the guy
who holds the reference to the pasid can be notified to
release its usage?

> +	mm = get_task_mm(current);
> +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> +	if (IS_ERR(pdata)) {
> +		if (pdata == ERR_PTR(-ENOENT))
> +			pr_err("PASID %u is not allocated\n", pasid);
> +		else if (pdata == ERR_PTR(-EACCES))
> +			pr_err("Free PASID %u by non-owner, denied",
> pasid);
> +		else
> +			pr_err("Error searching PASID %u\n", pasid);
> +		ret = -EPERM;
> +		goto out_unlock;
> +	}
> +	if (pdata) {
> +		pr_debug("Cannot free pasid %d with private data\n",
> pasid);
> +		/* Expect PASID has no private data if not bond */
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +	ioasid_free(pasid);
> +
> +out_unlock:
> +	if (mm)
> +		mmput(mm);
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -2370,6 +2447,43 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
>  					    &ustruct);
>  		mutex_unlock(&iommu->lock);
>  		return ret;
> +
> +	} else if (cmd == VFIO_IOMMU_PASID_REQUEST) {
> +		struct vfio_iommu_type1_pasid_request req;
> +		int min_pasid, max_pasid, pasid;
> +
> +		minsz = offsetofend(struct
> vfio_iommu_type1_pasid_request,
> +				    flag);
> +
> +		if (copy_from_user(&req, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (req.argsz < minsz)
> +			return -EINVAL;
> +
> +		switch (req.flag) {
> +		/**
> +		 * TODO: min_pasid and max_pasid align with
> +		 * typedef unsigned int ioasid_t
> +		 */
> +		case VFIO_IOMMU_PASID_ALLOC:
> +			if (copy_from_user(&min_pasid,
> +				(void __user *)arg + minsz,
> sizeof(min_pasid)))
> +				return -EFAULT;
> +			if (copy_from_user(&max_pasid,
> +				(void __user *)arg + minsz +
> sizeof(min_pasid),
> +				sizeof(max_pasid)))
> +				return -EFAULT;
> +			return vfio_iommu_type1_pasid_alloc(iommu,
> +						min_pasid, max_pasid);
> +		case VFIO_IOMMU_PASID_FREE:
> +			if (copy_from_user(&pasid,
> +				(void __user *)arg + minsz, sizeof(pasid)))
> +				return -EFAULT;
> +			return vfio_iommu_type1_pasid_free(iommu,
> pasid);
> +		default:
> +			return -EINVAL;
> +		}
>  	}
> 
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ccf60a2..04de290 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -807,6 +807,31 @@ struct vfio_iommu_type1_cache_invalidate {
>  };
>  #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE
> + 24)
> 
> +/*
> + * @flag=VFIO_IOMMU_PASID_ALLOC, refer to the @min_pasid and
> @max_pasid fields
> + * @flag=VFIO_IOMMU_PASID_FREE, refer to @pasid field
> + */
> +struct vfio_iommu_type1_pasid_request {
> +	__u32	argsz;
> +#define VFIO_IOMMU_PASID_ALLOC	(1 << 0)
> +#define VFIO_IOMMU_PASID_FREE	(1 << 1)
> +	__u32	flag;
> +	union {
> +		struct {
> +			int min_pasid;
> +			int max_pasid;
> +		};
> +		int pasid;
> +	};
> +};
> +
> +/**
> + * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 27,
> + *				struct vfio_iommu_type1_pasid_request)
> + *
> + */
> +#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE
> + 27)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU --------
> */
> 
>  /*
> --
> 2.7.4
Yi Liu Oct. 25, 2019, 11:16 a.m. UTC | #2
Hi Kevin,

> From: Tian, Kevin
> Sent: Friday, October 25, 2019 6:06 PM
> To: Liu, Yi L <yi.l.liu@intel.com>; alex.williamson@redhat.com;
> Subject: RE: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> 
> > From: Liu Yi L
> > Sent: Thursday, October 24, 2019 8:26 PM
> >
> > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims to passdown
> > PASID allocation/free request from the virtual iommu. This is required
> > to get PASID managed in system-wide.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 114
> > ++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/vfio.h       |  25 +++++++++
> >  2 files changed, 139 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index cd8d3a5..3d73a7d 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device
> > *dev, void *data)
> >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);  }
> >
> > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > +					 int min_pasid,
> > +					 int max_pasid)
> > +{
> > +	int ret;
> > +	ioasid_t pasid;
> > +	struct mm_struct *mm = NULL;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +	mm = get_task_mm(current);
> > +	/* Track ioasid allocation owner by mm */
> below is purely allocation. Where does 'track' come to play?

ioasid_set is kind of owner track. As allocation is separate with
bind, here set the "owner" could be used to do sanity check when
a pasid bind comes.

> > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > +				max_pasid, NULL);
> > +	if (pasid == INVALID_IOASID) {
> > +		ret = -ENOSPC;
> > +		goto out_unlock;
> > +	}
> > +	ret = pasid;
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	if (mm)
> > +		mmput(mm);
> > +	return ret;
> > +}
> > +
> > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > +				       unsigned int pasid)
> > +{
> > +	struct mm_struct *mm = NULL;
> > +	void *pdata;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/**
> > +	 * REVISIT:
> > +	 * There are two cases free could fail:
> > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > +	 * the set does not match, caller is not permitted to free.
> > +	 * 2. free before unbind all devices, we can check if ioasid private
> > +	 * data, if data != NULL, then fail to free.
> > +	 */
> 
> Does REVISIT mean that above comment is the right way but the code doesn't follow
> yet, or the comment itself should be revisited?

Sorry, it's a mistake... should be removed. It's added in the development phase
for remind. Will remove it.

> 
> should we have some notification mechanism, so the guy who holds the reference to
> the pasid can be notified to release its usage?

Do you mean the ioasid itself to provide such a notification mechanism?

Currently, we prevent pasid free before all user (iommu driver, guest) released
their usage. This is achieved by checking the private data, in which there is
a user_cnt of a pasid. e.g. struct intel_svm. A fresh guest pasid bind will allocate
the private data. A second guest pasid bind will increase the user_cnt. guest pasid
unbind decreases the user_cnt. The private data will be freed by the last guest
pasid unbind. Do you think it is sufficient? or we may want to have a notification
mechanism to allow such pasid free and keep the user updated?

Thanks,
Yi Liu
Alex Williamson Nov. 5, 2019, 11:35 p.m. UTC | #3
On Thu, 24 Oct 2019 08:26:22 -0400
Liu Yi L <yi.l.liu@intel.com> wrote:

> This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> to passdown PASID allocation/free request from the virtual
> iommu. This is required to get PASID managed in system-wide.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 114 ++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h       |  25 +++++++++
>  2 files changed, 139 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index cd8d3a5..3d73a7d 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev, void *data)
>  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
>  }
>  
> +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> +					 int min_pasid,
> +					 int max_pasid)
> +{
> +	int ret;
> +	ioasid_t pasid;
> +	struct mm_struct *mm = NULL;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +	mm = get_task_mm(current);
> +	/* Track ioasid allocation owner by mm */
> +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> +				max_pasid, NULL);

Are we sure we want to tie this to the task mm vs perhaps the
vfio_iommu pointer?

> +	if (pasid == INVALID_IOASID) {
> +		ret = -ENOSPC;
> +		goto out_unlock;
> +	}
> +	ret = pasid;
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	if (mm)
> +		mmput(mm);
> +	return ret;
> +}
> +
> +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> +				       unsigned int pasid)
> +{
> +	struct mm_struct *mm = NULL;
> +	void *pdata;
> +	int ret = 0;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	/**
> +	 * REVISIT:
> +	 * There are two cases free could fail:
> +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> +	 * the set does not match, caller is not permitted to free.
> +	 * 2. free before unbind all devices, we can check if ioasid private
> +	 * data, if data != NULL, then fail to free.
> +	 */
> +	mm = get_task_mm(current);
> +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> +	if (IS_ERR(pdata)) {
> +		if (pdata == ERR_PTR(-ENOENT))
> +			pr_err("PASID %u is not allocated\n", pasid);
> +		else if (pdata == ERR_PTR(-EACCES))
> +			pr_err("Free PASID %u by non-owner, denied", pasid);
> +		else
> +			pr_err("Error searching PASID %u\n", pasid);

This should be removed, errno is sufficient for the user, this just
provides the user with a trivial DoS vector filling logs.

> +		ret = -EPERM;

But why not return PTR_ERR(pdata)?

> +		goto out_unlock;
> +	}
> +	if (pdata) {
> +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> +		/* Expect PASID has no private data if not bond */
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +	ioasid_free(pasid);

We only ever get here with pasid == NULL?!  Something is wrong.  Should
that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
vector)

> +
> +out_unlock:
> +	if (mm)
> +		mmput(mm);
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -2370,6 +2447,43 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  					    &ustruct);
>  		mutex_unlock(&iommu->lock);
>  		return ret;
> +
> +	} else if (cmd == VFIO_IOMMU_PASID_REQUEST) {
> +		struct vfio_iommu_type1_pasid_request req;
> +		int min_pasid, max_pasid, pasid;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> +				    flag);
> +
> +		if (copy_from_user(&req, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (req.argsz < minsz)
> +			return -EINVAL;
> +
> +		switch (req.flag) {

This works, but it's strange.  Let's make the code a little easier for
the next flag bit that gets used so they don't need to rework this case
statement.  I'd suggest creating a VFIO_IOMMU_PASID_OPS_MASK that is
the OR of the ALLOC/FREE options, test that no bits are set outside of
that mask, then AND that mask as the switch arg with the code below.

> +		/**
> +		 * TODO: min_pasid and max_pasid align with
> +		 * typedef unsigned int ioasid_t
> +		 */
> +		case VFIO_IOMMU_PASID_ALLOC:
> +			if (copy_from_user(&min_pasid,
> +				(void __user *)arg + minsz, sizeof(min_pasid)))
> +				return -EFAULT;
> +			if (copy_from_user(&max_pasid,
> +				(void __user *)arg + minsz + sizeof(min_pasid),
> +				sizeof(max_pasid)))
> +				return -EFAULT;
> +			return vfio_iommu_type1_pasid_alloc(iommu,
> +						min_pasid, max_pasid);
> +		case VFIO_IOMMU_PASID_FREE:
> +			if (copy_from_user(&pasid,
> +				(void __user *)arg + minsz, sizeof(pasid)))
> +				return -EFAULT;
> +			return vfio_iommu_type1_pasid_free(iommu, pasid);
> +		default:
> +			return -EINVAL;
> +		}
>  	}
>  
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ccf60a2..04de290 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -807,6 +807,31 @@ struct vfio_iommu_type1_cache_invalidate {
>  };
>  #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 24)
>  
> +/*
> + * @flag=VFIO_IOMMU_PASID_ALLOC, refer to the @min_pasid and @max_pasid fields
> + * @flag=VFIO_IOMMU_PASID_FREE, refer to @pasid field
> + */
> +struct vfio_iommu_type1_pasid_request {
> +	__u32	argsz;
> +#define VFIO_IOMMU_PASID_ALLOC	(1 << 0)
> +#define VFIO_IOMMU_PASID_FREE	(1 << 1)
> +	__u32	flag;
> +	union {
> +		struct {
> +			int min_pasid;
> +			int max_pasid;
> +		};
> +		int pasid;

Perhaps:

		struct {
			u32 min;
			u32 max;
		} alloc_pasid;
		u32 free_pasid;

(note also the s/int/u32/)

> +	};
> +};
> +
> +/**
> + * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 27,
> + *				struct vfio_iommu_type1_pasid_request)
> + *
> + */
> +#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 27)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>  
>  /*
Yi Liu Nov. 6, 2019, 1:27 p.m. UTC | #4
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Wednesday, November 6, 2019 7:36 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> 
> On Thu, 24 Oct 2019 08:26:22 -0400
> Liu Yi L <yi.l.liu@intel.com> wrote:
> 
> > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> > to passdown PASID allocation/free request from the virtual
> > iommu. This is required to get PASID managed in system-wide.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 114
> ++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/vfio.h       |  25 +++++++++
> >  2 files changed, 139 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index cd8d3a5..3d73a7d 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev, void
> *data)
> >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
> >  }
> >
> > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > +					 int min_pasid,
> > +					 int max_pasid)
> > +{
> > +	int ret;
> > +	ioasid_t pasid;
> > +	struct mm_struct *mm = NULL;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +	mm = get_task_mm(current);
> > +	/* Track ioasid allocation owner by mm */
> > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > +				max_pasid, NULL);
> 
> Are we sure we want to tie this to the task mm vs perhaps the
> vfio_iommu pointer?

Here we want to have a kind of per-VM mark, which can be used to do
ownership check on whether a pasid is held by a specific VM. This is
very important to prevent across VM affect. vfio_iommu pointer is
competent for vfio as vfio is both pasid alloc requester and pasid
consumer. e.g. vfio requests pasid alloc from ioasid and also it will
invoke bind_gpasid(). vfio can either check ownership before invoking
bind_gpasid() or pass vfio_iommu pointer to iommu driver. But in future,
there may be other modules which are just consumers of pasid. And they
also want to do ownership check for a pasid. Then, it would be hard for
them as they are not the pasid alloc requester. So here better to have
a system wide structure to perform as the per-VM mark. task mm looks
to be much competent.

> > +	if (pasid == INVALID_IOASID) {
> > +		ret = -ENOSPC;
> > +		goto out_unlock;
> > +	}
> > +	ret = pasid;
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	if (mm)
> > +		mmput(mm);
> > +	return ret;
> > +}
> > +
> > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > +				       unsigned int pasid)
> > +{
> > +	struct mm_struct *mm = NULL;
> > +	void *pdata;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/**
> > +	 * REVISIT:
> > +	 * There are two cases free could fail:
> > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > +	 * the set does not match, caller is not permitted to free.
> > +	 * 2. free before unbind all devices, we can check if ioasid private
> > +	 * data, if data != NULL, then fail to free.
> > +	 */
> > +	mm = get_task_mm(current);
> > +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> > +	if (IS_ERR(pdata)) {
> > +		if (pdata == ERR_PTR(-ENOENT))
> > +			pr_err("PASID %u is not allocated\n", pasid);
> > +		else if (pdata == ERR_PTR(-EACCES))
> > +			pr_err("Free PASID %u by non-owner, denied", pasid);
> > +		else
> > +			pr_err("Error searching PASID %u\n", pasid);
> 
> This should be removed, errno is sufficient for the user, this just
> provides the user with a trivial DoS vector filling logs.

sure, will fix it. thanks.

> > +		ret = -EPERM;
> 
> But why not return PTR_ERR(pdata)?

aha, would do it.

> > +		goto out_unlock;
> > +	}
> > +	if (pdata) {
> > +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> > +		/* Expect PASID has no private data if not bond */
> > +		ret = -EBUSY;
> > +		goto out_unlock;
> > +	}
> > +	ioasid_free(pasid);
> 
> We only ever get here with pasid == NULL?! 

I guess you meant only when pdata==NULL.

> Something is wrong.  Should
> that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
> vector)

Oh, yes, just do it as below:

if (!pdata) {
	ioasid_free(pasid);
	ret = SUCCESS;
} else
	ret = -EBUSY;

Is it what you mean?

> > +
> > +out_unlock:
> > +	if (mm)
> > +		mmput(mm);
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  				   unsigned int cmd, unsigned long arg)
> >  {
> > @@ -2370,6 +2447,43 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  					    &ustruct);
> >  		mutex_unlock(&iommu->lock);
> >  		return ret;
> > +
> > +	} else if (cmd == VFIO_IOMMU_PASID_REQUEST) {
> > +		struct vfio_iommu_type1_pasid_request req;
> > +		int min_pasid, max_pasid, pasid;
> > +
> > +		minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> > +				    flag);
> > +
> > +		if (copy_from_user(&req, (void __user *)arg, minsz))
> > +			return -EFAULT;
> > +
> > +		if (req.argsz < minsz)
> > +			return -EINVAL;
> > +
> > +		switch (req.flag) {
> 
> This works, but it's strange.  Let's make the code a little easier for
> the next flag bit that gets used so they don't need to rework this case
> statement.  I'd suggest creating a VFIO_IOMMU_PASID_OPS_MASK that is
> the OR of the ALLOC/FREE options, test that no bits are set outside of
> that mask, then AND that mask as the switch arg with the code below.

Got it. Let me fix it in next version.

> > +		/**
> > +		 * TODO: min_pasid and max_pasid align with
> > +		 * typedef unsigned int ioasid_t
> > +		 */
> > +		case VFIO_IOMMU_PASID_ALLOC:
> > +			if (copy_from_user(&min_pasid,
> > +				(void __user *)arg + minsz, sizeof(min_pasid)))
> > +				return -EFAULT;
> > +			if (copy_from_user(&max_pasid,
> > +				(void __user *)arg + minsz + sizeof(min_pasid),
> > +				sizeof(max_pasid)))
> > +				return -EFAULT;
> > +			return vfio_iommu_type1_pasid_alloc(iommu,
> > +						min_pasid, max_pasid);
> > +		case VFIO_IOMMU_PASID_FREE:
> > +			if (copy_from_user(&pasid,
> > +				(void __user *)arg + minsz, sizeof(pasid)))
> > +				return -EFAULT;
> > +			return vfio_iommu_type1_pasid_free(iommu, pasid);
> > +		default:
> > +			return -EINVAL;
> > +		}
> >  	}
> >
> >  	return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index ccf60a2..04de290 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -807,6 +807,31 @@ struct vfio_iommu_type1_cache_invalidate {
> >  };
> >  #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 24)
> >
> > +/*
> > + * @flag=VFIO_IOMMU_PASID_ALLOC, refer to the @min_pasid and
> @max_pasid fields
> > + * @flag=VFIO_IOMMU_PASID_FREE, refer to @pasid field
> > + */
> > +struct vfio_iommu_type1_pasid_request {
> > +	__u32	argsz;
> > +#define VFIO_IOMMU_PASID_ALLOC	(1 << 0)
> > +#define VFIO_IOMMU_PASID_FREE	(1 << 1)
> > +	__u32	flag;
> > +	union {
> > +		struct {
> > +			int min_pasid;
> > +			int max_pasid;
> > +		};
> > +		int pasid;
> 
> Perhaps:
> 
> 		struct {
> 			u32 min;
> 			u32 max;
> 		} alloc_pasid;
> 		u32 free_pasid;
> 
> (note also the s/int/u32/)

got it. will fix it in next version. Thanks.

> > +	};
> > +};
> > +
> > +/**
> > + * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 27,
> > + *				struct vfio_iommu_type1_pasid_request)
> > + *
> > + */
> > +#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 27)
> > +
> >  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> >
> >  /*

Regards,
Yi Liu
Alex Williamson Nov. 7, 2019, 10:06 p.m. UTC | #5
On Wed, 6 Nov 2019 13:27:26 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Wednesday, November 6, 2019 7:36 AM
> > To: Liu, Yi L <yi.l.liu@intel.com>
> > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > 
> > On Thu, 24 Oct 2019 08:26:22 -0400
> > Liu Yi L <yi.l.liu@intel.com> wrote:
> >   
> > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> > > to passdown PASID allocation/free request from the virtual
> > > iommu. This is required to get PASID managed in system-wide.
> > >
> > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 114  
> > ++++++++++++++++++++++++++++++++++++++++  
> > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > >  2 files changed, 139 insertions(+)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index cd8d3a5..3d73a7d 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev, void  
> > *data)  
> > >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
> > >  }
> > >
> > > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > > +					 int min_pasid,
> > > +					 int max_pasid)
> > > +{
> > > +	int ret;
> > > +	ioasid_t pasid;
> > > +	struct mm_struct *mm = NULL;
> > > +
> > > +	mutex_lock(&iommu->lock);
> > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > +		ret = -EINVAL;
> > > +		goto out_unlock;
> > > +	}
> > > +	mm = get_task_mm(current);
> > > +	/* Track ioasid allocation owner by mm */
> > > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > > +				max_pasid, NULL);  
> > 
> > Are we sure we want to tie this to the task mm vs perhaps the
> > vfio_iommu pointer?  
> 
> Here we want to have a kind of per-VM mark, which can be used to do
> ownership check on whether a pasid is held by a specific VM. This is
> very important to prevent across VM affect. vfio_iommu pointer is
> competent for vfio as vfio is both pasid alloc requester and pasid
> consumer. e.g. vfio requests pasid alloc from ioasid and also it will
> invoke bind_gpasid(). vfio can either check ownership before invoking
> bind_gpasid() or pass vfio_iommu pointer to iommu driver. But in future,
> there may be other modules which are just consumers of pasid. And they
> also want to do ownership check for a pasid. Then, it would be hard for
> them as they are not the pasid alloc requester. So here better to have
> a system wide structure to perform as the per-VM mark. task mm looks
> to be much competent.

Ok, so it's intentional to have a VM-wide token.  Elsewhere in the
type1 code (vfio_dma_do_map) we record the task_struct per dma mapping
so that we can get the task mm as needed.  Would the task_struct
pointer provide any advantage?

Also, an overall question, this provides userspace with pasid alloc and
free ioctls, (1) what prevents a userspace process from consuming every
available pasid, and (2) if the process exits or crashes without
freeing pasids, how are they recovered aside from a reboot?

> > > +	if (pasid == INVALID_IOASID) {
> > > +		ret = -ENOSPC;
> > > +		goto out_unlock;
> > > +	}
> > > +	ret = pasid;
> > > +out_unlock:
> > > +	mutex_unlock(&iommu->lock);

What does holding this lock protect?  That the vfio_iommu remains
backed by an iommu during this operation, even though we don't do
anything to release allocated pasids when that iommu backing is removed?

> > > +	if (mm)
> > > +		mmput(mm);
> > > +	return ret;
> > > +}
> > > +
> > > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > > +				       unsigned int pasid)
> > > +{
> > > +	struct mm_struct *mm = NULL;
> > > +	void *pdata;
> > > +	int ret = 0;
> > > +
> > > +	mutex_lock(&iommu->lock);
> > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > +		ret = -EINVAL;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	/**
> > > +	 * REVISIT:
> > > +	 * There are two cases free could fail:
> > > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > > +	 * the set does not match, caller is not permitted to free.
> > > +	 * 2. free before unbind all devices, we can check if ioasid private
> > > +	 * data, if data != NULL, then fail to free.
> > > +	 */
> > > +	mm = get_task_mm(current);
> > > +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> > > +	if (IS_ERR(pdata)) {
> > > +		if (pdata == ERR_PTR(-ENOENT))
> > > +			pr_err("PASID %u is not allocated\n", pasid);
> > > +		else if (pdata == ERR_PTR(-EACCES))
> > > +			pr_err("Free PASID %u by non-owner, denied", pasid);
> > > +		else
> > > +			pr_err("Error searching PASID %u\n", pasid);  
> > 
> > This should be removed, errno is sufficient for the user, this just
> > provides the user with a trivial DoS vector filling logs.  
> 
> sure, will fix it. thanks.
> 
> > > +		ret = -EPERM;  
> > 
> > But why not return PTR_ERR(pdata)?  
> 
> aha, would do it.
> 
> > > +		goto out_unlock;
> > > +	}
> > > +	if (pdata) {
> > > +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> > > +		/* Expect PASID has no private data if not bond */
> > > +		ret = -EBUSY;
> > > +		goto out_unlock;
> > > +	}
> > > +	ioasid_free(pasid);  
> > 
> > We only ever get here with pasid == NULL?!   
> 
> I guess you meant only when pdata==NULL.
> 
> > Something is wrong.  Should
> > that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
> > vector)  
> 
> Oh, yes, just do it as below:
> 
> if (!pdata) {
> 	ioasid_free(pasid);
> 	ret = SUCCESS;
> } else
> 	ret = -EBUSY;
> 
> Is it what you mean?

No, I think I was just confusing pdata and pasid, but I am still
confused about testing pdata.  We call ioasid_alloc() with private =
NULL, and I don't see any of your patches calling ioasid_set_data() to
change the private data after allocation, so how could this ever be
set?  Should this just be a BUG_ON(pdata) as the integrity of the
system is in question should this state ever occur?  Thanks,

Alex
 
> > > +
> > > +out_unlock:
> > > +	if (mm)
> > > +		mmput(mm);
> > > +	mutex_unlock(&iommu->lock);
> > > +	return ret;
> > > +}
> > > +
> > >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> > >  				   unsigned int cmd, unsigned long arg)
> > >  {
> > > @@ -2370,6 +2447,43 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> > >  					    &ustruct);
> > >  		mutex_unlock(&iommu->lock);
> > >  		return ret;
> > > +
> > > +	} else if (cmd == VFIO_IOMMU_PASID_REQUEST) {
> > > +		struct vfio_iommu_type1_pasid_request req;
> > > +		int min_pasid, max_pasid, pasid;
> > > +
> > > +		minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> > > +				    flag);
> > > +
> > > +		if (copy_from_user(&req, (void __user *)arg, minsz))
> > > +			return -EFAULT;
> > > +
> > > +		if (req.argsz < minsz)
> > > +			return -EINVAL;
> > > +
> > > +		switch (req.flag) {  
> > 
> > This works, but it's strange.  Let's make the code a little easier for
> > the next flag bit that gets used so they don't need to rework this case
> > statement.  I'd suggest creating a VFIO_IOMMU_PASID_OPS_MASK that is
> > the OR of the ALLOC/FREE options, test that no bits are set outside of
> > that mask, then AND that mask as the switch arg with the code below.  
> 
> Got it. Let me fix it in next version.
> 
> > > +		/**
> > > +		 * TODO: min_pasid and max_pasid align with
> > > +		 * typedef unsigned int ioasid_t
> > > +		 */
> > > +		case VFIO_IOMMU_PASID_ALLOC:
> > > +			if (copy_from_user(&min_pasid,
> > > +				(void __user *)arg + minsz, sizeof(min_pasid)))
> > > +				return -EFAULT;
> > > +			if (copy_from_user(&max_pasid,
> > > +				(void __user *)arg + minsz + sizeof(min_pasid),
> > > +				sizeof(max_pasid)))
> > > +				return -EFAULT;
> > > +			return vfio_iommu_type1_pasid_alloc(iommu,
> > > +						min_pasid, max_pasid);
> > > +		case VFIO_IOMMU_PASID_FREE:
> > > +			if (copy_from_user(&pasid,
> > > +				(void __user *)arg + minsz, sizeof(pasid)))
> > > +				return -EFAULT;
> > > +			return vfio_iommu_type1_pasid_free(iommu, pasid);
> > > +		default:
> > > +			return -EINVAL;
> > > +		}
> > >  	}
> > >
> > >  	return -ENOTTY;
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index ccf60a2..04de290 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -807,6 +807,31 @@ struct vfio_iommu_type1_cache_invalidate {
> > >  };
> > >  #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 24)
> > >
> > > +/*
> > > + * @flag=VFIO_IOMMU_PASID_ALLOC, refer to the @min_pasid and  
> > @max_pasid fields  
> > > + * @flag=VFIO_IOMMU_PASID_FREE, refer to @pasid field
> > > + */
> > > +struct vfio_iommu_type1_pasid_request {
> > > +	__u32	argsz;
> > > +#define VFIO_IOMMU_PASID_ALLOC	(1 << 0)
> > > +#define VFIO_IOMMU_PASID_FREE	(1 << 1)
> > > +	__u32	flag;
> > > +	union {
> > > +		struct {
> > > +			int min_pasid;
> > > +			int max_pasid;
> > > +		};
> > > +		int pasid;  
> > 
> > Perhaps:
> > 
> > 		struct {
> > 			u32 min;
> > 			u32 max;
> > 		} alloc_pasid;
> > 		u32 free_pasid;
> > 
> > (note also the s/int/u32/)  
> 
> got it. will fix it in next version. Thanks.
> 
> > > +	};
> > > +};
> > > +
> > > +/**
> > > + * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 27,
> > > + *				struct vfio_iommu_type1_pasid_request)
> > > + *
> > > + */
> > > +#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 27)
> > > +
> > >  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> > >
> > >  /*  
> 
> Regards,
> Yi Liu
Yi Liu Nov. 8, 2019, 12:23 p.m. UTC | #6
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Friday, November 8, 2019 6:07 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> 
> On Wed, 6 Nov 2019 13:27:26 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Wednesday, November 6, 2019 7:36 AM
> > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > >
> > > On Thu, 24 Oct 2019 08:26:22 -0400
> > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > >
> > > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> > > > to passdown PASID allocation/free request from the virtual
> > > > iommu. This is required to get PASID managed in system-wide.
> > > >
> > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > ---
> > > >  drivers/vfio/vfio_iommu_type1.c | 114
> > > ++++++++++++++++++++++++++++++++++++++++
> > > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > > >  2 files changed, 139 insertions(+)
> > > >
> > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > > > index cd8d3a5..3d73a7d 100644
> > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev,
> void
> > > *data)
> > > >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
> > > >  }
> > > >
> > > > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > > > +					 int min_pasid,
> > > > +					 int max_pasid)
> > > > +{
> > > > +	int ret;
> > > > +	ioasid_t pasid;
> > > > +	struct mm_struct *mm = NULL;
> > > > +
> > > > +	mutex_lock(&iommu->lock);
> > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > +		ret = -EINVAL;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +	mm = get_task_mm(current);
> > > > +	/* Track ioasid allocation owner by mm */
> > > > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > > > +				max_pasid, NULL);
> > >
> > > Are we sure we want to tie this to the task mm vs perhaps the
> > > vfio_iommu pointer?
> >
> > Here we want to have a kind of per-VM mark, which can be used to do
> > ownership check on whether a pasid is held by a specific VM. This is
> > very important to prevent across VM affect. vfio_iommu pointer is
> > competent for vfio as vfio is both pasid alloc requester and pasid
> > consumer. e.g. vfio requests pasid alloc from ioasid and also it will
> > invoke bind_gpasid(). vfio can either check ownership before invoking
> > bind_gpasid() or pass vfio_iommu pointer to iommu driver. But in future,
> > there may be other modules which are just consumers of pasid. And they
> > also want to do ownership check for a pasid. Then, it would be hard for
> > them as they are not the pasid alloc requester. So here better to have
> > a system wide structure to perform as the per-VM mark. task mm looks
> > to be much competent.
> 
> Ok, so it's intentional to have a VM-wide token.  Elsewhere in the
> type1 code (vfio_dma_do_map) we record the task_struct per dma mapping
> so that we can get the task mm as needed.  Would the task_struct
> pointer provide any advantage?

I think we may use task_struct pointer to make type1 code consistent.
How do you think?

> Also, an overall question, this provides userspace with pasid alloc and
> free ioctls, (1) what prevents a userspace process from consuming every
> available pasid, and (2) if the process exits or crashes without
> freeing pasids, how are they recovered aside from a reboot?

For question (1), I think we only need to take care about malicious
userspace process. As vfio usage is under privilege mode, so we may
be safe on it so far. However, we may need to introduce a kind of credit
mechanism to protect it. I've thought it, but no good idea yet. Would be
happy to hear from you.

For question (2), I think we need to reclaim the allocated pasids when
the vfio container fd is released just like what vfio does to the domain
mappings. I didn't add it yet. But I can add it in next version if you think
it would make the pasid alloc/free be much sound.

> > > > +	if (pasid == INVALID_IOASID) {
> > > > +		ret = -ENOSPC;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +	ret = pasid;
> > > > +out_unlock:
> > > > +	mutex_unlock(&iommu->lock);
> 
> What does holding this lock protect?  That the vfio_iommu remains
> backed by an iommu during this operation, even though we don't do
> anything to release allocated pasids when that iommu backing is removed?

yes, it is unnecessary to hold the lock here. At least for the operations in
this patch. will remove it. :-)

> > > > +	if (mm)
> > > > +		mmput(mm);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > > > +				       unsigned int pasid)
> > > > +{
> > > > +	struct mm_struct *mm = NULL;
> > > > +	void *pdata;
> > > > +	int ret = 0;
> > > > +
> > > > +	mutex_lock(&iommu->lock);
> > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > +		ret = -EINVAL;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	/**
> > > > +	 * REVISIT:
> > > > +	 * There are two cases free could fail:
> > > > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > > > +	 * the set does not match, caller is not permitted to free.
> > > > +	 * 2. free before unbind all devices, we can check if ioasid private
> > > > +	 * data, if data != NULL, then fail to free.
> > > > +	 */
> > > > +	mm = get_task_mm(current);
> > > > +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> > > > +	if (IS_ERR(pdata)) {
> > > > +		if (pdata == ERR_PTR(-ENOENT))
> > > > +			pr_err("PASID %u is not allocated\n", pasid);
> > > > +		else if (pdata == ERR_PTR(-EACCES))
> > > > +			pr_err("Free PASID %u by non-owner, denied", pasid);
> > > > +		else
> > > > +			pr_err("Error searching PASID %u\n", pasid);
> > >
> > > This should be removed, errno is sufficient for the user, this just
> > > provides the user with a trivial DoS vector filling logs.
> >
> > sure, will fix it. thanks.
> >
> > > > +		ret = -EPERM;
> > >
> > > But why not return PTR_ERR(pdata)?
> >
> > aha, would do it.
> >
> > > > +		goto out_unlock;
> > > > +	}
> > > > +	if (pdata) {
> > > > +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> > > > +		/* Expect PASID has no private data if not bond */
> > > > +		ret = -EBUSY;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +	ioasid_free(pasid);
> > >
> > > We only ever get here with pasid == NULL?!
> >
> > I guess you meant only when pdata==NULL.
> >
> > > Something is wrong.  Should
> > > that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
> > > vector)
> >
> > Oh, yes, just do it as below:
> >
> > if (!pdata) {
> > 	ioasid_free(pasid);
> > 	ret = SUCCESS;
> > } else
> > 	ret = -EBUSY;
> >
> > Is it what you mean?
> 
> No, I think I was just confusing pdata and pasid, but I am still
> confused about testing pdata.  We call ioasid_alloc() with private =
> NULL, and I don't see any of your patches calling ioasid_set_data() to
> change the private data after allocation, so how could this ever be
> set?  Should this just be a BUG_ON(pdata) as the integrity of the
> system is in question should this state ever occur?  Thanks,

ioasid_set_data() was called  in one patch from Jacob's vSVA patchset.
[PATCH v6 08/10] iommu/vt-d: Add bind guest PASID support
https://lkml.org/lkml/2019/10/22/946

The basic idea is to allocate pasid with private=NULL, and set it when the
pasid is actually bind to a device (bind_gpasid()). Each bind_gpasid() will
increase the ref_cnt in the private data, and each unbind_gpasid() will
decrease the ref_cnt. So if bind/unbind_gpasid() is called in mirror, the
private data should be null when comes to free operation. If not, vfio can
believe that the pasid is still in use.

> Alex

Thanks,
Yi Liu
 
> > > > +
> > > > +out_unlock:
> > > > +	if (mm)
> > > > +		mmput(mm);
> > > > +	mutex_unlock(&iommu->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > >  				   unsigned int cmd, unsigned long arg)
> > > >  {
> > > > @@ -2370,6 +2447,43 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> > > >  					    &ustruct);
> > > >  		mutex_unlock(&iommu->lock);
> > > >  		return ret;
> > > > +
> > > > +	} else if (cmd == VFIO_IOMMU_PASID_REQUEST) {
> > > > +		struct vfio_iommu_type1_pasid_request req;
> > > > +		int min_pasid, max_pasid, pasid;
> > > > +
> > > > +		minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
> > > > +				    flag);
> > > > +
> > > > +		if (copy_from_user(&req, (void __user *)arg, minsz))
> > > > +			return -EFAULT;
> > > > +
> > > > +		if (req.argsz < minsz)
> > > > +			return -EINVAL;
> > > > +
> > > > +		switch (req.flag) {
> > >
> > > This works, but it's strange.  Let's make the code a little easier for
> > > the next flag bit that gets used so they don't need to rework this case
> > > statement.  I'd suggest creating a VFIO_IOMMU_PASID_OPS_MASK that is
> > > the OR of the ALLOC/FREE options, test that no bits are set outside of
> > > that mask, then AND that mask as the switch arg with the code below.
> >
> > Got it. Let me fix it in next version.
> >
> > > > +		/**
> > > > +		 * TODO: min_pasid and max_pasid align with
> > > > +		 * typedef unsigned int ioasid_t
> > > > +		 */
> > > > +		case VFIO_IOMMU_PASID_ALLOC:
> > > > +			if (copy_from_user(&min_pasid,
> > > > +				(void __user *)arg + minsz, sizeof(min_pasid)))
> > > > +				return -EFAULT;
> > > > +			if (copy_from_user(&max_pasid,
> > > > +				(void __user *)arg + minsz + sizeof(min_pasid),
> > > > +				sizeof(max_pasid)))
> > > > +				return -EFAULT;
> > > > +			return vfio_iommu_type1_pasid_alloc(iommu,
> > > > +						min_pasid, max_pasid);
> > > > +		case VFIO_IOMMU_PASID_FREE:
> > > > +			if (copy_from_user(&pasid,
> > > > +				(void __user *)arg + minsz, sizeof(pasid)))
> > > > +				return -EFAULT;
> > > > +			return vfio_iommu_type1_pasid_free(iommu, pasid);
> > > > +		default:
> > > > +			return -EINVAL;
> > > > +		}
> > > >  	}
> > > >
> > > >  	return -ENOTTY;
> > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > > index ccf60a2..04de290 100644
> > > > --- a/include/uapi/linux/vfio.h
> > > > +++ b/include/uapi/linux/vfio.h
> > > > @@ -807,6 +807,31 @@ struct vfio_iommu_type1_cache_invalidate {
> > > >  };
> > > >  #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE +
> 24)
> > > >
> > > > +/*
> > > > + * @flag=VFIO_IOMMU_PASID_ALLOC, refer to the @min_pasid and
> > > @max_pasid fields
> > > > + * @flag=VFIO_IOMMU_PASID_FREE, refer to @pasid field
> > > > + */
> > > > +struct vfio_iommu_type1_pasid_request {
> > > > +	__u32	argsz;
> > > > +#define VFIO_IOMMU_PASID_ALLOC	(1 << 0)
> > > > +#define VFIO_IOMMU_PASID_FREE	(1 << 1)
> > > > +	__u32	flag;
> > > > +	union {
> > > > +		struct {
> > > > +			int min_pasid;
> > > > +			int max_pasid;
> > > > +		};
> > > > +		int pasid;
> > >
> > > Perhaps:
> > >
> > > 		struct {
> > > 			u32 min;
> > > 			u32 max;
> > > 		} alloc_pasid;
> > > 		u32 free_pasid;
> > >
> > > (note also the s/int/u32/)
> >
> > got it. will fix it in next version. Thanks.
> >
> > > > +	};
> > > > +};
> > > > +
> > > > +/**
> > > > + * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 27,
> > > > + *				struct vfio_iommu_type1_pasid_request)
> > > > + *
> > > > + */
> > > > +#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 27)
> > > > +
> > > >  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> > > >
> > > >  /*
> >
> > Regards,
> > Yi Liu
Alex Williamson Nov. 8, 2019, 3:15 p.m. UTC | #7
On Fri, 8 Nov 2019 12:23:41 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, November 8, 2019 6:07 AM
> > To: Liu, Yi L <yi.l.liu@intel.com>
> > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > 
> > On Wed, 6 Nov 2019 13:27:26 +0000
> > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> >   
> > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > Sent: Wednesday, November 6, 2019 7:36 AM
> > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > >
> > > > On Thu, 24 Oct 2019 08:26:22 -0400
> > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > >  
> > > > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> > > > > to passdown PASID allocation/free request from the virtual
> > > > > iommu. This is required to get PASID managed in system-wide.
> > > > >
> > > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > > ---
> > > > >  drivers/vfio/vfio_iommu_type1.c | 114  
> > > > ++++++++++++++++++++++++++++++++++++++++  
> > > > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > > > >  2 files changed, 139 insertions(+)
> > > > >
> > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c  
> > b/drivers/vfio/vfio_iommu_type1.c  
> > > > > index cd8d3a5..3d73a7d 100644
> > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev,  
> > void  
> > > > *data)  
> > > > >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
> > > > >  }
> > > > >
> > > > > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > > > > +					 int min_pasid,
> > > > > +					 int max_pasid)
> > > > > +{
> > > > > +	int ret;
> > > > > +	ioasid_t pasid;
> > > > > +	struct mm_struct *mm = NULL;
> > > > > +
> > > > > +	mutex_lock(&iommu->lock);
> > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > +		ret = -EINVAL;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +	mm = get_task_mm(current);
> > > > > +	/* Track ioasid allocation owner by mm */
> > > > > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > > > > +				max_pasid, NULL);  
> > > >
> > > > Are we sure we want to tie this to the task mm vs perhaps the
> > > > vfio_iommu pointer?  
> > >
> > > Here we want to have a kind of per-VM mark, which can be used to do
> > > ownership check on whether a pasid is held by a specific VM. This is
> > > very important to prevent across VM affect. vfio_iommu pointer is
> > > competent for vfio as vfio is both pasid alloc requester and pasid
> > > consumer. e.g. vfio requests pasid alloc from ioasid and also it will
> > > invoke bind_gpasid(). vfio can either check ownership before invoking
> > > bind_gpasid() or pass vfio_iommu pointer to iommu driver. But in future,
> > > there may be other modules which are just consumers of pasid. And they
> > > also want to do ownership check for a pasid. Then, it would be hard for
> > > them as they are not the pasid alloc requester. So here better to have
> > > a system wide structure to perform as the per-VM mark. task mm looks
> > > to be much competent.  
> > 
> > Ok, so it's intentional to have a VM-wide token.  Elsewhere in the
> > type1 code (vfio_dma_do_map) we record the task_struct per dma mapping
> > so that we can get the task mm as needed.  Would the task_struct
> > pointer provide any advantage?  
> 
> I think we may use task_struct pointer to make type1 code consistent.
> How do you think?

If it has the same utility, sure.
 
> > Also, an overall question, this provides userspace with pasid alloc and
> > free ioctls, (1) what prevents a userspace process from consuming every
> > available pasid, and (2) if the process exits or crashes without
> > freeing pasids, how are they recovered aside from a reboot?  
> 
> For question (1), I think we only need to take care about malicious
> userspace process. As vfio usage is under privilege mode, so we may
> be safe on it so far.

No, where else do we ever make this assumption?  vfio requires a
privileged entity to configure the system for vfio, bind devices for
user use, and grant those devices to the user, but the usage of the
device is always assumed to be by an unprivileged user.  It is
absolutely not acceptable require a privileged user.  It's vfio's
responsibility to protect the system from the user.

> However, we may need to introduce a kind of credit
> mechanism to protect it. I've thought it, but no good idea yet. Would be
> happy to hear from you.

It's a limited system resource and it's unclear how many might
reasonably used by a user.  I don't have an easy answer.

> For question (2), I think we need to reclaim the allocated pasids when
> the vfio container fd is released just like what vfio does to the domain
> mappings. I didn't add it yet. But I can add it in next version if you think
> it would make the pasid alloc/free be much sound.

Consider it required, the interface is susceptible to abuse without it.

> > > > > +	if (pasid == INVALID_IOASID) {
> > > > > +		ret = -ENOSPC;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +	ret = pasid;
> > > > > +out_unlock:
> > > > > +	mutex_unlock(&iommu->lock);  
> > 
> > What does holding this lock protect?  That the vfio_iommu remains
> > backed by an iommu during this operation, even though we don't do
> > anything to release allocated pasids when that iommu backing is removed?  
> 
> yes, it is unnecessary to hold the lock here. At least for the operations in
> this patch. will remove it. :-)
> 
> > > > > +	if (mm)
> > > > > +		mmput(mm);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > > > > +				       unsigned int pasid)
> > > > > +{
> > > > > +	struct mm_struct *mm = NULL;
> > > > > +	void *pdata;
> > > > > +	int ret = 0;
> > > > > +
> > > > > +	mutex_lock(&iommu->lock);
> > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > +		ret = -EINVAL;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +
> > > > > +	/**
> > > > > +	 * REVISIT:
> > > > > +	 * There are two cases free could fail:
> > > > > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > > > > +	 * the set does not match, caller is not permitted to free.
> > > > > +	 * 2. free before unbind all devices, we can check if ioasid private
> > > > > +	 * data, if data != NULL, then fail to free.
> > > > > +	 */
> > > > > +	mm = get_task_mm(current);
> > > > > +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> > > > > +	if (IS_ERR(pdata)) {
> > > > > +		if (pdata == ERR_PTR(-ENOENT))
> > > > > +			pr_err("PASID %u is not allocated\n", pasid);
> > > > > +		else if (pdata == ERR_PTR(-EACCES))
> > > > > +			pr_err("Free PASID %u by non-owner, denied", pasid);
> > > > > +		else
> > > > > +			pr_err("Error searching PASID %u\n", pasid);  
> > > >
> > > > This should be removed, errno is sufficient for the user, this just
> > > > provides the user with a trivial DoS vector filling logs.  
> > >
> > > sure, will fix it. thanks.
> > >  
> > > > > +		ret = -EPERM;  
> > > >
> > > > But why not return PTR_ERR(pdata)?  
> > >
> > > aha, would do it.
> > >  
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +	if (pdata) {
> > > > > +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> > > > > +		/* Expect PASID has no private data if not bond */
> > > > > +		ret = -EBUSY;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +	ioasid_free(pasid);  
> > > >
> > > > We only ever get here with pasid == NULL?!  
> > >
> > > I guess you meant only when pdata==NULL.
> > >  
> > > > Something is wrong.  Should
> > > > that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
> > > > vector)  
> > >
> > > Oh, yes, just do it as below:
> > >
> > > if (!pdata) {
> > > 	ioasid_free(pasid);
> > > 	ret = SUCCESS;
> > > } else
> > > 	ret = -EBUSY;
> > >
> > > Is it what you mean?  
> > 
> > No, I think I was just confusing pdata and pasid, but I am still
> > confused about testing pdata.  We call ioasid_alloc() with private =
> > NULL, and I don't see any of your patches calling ioasid_set_data() to
> > change the private data after allocation, so how could this ever be
> > set?  Should this just be a BUG_ON(pdata) as the integrity of the
> > system is in question should this state ever occur?  Thanks,  
> 
> ioasid_set_data() was called  in one patch from Jacob's vSVA patchset.
> [PATCH v6 08/10] iommu/vt-d: Add bind guest PASID support
> https://lkml.org/lkml/2019/10/22/946
> 
> The basic idea is to allocate pasid with private=NULL, and set it when the
> pasid is actually bind to a device (bind_gpasid()). Each bind_gpasid() will
> increase the ref_cnt in the private data, and each unbind_gpasid() will
> decrease the ref_cnt. So if bind/unbind_gpasid() is called in mirror, the
> private data should be null when comes to free operation. If not, vfio can
> believe that the pasid is still in use.

So this is another opportunity to leak pasids.  What's a user supposed
to do when their attempt to free a pasid fails?  It invites leaks to
allow this path to fail.  Thanks,

Alex
Yi Liu Nov. 13, 2019, 11:03 a.m. UTC | #8
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Friday, November 8, 2019 11:15 PM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> 
> On Fri, 8 Nov 2019 12:23:41 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Friday, November 8, 2019 6:07 AM
> > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > >
> > > On Wed, 6 Nov 2019 13:27:26 +0000
> > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > >
> > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > Sent: Wednesday, November 6, 2019 7:36 AM
> > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Subject: Re: [RFC v2 2/3] vfio/type1:
> VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > > >
> > > > > On Thu, 24 Oct 2019 08:26:22 -0400
> > > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > > >
> > > > > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> > > > > > to passdown PASID allocation/free request from the virtual
> > > > > > iommu. This is required to get PASID managed in system-wide.
> > > > > >
> > > > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > > > ---
> > > > > >  drivers/vfio/vfio_iommu_type1.c | 114
> > > > > ++++++++++++++++++++++++++++++++++++++++
> > > > > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > > > > >  2 files changed, 139 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > b/drivers/vfio/vfio_iommu_type1.c
> > > > > > index cd8d3a5..3d73a7d 100644
> > > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev,
> > > void
> > > > > *data)
> > > > > >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
> > > > > >  }
> > > > > >
> > > > > > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > > > > > +					 int min_pasid,
> > > > > > +					 int max_pasid)
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +	ioasid_t pasid;
> > > > > > +	struct mm_struct *mm = NULL;
> > > > > > +
> > > > > > +	mutex_lock(&iommu->lock);
> > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > +		ret = -EINVAL;
> > > > > > +		goto out_unlock;
> > > > > > +	}
> > > > > > +	mm = get_task_mm(current);
> > > > > > +	/* Track ioasid allocation owner by mm */
> > > > > > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > > > > > +				max_pasid, NULL);
> > > > >
> > > > > Are we sure we want to tie this to the task mm vs perhaps the
> > > > > vfio_iommu pointer?
> > > >
> > > > Here we want to have a kind of per-VM mark, which can be used to do
> > > > ownership check on whether a pasid is held by a specific VM. This is
> > > > very important to prevent across VM affect. vfio_iommu pointer is
> > > > competent for vfio as vfio is both pasid alloc requester and pasid
> > > > consumer. e.g. vfio requests pasid alloc from ioasid and also it will
> > > > invoke bind_gpasid(). vfio can either check ownership before invoking
> > > > bind_gpasid() or pass vfio_iommu pointer to iommu driver. But in future,
> > > > there may be other modules which are just consumers of pasid. And they
> > > > also want to do ownership check for a pasid. Then, it would be hard for
> > > > them as they are not the pasid alloc requester. So here better to have
> > > > a system wide structure to perform as the per-VM mark. task mm looks
> > > > to be much competent.
> > >
> > > Ok, so it's intentional to have a VM-wide token.  Elsewhere in the
> > > type1 code (vfio_dma_do_map) we record the task_struct per dma mapping
> > > so that we can get the task mm as needed.  Would the task_struct
> > > pointer provide any advantage?
> >
> > I think we may use task_struct pointer to make type1 code consistent.
> > How do you think?
> 
> If it has the same utility, sure.

thanks, I'll make this change.

> > > Also, an overall question, this provides userspace with pasid alloc and
> > > free ioctls, (1) what prevents a userspace process from consuming every
> > > available pasid, and (2) if the process exits or crashes without
> > > freeing pasids, how are they recovered aside from a reboot?
> >
> > For question (1), I think we only need to take care about malicious
> > userspace process. As vfio usage is under privilege mode, so we may
> > be safe on it so far.
> 
> No, where else do we ever make this assumption?  vfio requires a
> privileged entity to configure the system for vfio, bind devices for
> user use, and grant those devices to the user, but the usage of the
> device is always assumed to be by an unprivileged user.  It is
> absolutely not acceptable require a privileged user.  It's vfio's
> responsibility to protect the system from the user.

My assumption is not precise here. sorry for it... Maybe to further
check with you to better understand your point. I think the user (QEMU)
of vfio needs to have a root permission. Thus it can open the vfio fds.
At this point, the user is a privileged one. Also I guess that's why vfio
can grant the user with the usage of VFIO_MAP/UNMAP to config
mappings into iommu page tables. But I'm not quite sure when will
the user be an unprivileged one.

> > However, we may need to introduce a kind of credit
> > mechanism to protect it. I've thought it, but no good idea yet. Would be
> > happy to hear from you.
> 
> It's a limited system resource and it's unclear how many might
> reasonably used by a user.  I don't have an easy answer.

How about the below method? based on some offline chat with Jacob.
a. some reasonable defaults for the initial per VM quota, e.g. 1000 per
process
b. IOASID should be able to enforce per ioasid_set (it is kind of per VM
mark) limit

> > For question (2), I think we need to reclaim the allocated pasids when
> > the vfio container fd is released just like what vfio does to the domain
> > mappings. I didn't add it yet. But I can add it in next version if you think
> > it would make the pasid alloc/free be much sound.
> 
> Consider it required, the interface is susceptible to abuse without it.

sure, let me add it in next version.

> > > > > > +	if (pasid == INVALID_IOASID) {
> > > > > > +		ret = -ENOSPC;
> > > > > > +		goto out_unlock;
> > > > > > +	}
> > > > > > +	ret = pasid;
> > > > > > +out_unlock:
> > > > > > +	mutex_unlock(&iommu->lock);
> > >
> > > What does holding this lock protect?  That the vfio_iommu remains
> > > backed by an iommu during this operation, even though we don't do
> > > anything to release allocated pasids when that iommu backing is removed?
> >
> > yes, it is unnecessary to hold the lock here. At least for the operations in
> > this patch. will remove it. :-)
> >
> > > > > > +	if (mm)
> > > > > > +		mmput(mm);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > > > > > +				       unsigned int pasid)
> > > > > > +{
> > > > > > +	struct mm_struct *mm = NULL;
> > > > > > +	void *pdata;
> > > > > > +	int ret = 0;
> > > > > > +
> > > > > > +	mutex_lock(&iommu->lock);
> > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > +		ret = -EINVAL;
> > > > > > +		goto out_unlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * REVISIT:
> > > > > > +	 * There are two cases free could fail:
> > > > > > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > > > > > +	 * the set does not match, caller is not permitted to free.
> > > > > > +	 * 2. free before unbind all devices, we can check if ioasid private
> > > > > > +	 * data, if data != NULL, then fail to free.
> > > > > > +	 */
> > > > > > +	mm = get_task_mm(current);
> > > > > > +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> > > > > > +	if (IS_ERR(pdata)) {
> > > > > > +		if (pdata == ERR_PTR(-ENOENT))
> > > > > > +			pr_err("PASID %u is not allocated\n", pasid);
> > > > > > +		else if (pdata == ERR_PTR(-EACCES))
> > > > > > +			pr_err("Free PASID %u by non-owner, denied",
> pasid);
> > > > > > +		else
> > > > > > +			pr_err("Error searching PASID %u\n", pasid);
> > > > >
> > > > > This should be removed, errno is sufficient for the user, this just
> > > > > provides the user with a trivial DoS vector filling logs.
> > > >
> > > > sure, will fix it. thanks.
> > > >
> > > > > > +		ret = -EPERM;
> > > > >
> > > > > But why not return PTR_ERR(pdata)?
> > > >
> > > > aha, would do it.
> > > >
> > > > > > +		goto out_unlock;
> > > > > > +	}
> > > > > > +	if (pdata) {
> > > > > > +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> > > > > > +		/* Expect PASID has no private data if not bond */
> > > > > > +		ret = -EBUSY;
> > > > > > +		goto out_unlock;
> > > > > > +	}
> > > > > > +	ioasid_free(pasid);
> > > > >
> > > > > We only ever get here with pasid == NULL?!
> > > >
> > > > I guess you meant only when pdata==NULL.
> > > >
> > > > > Something is wrong.  Should
> > > > > that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
> > > > > vector)
> > > >
> > > > Oh, yes, just do it as below:
> > > >
> > > > if (!pdata) {
> > > > 	ioasid_free(pasid);
> > > > 	ret = SUCCESS;
> > > > } else
> > > > 	ret = -EBUSY;
> > > >
> > > > Is it what you mean?
> > >
> > > No, I think I was just confusing pdata and pasid, but I am still
> > > confused about testing pdata.  We call ioasid_alloc() with private =
> > > NULL, and I don't see any of your patches calling ioasid_set_data() to
> > > change the private data after allocation, so how could this ever be
> > > set?  Should this just be a BUG_ON(pdata) as the integrity of the
> > > system is in question should this state ever occur?  Thanks,
> >
> > ioasid_set_data() was called  in one patch from Jacob's vSVA patchset.
> > [PATCH v6 08/10] iommu/vt-d: Add bind guest PASID support
> > https://lkml.org/lkml/2019/10/22/946
> >
> > The basic idea is to allocate pasid with private=NULL, and set it when the
> > pasid is actually bind to a device (bind_gpasid()). Each bind_gpasid() will
> > increase the ref_cnt in the private data, and each unbind_gpasid() will
> > decrease the ref_cnt. So if bind/unbind_gpasid() is called in mirror, the
> > private data should be null when comes to free operation. If not, vfio can
> > believe that the pasid is still in use.
> 
> So this is another opportunity to leak pasids.  What's a user supposed
> to do when their attempt to free a pasid fails?  It invites leaks to
> allow this path to fail.  Thanks,

Agreed, may no need to fail pasid free as it may leak pasid. How about
always let free successful? If the ref_cnt is non-zero, notify the remaining
users to release their reference.

Thanks,
Yi Liu
Alex Williamson Nov. 13, 2019, 3:29 p.m. UTC | #9
On Wed, 13 Nov 2019 11:03:17 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, November 8, 2019 11:15 PM
> > To: Liu, Yi L <yi.l.liu@intel.com>
> > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > 
> > On Fri, 8 Nov 2019 12:23:41 +0000
> > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> >   
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Friday, November 8, 2019 6:07 AM
> > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > >
> > > > On Wed, 6 Nov 2019 13:27:26 +0000
> > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > > >  
> > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > Sent: Wednesday, November 6, 2019 7:36 AM
> > > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > Subject: Re: [RFC v2 2/3] vfio/type1:  
> > VFIO_IOMMU_PASID_REQUEST(alloc/free)  
> > > > > >
> > > > > > On Thu, 24 Oct 2019 08:26:22 -0400
> > > > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > > > >  
> > > > > > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> > > > > > > to passdown PASID allocation/free request from the virtual
> > > > > > > iommu. This is required to get PASID managed in system-wide.
> > > > > > >
> > > > > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > > > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > > > > ---
> > > > > > >  drivers/vfio/vfio_iommu_type1.c | 114  
> > > > > > ++++++++++++++++++++++++++++++++++++++++  
> > > > > > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > > > > > >  2 files changed, 139 insertions(+)
> > > > > > >
> > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c  
> > > > b/drivers/vfio/vfio_iommu_type1.c  
> > > > > > > index cd8d3a5..3d73a7d 100644
> > > > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device *dev,  
> > > > void  
> > > > > > *data)  
> > > > > > >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
> > > > > > >  }
> > > > > > >
> > > > > > > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > > > > > > +					 int min_pasid,
> > > > > > > +					 int max_pasid)
> > > > > > > +{
> > > > > > > +	int ret;
> > > > > > > +	ioasid_t pasid;
> > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > +
> > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > +		ret = -EINVAL;
> > > > > > > +		goto out_unlock;
> > > > > > > +	}
> > > > > > > +	mm = get_task_mm(current);
> > > > > > > +	/* Track ioasid allocation owner by mm */
> > > > > > > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > > > > > > +				max_pasid, NULL);  
> > > > > >
> > > > > > Are we sure we want to tie this to the task mm vs perhaps the
> > > > > > vfio_iommu pointer?  
> > > > >
> > > > > Here we want to have a kind of per-VM mark, which can be used to do
> > > > > ownership check on whether a pasid is held by a specific VM. This is
> > > > > very important to prevent across VM affect. vfio_iommu pointer is
> > > > > competent for vfio as vfio is both pasid alloc requester and pasid
> > > > > consumer. e.g. vfio requests pasid alloc from ioasid and also it will
> > > > > invoke bind_gpasid(). vfio can either check ownership before invoking
> > > > > bind_gpasid() or pass vfio_iommu pointer to iommu driver. But in future,
> > > > > there may be other modules which are just consumers of pasid. And they
> > > > > also want to do ownership check for a pasid. Then, it would be hard for
> > > > > them as they are not the pasid alloc requester. So here better to have
> > > > > a system wide structure to perform as the per-VM mark. task mm looks
> > > > > to be much competent.  
> > > >
> > > > Ok, so it's intentional to have a VM-wide token.  Elsewhere in the
> > > > type1 code (vfio_dma_do_map) we record the task_struct per dma mapping
> > > > so that we can get the task mm as needed.  Would the task_struct
> > > > pointer provide any advantage?  
> > >
> > > I think we may use task_struct pointer to make type1 code consistent.
> > > How do you think?  
> > 
> > If it has the same utility, sure.  
> 
> thanks, I'll make this change.
> 
> > > > Also, an overall question, this provides userspace with pasid alloc and
> > > > free ioctls, (1) what prevents a userspace process from consuming every
> > > > available pasid, and (2) if the process exits or crashes without
> > > > freeing pasids, how are they recovered aside from a reboot?  
> > >
> > > For question (1), I think we only need to take care about malicious
> > > userspace process. As vfio usage is under privilege mode, so we may
> > > be safe on it so far.  
> > 
> > No, where else do we ever make this assumption?  vfio requires a
> > privileged entity to configure the system for vfio, bind devices for
> > user use, and grant those devices to the user, but the usage of the
> > device is always assumed to be by an unprivileged user.  It is
> > absolutely not acceptable require a privileged user.  It's vfio's
> > responsibility to protect the system from the user.  
> 
> My assumption is not precise here. sorry for it... Maybe to further
> check with you to better understand your point. I think the user (QEMU)
> of vfio needs to have a root permission. Thus it can open the vfio fds.
> At this point, the user is a privileged one. Also I guess that's why vfio
> can grant the user with the usage of VFIO_MAP/UNMAP to config
> mappings into iommu page tables. But I'm not quite sure when will
> the user be an unprivileged one.

QEMU does NOT need to be run as root to use vfio.  This is NOT the
model libvirt follows.  libvirt grants a user access to a device, or
rather a set of one or more devices (ie. the group) via standard file
permission access to the group file (/dev/vfio/$GROUP).  Ownership of a
device allows the user permission to make use of the IOMMU.  The user's
ability to create DMA mappings is restricted by their process locked
memory limits, where libvirt elevates the user limit sufficient for the
size of the VM.  QEMU should never need to be run as root and doing so
is entirely unacceptable from a security perspective.  The only mode of
vfio that requires elevated privilege for use is when making use of
no-iommu, where we have no IOMMU protection or translation.

> > > However, we may need to introduce a kind of credit
> > > mechanism to protect it. I've thought it, but no good idea yet. Would be
> > > happy to hear from you.  
> > 
> > It's a limited system resource and it's unclear how many might
> > reasonably used by a user.  I don't have an easy answer.  
> 
> How about the below method? based on some offline chat with Jacob.
> a. some reasonable defaults for the initial per VM quota, e.g. 1000 per
> process
> b. IOASID should be able to enforce per ioasid_set (it is kind of per VM
> mark) limit

We support large numbers of assigned devices, how many IOASIDs might be
reasonably used per device?  Is the mm or the task still the correct
"set" in this scenario?  I don't have any better ideas than setting a
limit, but it probably needs a kernel or module tunable, and it needs
to match the scaling we expect to see when multiple devices are
involved.

> > > For question (2), I think we need to reclaim the allocated pasids when
> > > the vfio container fd is released just like what vfio does to the domain
> > > mappings. I didn't add it yet. But I can add it in next version if you think
> > > it would make the pasid alloc/free be much sound.  
> > 
> > Consider it required, the interface is susceptible to abuse without it.  
> 
> sure, let me add it in next version.
> 
> > > > > > > +	if (pasid == INVALID_IOASID) {
> > > > > > > +		ret = -ENOSPC;
> > > > > > > +		goto out_unlock;
> > > > > > > +	}
> > > > > > > +	ret = pasid;
> > > > > > > +out_unlock:
> > > > > > > +	mutex_unlock(&iommu->lock);  
> > > >
> > > > What does holding this lock protect?  That the vfio_iommu remains
> > > > backed by an iommu during this operation, even though we don't do
> > > > anything to release allocated pasids when that iommu backing is removed?  
> > >
> > > yes, it is unnecessary to hold the lock here. At least for the operations in
> > > this patch. will remove it. :-)
> > >  
> > > > > > > +	if (mm)
> > > > > > > +		mmput(mm);
> > > > > > > +	return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > > > > > > +				       unsigned int pasid)
> > > > > > > +{
> > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > +	void *pdata;
> > > > > > > +	int ret = 0;
> > > > > > > +
> > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > +		ret = -EINVAL;
> > > > > > > +		goto out_unlock;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	/**
> > > > > > > +	 * REVISIT:
> > > > > > > +	 * There are two cases free could fail:
> > > > > > > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > > > > > > +	 * the set does not match, caller is not permitted to free.
> > > > > > > +	 * 2. free before unbind all devices, we can check if ioasid private
> > > > > > > +	 * data, if data != NULL, then fail to free.
> > > > > > > +	 */
> > > > > > > +	mm = get_task_mm(current);
> > > > > > > +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> > > > > > > +	if (IS_ERR(pdata)) {
> > > > > > > +		if (pdata == ERR_PTR(-ENOENT))
> > > > > > > +			pr_err("PASID %u is not allocated\n", pasid);
> > > > > > > +		else if (pdata == ERR_PTR(-EACCES))
> > > > > > > +			pr_err("Free PASID %u by non-owner, denied",  
> > pasid);  
> > > > > > > +		else
> > > > > > > +			pr_err("Error searching PASID %u\n", pasid);  
> > > > > >
> > > > > > This should be removed, errno is sufficient for the user, this just
> > > > > > provides the user with a trivial DoS vector filling logs.  
> > > > >
> > > > > sure, will fix it. thanks.
> > > > >  
> > > > > > > +		ret = -EPERM;  
> > > > > >
> > > > > > But why not return PTR_ERR(pdata)?  
> > > > >
> > > > > aha, would do it.
> > > > >  
> > > > > > > +		goto out_unlock;
> > > > > > > +	}
> > > > > > > +	if (pdata) {
> > > > > > > +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> > > > > > > +		/* Expect PASID has no private data if not bond */
> > > > > > > +		ret = -EBUSY;
> > > > > > > +		goto out_unlock;
> > > > > > > +	}
> > > > > > > +	ioasid_free(pasid);  
> > > > > >
> > > > > > We only ever get here with pasid == NULL?!  
> > > > >
> > > > > I guess you meant only when pdata==NULL.
> > > > >  
> > > > > > Something is wrong.  Should
> > > > > > that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
> > > > > > vector)  
> > > > >
> > > > > Oh, yes, just do it as below:
> > > > >
> > > > > if (!pdata) {
> > > > > 	ioasid_free(pasid);
> > > > > 	ret = SUCCESS;
> > > > > } else
> > > > > 	ret = -EBUSY;
> > > > >
> > > > > Is it what you mean?  
> > > >
> > > > No, I think I was just confusing pdata and pasid, but I am still
> > > > confused about testing pdata.  We call ioasid_alloc() with private =
> > > > NULL, and I don't see any of your patches calling ioasid_set_data() to
> > > > change the private data after allocation, so how could this ever be
> > > > set?  Should this just be a BUG_ON(pdata) as the integrity of the
> > > > system is in question should this state ever occur?  Thanks,  
> > >
> > > ioasid_set_data() was called  in one patch from Jacob's vSVA patchset.
> > > [PATCH v6 08/10] iommu/vt-d: Add bind guest PASID support
> > > https://lkml.org/lkml/2019/10/22/946
> > >
> > > The basic idea is to allocate pasid with private=NULL, and set it when the
> > > pasid is actually bind to a device (bind_gpasid()). Each bind_gpasid() will
> > > increase the ref_cnt in the private data, and each unbind_gpasid() will
> > > decrease the ref_cnt. So if bind/unbind_gpasid() is called in mirror, the
> > > private data should be null when comes to free operation. If not, vfio can
> > > believe that the pasid is still in use.  
> > 
> > So this is another opportunity to leak pasids.  What's a user supposed
> > to do when their attempt to free a pasid fails?  It invites leaks to
> > allow this path to fail.  Thanks,  
> 
> Agreed, may no need to fail pasid free as it may leak pasid. How about
> always let free successful? If the ref_cnt is non-zero, notify the remaining
> users to release their reference.

If a user frees an PASID, they've done their due diligence in
indicating it's no longer used.  The kernel should handle reclaiming it
from that point.  Thanks,

Alex
Jacob Pan Nov. 13, 2019, 7:45 p.m. UTC | #10
On Wed, 13 Nov 2019 08:29:40 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 13 Nov 2019 11:03:17 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Friday, November 8, 2019 11:15 PM
> > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > Subject: Re: [RFC v2 2/3] vfio/type1:
> > > VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > 
> > > On Fri, 8 Nov 2019 12:23:41 +0000
> > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > >     
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Friday, November 8, 2019 6:07 AM
> > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Subject: Re: [RFC v2 2/3] vfio/type1:
> > > > > VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > > >
> > > > > On Wed, 6 Nov 2019 13:27:26 +0000
> > > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > > > >    
> > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > Sent: Wednesday, November 6, 2019 7:36 AM
> > > > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > Subject: Re: [RFC v2 2/3] vfio/type1:    
> > > VFIO_IOMMU_PASID_REQUEST(alloc/free)    
> > > > > > >
> > > > > > > On Thu, 24 Oct 2019 08:26:22 -0400
> > > > > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > > > > >    
> > > > > > > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which
> > > > > > > > aims to passdown PASID allocation/free request from the
> > > > > > > > virtual iommu. This is required to get PASID managed in
> > > > > > > > system-wide.
> > > > > > > >
> > > > > > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > > > > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > > > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > > > > > ---
> > > > > > > >  drivers/vfio/vfio_iommu_type1.c | 114    
> > > > > > > ++++++++++++++++++++++++++++++++++++++++    
> > > > > > > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > > > > > > >  2 files changed, 139 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c    
> > > > > b/drivers/vfio/vfio_iommu_type1.c    
> > > > > > > > index cd8d3a5..3d73a7d 100644
> > > > > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > @@ -2248,6 +2248,83 @@ static int
> > > > > > > > vfio_cache_inv_fn(struct device *dev,    
> > > > > void    
> > > > > > > *data)    
> > > > > > > >  	return iommu_cache_invalidate(dc->domain, dev,
> > > > > > > > &ustruct->info); }
> > > > > > > >
> > > > > > > > +static int vfio_iommu_type1_pasid_alloc(struct
> > > > > > > > vfio_iommu *iommu,
> > > > > > > > +					 int min_pasid,
> > > > > > > > +					 int max_pasid)
> > > > > > > > +{
> > > > > > > > +	int ret;
> > > > > > > > +	ioasid_t pasid;
> > > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > > +		ret = -EINVAL;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	mm = get_task_mm(current);
> > > > > > > > +	/* Track ioasid allocation owner by mm */
> > > > > > > > +	pasid = ioasid_alloc((struct ioasid_set *)mm,
> > > > > > > > min_pasid,
> > > > > > > > +				max_pasid, NULL);    
> > > > > > >
> > > > > > > Are we sure we want to tie this to the task mm vs perhaps
> > > > > > > the vfio_iommu pointer?    
> > > > > >
> > > > > > Here we want to have a kind of per-VM mark, which can be
> > > > > > used to do ownership check on whether a pasid is held by a
> > > > > > specific VM. This is very important to prevent across VM
> > > > > > affect. vfio_iommu pointer is competent for vfio as vfio is
> > > > > > both pasid alloc requester and pasid consumer. e.g. vfio
> > > > > > requests pasid alloc from ioasid and also it will invoke
> > > > > > bind_gpasid(). vfio can either check ownership before
> > > > > > invoking bind_gpasid() or pass vfio_iommu pointer to iommu
> > > > > > driver. But in future, there may be other modules which are
> > > > > > just consumers of pasid. And they also want to do ownership
> > > > > > check for a pasid. Then, it would be hard for them as they
> > > > > > are not the pasid alloc requester. So here better to have a
> > > > > > system wide structure to perform as the per-VM mark. task
> > > > > > mm looks to be much competent.    
> > > > >
> > > > > Ok, so it's intentional to have a VM-wide token.  Elsewhere
> > > > > in the type1 code (vfio_dma_do_map) we record the task_struct
> > > > > per dma mapping so that we can get the task mm as needed.
> > > > > Would the task_struct pointer provide any advantage?    
> > > >
> > > > I think we may use task_struct pointer to make type1 code
> > > > consistent. How do you think?    
> > > 
> > > If it has the same utility, sure.    
> > 
> > thanks, I'll make this change.
> >   
> > > > > Also, an overall question, this provides userspace with pasid
> > > > > alloc and free ioctls, (1) what prevents a userspace process
> > > > > from consuming every available pasid, and (2) if the process
> > > > > exits or crashes without freeing pasids, how are they
> > > > > recovered aside from a reboot?    
> > > >
> > > > For question (1), I think we only need to take care about
> > > > malicious userspace process. As vfio usage is under privilege
> > > > mode, so we may be safe on it so far.    
> > > 
> > > No, where else do we ever make this assumption?  vfio requires a
> > > privileged entity to configure the system for vfio, bind devices
> > > for user use, and grant those devices to the user, but the usage
> > > of the device is always assumed to be by an unprivileged user.
> > > It is absolutely not acceptable require a privileged user.  It's
> > > vfio's responsibility to protect the system from the user.    
> > 
> > My assumption is not precise here. sorry for it... Maybe to further
> > check with you to better understand your point. I think the user
> > (QEMU) of vfio needs to have a root permission. Thus it can open
> > the vfio fds. At this point, the user is a privileged one. Also I
> > guess that's why vfio can grant the user with the usage of
> > VFIO_MAP/UNMAP to config mappings into iommu page tables. But I'm
> > not quite sure when will the user be an unprivileged one.  
> 
> QEMU does NOT need to be run as root to use vfio.  This is NOT the
> model libvirt follows.  libvirt grants a user access to a device, or
> rather a set of one or more devices (ie. the group) via standard file
> permission access to the group file (/dev/vfio/$GROUP).  Ownership of
> a device allows the user permission to make use of the IOMMU.  The
> user's ability to create DMA mappings is restricted by their process
> locked memory limits, where libvirt elevates the user limit
> sufficient for the size of the VM.  QEMU should never need to be run
> as root and doing so is entirely unacceptable from a security
> perspective.  The only mode of vfio that requires elevated privilege
> for use is when making use of no-iommu, where we have no IOMMU
> protection or translation.
> 
> > > > However, we may need to introduce a kind of credit
> > > > mechanism to protect it. I've thought it, but no good idea yet.
> > > > Would be happy to hear from you.    
> > > 
> > > It's a limited system resource and it's unclear how many might
> > > reasonably used by a user.  I don't have an easy answer.    
> > 
> > How about the below method? based on some offline chat with Jacob.
> > a. some reasonable defaults for the initial per VM quota, e.g. 1000
> > per process
> > b. IOASID should be able to enforce per ioasid_set (it is kind of
> > per VM mark) limit  
> 
> We support large numbers of assigned devices, how many IOASIDs might
> be reasonably used per device?  Is the mm or the task still the
> correct "set" in this scenario?  I don't have any better ideas than
> setting a limit, but it probably needs a kernel or module tunable,
> and it needs to match the scaling we expect to see when multiple
> devices are involved.
> 
I think mm/task is still the correct set in that we try to prevent
abuse based on mm not device. Or we need to have some notion of super
container (Ashok proposed a while ago) that maps to a VM.

I am guessing you are suggesting the per mm quota should also be
scaled against number of devices assigned. I think that is very
reasonable. Perhaps we can do:

1. A tunable per iommu group PASID quota with a default of 1000, e.g.
nr_pasid_per_group. Since we are dealing with nested translation and
each device has its own second level so we pretty much have one device
per group. Call it nr_pasid_per_group?

2. Limit number of PASIDs per VM with nr_pasid_per_group * nr_groups.
Probably update the limit when group is added to a container with the
same mm.

I guess we could also use a cgroup controller for PASIDs.

> > > > For question (2), I think we need to reclaim the allocated
> > > > pasids when the vfio container fd is released just like what
> > > > vfio does to the domain mappings. I didn't add it yet. But I
> > > > can add it in next version if you think it would make the pasid
> > > > alloc/free be much sound.    
> > > 
> > > Consider it required, the interface is susceptible to abuse
> > > without it.    
> > 
> > sure, let me add it in next version.
> >   
> > > > > > > > +	if (pasid == INVALID_IOASID) {
> > > > > > > > +		ret = -ENOSPC;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	ret = pasid;
> > > > > > > > +out_unlock:
> > > > > > > > +	mutex_unlock(&iommu->lock);    
> > > > >
> > > > > What does holding this lock protect?  That the vfio_iommu
> > > > > remains backed by an iommu during this operation, even though
> > > > > we don't do anything to release allocated pasids when that
> > > > > iommu backing is removed?    
> > > >
> > > > yes, it is unnecessary to hold the lock here. At least for the
> > > > operations in this patch. will remove it. :-)
> > > >    
> > > > > > > > +	if (mm)
> > > > > > > > +		mmput(mm);
> > > > > > > > +	return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int vfio_iommu_type1_pasid_free(struct
> > > > > > > > vfio_iommu *iommu,
> > > > > > > > +				       unsigned int
> > > > > > > > pasid) +{
> > > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > > +	void *pdata;
> > > > > > > > +	int ret = 0;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > > +		ret = -EINVAL;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	/**
> > > > > > > > +	 * REVISIT:
> > > > > > > > +	 * There are two cases free could fail:
> > > > > > > > +	 * 1. free pasid by non-owner, we use
> > > > > > > > ioasid_set to track mm, if
> > > > > > > > +	 * the set does not match, caller is not
> > > > > > > > permitted to free.
> > > > > > > > +	 * 2. free before unbind all devices, we can
> > > > > > > > check if ioasid private
> > > > > > > > +	 * data, if data != NULL, then fail to free.
> > > > > > > > +	 */
> > > > > > > > +	mm = get_task_mm(current);
> > > > > > > > +	pdata = ioasid_find((struct ioasid_set *)mm,
> > > > > > > > pasid, NULL);
> > > > > > > > +	if (IS_ERR(pdata)) {
> > > > > > > > +		if (pdata == ERR_PTR(-ENOENT))
> > > > > > > > +			pr_err("PASID %u is not
> > > > > > > > allocated\n", pasid);
> > > > > > > > +		else if (pdata == ERR_PTR(-EACCES))
> > > > > > > > +			pr_err("Free PASID %u by
> > > > > > > > non-owner, denied",    
> > > pasid);    
> > > > > > > > +		else
> > > > > > > > +			pr_err("Error searching PASID
> > > > > > > > %u\n", pasid);    
> > > > > > >
> > > > > > > This should be removed, errno is sufficient for the user,
> > > > > > > this just provides the user with a trivial DoS vector
> > > > > > > filling logs.    
> > > > > >
> > > > > > sure, will fix it. thanks.
> > > > > >    
> > > > > > > > +		ret = -EPERM;    
> > > > > > >
> > > > > > > But why not return PTR_ERR(pdata)?    
> > > > > >
> > > > > > aha, would do it.
> > > > > >    
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	if (pdata) {
> > > > > > > > +		pr_debug("Cannot free pasid %d with
> > > > > > > > private data\n", pasid);
> > > > > > > > +		/* Expect PASID has no private data if
> > > > > > > > not bond */
> > > > > > > > +		ret = -EBUSY;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	ioasid_free(pasid);    
> > > > > > >
> > > > > > > We only ever get here with pasid == NULL?!    
> > > > > >
> > > > > > I guess you meant only when pdata==NULL.
> > > > > >    
> > > > > > > Something is wrong.  Should
> > > > > > > that be 'if (!pdata)'?  (which also makes that pr_debug
> > > > > > > another DoS vector)    
> > > > > >
> > > > > > Oh, yes, just do it as below:
> > > > > >
> > > > > > if (!pdata) {
> > > > > > 	ioasid_free(pasid);
> > > > > > 	ret = SUCCESS;
> > > > > > } else
> > > > > > 	ret = -EBUSY;
> > > > > >
> > > > > > Is it what you mean?    
> > > > >
> > > > > No, I think I was just confusing pdata and pasid, but I am
> > > > > still confused about testing pdata.  We call ioasid_alloc()
> > > > > with private = NULL, and I don't see any of your patches
> > > > > calling ioasid_set_data() to change the private data after
> > > > > allocation, so how could this ever be set?  Should this just
> > > > > be a BUG_ON(pdata) as the integrity of the system is in
> > > > > question should this state ever occur?  Thanks,    
> > > >
> > > > ioasid_set_data() was called  in one patch from Jacob's vSVA
> > > > patchset. [PATCH v6 08/10] iommu/vt-d: Add bind guest PASID
> > > > support https://lkml.org/lkml/2019/10/22/946
> > > >
> > > > The basic idea is to allocate pasid with private=NULL, and set
> > > > it when the pasid is actually bind to a device (bind_gpasid()).
> > > > Each bind_gpasid() will increase the ref_cnt in the private
> > > > data, and each unbind_gpasid() will decrease the ref_cnt. So if
> > > > bind/unbind_gpasid() is called in mirror, the private data
> > > > should be null when comes to free operation. If not, vfio can
> > > > believe that the pasid is still in use.    
> > > 
> > > So this is another opportunity to leak pasids.  What's a user
> > > supposed to do when their attempt to free a pasid fails?  It
> > > invites leaks to allow this path to fail.  Thanks,    
> > 
> > Agreed, may no need to fail pasid free as it may leak pasid. How
> > about always let free successful? If the ref_cnt is non-zero,
> > notify the remaining users to release their reference.  
> 
> If a user frees an PASID, they've done their due diligence in
> indicating it's no longer used.  The kernel should handle reclaiming
> it from that point.  Thanks,

Yeah, I think we can add a atomic notifier for each PASID.
Consumers such as IOMMU driver and KVM gets notified when IOASID is
freed by VFIO. IOMMU driver can do the unbind and tear down.

In case of all the users already did unbind() before ioasid_free(), the
free will proceed as usual.

> Alex
>
Yi Liu Nov. 18, 2019, 4:50 a.m. UTC | #11
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, November 13, 2019 11:30 PM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> 
> On Wed, 13 Nov 2019 11:03:17 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Friday, November 8, 2019 11:15 PM
> > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > >
> > > On Fri, 8 Nov 2019 12:23:41 +0000
> > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > >
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Friday, November 8, 2019 6:07 AM
> > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Subject: Re: [RFC v2 2/3] vfio/type1:
> VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > > >
> > > > > On Wed, 6 Nov 2019 13:27:26 +0000
> > > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > > > >
> > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > Sent: Wednesday, November 6, 2019 7:36 AM
> > > > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > Subject: Re: [RFC v2 2/3] vfio/type1:
> > > VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > > > > >
> > > > > > > On Thu, 24 Oct 2019 08:26:22 -0400
> > > > > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > > > > >
> > > > > > > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which aims
> > > > > > > > to passdown PASID allocation/free request from the virtual
> > > > > > > > iommu. This is required to get PASID managed in system-wide.
> > > > > > > >
> > > > > > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > > > > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > > > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > > > > > ---
> > > > > > > >  drivers/vfio/vfio_iommu_type1.c | 114
> > > > > > > ++++++++++++++++++++++++++++++++++++++++
> > > > > > > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > > > > > > >  2 files changed, 139 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > > b/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > index cd8d3a5..3d73a7d 100644
> > > > > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > @@ -2248,6 +2248,83 @@ static int vfio_cache_inv_fn(struct device
> *dev,
> > > > > void
> > > > > > > *data)
> > > > > > > >  	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
> > > > > > > > +					 int min_pasid,
> > > > > > > > +					 int max_pasid)
> > > > > > > > +{
> > > > > > > > +	int ret;
> > > > > > > > +	ioasid_t pasid;
> > > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > > +		ret = -EINVAL;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	mm = get_task_mm(current);
> > > > > > > > +	/* Track ioasid allocation owner by mm */
> > > > > > > > +	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
> > > > > > > > +				max_pasid, NULL);
> > > > > > >
> > > > > > > Are we sure we want to tie this to the task mm vs perhaps the
> > > > > > > vfio_iommu pointer?
> > > > > >
> > > > > > Here we want to have a kind of per-VM mark, which can be used to do
> > > > > > ownership check on whether a pasid is held by a specific VM. This is
> > > > > > very important to prevent across VM affect. vfio_iommu pointer is
> > > > > > competent for vfio as vfio is both pasid alloc requester and pasid
> > > > > > consumer. e.g. vfio requests pasid alloc from ioasid and also it will
> > > > > > invoke bind_gpasid(). vfio can either check ownership before invoking
> > > > > > bind_gpasid() or pass vfio_iommu pointer to iommu driver. But in future,
> > > > > > there may be other modules which are just consumers of pasid. And they
> > > > > > also want to do ownership check for a pasid. Then, it would be hard for
> > > > > > them as they are not the pasid alloc requester. So here better to have
> > > > > > a system wide structure to perform as the per-VM mark. task mm looks
> > > > > > to be much competent.
> > > > >
> > > > > Ok, so it's intentional to have a VM-wide token.  Elsewhere in the
> > > > > type1 code (vfio_dma_do_map) we record the task_struct per dma mapping
> > > > > so that we can get the task mm as needed.  Would the task_struct
> > > > > pointer provide any advantage?
> > > >
> > > > I think we may use task_struct pointer to make type1 code consistent.
> > > > How do you think?
> > >
> > > If it has the same utility, sure.
> >
> > thanks, I'll make this change.
> >
> > > > > Also, an overall question, this provides userspace with pasid alloc and
> > > > > free ioctls, (1) what prevents a userspace process from consuming every
> > > > > available pasid, and (2) if the process exits or crashes without
> > > > > freeing pasids, how are they recovered aside from a reboot?
> > > >
> > > > For question (1), I think we only need to take care about malicious
> > > > userspace process. As vfio usage is under privilege mode, so we may
> > > > be safe on it so far.
> > >
> > > No, where else do we ever make this assumption?  vfio requires a
> > > privileged entity to configure the system for vfio, bind devices for
> > > user use, and grant those devices to the user, but the usage of the
> > > device is always assumed to be by an unprivileged user.  It is
> > > absolutely not acceptable require a privileged user.  It's vfio's
> > > responsibility to protect the system from the user.
> >
> > My assumption is not precise here. sorry for it... Maybe to further
> > check with you to better understand your point. I think the user (QEMU)
> > of vfio needs to have a root permission. Thus it can open the vfio fds.
> > At this point, the user is a privileged one. Also I guess that's why vfio
> > can grant the user with the usage of VFIO_MAP/UNMAP to config
> > mappings into iommu page tables. But I'm not quite sure when will
> > the user be an unprivileged one.
> 
> QEMU does NOT need to be run as root to use vfio.  This is NOT the
> model libvirt follows.  libvirt grants a user access to a device, or
> rather a set of one or more devices (ie. the group) via standard file
> permission access to the group file (/dev/vfio/$GROUP).  Ownership of a
> device allows the user permission to make use of the IOMMU.  The user's
> ability to create DMA mappings is restricted by their process locked
> memory limits, where libvirt elevates the user limit sufficient for the
> size of the VM.  QEMU should never need to be run as root and doing so
> is entirely unacceptable from a security perspective.  The only mode of
> vfio that requires elevated privilege for use is when making use of
> no-iommu, where we have no IOMMU protection or translation.

got it. thanks for the detailed explanation.

> > > > However, we may need to introduce a kind of credit
> > > > mechanism to protect it. I've thought it, but no good idea yet. Would be
> > > > happy to hear from you.
> > >
> > > It's a limited system resource and it's unclear how many might
> > > reasonably used by a user.  I don't have an easy answer.
> >
> > How about the below method? based on some offline chat with Jacob.
> > a. some reasonable defaults for the initial per VM quota, e.g. 1000 per
> > process
> > b. IOASID should be able to enforce per ioasid_set (it is kind of per VM
> > mark) limit
> 
> We support large numbers of assigned devices, how many IOASIDs might be
> reasonably used per device?  Is the mm or the task still the correct
> "set" in this scenario?  I don't have any better ideas than setting a
> limit, but it probably needs a kernel or module tunable, and it needs
> to match the scaling we expect to see when multiple devices are
> involved.

How about Jacob's proposal in his reply?

> > > > For question (2), I think we need to reclaim the allocated pasids when
> > > > the vfio container fd is released just like what vfio does to the domain
> > > > mappings. I didn't add it yet. But I can add it in next version if you think
> > > > it would make the pasid alloc/free be much sound.
> > >
> > > Consider it required, the interface is susceptible to abuse without it.
> >
> > sure, let me add it in next version.
> >
> > > > > > > > +	if (pasid == INVALID_IOASID) {
> > > > > > > > +		ret = -ENOSPC;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	ret = pasid;
> > > > > > > > +out_unlock:
> > > > > > > > +	mutex_unlock(&iommu->lock);
> > > > >
> > > > > What does holding this lock protect?  That the vfio_iommu remains
> > > > > backed by an iommu during this operation, even though we don't do
> > > > > anything to release allocated pasids when that iommu backing is removed?
> > > >
> > > > yes, it is unnecessary to hold the lock here. At least for the operations in
> > > > this patch. will remove it. :-)
> > > >
> > > > > > > > +	if (mm)
> > > > > > > > +		mmput(mm);
> > > > > > > > +	return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
> > > > > > > > +				       unsigned int pasid)
> > > > > > > > +{
> > > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > > +	void *pdata;
> > > > > > > > +	int ret = 0;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > > +		ret = -EINVAL;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	/**
> > > > > > > > +	 * REVISIT:
> > > > > > > > +	 * There are two cases free could fail:
> > > > > > > > +	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
> > > > > > > > +	 * the set does not match, caller is not permitted to free.
> > > > > > > > +	 * 2. free before unbind all devices, we can check if ioasid private
> > > > > > > > +	 * data, if data != NULL, then fail to free.
> > > > > > > > +	 */
> > > > > > > > +	mm = get_task_mm(current);
> > > > > > > > +	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
> > > > > > > > +	if (IS_ERR(pdata)) {
> > > > > > > > +		if (pdata == ERR_PTR(-ENOENT))
> > > > > > > > +			pr_err("PASID %u is not allocated\n", pasid);
> > > > > > > > +		else if (pdata == ERR_PTR(-EACCES))
> > > > > > > > +			pr_err("Free PASID %u by non-owner, denied",
> > > pasid);
> > > > > > > > +		else
> > > > > > > > +			pr_err("Error searching PASID %u\n", pasid);
> > > > > > >
> > > > > > > This should be removed, errno is sufficient for the user, this just
> > > > > > > provides the user with a trivial DoS vector filling logs.
> > > > > >
> > > > > > sure, will fix it. thanks.
> > > > > >
> > > > > > > > +		ret = -EPERM;
> > > > > > >
> > > > > > > But why not return PTR_ERR(pdata)?
> > > > > >
> > > > > > aha, would do it.
> > > > > >
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	if (pdata) {
> > > > > > > > +		pr_debug("Cannot free pasid %d with private data\n", pasid);
> > > > > > > > +		/* Expect PASID has no private data if not bond */
> > > > > > > > +		ret = -EBUSY;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > > +	ioasid_free(pasid);
> > > > > > >
> > > > > > > We only ever get here with pasid == NULL?!
> > > > > >
> > > > > > I guess you meant only when pdata==NULL.
> > > > > >
> > > > > > > Something is wrong.  Should
> > > > > > > that be 'if (!pdata)'?  (which also makes that pr_debug another DoS
> > > > > > > vector)
> > > > > >
> > > > > > Oh, yes, just do it as below:
> > > > > >
> > > > > > if (!pdata) {
> > > > > > 	ioasid_free(pasid);
> > > > > > 	ret = SUCCESS;
> > > > > > } else
> > > > > > 	ret = -EBUSY;
> > > > > >
> > > > > > Is it what you mean?
> > > > >
> > > > > No, I think I was just confusing pdata and pasid, but I am still
> > > > > confused about testing pdata.  We call ioasid_alloc() with private =
> > > > > NULL, and I don't see any of your patches calling ioasid_set_data() to
> > > > > change the private data after allocation, so how could this ever be
> > > > > set?  Should this just be a BUG_ON(pdata) as the integrity of the
> > > > > system is in question should this state ever occur?  Thanks,
> > > >
> > > > ioasid_set_data() was called  in one patch from Jacob's vSVA patchset.
> > > > [PATCH v6 08/10] iommu/vt-d: Add bind guest PASID support
> > > > https://lkml.org/lkml/2019/10/22/946
> > > >
> > > > The basic idea is to allocate pasid with private=NULL, and set it when the
> > > > pasid is actually bind to a device (bind_gpasid()). Each bind_gpasid() will
> > > > increase the ref_cnt in the private data, and each unbind_gpasid() will
> > > > decrease the ref_cnt. So if bind/unbind_gpasid() is called in mirror, the
> > > > private data should be null when comes to free operation. If not, vfio can
> > > > believe that the pasid is still in use.
> > >
> > > So this is another opportunity to leak pasids.  What's a user supposed
> > > to do when their attempt to free a pasid fails?  It invites leaks to
> > > allow this path to fail.  Thanks,
> >
> > Agreed, may no need to fail pasid free as it may leak pasid. How about
> > always let free successful? If the ref_cnt is non-zero, notify the remaining
> > users to release their reference.
> 
> If a user frees an PASID, they've done their due diligence in
> indicating it's no longer used.  The kernel should handle reclaiming it
> from that point.  Thanks,

Yes, I've aligned with Jacob offline. Will free PASID per requested, no fail. Jacob
will help to add notifications in ioasid.

> Alex

Thanks,
Yi Liu
Yi Liu Nov. 25, 2019, 8:32 a.m. UTC | #12
Hi Alex,

I think the major opens in this thread are the pasid allocation limit for
each VM and pasid lifecycle management. For lifecycle management, we will
free pasid when userspace requests and reclaim pasids when VM is crashed.
For the pasid allocation limit, may have a tunable quota and scaled per
assigned device number. If no apparent issue, we may prepare a version
to see if it is workable.

Thanks,
Yi Liu

> From: Jacob Pan [mailto:jacob.jun.pan@linux.intel.com]
> Sent: Thursday, November 14, 2019 3:45 AM
> To: Alex Williamson <alex.williamson@redhat.com>
> Cc: Liu, Yi L <yi.l.liu@intel.com>; eric.auger@redhat.com; Tian, Kevin
> <kevin.tian@intel.com>; joro@8bytes.org; Raj, Ashok <ashok.raj@intel.com>; Tian,
> Jun J <jun.j.tian@intel.com>; Sun, Yi Y <yi.y.sun@intel.com>; jean-
> philippe.brucker@arm.com; peterx@redhat.com; iommu@lists.linux-foundation.org;
> kvm@vger.kernel.org; jacob.jun.pan@linux.intel.com
> Subject: Re: [RFC v2 2/3] vfio/type1: VFIO_IOMMU_PASID_REQUEST(alloc/free)
> 
> On Wed, 13 Nov 2019 08:29:40 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Wed, 13 Nov 2019 11:03:17 +0000
> > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> >
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Friday, November 8, 2019 11:15 PM
> > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > Subject: Re: [RFC v2 2/3] vfio/type1:
> > > > VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > >
> > > > On Fri, 8 Nov 2019 12:23:41 +0000
> > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > > >
> > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > Sent: Friday, November 8, 2019 6:07 AM
> > > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > Subject: Re: [RFC v2 2/3] vfio/type1:
> > > > > > VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > > > >
> > > > > > On Wed, 6 Nov 2019 13:27:26 +0000
> > > > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > > > > >
> > > > > > > > From: Alex Williamson <alex.williamson@redhat.com>
> > > > > > > > Sent: Wednesday, November 6, 2019 7:36 AM
> > > > > > > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > > Subject: Re: [RFC v2 2/3] vfio/type1:
> > > > VFIO_IOMMU_PASID_REQUEST(alloc/free)
> > > > > > > >
> > > > > > > > On Thu, 24 Oct 2019 08:26:22 -0400
> > > > > > > > Liu Yi L <yi.l.liu@intel.com> wrote:
> > > > > > > >
> > > > > > > > > This patch adds VFIO_IOMMU_PASID_REQUEST ioctl which
> > > > > > > > > aims to passdown PASID allocation/free request from the
> > > > > > > > > virtual iommu. This is required to get PASID managed in
> > > > > > > > > system-wide.
> > > > > > > > >
> > > > > > > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > > > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > > > > > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > > > > > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > > > > > > ---
> > > > > > > > >  drivers/vfio/vfio_iommu_type1.c | 114
> > > > > > > > ++++++++++++++++++++++++++++++++++++++++
> > > > > > > > >  include/uapi/linux/vfio.h       |  25 +++++++++
> > > > > > > > >  2 files changed, 139 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > > > b/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > > index cd8d3a5..3d73a7d 100644
> > > > > > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > > > > @@ -2248,6 +2248,83 @@ static int
> > > > > > > > > vfio_cache_inv_fn(struct device *dev,
> > > > > > void
> > > > > > > > *data)
> > > > > > > > >  	return iommu_cache_invalidate(dc->domain, dev,
> > > > > > > > > &ustruct->info); }
> > > > > > > > >
> > > > > > > > > +static int vfio_iommu_type1_pasid_alloc(struct
> > > > > > > > > vfio_iommu *iommu,
> > > > > > > > > +					 int min_pasid,
> > > > > > > > > +					 int max_pasid)
> > > > > > > > > +{
> > > > > > > > > +	int ret;
> > > > > > > > > +	ioasid_t pasid;
> > > > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > > > +
> > > > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > > > +		ret = -EINVAL;
> > > > > > > > > +		goto out_unlock;
> > > > > > > > > +	}
> > > > > > > > > +	mm = get_task_mm(current);
> > > > > > > > > +	/* Track ioasid allocation owner by mm */
> > > > > > > > > +	pasid = ioasid_alloc((struct ioasid_set *)mm,
> > > > > > > > > min_pasid,
> > > > > > > > > +				max_pasid, NULL);
> > > > > > > >
> > > > > > > > Are we sure we want to tie this to the task mm vs perhaps
> > > > > > > > the vfio_iommu pointer?
> > > > > > >
> > > > > > > Here we want to have a kind of per-VM mark, which can be
> > > > > > > used to do ownership check on whether a pasid is held by a
> > > > > > > specific VM. This is very important to prevent across VM
> > > > > > > affect. vfio_iommu pointer is competent for vfio as vfio is
> > > > > > > both pasid alloc requester and pasid consumer. e.g. vfio
> > > > > > > requests pasid alloc from ioasid and also it will invoke
> > > > > > > bind_gpasid(). vfio can either check ownership before
> > > > > > > invoking bind_gpasid() or pass vfio_iommu pointer to iommu
> > > > > > > driver. But in future, there may be other modules which are
> > > > > > > just consumers of pasid. And they also want to do ownership
> > > > > > > check for a pasid. Then, it would be hard for them as they
> > > > > > > are not the pasid alloc requester. So here better to have a
> > > > > > > system wide structure to perform as the per-VM mark. task
> > > > > > > mm looks to be much competent.
> > > > > >
> > > > > > Ok, so it's intentional to have a VM-wide token.  Elsewhere
> > > > > > in the type1 code (vfio_dma_do_map) we record the task_struct
> > > > > > per dma mapping so that we can get the task mm as needed.
> > > > > > Would the task_struct pointer provide any advantage?
> > > > >
> > > > > I think we may use task_struct pointer to make type1 code
> > > > > consistent. How do you think?
> > > >
> > > > If it has the same utility, sure.
> > >
> > > thanks, I'll make this change.
> > >
> > > > > > Also, an overall question, this provides userspace with pasid
> > > > > > alloc and free ioctls, (1) what prevents a userspace process
> > > > > > from consuming every available pasid, and (2) if the process
> > > > > > exits or crashes without freeing pasids, how are they
> > > > > > recovered aside from a reboot?
> > > > >
> > > > > For question (1), I think we only need to take care about
> > > > > malicious userspace process. As vfio usage is under privilege
> > > > > mode, so we may be safe on it so far.
> > > >
> > > > No, where else do we ever make this assumption?  vfio requires a
> > > > privileged entity to configure the system for vfio, bind devices
> > > > for user use, and grant those devices to the user, but the usage
> > > > of the device is always assumed to be by an unprivileged user.
> > > > It is absolutely not acceptable require a privileged user.  It's
> > > > vfio's responsibility to protect the system from the user.
> > >
> > > My assumption is not precise here. sorry for it... Maybe to further
> > > check with you to better understand your point. I think the user
> > > (QEMU) of vfio needs to have a root permission. Thus it can open
> > > the vfio fds. At this point, the user is a privileged one. Also I
> > > guess that's why vfio can grant the user with the usage of
> > > VFIO_MAP/UNMAP to config mappings into iommu page tables. But I'm
> > > not quite sure when will the user be an unprivileged one.
> >
> > QEMU does NOT need to be run as root to use vfio.  This is NOT the
> > model libvirt follows.  libvirt grants a user access to a device, or
> > rather a set of one or more devices (ie. the group) via standard file
> > permission access to the group file (/dev/vfio/$GROUP).  Ownership of
> > a device allows the user permission to make use of the IOMMU.  The
> > user's ability to create DMA mappings is restricted by their process
> > locked memory limits, where libvirt elevates the user limit
> > sufficient for the size of the VM.  QEMU should never need to be run
> > as root and doing so is entirely unacceptable from a security
> > perspective.  The only mode of vfio that requires elevated privilege
> > for use is when making use of no-iommu, where we have no IOMMU
> > protection or translation.
> >
> > > > > However, we may need to introduce a kind of credit
> > > > > mechanism to protect it. I've thought it, but no good idea yet.
> > > > > Would be happy to hear from you.
> > > >
> > > > It's a limited system resource and it's unclear how many might
> > > > reasonably used by a user.  I don't have an easy answer.
> > >
> > > How about the below method? based on some offline chat with Jacob.
> > > a. some reasonable defaults for the initial per VM quota, e.g. 1000
> > > per process
> > > b. IOASID should be able to enforce per ioasid_set (it is kind of
> > > per VM mark) limit
> >
> > We support large numbers of assigned devices, how many IOASIDs might
> > be reasonably used per device?  Is the mm or the task still the
> > correct "set" in this scenario?  I don't have any better ideas than
> > setting a limit, but it probably needs a kernel or module tunable,
> > and it needs to match the scaling we expect to see when multiple
> > devices are involved.
> >
> I think mm/task is still the correct set in that we try to prevent
> abuse based on mm not device. Or we need to have some notion of super
> container (Ashok proposed a while ago) that maps to a VM.
> 
> I am guessing you are suggesting the per mm quota should also be
> scaled against number of devices assigned. I think that is very
> reasonable. Perhaps we can do:
> 
> 1. A tunable per iommu group PASID quota with a default of 1000, e.g.
> nr_pasid_per_group. Since we are dealing with nested translation and
> each device has its own second level so we pretty much have one device
> per group. Call it nr_pasid_per_group?
> 
> 2. Limit number of PASIDs per VM with nr_pasid_per_group * nr_groups.
> Probably update the limit when group is added to a container with the
> same mm.
> 
> I guess we could also use a cgroup controller for PASIDs.
> 
> > > > > For question (2), I think we need to reclaim the allocated
> > > > > pasids when the vfio container fd is released just like what
> > > > > vfio does to the domain mappings. I didn't add it yet. But I
> > > > > can add it in next version if you think it would make the pasid
> > > > > alloc/free be much sound.
> > > >
> > > > Consider it required, the interface is susceptible to abuse
> > > > without it.
> > >
> > > sure, let me add it in next version.
> > >
> > > > > > > > > +	if (pasid == INVALID_IOASID) {
> > > > > > > > > +		ret = -ENOSPC;
> > > > > > > > > +		goto out_unlock;
> > > > > > > > > +	}
> > > > > > > > > +	ret = pasid;
> > > > > > > > > +out_unlock:
> > > > > > > > > +	mutex_unlock(&iommu->lock);
> > > > > >
> > > > > > What does holding this lock protect?  That the vfio_iommu
> > > > > > remains backed by an iommu during this operation, even though
> > > > > > we don't do anything to release allocated pasids when that
> > > > > > iommu backing is removed?
> > > > >
> > > > > yes, it is unnecessary to hold the lock here. At least for the
> > > > > operations in this patch. will remove it. :-)
> > > > >
> > > > > > > > > +	if (mm)
> > > > > > > > > +		mmput(mm);
> > > > > > > > > +	return ret;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int vfio_iommu_type1_pasid_free(struct
> > > > > > > > > vfio_iommu *iommu,
> > > > > > > > > +				       unsigned int
> > > > > > > > > pasid) +{
> > > > > > > > > +	struct mm_struct *mm = NULL;
> > > > > > > > > +	void *pdata;
> > > > > > > > > +	int ret = 0;
> > > > > > > > > +
> > > > > > > > > +	mutex_lock(&iommu->lock);
> > > > > > > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > > > > > > +		ret = -EINVAL;
> > > > > > > > > +		goto out_unlock;
> > > > > > > > > +	}
> > > > > > > > > +
> > > > > > > > > +	/**
> > > > > > > > > +	 * REVISIT:
> > > > > > > > > +	 * There are two cases free could fail:
> > > > > > > > > +	 * 1. free pasid by non-owner, we use
> > > > > > > > > ioasid_set to track mm, if
> > > > > > > > > +	 * the set does not match, caller is not
> > > > > > > > > permitted to free.
> > > > > > > > > +	 * 2. free before unbind all devices, we can
> > > > > > > > > check if ioasid private
> > > > > > > > > +	 * data, if data != NULL, then fail to free.
> > > > > > > > > +	 */
> > > > > > > > > +	mm = get_task_mm(current);
> > > > > > > > > +	pdata = ioasid_find((struct ioasid_set *)mm,
> > > > > > > > > pasid, NULL);
> > > > > > > > > +	if (IS_ERR(pdata)) {
> > > > > > > > > +		if (pdata == ERR_PTR(-ENOENT))
> > > > > > > > > +			pr_err("PASID %u is not
> > > > > > > > > allocated\n", pasid);
> > > > > > > > > +		else if (pdata == ERR_PTR(-EACCES))
> > > > > > > > > +			pr_err("Free PASID %u by
> > > > > > > > > non-owner, denied",
> > > > pasid);
> > > > > > > > > +		else
> > > > > > > > > +			pr_err("Error searching PASID
> > > > > > > > > %u\n", pasid);
> > > > > > > >
> > > > > > > > This should be removed, errno is sufficient for the user,
> > > > > > > > this just provides the user with a trivial DoS vector
> > > > > > > > filling logs.
> > > > > > >
> > > > > > > sure, will fix it. thanks.
> > > > > > >
> > > > > > > > > +		ret = -EPERM;
> > > > > > > >
> > > > > > > > But why not return PTR_ERR(pdata)?
> > > > > > >
> > > > > > > aha, would do it.
> > > > > > >
> > > > > > > > > +		goto out_unlock;
> > > > > > > > > +	}
> > > > > > > > > +	if (pdata) {
> > > > > > > > > +		pr_debug("Cannot free pasid %d with
> > > > > > > > > private data\n", pasid);
> > > > > > > > > +		/* Expect PASID has no private data if
> > > > > > > > > not bond */
> > > > > > > > > +		ret = -EBUSY;
> > > > > > > > > +		goto out_unlock;
> > > > > > > > > +	}
> > > > > > > > > +	ioasid_free(pasid);
> > > > > > > >
> > > > > > > > We only ever get here with pasid == NULL?!
> > > > > > >
> > > > > > > I guess you meant only when pdata==NULL.
> > > > > > >
> > > > > > > > Something is wrong.  Should
> > > > > > > > that be 'if (!pdata)'?  (which also makes that pr_debug
> > > > > > > > another DoS vector)
> > > > > > >
> > > > > > > Oh, yes, just do it as below:
> > > > > > >
> > > > > > > if (!pdata) {
> > > > > > > 	ioasid_free(pasid);
> > > > > > > 	ret = SUCCESS;
> > > > > > > } else
> > > > > > > 	ret = -EBUSY;
> > > > > > >
> > > > > > > Is it what you mean?
> > > > > >
> > > > > > No, I think I was just confusing pdata and pasid, but I am
> > > > > > still confused about testing pdata.  We call ioasid_alloc()
> > > > > > with private = NULL, and I don't see any of your patches
> > > > > > calling ioasid_set_data() to change the private data after
> > > > > > allocation, so how could this ever be set?  Should this just
> > > > > > be a BUG_ON(pdata) as the integrity of the system is in
> > > > > > question should this state ever occur?  Thanks,
> > > > >
> > > > > ioasid_set_data() was called  in one patch from Jacob's vSVA
> > > > > patchset. [PATCH v6 08/10] iommu/vt-d: Add bind guest PASID
> > > > > support https://lkml.org/lkml/2019/10/22/946
> > > > >
> > > > > The basic idea is to allocate pasid with private=NULL, and set
> > > > > it when the pasid is actually bind to a device (bind_gpasid()).
> > > > > Each bind_gpasid() will increase the ref_cnt in the private
> > > > > data, and each unbind_gpasid() will decrease the ref_cnt. So if
> > > > > bind/unbind_gpasid() is called in mirror, the private data
> > > > > should be null when comes to free operation. If not, vfio can
> > > > > believe that the pasid is still in use.
> > > >
> > > > So this is another opportunity to leak pasids.  What's a user
> > > > supposed to do when their attempt to free a pasid fails?  It
> > > > invites leaks to allow this path to fail.  Thanks,
> > >
> > > Agreed, may no need to fail pasid free as it may leak pasid. How
> > > about always let free successful? If the ref_cnt is non-zero,
> > > notify the remaining users to release their reference.
> >
> > If a user frees an PASID, they've done their due diligence in
> > indicating it's no longer used.  The kernel should handle reclaiming
> > it from that point.  Thanks,
> 
> Yeah, I think we can add a atomic notifier for each PASID.
> Consumers such as IOMMU driver and KVM gets notified when IOASID is
> freed by VFIO. IOMMU driver can do the unbind and tear down.
> 
> In case of all the users already did unbind() before ioasid_free(), the
> free will proceed as usual.
> 
> > Alex
> >
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index cd8d3a5..3d73a7d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2248,6 +2248,83 @@  static int vfio_cache_inv_fn(struct device *dev, void *data)
 	return iommu_cache_invalidate(dc->domain, dev, &ustruct->info);
 }
 
+static int vfio_iommu_type1_pasid_alloc(struct vfio_iommu *iommu,
+					 int min_pasid,
+					 int max_pasid)
+{
+	int ret;
+	ioasid_t pasid;
+	struct mm_struct *mm = NULL;
+
+	mutex_lock(&iommu->lock);
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	mm = get_task_mm(current);
+	/* Track ioasid allocation owner by mm */
+	pasid = ioasid_alloc((struct ioasid_set *)mm, min_pasid,
+				max_pasid, NULL);
+	if (pasid == INVALID_IOASID) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+	ret = pasid;
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	if (mm)
+		mmput(mm);
+	return ret;
+}
+
+static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu,
+				       unsigned int pasid)
+{
+	struct mm_struct *mm = NULL;
+	void *pdata;
+	int ret = 0;
+
+	mutex_lock(&iommu->lock);
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/**
+	 * REVISIT:
+	 * There are two cases free could fail:
+	 * 1. free pasid by non-owner, we use ioasid_set to track mm, if
+	 * the set does not match, caller is not permitted to free.
+	 * 2. free before unbind all devices, we can check if ioasid private
+	 * data, if data != NULL, then fail to free.
+	 */
+	mm = get_task_mm(current);
+	pdata = ioasid_find((struct ioasid_set *)mm, pasid, NULL);
+	if (IS_ERR(pdata)) {
+		if (pdata == ERR_PTR(-ENOENT))
+			pr_err("PASID %u is not allocated\n", pasid);
+		else if (pdata == ERR_PTR(-EACCES))
+			pr_err("Free PASID %u by non-owner, denied", pasid);
+		else
+			pr_err("Error searching PASID %u\n", pasid);
+		ret = -EPERM;
+		goto out_unlock;
+	}
+	if (pdata) {
+		pr_debug("Cannot free pasid %d with private data\n", pasid);
+		/* Expect PASID has no private data if not bond */
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	ioasid_free(pasid);
+
+out_unlock:
+	if (mm)
+		mmput(mm);
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -2370,6 +2447,43 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 					    &ustruct);
 		mutex_unlock(&iommu->lock);
 		return ret;
+
+	} else if (cmd == VFIO_IOMMU_PASID_REQUEST) {
+		struct vfio_iommu_type1_pasid_request req;
+		int min_pasid, max_pasid, pasid;
+
+		minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
+				    flag);
+
+		if (copy_from_user(&req, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (req.argsz < minsz)
+			return -EINVAL;
+
+		switch (req.flag) {
+		/**
+		 * TODO: min_pasid and max_pasid align with
+		 * typedef unsigned int ioasid_t
+		 */
+		case VFIO_IOMMU_PASID_ALLOC:
+			if (copy_from_user(&min_pasid,
+				(void __user *)arg + minsz, sizeof(min_pasid)))
+				return -EFAULT;
+			if (copy_from_user(&max_pasid,
+				(void __user *)arg + minsz + sizeof(min_pasid),
+				sizeof(max_pasid)))
+				return -EFAULT;
+			return vfio_iommu_type1_pasid_alloc(iommu,
+						min_pasid, max_pasid);
+		case VFIO_IOMMU_PASID_FREE:
+			if (copy_from_user(&pasid,
+				(void __user *)arg + minsz, sizeof(pasid)))
+				return -EFAULT;
+			return vfio_iommu_type1_pasid_free(iommu, pasid);
+		default:
+			return -EINVAL;
+		}
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ccf60a2..04de290 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -807,6 +807,31 @@  struct vfio_iommu_type1_cache_invalidate {
 };
 #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 24)
 
+/*
+ * @flag=VFIO_IOMMU_PASID_ALLOC, refer to the @min_pasid and @max_pasid fields
+ * @flag=VFIO_IOMMU_PASID_FREE, refer to @pasid field
+ */
+struct vfio_iommu_type1_pasid_request {
+	__u32	argsz;
+#define VFIO_IOMMU_PASID_ALLOC	(1 << 0)
+#define VFIO_IOMMU_PASID_FREE	(1 << 1)
+	__u32	flag;
+	union {
+		struct {
+			int min_pasid;
+			int max_pasid;
+		};
+		int pasid;
+	};
+};
+
+/**
+ * VFIO_IOMMU_PASID_REQUEST - _IOWR(VFIO_TYPE, VFIO_BASE + 27,
+ *				struct vfio_iommu_type1_pasid_request)
+ *
+ */
+#define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 27)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*