diff mbox series

[v1,6/8] vfio/type1: Bind guest page tables to host

Message ID 1584880325-10561-7-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio: expose virtual Shared Virtual Addressing to VMs | expand

Commit Message

Yi Liu March 22, 2020, 12:32 p.m. UTC
From: Liu Yi L <yi.l.liu@intel.com>

VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware
IOMMUs that have nesting DMA translation (a.k.a dual stage address
translation). For such hardware IOMMUs, there are two stages/levels of
address translation, and software may let userspace/VM to own the first-
level/stage-1 translation structures. Example of such usage is vSVA (
virtual Shared Virtual Addressing). VM owns the first-level/stage-1
translation structures and bind the structures to host, then hardware
IOMMU would utilize nesting translation when doing DMA translation fo
the devices behind such hardware IOMMU.

This patch adds vfio support for binding guest translation (a.k.a stage 1)
structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only bind
guest page table is needed, it also requires to expose interface to guest
for iommu cache invalidation when guest modified the first-level/stage-1
translation structures since hardware needs to be notified to flush stale
iotlbs. This would be introduced in next patch.

In this patch, guest page table bind and unbind are done by using flags
VFIO_IOMMU_BIND_GUEST_PGTBL and VFIO_IOMMU_UNBIND_GUEST_PGTBL under IOCTL
VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
struct iommu_gpasid_bind_data. Before binding guest page table to host,
VM should have got a PASID allocated by host via VFIO_IOMMU_PASID_REQUEST.

Bind guest translation structures (here is guest page table) to host
are the first step to setup vSVA (Virtual Shared Virtual Addressing).

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 drivers/vfio/vfio_iommu_type1.c | 158 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       |  46 ++++++++++++
 2 files changed, 204 insertions(+)

Comments

kernel test robot March 22, 2020, 6:10 p.m. UTC | #1
Hi Yi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on vfio/next]
[also build test ERROR on v5.6-rc6 next-20200320]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Liu-Yi-L/vfio-expose-virtual-Shared-Virtual-Addressing-to-VMs/20200322-213259
base:   https://github.com/awilliam/linux-vfio.git next
config: arm64-defconfig (attached as .config)
compiler: aarch64-linux-gcc (GCC) 9.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=9.2.0 make.cross ARCH=arm64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   drivers/vfio/vfio_iommu_type1.c: In function 'vfio_iommu_get_stage1_format':
   drivers/vfio/vfio_iommu_type1.c:2300:4: error: 'DOMAIN_ATTR_PASID_FORMAT' undeclared (first use in this function)
    2300 |    DOMAIN_ATTR_PASID_FORMAT, &format)) {
         |    ^~~~~~~~~~~~~~~~~~~~~~~~
   drivers/vfio/vfio_iommu_type1.c:2300:4: note: each undeclared identifier is reported only once for each function it appears in
   drivers/vfio/vfio_iommu_type1.c: In function 'vfio_iommu_type1_ioctl':
   drivers/vfio/vfio_iommu_type1.c:2464:11: error: implicit declaration of function 'iommu_get_uapi_version' [-Werror=implicit-function-declaration]
    2464 |    return iommu_get_uapi_version();
         |           ^~~~~~~~~~~~~~~~~~~~~~
>> drivers/vfio/vfio_iommu_type1.c:2626:15: error: implicit declaration of function 'iommu_uapi_get_data_size' [-Werror=implicit-function-declaration]
    2626 |   data_size = iommu_uapi_get_data_size(
         |               ^~~~~~~~~~~~~~~~~~~~~~~~
>> drivers/vfio/vfio_iommu_type1.c:2627:5: error: 'IOMMU_UAPI_BIND_GPASID' undeclared (first use in this function)
    2627 |     IOMMU_UAPI_BIND_GPASID, version);
         |     ^~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/iommu_uapi_get_data_size +2626 drivers/vfio/vfio_iommu_type1.c

  2446	
  2447	static long vfio_iommu_type1_ioctl(void *iommu_data,
  2448					   unsigned int cmd, unsigned long arg)
  2449	{
  2450		struct vfio_iommu *iommu = iommu_data;
  2451		unsigned long minsz;
  2452	
  2453		if (cmd == VFIO_CHECK_EXTENSION) {
  2454			switch (arg) {
  2455			case VFIO_TYPE1_IOMMU:
  2456			case VFIO_TYPE1v2_IOMMU:
  2457			case VFIO_TYPE1_NESTING_IOMMU:
  2458				return 1;
  2459			case VFIO_DMA_CC_IOMMU:
  2460				if (!iommu)
  2461					return 0;
  2462				return vfio_domains_have_iommu_cache(iommu);
  2463			case VFIO_NESTING_IOMMU_UAPI:
  2464				return iommu_get_uapi_version();
  2465			default:
  2466				return 0;
  2467			}
  2468		} else if (cmd == VFIO_IOMMU_GET_INFO) {
  2469			struct vfio_iommu_type1_info info;
  2470			struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
  2471			unsigned long capsz;
  2472			int ret;
  2473	
  2474			minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
  2475	
  2476			/* For backward compatibility, cannot require this */
  2477			capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
  2478	
  2479			if (copy_from_user(&info, (void __user *)arg, minsz))
  2480				return -EFAULT;
  2481	
  2482			if (info.argsz < minsz)
  2483				return -EINVAL;
  2484	
  2485			if (info.argsz >= capsz) {
  2486				minsz = capsz;
  2487				info.cap_offset = 0; /* output, no-recopy necessary */
  2488			}
  2489	
  2490			info.flags = VFIO_IOMMU_INFO_PGSIZES;
  2491	
  2492			info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
  2493	
  2494			ret = vfio_iommu_iova_build_caps(iommu, &caps);
  2495			if (ret)
  2496				return ret;
  2497	
  2498			ret = vfio_iommu_info_add_nesting_cap(iommu, &caps);
  2499			if (ret)
  2500				return ret;
  2501	
  2502			if (caps.size) {
  2503				info.flags |= VFIO_IOMMU_INFO_CAPS;
  2504	
  2505				if (info.argsz < sizeof(info) + caps.size) {
  2506					info.argsz = sizeof(info) + caps.size;
  2507				} else {
  2508					vfio_info_cap_shift(&caps, sizeof(info));
  2509					if (copy_to_user((void __user *)arg +
  2510							sizeof(info), caps.buf,
  2511							caps.size)) {
  2512						kfree(caps.buf);
  2513						return -EFAULT;
  2514					}
  2515					info.cap_offset = sizeof(info);
  2516				}
  2517	
  2518				kfree(caps.buf);
  2519			}
  2520	
  2521			return copy_to_user((void __user *)arg, &info, minsz) ?
  2522				-EFAULT : 0;
  2523	
  2524		} else if (cmd == VFIO_IOMMU_MAP_DMA) {
  2525			struct vfio_iommu_type1_dma_map map;
  2526			uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
  2527					VFIO_DMA_MAP_FLAG_WRITE;
  2528	
  2529			minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
  2530	
  2531			if (copy_from_user(&map, (void __user *)arg, minsz))
  2532				return -EFAULT;
  2533	
  2534			if (map.argsz < minsz || map.flags & ~mask)
  2535				return -EINVAL;
  2536	
  2537			return vfio_dma_do_map(iommu, &map);
  2538	
  2539		} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
  2540			struct vfio_iommu_type1_dma_unmap unmap;
  2541			long ret;
  2542	
  2543			minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
  2544	
  2545			if (copy_from_user(&unmap, (void __user *)arg, minsz))
  2546				return -EFAULT;
  2547	
  2548			if (unmap.argsz < minsz || unmap.flags)
  2549				return -EINVAL;
  2550	
  2551			ret = vfio_dma_do_unmap(iommu, &unmap);
  2552			if (ret)
  2553				return ret;
  2554	
  2555			return copy_to_user((void __user *)arg, &unmap, minsz) ?
  2556				-EFAULT : 0;
  2557	
  2558		} else if (cmd == VFIO_IOMMU_PASID_REQUEST) {
  2559			struct vfio_iommu_type1_pasid_request req;
  2560			unsigned long offset;
  2561	
  2562			minsz = offsetofend(struct vfio_iommu_type1_pasid_request,
  2563					    flags);
  2564	
  2565			if (copy_from_user(&req, (void __user *)arg, minsz))
  2566				return -EFAULT;
  2567	
  2568			if (req.argsz < minsz ||
  2569			    !vfio_iommu_type1_pasid_req_valid(req.flags))
  2570				return -EINVAL;
  2571	
  2572			if (copy_from_user((void *)&req + minsz,
  2573					   (void __user *)arg + minsz,
  2574					   sizeof(req) - minsz))
  2575				return -EFAULT;
  2576	
  2577			switch (req.flags & VFIO_PASID_REQUEST_MASK) {
  2578			case VFIO_IOMMU_PASID_ALLOC:
  2579			{
  2580				int ret = 0, result;
  2581	
  2582				result = vfio_iommu_type1_pasid_alloc(iommu,
  2583								req.alloc_pasid.min,
  2584								req.alloc_pasid.max);
  2585				if (result > 0) {
  2586					offset = offsetof(
  2587						struct vfio_iommu_type1_pasid_request,
  2588						alloc_pasid.result);
  2589					ret = copy_to_user(
  2590						      (void __user *) (arg + offset),
  2591						      &result, sizeof(result));
  2592				} else {
  2593					pr_debug("%s: PASID alloc failed\n", __func__);
  2594					ret = -EFAULT;
  2595				}
  2596				return ret;
  2597			}
  2598			case VFIO_IOMMU_PASID_FREE:
  2599				return vfio_iommu_type1_pasid_free(iommu,
  2600								   req.free_pasid);
  2601			default:
  2602				return -EINVAL;
  2603			}
  2604	
  2605		} else if (cmd == VFIO_IOMMU_BIND) {
  2606			struct vfio_iommu_type1_bind bind;
  2607			u32 version;
  2608			int data_size;
  2609			void *gbind_data;
  2610			int ret;
  2611	
  2612			minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
  2613	
  2614			if (copy_from_user(&bind, (void __user *)arg, minsz))
  2615				return -EFAULT;
  2616	
  2617			if (bind.argsz < minsz)
  2618				return -EINVAL;
  2619	
  2620			/* Get the version of struct iommu_gpasid_bind_data */
  2621			if (copy_from_user(&version,
  2622				(void __user *) (arg + minsz),
  2623						sizeof(version)))
  2624				return -EFAULT;
  2625	
> 2626			data_size = iommu_uapi_get_data_size(
> 2627					IOMMU_UAPI_BIND_GPASID, version);
  2628			gbind_data = kzalloc(data_size, GFP_KERNEL);
  2629			if (!gbind_data)
  2630				return -ENOMEM;
  2631	
  2632			if (copy_from_user(gbind_data,
  2633				 (void __user *) (arg + minsz), data_size)) {
  2634				kfree(gbind_data);
  2635				return -EFAULT;
  2636			}
  2637	
  2638			switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
  2639			case VFIO_IOMMU_BIND_GUEST_PGTBL:
  2640				ret = vfio_iommu_type1_bind_gpasid(iommu,
  2641								   gbind_data);
  2642				break;
  2643			case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
  2644				ret = vfio_iommu_type1_unbind_gpasid(iommu,
  2645								     gbind_data);
  2646				break;
  2647			default:
  2648				ret = -EINVAL;
  2649				break;
  2650			}
  2651			kfree(gbind_data);
  2652			return ret;
  2653		}
  2654	
  2655		return -ENOTTY;
  2656	}
  2657	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Tian, Kevin March 30, 2020, 12:46 p.m. UTC | #2
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Sunday, March 22, 2020 8:32 PM
> 
> From: Liu Yi L <yi.l.liu@intel.com>
> 
> VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by
> hardware
> IOMMUs that have nesting DMA translation (a.k.a dual stage address
> translation). For such hardware IOMMUs, there are two stages/levels of
> address translation, and software may let userspace/VM to own the first-
> level/stage-1 translation structures. Example of such usage is vSVA (
> virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> translation structures and bind the structures to host, then hardware
> IOMMU would utilize nesting translation when doing DMA translation fo
> the devices behind such hardware IOMMU.
> 
> This patch adds vfio support for binding guest translation (a.k.a stage 1)
> structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only
> bind
> guest page table is needed, it also requires to expose interface to guest
> for iommu cache invalidation when guest modified the first-level/stage-1
> translation structures since hardware needs to be notified to flush stale
> iotlbs. This would be introduced in next patch.
> 
> In this patch, guest page table bind and unbind are done by using flags
> VFIO_IOMMU_BIND_GUEST_PGTBL and
> VFIO_IOMMU_UNBIND_GUEST_PGTBL under IOCTL
> VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> struct iommu_gpasid_bind_data. Before binding guest page table to host,
> VM should have got a PASID allocated by host via
> VFIO_IOMMU_PASID_REQUEST.
> 
> Bind guest translation structures (here is guest page table) to host

Bind -> Binding

> are the first step to setup vSVA (Virtual Shared Virtual Addressing).

are -> is. and you already explained vSVA earlier.

> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Eric Auger <eric.auger@redhat.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 158
> ++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h       |  46 ++++++++++++
>  2 files changed, 204 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> index 82a9e0b..a877747 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -130,6 +130,33 @@ struct vfio_regions {
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
> 
> +struct domain_capsule {
> +	struct iommu_domain *domain;
> +	void *data;
> +};
> +
> +/* iommu->lock must be held */
> +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> +		      int (*fn)(struct device *dev, void *data),
> +		      void *data)
> +{
> +	struct domain_capsule dc = {.data = data};
> +	struct vfio_domain *d;
> +	struct vfio_group *g;
> +	int ret = 0;
> +
> +	list_for_each_entry(d, &iommu->domain_list, next) {
> +		dc.domain = d->domain;
> +		list_for_each_entry(g, &d->group_list, next) {
> +			ret = iommu_group_for_each_dev(g->iommu_group,
> +						       &dc, fn);
> +			if (ret)
> +				break;
> +		}
> +	}
> +	return ret;
> +}
> +
>  static int put_pfn(unsigned long pfn, int prot);
> 
>  /*
> @@ -2314,6 +2341,88 @@ static int
> vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
>  	return 0;
>  }
> 
> +static int vfio_bind_gpasid_fn(struct device *dev, void *data)
> +{
> +	struct domain_capsule *dc = (struct domain_capsule *)data;
> +	struct iommu_gpasid_bind_data *gbind_data =
> +		(struct iommu_gpasid_bind_data *) dc->data;
> +

In Jacob's vSVA iommu series, [PATCH 06/11]:

+		/* REVISIT: upper layer/VFIO can track host process that bind the PASID.
+		 * ioasid_set = mm might be sufficient for vfio to check pasid VMM
+		 * ownership.
+		 */

I asked him who exactly should be responsible for tracking the pasid
ownership. Although no response yet, I expect vfio/iommu can have
a clear policy and also documented here to provide consistent 
message.

> +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data);
> +}
> +
> +static int vfio_unbind_gpasid_fn(struct device *dev, void *data)
> +{
> +	struct domain_capsule *dc = (struct domain_capsule *)data;
> +	struct iommu_gpasid_bind_data *gbind_data =
> +		(struct iommu_gpasid_bind_data *) dc->data;
> +
> +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> +					gbind_data->hpasid);

curious why we have to share the same bind_data structure
between bind and unbind, especially when unbind requires
only one field? I didn't see a clear reason, and just similar
to earlier ALLOC/FREE which don't share structure either.
Current way simply wastes space for unbind operation...

> +}
> +
> +/**
> + * Unbind specific gpasid, caller of this function requires hold
> + * vfio_iommu->lock
> + */
> +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu
> *iommu,
> +				struct iommu_gpasid_bind_data *gbind_data)
> +{
> +	return vfio_iommu_for_each_dev(iommu,
> +				vfio_unbind_gpasid_fn, gbind_data);
> +}
> +
> +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
> +				struct iommu_gpasid_bind_data *gbind_data)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = vfio_iommu_for_each_dev(iommu,
> +			vfio_bind_gpasid_fn, gbind_data);
> +	/*
> +	 * If bind failed, it may not be a total failure. Some devices
> +	 * within the iommu group may have bind successfully. Although
> +	 * we don't enable pasid capability for non-singletion iommu
> +	 * groups, a unbind operation would be helpful to ensure no
> +	 * partial binding for an iommu group.
> +	 */
> +	if (ret)
> +		/*
> +		 * Undo all binds that already succeeded, no need to

binds -> bindings

> +		 * check the return value here since some device within
> +		 * the group has no successful bind when coming to this
> +		 * place switch.
> +		 */

remove 'switch'

> +		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> +
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> +				struct iommu_gpasid_bind_data *gbind_data)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> +
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
>  		default:
>  			return -EINVAL;
>  		}
> +
> +	} else if (cmd == VFIO_IOMMU_BIND) {

BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me.

> +		struct vfio_iommu_type1_bind bind;
> +		u32 version;
> +		int data_size;
> +		void *gbind_data;
> +		int ret;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> +
> +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (bind.argsz < minsz)
> +			return -EINVAL;
> +
> +		/* Get the version of struct iommu_gpasid_bind_data */
> +		if (copy_from_user(&version,
> +			(void __user *) (arg + minsz),
> +					sizeof(version)))
> +			return -EFAULT;
> +
> +		data_size = iommu_uapi_get_data_size(
> +				IOMMU_UAPI_BIND_GPASID, version);
> +		gbind_data = kzalloc(data_size, GFP_KERNEL);
> +		if (!gbind_data)
> +			return -ENOMEM;
> +
> +		if (copy_from_user(gbind_data,
> +			 (void __user *) (arg + minsz), data_size)) {
> +			kfree(gbind_data);
> +			return -EFAULT;
> +		}
> +
> +		switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
> +		case VFIO_IOMMU_BIND_GUEST_PGTBL:
> +			ret = vfio_iommu_type1_bind_gpasid(iommu,
> +							   gbind_data);
> +			break;
> +		case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
> +			ret = vfio_iommu_type1_unbind_gpasid(iommu,
> +							     gbind_data);
> +			break;
> +		default:
> +			ret = -EINVAL;
> +			break;
> +		}
> +		kfree(gbind_data);
> +		return ret;
>  	}
> 
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ebeaf3e..2235bc6 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -14,6 +14,7 @@
> 
>  #include <linux/types.h>
>  #include <linux/ioctl.h>
> +#include <linux/iommu.h>
> 
>  #define VFIO_API_VERSION	0
> 
> @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request {
>   */
>  #define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE +
> 22)
> 
> +/**
> + * Supported flags:
> + *	- VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host
> for
> + *			nesting type IOMMUs. In @data field It takes struct
> + *			iommu_gpasid_bind_data.
> + *	- VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page
> table operation
> + *			invoked by VFIO_IOMMU_BIND_GUEST_PGTBL.
> + *
> + */
> +struct vfio_iommu_type1_bind {
> +	__u32		argsz;
> +	__u32		flags;
> +#define VFIO_IOMMU_BIND_GUEST_PGTBL	(1 << 0)
> +#define VFIO_IOMMU_UNBIND_GUEST_PGTBL	(1 << 1)
> +	__u8		data[];
> +};
> +
> +#define VFIO_IOMMU_BIND_MASK	(VFIO_IOMMU_BIND_GUEST_PGTBL
> | \
> +
> 	VFIO_IOMMU_UNBIND_GUEST_PGTBL)
> +
> +/**
> + * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23,
> + *				struct vfio_iommu_type1_bind)
> + *
> + * Manage address spaces of devices in this container. Initially a TYPE1
> + * container can only have one address space, managed with
> + * VFIO_IOMMU_MAP/UNMAP_DMA.

the last sentence seems irrelevant and more suitable in commit msg.

> + *
> + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by
> both MAP/UNMAP
> + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host)
> page
> + * tables, and BIND manages the stage-1 (guest) page tables. Other types of

Are "other types" the counterpart to VFIO_TYPE1_NESTING_IOMMU?
What are those types? I thought only NESTING_IOMMU allows two
stage translation...

> + * IOMMU may allow MAP/UNMAP and BIND to coexist, where

The first sentence said the same thing. Then what is the exact difference?

> MAP/UNMAP controls
> + * the traffics only require single stage translation while BIND controls the
> + * traffics require nesting translation. But this depends on the underlying
> + * IOMMU architecture and isn't guaranteed. Example of this is the guest
> SVA
> + * traffics, such traffics need nesting translation to gain gVA->gPA and then
> + * gPA->hPA translation.

I'm a bit confused about the content since "other types of". Are they
trying to state some exceptions/corner cases that this API cannot
resolve or explain the desired behavior of the API? Especially the
last example, which is worded as if the example for "isn't guaranteed"
but isn't guest SVA the main purpose of this API?

> + *
> + * Availability of this feature depends on the device, its bus, the underlying
> + * IOMMU and the CPU architecture.
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 23)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
> 
>  /*
> --
> 2.7.4
Yi Liu April 1, 2020, 9:13 a.m. UTC | #3
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Monday, March 30, 2020 8:46 PM
> Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Sunday, March 22, 2020 8:32 PM
> >
> > From: Liu Yi L <yi.l.liu@intel.com>
> >
> > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware
> > IOMMUs that have nesting DMA translation (a.k.a dual stage address
> > translation). For such hardware IOMMUs, there are two stages/levels of
> > address translation, and software may let userspace/VM to own the
> > first-
> > level/stage-1 translation structures. Example of such usage is vSVA (
> > virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> > translation structures and bind the structures to host, then hardware
> > IOMMU would utilize nesting translation when doing DMA translation fo
> > the devices behind such hardware IOMMU.
> >
> > This patch adds vfio support for binding guest translation (a.k.a
> > stage 1) structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU,
> > not only bind guest page table is needed, it also requires to expose
> > interface to guest for iommu cache invalidation when guest modified
> > the first-level/stage-1 translation structures since hardware needs to
> > be notified to flush stale iotlbs. This would be introduced in next
> > patch.
> >
> > In this patch, guest page table bind and unbind are done by using
> > flags VFIO_IOMMU_BIND_GUEST_PGTBL and
> VFIO_IOMMU_UNBIND_GUEST_PGTBL
> > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> > struct iommu_gpasid_bind_data. Before binding guest page table to
> > host, VM should have got a PASID allocated by host via
> > VFIO_IOMMU_PASID_REQUEST.
> >
> > Bind guest translation structures (here is guest page table) to host
> 
> Bind -> Binding
got it.
> > are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> 
> are -> is. and you already explained vSVA earlier.
oh yes, it is.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Eric Auger <eric.auger@redhat.com>
> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 158
> > ++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/vfio.h       |  46 ++++++++++++
> >  2 files changed, 204 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -130,6 +130,33 @@ struct vfio_regions {
> >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >  					(!list_empty(&iommu->domain_list))
> >
> > +struct domain_capsule {
> > +	struct iommu_domain *domain;
> > +	void *data;
> > +};
> > +
> > +/* iommu->lock must be held */
> > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > +		      int (*fn)(struct device *dev, void *data),
> > +		      void *data)
> > +{
> > +	struct domain_capsule dc = {.data = data};
> > +	struct vfio_domain *d;
> > +	struct vfio_group *g;
> > +	int ret = 0;
> > +
> > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > +		dc.domain = d->domain;
> > +		list_for_each_entry(g, &d->group_list, next) {
> > +			ret = iommu_group_for_each_dev(g->iommu_group,
> > +						       &dc, fn);
> > +			if (ret)
> > +				break;
> > +		}
> > +	}
> > +	return ret;
> > +}
> > +
> >  static int put_pfn(unsigned long pfn, int prot);
> >
> >  /*
> > @@ -2314,6 +2341,88 @@ static int
> > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> >  	return 0;
> >  }
> >
> > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) {
> > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > +	struct iommu_gpasid_bind_data *gbind_data =
> > +		(struct iommu_gpasid_bind_data *) dc->data;
> > +
> 
> In Jacob's vSVA iommu series, [PATCH 06/11]:
> 
> +		/* REVISIT: upper layer/VFIO can track host process that bind the
> PASID.
> +		 * ioasid_set = mm might be sufficient for vfio to check pasid VMM
> +		 * ownership.
> +		 */
> 
> I asked him who exactly should be responsible for tracking the pasid ownership.
> Although no response yet, I expect vfio/iommu can have a clear policy and also
> documented here to provide consistent message.

yep.

> > +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); }
> > +
> > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) {
> > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > +	struct iommu_gpasid_bind_data *gbind_data =
> > +		(struct iommu_gpasid_bind_data *) dc->data;
> > +
> > +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> > +					gbind_data->hpasid);
> 
> curious why we have to share the same bind_data structure between bind and
> unbind, especially when unbind requires only one field? I didn't see a clear reason,
> and just similar to earlier ALLOC/FREE which don't share structure either.
> Current way simply wastes space for unbind operation...

no special reason today. But the gIOVA support over nested translation
is in plan, it may require a flag to indicate it as guest iommu driver
may user a single PASID value(RID2PASID) for all devices in guest side.
Especially if the RID2PASID value used for IOVA the the same with host
side. So adding a flag to indicate the binding is for IOVA is helpful.
For PF/VF, iommu driver just bind with the host side's RID2PASID. While
for ADI (Assignable Device Interface),  vfio layer needs to figure out
the default PASID stored in the aux-domain, and then iommu driver bind
gIOVA table to the default PASID. The potential flag is required in both
bind and unbind path. As such, it would be better to share the structure.

> > +}
> > +
> > +/**
> > + * Unbind specific gpasid, caller of this function requires hold
> > + * vfio_iommu->lock
> > + */
> > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu
> > *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data) {
> > +	return vfio_iommu_for_each_dev(iommu,
> > +				vfio_unbind_gpasid_fn, gbind_data); }
> > +
> > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data) {
> > +	int ret = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	ret = vfio_iommu_for_each_dev(iommu,
> > +			vfio_bind_gpasid_fn, gbind_data);
> > +	/*
> > +	 * If bind failed, it may not be a total failure. Some devices
> > +	 * within the iommu group may have bind successfully. Although
> > +	 * we don't enable pasid capability for non-singletion iommu
> > +	 * groups, a unbind operation would be helpful to ensure no
> > +	 * partial binding for an iommu group.
> > +	 */
> > +	if (ret)
> > +		/*
> > +		 * Undo all binds that already succeeded, no need to
> 
> binds -> bindings
got it.
> 
> > +		 * check the return value here since some device within
> > +		 * the group has no successful bind when coming to this
> > +		 * place switch.
> > +		 */
> 
> remove 'switch'
oh, yes.

> 
> > +		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> > +
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data) {
> > +	int ret = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> > +
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  				   unsigned int cmd, unsigned long arg)  { @@ -
> 2471,6 +2580,55 @@
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  		default:
> >  			return -EINVAL;
> >  		}
> > +
> > +	} else if (cmd == VFIO_IOMMU_BIND) {
> 
> BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me.

Emm, it's up to the flags to indicate bind what. It was proposed to
cover the three cases below:
a) BIND/UNBIND_GPASID
b) BIND/UNBIND_GPASID_TABLE
c) BIND/UNBIND_PROCESS
<only a) is covered in this patch>
So it's called VFIO_IOMMU_BIND.

> 
> > +		struct vfio_iommu_type1_bind bind;
> > +		u32 version;
> > +		int data_size;
> > +		void *gbind_data;
> > +		int ret;
> > +
> > +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> > +
> > +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> > +			return -EFAULT;
> > +
> > +		if (bind.argsz < minsz)
> > +			return -EINVAL;
> > +
> > +		/* Get the version of struct iommu_gpasid_bind_data */
> > +		if (copy_from_user(&version,
> > +			(void __user *) (arg + minsz),
> > +					sizeof(version)))
> > +			return -EFAULT;
> > +
> > +		data_size = iommu_uapi_get_data_size(
> > +				IOMMU_UAPI_BIND_GPASID, version);
> > +		gbind_data = kzalloc(data_size, GFP_KERNEL);
> > +		if (!gbind_data)
> > +			return -ENOMEM;
> > +
> > +		if (copy_from_user(gbind_data,
> > +			 (void __user *) (arg + minsz), data_size)) {
> > +			kfree(gbind_data);
> > +			return -EFAULT;
> > +		}
> > +
> > +		switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
> > +		case VFIO_IOMMU_BIND_GUEST_PGTBL:
> > +			ret = vfio_iommu_type1_bind_gpasid(iommu,
> > +							   gbind_data);
> > +			break;
> > +		case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
> > +			ret = vfio_iommu_type1_unbind_gpasid(iommu,
> > +							     gbind_data);
> > +			break;
> > +		default:
> > +			ret = -EINVAL;
> > +			break;
> > +		}
> > +		kfree(gbind_data);
> > +		return ret;
> >  	}
> >
> >  	return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index ebeaf3e..2235bc6 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -14,6 +14,7 @@
> >
> >  #include <linux/types.h>
> >  #include <linux/ioctl.h>
> > +#include <linux/iommu.h>
> >
> >  #define VFIO_API_VERSION	0
> >
> > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request {
> >   */
> >  #define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE +
> > 22)
> >
> > +/**
> > + * Supported flags:
> > + *	- VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host
> > for
> > + *			nesting type IOMMUs. In @data field It takes struct
> > + *			iommu_gpasid_bind_data.
> > + *	- VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page
> > table operation
> > + *			invoked by VFIO_IOMMU_BIND_GUEST_PGTBL.
> > + *
> > + */
> > +struct vfio_iommu_type1_bind {
> > +	__u32		argsz;
> > +	__u32		flags;
> > +#define VFIO_IOMMU_BIND_GUEST_PGTBL	(1 << 0)
> > +#define VFIO_IOMMU_UNBIND_GUEST_PGTBL	(1 << 1)
> > +	__u8		data[];
> > +};
> > +
> > +#define VFIO_IOMMU_BIND_MASK	(VFIO_IOMMU_BIND_GUEST_PGTBL
> > | \
> > +
> > 	VFIO_IOMMU_UNBIND_GUEST_PGTBL)
> > +
> > +/**
> > + * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23,
> > + *				struct vfio_iommu_type1_bind)
> > + *
> > + * Manage address spaces of devices in this container. Initially a
> > +TYPE1
> > + * container can only have one address space, managed with
> > + * VFIO_IOMMU_MAP/UNMAP_DMA.
> 
> the last sentence seems irrelevant and more suitable in commit msg.

oh, I could remove it.

> > + *
> > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by
> > both MAP/UNMAP
> > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2
> > + (host)
> > page
> > + * tables, and BIND manages the stage-1 (guest) page tables. Other
> > + types of
> 
> Are "other types" the counterpart to VFIO_TYPE1_NESTING_IOMMU?
> What are those types? I thought only NESTING_IOMMU allows two stage
> translation...

it's a mistake... please ignore this message. would correct it in next version.

> 
> > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where
> 
> The first sentence said the same thing. Then what is the exact difference?

this sentence were added by mistake. will correct it.

> 
> > MAP/UNMAP controls
> > + * the traffics only require single stage translation while BIND
> > + controls the
> > + * traffics require nesting translation. But this depends on the
> > + underlying
> > + * IOMMU architecture and isn't guaranteed. Example of this is the
> > + guest
> > SVA
> > + * traffics, such traffics need nesting translation to gain gVA->gPA
> > + and then
> > + * gPA->hPA translation.
> 
> I'm a bit confused about the content since "other types of". Are they trying to state
> some exceptions/corner cases that this API cannot resolve or explain the desired
> behavior of the API? Especially the last example, which is worded as if the example
> for "isn't guaranteed"
> but isn't guest SVA the main purpose of this API?
> 
I think the description in original patch is bad especially with the "other types"
phrase. How about the below description?

/**
 * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23,
 *				struct vfio_iommu_type1_bind)
 *
 * Manage address spaces of devices in this container when it's an IOMMU
 * of type VFIO_TYPE1_NESTING_IOMMU. Such type IOMMU allows MAP/UNMAP and
 * BIND to coexist, where MAP/UNMAP controls the traffics only require
 * single stage translation while BIND controls the traffics require nesting
 * translation.
 *
 * Availability of this feature depends on the device, its bus, the underlying
 * IOMMU and the CPU architecture.
 *
 * returns: 0 on success, -errno on failure.
 */

Regards,
Yi Liu
Tian, Kevin April 2, 2020, 2:12 a.m. UTC | #4
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Wednesday, April 1, 2020 5:13 PM
> 
> > From: Tian, Kevin <kevin.tian@intel.com>
> > Sent: Monday, March 30, 2020 8:46 PM
> > Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> >
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Sunday, March 22, 2020 8:32 PM
> > >
> > > From: Liu Yi L <yi.l.liu@intel.com>
> > >
> > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by
> hardware
> > > IOMMUs that have nesting DMA translation (a.k.a dual stage address
> > > translation). For such hardware IOMMUs, there are two stages/levels of
> > > address translation, and software may let userspace/VM to own the
> > > first-
> > > level/stage-1 translation structures. Example of such usage is vSVA (
> > > virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> > > translation structures and bind the structures to host, then hardware
> > > IOMMU would utilize nesting translation when doing DMA translation fo
> > > the devices behind such hardware IOMMU.
> > >
> > > This patch adds vfio support for binding guest translation (a.k.a
> > > stage 1) structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU,
> > > not only bind guest page table is needed, it also requires to expose
> > > interface to guest for iommu cache invalidation when guest modified
> > > the first-level/stage-1 translation structures since hardware needs to
> > > be notified to flush stale iotlbs. This would be introduced in next
> > > patch.
> > >
> > > In this patch, guest page table bind and unbind are done by using
> > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and
> > VFIO_IOMMU_UNBIND_GUEST_PGTBL
> > > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> > > struct iommu_gpasid_bind_data. Before binding guest page table to
> > > host, VM should have got a PASID allocated by host via
> > > VFIO_IOMMU_PASID_REQUEST.
> > >
> > > Bind guest translation structures (here is guest page table) to host
> >
> > Bind -> Binding
> got it.
> > > are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> >
> > are -> is. and you already explained vSVA earlier.
> oh yes, it is.
> > >
> > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > Cc: Alex Williamson <alex.williamson@redhat.com>
> > > Cc: Eric Auger <eric.auger@redhat.com>
> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 158
> > > ++++++++++++++++++++++++++++++++++++++++
> > >  include/uapi/linux/vfio.h       |  46 ++++++++++++
> > >  2 files changed, 204 insertions(+)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -130,6 +130,33 @@ struct vfio_regions {
> > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > >  					(!list_empty(&iommu->domain_list))
> > >
> > > +struct domain_capsule {
> > > +	struct iommu_domain *domain;
> > > +	void *data;
> > > +};
> > > +
> > > +/* iommu->lock must be held */
> > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > > +		      int (*fn)(struct device *dev, void *data),
> > > +		      void *data)
> > > +{
> > > +	struct domain_capsule dc = {.data = data};
> > > +	struct vfio_domain *d;
> > > +	struct vfio_group *g;
> > > +	int ret = 0;
> > > +
> > > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > > +		dc.domain = d->domain;
> > > +		list_for_each_entry(g, &d->group_list, next) {
> > > +			ret = iommu_group_for_each_dev(g->iommu_group,
> > > +						       &dc, fn);
> > > +			if (ret)
> > > +				break;
> > > +		}
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > >  static int put_pfn(unsigned long pfn, int prot);
> > >
> > >  /*
> > > @@ -2314,6 +2341,88 @@ static int
> > > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> > >  	return 0;
> > >  }
> > >
> > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) {
> > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > +
> >
> > In Jacob's vSVA iommu series, [PATCH 06/11]:
> >
> > +		/* REVISIT: upper layer/VFIO can track host process that bind
> the
> > PASID.
> > +		 * ioasid_set = mm might be sufficient for vfio to check pasid
> VMM
> > +		 * ownership.
> > +		 */
> >
> > I asked him who exactly should be responsible for tracking the pasid
> ownership.
> > Although no response yet, I expect vfio/iommu can have a clear policy and
> also
> > documented here to provide consistent message.
> 
> yep.
> 
> > > +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); }
> > > +
> > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) {
> > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > +
> > > +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> > > +					gbind_data->hpasid);
> >
> > curious why we have to share the same bind_data structure between bind
> and
> > unbind, especially when unbind requires only one field? I didn't see a clear
> reason,
> > and just similar to earlier ALLOC/FREE which don't share structure either.
> > Current way simply wastes space for unbind operation...
> 
> no special reason today. But the gIOVA support over nested translation
> is in plan, it may require a flag to indicate it as guest iommu driver
> may user a single PASID value(RID2PASID) for all devices in guest side.
> Especially if the RID2PASID value used for IOVA the the same with host
> side. So adding a flag to indicate the binding is for IOVA is helpful.
> For PF/VF, iommu driver just bind with the host side's RID2PASID. While
> for ADI (Assignable Device Interface),  vfio layer needs to figure out
> the default PASID stored in the aux-domain, and then iommu driver bind
> gIOVA table to the default PASID. The potential flag is required in both
> bind and unbind path. As such, it would be better to share the structure.

I'm fine with it if you are pretty sure that more extension will be required
in the future, though I didn't fully understand above explanations. 
Yi Liu April 2, 2020, 8:05 a.m. UTC | #5
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Thursday, April 2, 2020 10:12 AM
> To: Liu, Yi L <yi.l.liu@intel.com>; alex.williamson@redhat.com;
> Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Wednesday, April 1, 2020 5:13 PM
> >
> > > From: Tian, Kevin <kevin.tian@intel.com>
> > > Sent: Monday, March 30, 2020 8:46 PM
> > > Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to
> > > host
> > >
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Sunday, March 22, 2020 8:32 PM
> > > >
> > > > From: Liu Yi L <yi.l.liu@intel.com>
> > > >
> > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by
> > hardware
> > > > IOMMUs that have nesting DMA translation (a.k.a dual stage address
> > > > translation). For such hardware IOMMUs, there are two
> > > > stages/levels of address translation, and software may let
> > > > userspace/VM to own the
> > > > first-
> > > > level/stage-1 translation structures. Example of such usage is
> > > > vSVA ( virtual Shared Virtual Addressing). VM owns the
> > > > first-level/stage-1 translation structures and bind the structures
> > > > to host, then hardware IOMMU would utilize nesting translation
> > > > when doing DMA translation fo the devices behind such hardware IOMMU.
> > > >
> > > > This patch adds vfio support for binding guest translation (a.k.a
> > > > stage 1) structure to host iommu. And for
> > > > VFIO_TYPE1_NESTING_IOMMU, not only bind guest page table is
> > > > needed, it also requires to expose interface to guest for iommu
> > > > cache invalidation when guest modified the first-level/stage-1
> > > > translation structures since hardware needs to be notified to
> > > > flush stale iotlbs. This would be introduced in next patch.
> > > >
> > > > In this patch, guest page table bind and unbind are done by using
> > > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and
> > > VFIO_IOMMU_UNBIND_GUEST_PGTBL
> > > > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> > > > struct iommu_gpasid_bind_data. Before binding guest page table to
> > > > host, VM should have got a PASID allocated by host via
> > > > VFIO_IOMMU_PASID_REQUEST.
> > > >
> > > > Bind guest translation structures (here is guest page table) to
> > > > host
> > >
> > > Bind -> Binding
> > got it.
> > > > are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> > >
> > > are -> is. and you already explained vSVA earlier.
> > oh yes, it is.
> > > >
> > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > Cc: Alex Williamson <alex.williamson@redhat.com>
> > > > Cc: Eric Auger <eric.auger@redhat.com>
> > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > ---
> > > >  drivers/vfio/vfio_iommu_type1.c | 158
> > > > ++++++++++++++++++++++++++++++++++++++++
> > > >  include/uapi/linux/vfio.h       |  46 ++++++++++++
> > > >  2 files changed, 204 insertions(+)
> > > >
> > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644
> > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > @@ -130,6 +130,33 @@ struct vfio_regions {
> > > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > >  					(!list_empty(&iommu->domain_list))
> > > >
> > > > +struct domain_capsule {
> > > > +	struct iommu_domain *domain;
> > > > +	void *data;
> > > > +};
> > > > +
> > > > +/* iommu->lock must be held */
> > > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > > > +		      int (*fn)(struct device *dev, void *data),
> > > > +		      void *data)
> > > > +{
> > > > +	struct domain_capsule dc = {.data = data};
> > > > +	struct vfio_domain *d;
> > > > +	struct vfio_group *g;
> > > > +	int ret = 0;
> > > > +
> > > > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > > > +		dc.domain = d->domain;
> > > > +		list_for_each_entry(g, &d->group_list, next) {
> > > > +			ret = iommu_group_for_each_dev(g->iommu_group,
> > > > +						       &dc, fn);
> > > > +			if (ret)
> > > > +				break;
> > > > +		}
> > > > +	}
> > > > +	return ret;
> > > > +}
> > > > +
> > > >  static int put_pfn(unsigned long pfn, int prot);
> > > >
> > > >  /*
> > > > @@ -2314,6 +2341,88 @@ static int
> > > > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> > > >  	return 0;
> > > >  }
> > > >
> > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) {
> > > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > > +
> > >
> > > In Jacob's vSVA iommu series, [PATCH 06/11]:
> > >
> > > +		/* REVISIT: upper layer/VFIO can track host process that bind
> > the
> > > PASID.
> > > +		 * ioasid_set = mm might be sufficient for vfio to check pasid
> > VMM
> > > +		 * ownership.
> > > +		 */
> > >
> > > I asked him who exactly should be responsible for tracking the pasid
> > ownership.
> > > Although no response yet, I expect vfio/iommu can have a clear
> > > policy and
> > also
> > > documented here to provide consistent message.
> >
> > yep.
> >
> > > > +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); }
> > > > +
> > > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) {
> > > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > > +
> > > > +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> > > > +					gbind_data->hpasid);
> > >
> > > curious why we have to share the same bind_data structure between
> > > bind
> > and
> > > unbind, especially when unbind requires only one field? I didn't see
> > > a clear
> > reason,
> > > and just similar to earlier ALLOC/FREE which don't share structure either.
> > > Current way simply wastes space for unbind operation...
> >
> > no special reason today. But the gIOVA support over nested translation
> > is in plan, it may require a flag to indicate it as guest iommu driver
> > may user a single PASID value(RID2PASID) for all devices in guest side.
> > Especially if the RID2PASID value used for IOVA the the same with host
> > side. So adding a flag to indicate the binding is for IOVA is helpful.
> > For PF/VF, iommu driver just bind with the host side's RID2PASID.
> > While for ADI (Assignable Device Interface),  vfio layer needs to
> > figure out the default PASID stored in the aux-domain, and then iommu
> > driver bind gIOVA table to the default PASID. The potential flag is
> > required in both bind and unbind path. As such, it would be better to share the
> structure.
> 
> I'm fine with it if you are pretty sure that more extension will be required in the
> future, though I didn't fully understand above explanations. 
Alex Williamson April 2, 2020, 7:57 p.m. UTC | #6
On Sun, 22 Mar 2020 05:32:03 -0700
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> From: Liu Yi L <yi.l.liu@intel.com>
> 
> VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware
> IOMMUs that have nesting DMA translation (a.k.a dual stage address
> translation). For such hardware IOMMUs, there are two stages/levels of
> address translation, and software may let userspace/VM to own the first-
> level/stage-1 translation structures. Example of such usage is vSVA (
> virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> translation structures and bind the structures to host, then hardware
> IOMMU would utilize nesting translation when doing DMA translation fo
> the devices behind such hardware IOMMU.
> 
> This patch adds vfio support for binding guest translation (a.k.a stage 1)
> structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only bind
> guest page table is needed, it also requires to expose interface to guest
> for iommu cache invalidation when guest modified the first-level/stage-1
> translation structures since hardware needs to be notified to flush stale
> iotlbs. This would be introduced in next patch.
> 
> In this patch, guest page table bind and unbind are done by using flags
> VFIO_IOMMU_BIND_GUEST_PGTBL and VFIO_IOMMU_UNBIND_GUEST_PGTBL under IOCTL
> VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> struct iommu_gpasid_bind_data. Before binding guest page table to host,
> VM should have got a PASID allocated by host via VFIO_IOMMU_PASID_REQUEST.
> 
> Bind guest translation structures (here is guest page table) to host
> are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Eric Auger <eric.auger@redhat.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 158 ++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h       |  46 ++++++++++++
>  2 files changed, 204 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 82a9e0b..a877747 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -130,6 +130,33 @@ struct vfio_regions {
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> +struct domain_capsule {
> +	struct iommu_domain *domain;
> +	void *data;
> +};
> +
> +/* iommu->lock must be held */
> +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> +		      int (*fn)(struct device *dev, void *data),
> +		      void *data)
> +{
> +	struct domain_capsule dc = {.data = data};
> +	struct vfio_domain *d;
> +	struct vfio_group *g;
> +	int ret = 0;
> +
> +	list_for_each_entry(d, &iommu->domain_list, next) {
> +		dc.domain = d->domain;
> +		list_for_each_entry(g, &d->group_list, next) {
> +			ret = iommu_group_for_each_dev(g->iommu_group,
> +						       &dc, fn);
> +			if (ret)
> +				break;
> +		}
> +	}
> +	return ret;
> +}
> +
>  static int put_pfn(unsigned long pfn, int prot);
>  
>  /*
> @@ -2314,6 +2341,88 @@ static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
>  	return 0;
>  }
>  
> +static int vfio_bind_gpasid_fn(struct device *dev, void *data)
> +{
> +	struct domain_capsule *dc = (struct domain_capsule *)data;
> +	struct iommu_gpasid_bind_data *gbind_data =
> +		(struct iommu_gpasid_bind_data *) dc->data;
> +
> +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data);
> +}
> +
> +static int vfio_unbind_gpasid_fn(struct device *dev, void *data)
> +{
> +	struct domain_capsule *dc = (struct domain_capsule *)data;
> +	struct iommu_gpasid_bind_data *gbind_data =
> +		(struct iommu_gpasid_bind_data *) dc->data;
> +
> +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> +					gbind_data->hpasid);
> +}
> +
> +/**
> + * Unbind specific gpasid, caller of this function requires hold
> + * vfio_iommu->lock
> + */
> +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu,
> +				struct iommu_gpasid_bind_data *gbind_data)
> +{
> +	return vfio_iommu_for_each_dev(iommu,
> +				vfio_unbind_gpasid_fn, gbind_data);
> +}
> +
> +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
> +				struct iommu_gpasid_bind_data *gbind_data)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = vfio_iommu_for_each_dev(iommu,
> +			vfio_bind_gpasid_fn, gbind_data);
> +	/*
> +	 * If bind failed, it may not be a total failure. Some devices
> +	 * within the iommu group may have bind successfully. Although
> +	 * we don't enable pasid capability for non-singletion iommu
> +	 * groups, a unbind operation would be helpful to ensure no
> +	 * partial binding for an iommu group.

Where was the non-singleton group restriction done, I missed that.

> +	 */
> +	if (ret)
> +		/*
> +		 * Undo all binds that already succeeded, no need to
> +		 * check the return value here since some device within
> +		 * the group has no successful bind when coming to this
> +		 * place switch.
> +		 */
> +		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);

However, the for_each_dev function stops when the callback function
returns error, are we just assuming we stop at the same device as we
faulted on the first time and that we traverse the same set of devices
the second time?  It seems strange to me that unbind should be able to
fail.

> +
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> +				struct iommu_gpasid_bind_data *gbind_data)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);

How is a user supposed to respond to their unbind failing?

> +
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  		default:
>  			return -EINVAL;
>  		}
> +
> +	} else if (cmd == VFIO_IOMMU_BIND) {
> +		struct vfio_iommu_type1_bind bind;
> +		u32 version;
> +		int data_size;
> +		void *gbind_data;
> +		int ret;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> +
> +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (bind.argsz < minsz)
> +			return -EINVAL;
> +
> +		/* Get the version of struct iommu_gpasid_bind_data */
> +		if (copy_from_user(&version,
> +			(void __user *) (arg + minsz),
> +					sizeof(version)))
> +			return -EFAULT;

Why are we coping things from beyond the size we've validated that the
user has provided again?

> +
> +		data_size = iommu_uapi_get_data_size(
> +				IOMMU_UAPI_BIND_GPASID, version);
> +		gbind_data = kzalloc(data_size, GFP_KERNEL);
> +		if (!gbind_data)
> +			return -ENOMEM;
> +
> +		if (copy_from_user(gbind_data,
> +			 (void __user *) (arg + minsz), data_size)) {
> +			kfree(gbind_data);
> +			return -EFAULT;
> +		}

And again.  argsz isn't just for minsz.

> +
> +		switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
> +		case VFIO_IOMMU_BIND_GUEST_PGTBL:
> +			ret = vfio_iommu_type1_bind_gpasid(iommu,
> +							   gbind_data);
> +			break;
> +		case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
> +			ret = vfio_iommu_type1_unbind_gpasid(iommu,
> +							     gbind_data);
> +			break;
> +		default:
> +			ret = -EINVAL;
> +			break;
> +		}
> +		kfree(gbind_data);
> +		return ret;
>  	}
>  
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ebeaf3e..2235bc6 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -14,6 +14,7 @@
>  
>  #include <linux/types.h>
>  #include <linux/ioctl.h>
> +#include <linux/iommu.h>
>  
>  #define VFIO_API_VERSION	0
>  
> @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request {
>   */
>  #define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 22)
>  
> +/**
> + * Supported flags:
> + *	- VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for
> + *			nesting type IOMMUs. In @data field It takes struct
> + *			iommu_gpasid_bind_data.
> + *	- VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table operation
> + *			invoked by VFIO_IOMMU_BIND_GUEST_PGTBL.

This must require iommu_gpasid_bind_data in the data field as well,
right?

> + *
> + */
> +struct vfio_iommu_type1_bind {
> +	__u32		argsz;
> +	__u32		flags;
> +#define VFIO_IOMMU_BIND_GUEST_PGTBL	(1 << 0)
> +#define VFIO_IOMMU_UNBIND_GUEST_PGTBL	(1 << 1)
> +	__u8		data[];
> +};
> +
> +#define VFIO_IOMMU_BIND_MASK	(VFIO_IOMMU_BIND_GUEST_PGTBL | \
> +					VFIO_IOMMU_UNBIND_GUEST_PGTBL)
> +
> +/**
> + * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23,
> + *				struct vfio_iommu_type1_bind)
> + *
> + * Manage address spaces of devices in this container. Initially a TYPE1
> + * container can only have one address space, managed with
> + * VFIO_IOMMU_MAP/UNMAP_DMA.
> + *
> + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP
> + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page
> + * tables, and BIND manages the stage-1 (guest) page tables. Other types of
> + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls
> + * the traffics only require single stage translation while BIND controls the
> + * traffics require nesting translation. But this depends on the underlying
> + * IOMMU architecture and isn't guaranteed. Example of this is the guest SVA
> + * traffics, such traffics need nesting translation to gain gVA->gPA and then
> + * gPA->hPA translation.
> + *
> + * Availability of this feature depends on the device, its bus, the underlying
> + * IOMMU and the CPU architecture.
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 23)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>  
>  /*
Jean-Philippe Brucker April 3, 2020, 8:34 a.m. UTC | #7
On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote:
> > > > > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > > >  		default:
> > > > >  			return -EINVAL;
> > > > >  		}
> > > > > +
> > > > > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > > >
> > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me.
> > >
> > > Emm, it's up to the flags to indicate bind what. It was proposed to
> > > cover the three cases below:
> > > a) BIND/UNBIND_GPASID
> > > b) BIND/UNBIND_GPASID_TABLE
> > > c) BIND/UNBIND_PROCESS
> > > <only a) is covered in this patch>
> > > So it's called VFIO_IOMMU_BIND.
> > 
> > but aren't they all about PASID related binding?
> 
> yeah, I can rename it. :-)

I don't know if anyone intends to implement it, but SMMUv2 supports
nesting translation without any PASID support. For that case the name
VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more sense.
Ideally we'd also use a neutral name for the IOMMU API instead of
bind_gpasid(), but that's easier to change later.

Thanks,
Jean
Yi Liu April 3, 2020, 1:30 p.m. UTC | #8
Hi Alex,

> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Friday, April 3, 2020 3:57 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> 
> On Sun, 22 Mar 2020 05:32:03 -0700
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > From: Liu Yi L <yi.l.liu@intel.com>
> >
> > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware
> > IOMMUs that have nesting DMA translation (a.k.a dual stage address
> > translation). For such hardware IOMMUs, there are two stages/levels of
> > address translation, and software may let userspace/VM to own the first-
> > level/stage-1 translation structures. Example of such usage is vSVA (
> > virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> > translation structures and bind the structures to host, then hardware
> > IOMMU would utilize nesting translation when doing DMA translation fo
> > the devices behind such hardware IOMMU.
> >
> > This patch adds vfio support for binding guest translation (a.k.a stage 1)
> > structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only bind
> > guest page table is needed, it also requires to expose interface to guest
> > for iommu cache invalidation when guest modified the first-level/stage-1
> > translation structures since hardware needs to be notified to flush stale
> > iotlbs. This would be introduced in next patch.
> >
> > In this patch, guest page table bind and unbind are done by using flags
> > VFIO_IOMMU_BIND_GUEST_PGTBL and VFIO_IOMMU_UNBIND_GUEST_PGTBL
> under IOCTL
> > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> > struct iommu_gpasid_bind_data. Before binding guest page table to host,
> > VM should have got a PASID allocated by host via VFIO_IOMMU_PASID_REQUEST.
> >
> > Bind guest translation structures (here is guest page table) to host
> > are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Eric Auger <eric.auger@redhat.com>
> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 158
> ++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/vfio.h       |  46 ++++++++++++
> >  2 files changed, 204 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 82a9e0b..a877747 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -130,6 +130,33 @@ struct vfio_regions {
> >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >  					(!list_empty(&iommu->domain_list))
> >
> > +struct domain_capsule {
> > +	struct iommu_domain *domain;
> > +	void *data;
> > +};
> > +
> > +/* iommu->lock must be held */
> > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > +		      int (*fn)(struct device *dev, void *data),
> > +		      void *data)
> > +{
> > +	struct domain_capsule dc = {.data = data};
> > +	struct vfio_domain *d;
> > +	struct vfio_group *g;
> > +	int ret = 0;
> > +
> > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > +		dc.domain = d->domain;
> > +		list_for_each_entry(g, &d->group_list, next) {
> > +			ret = iommu_group_for_each_dev(g->iommu_group,
> > +						       &dc, fn);
> > +			if (ret)
> > +				break;
> > +		}
> > +	}
> > +	return ret;
> > +}
> > +
> >  static int put_pfn(unsigned long pfn, int prot);
> >
> >  /*
> > @@ -2314,6 +2341,88 @@ static int vfio_iommu_info_add_nesting_cap(struct
> vfio_iommu *iommu,
> >  	return 0;
> >  }
> >
> > +static int vfio_bind_gpasid_fn(struct device *dev, void *data)
> > +{
> > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > +	struct iommu_gpasid_bind_data *gbind_data =
> > +		(struct iommu_gpasid_bind_data *) dc->data;
> > +
> > +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data);
> > +}
> > +
> > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data)
> > +{
> > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > +	struct iommu_gpasid_bind_data *gbind_data =
> > +		(struct iommu_gpasid_bind_data *) dc->data;
> > +
> > +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> > +					gbind_data->hpasid);
> > +}
> > +
> > +/**
> > + * Unbind specific gpasid, caller of this function requires hold
> > + * vfio_iommu->lock
> > + */
> > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data)
> > +{
> > +	return vfio_iommu_for_each_dev(iommu,
> > +				vfio_unbind_gpasid_fn, gbind_data);
> > +}
> > +
> > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data)
> > +{
> > +	int ret = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	ret = vfio_iommu_for_each_dev(iommu,
> > +			vfio_bind_gpasid_fn, gbind_data);
> > +	/*
> > +	 * If bind failed, it may not be a total failure. Some devices
> > +	 * within the iommu group may have bind successfully. Although
> > +	 * we don't enable pasid capability for non-singletion iommu
> > +	 * groups, a unbind operation would be helpful to ensure no
> > +	 * partial binding for an iommu group.
> 
> Where was the non-singleton group restriction done, I missed that.

Hmm, it's missed. thanks for spotting it. How about adding this
check in the vfio_iommu_for_each_dev()? If looped a non-singleton
group, just skip it. It applies to the cache_inv path all the
same.

> > +	 */
> > +	if (ret)
> > +		/*
> > +		 * Undo all binds that already succeeded, no need to
> > +		 * check the return value here since some device within
> > +		 * the group has no successful bind when coming to this
> > +		 * place switch.
> > +		 */
> > +		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> 
> However, the for_each_dev function stops when the callback function
> returns error, are we just assuming we stop at the same device as we
> faulted on the first time and that we traverse the same set of devices
> the second time?  It seems strange to me that unbind should be able to
> fail.

unbind can fail if a user attempts to unbind a pasid which is not belonged
to it or a pasid which hasn't ever been bound. Otherwise, I didn't see a
reason to fail.

> > +
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data)
> > +{
> > +	int ret = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> 
> How is a user supposed to respond to their unbind failing?

If it's a malicious unbind (e.g. unbind a not yet bound pasid or unbind
a pasid which doesn't belong to current user).

> > +
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  				   unsigned int cmd, unsigned long arg)
> >  {
> > @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> >  		default:
> >  			return -EINVAL;
> >  		}
> > +
> > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > +		struct vfio_iommu_type1_bind bind;
> > +		u32 version;
> > +		int data_size;
> > +		void *gbind_data;
> > +		int ret;
> > +
> > +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> > +
> > +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> > +			return -EFAULT;
> > +
> > +		if (bind.argsz < minsz)
> > +			return -EINVAL;
> > +
> > +		/* Get the version of struct iommu_gpasid_bind_data */
> > +		if (copy_from_user(&version,
> > +			(void __user *) (arg + minsz),
> > +					sizeof(version)))
> > +			return -EFAULT;
> 
> Why are we coping things from beyond the size we've validated that the
> user has provided again?

let me wait for the result in Jacob's thread below. looks like need
to have a decision from you and Joreg. If using argsze is good, then
I guess we don't need the version-to-size mapping. right? Actually,
the version-to-size mapping is added to ensure vfio copy data correctly.
https://lkml.org/lkml/2020/4/2/876

> > +
> > +		data_size = iommu_uapi_get_data_size(
> > +				IOMMU_UAPI_BIND_GPASID, version);
> > +		gbind_data = kzalloc(data_size, GFP_KERNEL);
> > +		if (!gbind_data)
> > +			return -ENOMEM;
> > +
> > +		if (copy_from_user(gbind_data,
> > +			 (void __user *) (arg + minsz), data_size)) {
> > +			kfree(gbind_data);
> > +			return -EFAULT;
> > +		}
> 
> And again.  argsz isn't just for minsz.
>
> > +
> > +		switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
> > +		case VFIO_IOMMU_BIND_GUEST_PGTBL:
> > +			ret = vfio_iommu_type1_bind_gpasid(iommu,
> > +							   gbind_data);
> > +			break;
> > +		case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
> > +			ret = vfio_iommu_type1_unbind_gpasid(iommu,
> > +							     gbind_data);
> > +			break;
> > +		default:
> > +			ret = -EINVAL;
> > +			break;
> > +		}
> > +		kfree(gbind_data);
> > +		return ret;
> >  	}
> >
> >  	return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index ebeaf3e..2235bc6 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -14,6 +14,7 @@
> >
> >  #include <linux/types.h>
> >  #include <linux/ioctl.h>
> > +#include <linux/iommu.h>
> >
> >  #define VFIO_API_VERSION	0
> >
> > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request {
> >   */
> >  #define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 22)
> >
> > +/**
> > + * Supported flags:
> > + *	- VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for
> > + *			nesting type IOMMUs. In @data field It takes struct
> > + *			iommu_gpasid_bind_data.
> > + *	- VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table
> operation
> > + *			invoked by VFIO_IOMMU_BIND_GUEST_PGTBL.
> 
> This must require iommu_gpasid_bind_data in the data field as well,
> right?

yes.

Regards,
Yi Liu
Alex Williamson April 3, 2020, 6:11 p.m. UTC | #9
On Fri, 3 Apr 2020 13:30:49 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> Hi Alex,
> 
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Friday, April 3, 2020 3:57 AM
> > To: Liu, Yi L <yi.l.liu@intel.com>
> > 
> > On Sun, 22 Mar 2020 05:32:03 -0700
> > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> >   
> > > From: Liu Yi L <yi.l.liu@intel.com>
> > >
> > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware
> > > IOMMUs that have nesting DMA translation (a.k.a dual stage address
> > > translation). For such hardware IOMMUs, there are two stages/levels of
> > > address translation, and software may let userspace/VM to own the first-
> > > level/stage-1 translation structures. Example of such usage is vSVA (
> > > virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> > > translation structures and bind the structures to host, then hardware
> > > IOMMU would utilize nesting translation when doing DMA translation fo
> > > the devices behind such hardware IOMMU.
> > >
> > > This patch adds vfio support for binding guest translation (a.k.a stage 1)
> > > structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only bind
> > > guest page table is needed, it also requires to expose interface to guest
> > > for iommu cache invalidation when guest modified the first-level/stage-1
> > > translation structures since hardware needs to be notified to flush stale
> > > iotlbs. This would be introduced in next patch.
> > >
> > > In this patch, guest page table bind and unbind are done by using flags
> > > VFIO_IOMMU_BIND_GUEST_PGTBL and VFIO_IOMMU_UNBIND_GUEST_PGTBL  
> > under IOCTL  
> > > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> > > struct iommu_gpasid_bind_data. Before binding guest page table to host,
> > > VM should have got a PASID allocated by host via VFIO_IOMMU_PASID_REQUEST.
> > >
> > > Bind guest translation structures (here is guest page table) to host
> > > are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> > >
> > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > Cc: Alex Williamson <alex.williamson@redhat.com>
> > > Cc: Eric Auger <eric.auger@redhat.com>
> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 158  
> > ++++++++++++++++++++++++++++++++++++++++  
> > >  include/uapi/linux/vfio.h       |  46 ++++++++++++
> > >  2 files changed, 204 insertions(+)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > index 82a9e0b..a877747 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -130,6 +130,33 @@ struct vfio_regions {
> > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > >  					(!list_empty(&iommu->domain_list))
> > >
> > > +struct domain_capsule {
> > > +	struct iommu_domain *domain;
> > > +	void *data;
> > > +};
> > > +
> > > +/* iommu->lock must be held */
> > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > > +		      int (*fn)(struct device *dev, void *data),
> > > +		      void *data)
> > > +{
> > > +	struct domain_capsule dc = {.data = data};
> > > +	struct vfio_domain *d;
> > > +	struct vfio_group *g;
> > > +	int ret = 0;
> > > +
> > > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > > +		dc.domain = d->domain;
> > > +		list_for_each_entry(g, &d->group_list, next) {
> > > +			ret = iommu_group_for_each_dev(g->iommu_group,
> > > +						       &dc, fn);
> > > +			if (ret)
> > > +				break;
> > > +		}
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > >  static int put_pfn(unsigned long pfn, int prot);
> > >
> > >  /*
> > > @@ -2314,6 +2341,88 @@ static int vfio_iommu_info_add_nesting_cap(struct  
> > vfio_iommu *iommu,  
> > >  	return 0;
> > >  }
> > >
> > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data)
> > > +{
> > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > +
> > > +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data);
> > > +}
> > > +
> > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data)
> > > +{
> > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > +
> > > +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> > > +					gbind_data->hpasid);
> > > +}
> > > +
> > > +/**
> > > + * Unbind specific gpasid, caller of this function requires hold
> > > + * vfio_iommu->lock
> > > + */
> > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu,
> > > +				struct iommu_gpasid_bind_data *gbind_data)
> > > +{
> > > +	return vfio_iommu_for_each_dev(iommu,
> > > +				vfio_unbind_gpasid_fn, gbind_data);
> > > +}
> > > +
> > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
> > > +				struct iommu_gpasid_bind_data *gbind_data)
> > > +{
> > > +	int ret = 0;
> > > +
> > > +	mutex_lock(&iommu->lock);
> > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > +		ret = -EINVAL;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	ret = vfio_iommu_for_each_dev(iommu,
> > > +			vfio_bind_gpasid_fn, gbind_data);
> > > +	/*
> > > +	 * If bind failed, it may not be a total failure. Some devices
> > > +	 * within the iommu group may have bind successfully. Although
> > > +	 * we don't enable pasid capability for non-singletion iommu
> > > +	 * groups, a unbind operation would be helpful to ensure no
> > > +	 * partial binding for an iommu group.  
> > 
> > Where was the non-singleton group restriction done, I missed that.  
> 
> Hmm, it's missed. thanks for spotting it. How about adding this
> check in the vfio_iommu_for_each_dev()? If looped a non-singleton
> group, just skip it. It applies to the cache_inv path all the
> same.

I don't really understand the singleton issue, which is why I was
surprised to see this since I didn't see a discussion previously.
Skipping a singleton group seems like unpredictable behavior to the
user though.

> > > +	 */
> > > +	if (ret)
> > > +		/*
> > > +		 * Undo all binds that already succeeded, no need to
> > > +		 * check the return value here since some device within
> > > +		 * the group has no successful bind when coming to this
> > > +		 * place switch.
> > > +		 */
> > > +		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);  
> > 
> > However, the for_each_dev function stops when the callback function
> > returns error, are we just assuming we stop at the same device as we
> > faulted on the first time and that we traverse the same set of devices
> > the second time?  It seems strange to me that unbind should be able to
> > fail.  
> 
> unbind can fail if a user attempts to unbind a pasid which is not belonged
> to it or a pasid which hasn't ever been bound. Otherwise, I didn't see a
> reason to fail.

Even if so, this doesn't address the first part of the question.  If
our for_each_dev() callback returns error then the loop stops and we
can't be sure we've triggered it everywhere that it needs to be
triggered.  There are also aspects of whether it's an error to unbind
something that is not bound because the result is still that the pasid
is unbound, right?

> > > +
> > > +out_unlock:
> > > +	mutex_unlock(&iommu->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > > +				struct iommu_gpasid_bind_data *gbind_data)
> > > +{
> > > +	int ret = 0;
> > > +
> > > +	mutex_lock(&iommu->lock);
> > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > +		ret = -EINVAL;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);  
> > 
> > How is a user supposed to respond to their unbind failing?  
> 
> If it's a malicious unbind (e.g. unbind a not yet bound pasid or unbind
> a pasid which doesn't belong to current user).

And if it's not a malicious unbind?  To me this is similar semantics to
free() failing.  Is there any remedy other than to abort?  Thanks,

Alex

> > > +
> > > +out_unlock:
> > > +	mutex_unlock(&iommu->lock);
> > > +	return ret;
> > > +}
> > > +
> > >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> > >  				   unsigned int cmd, unsigned long arg)
> > >  {
> > > @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void  
> > *iommu_data,  
> > >  		default:
> > >  			return -EINVAL;
> > >  		}
> > > +
> > > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > > +		struct vfio_iommu_type1_bind bind;
> > > +		u32 version;
> > > +		int data_size;
> > > +		void *gbind_data;
> > > +		int ret;
> > > +
> > > +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> > > +
> > > +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> > > +			return -EFAULT;
> > > +
> > > +		if (bind.argsz < minsz)
> > > +			return -EINVAL;
> > > +
> > > +		/* Get the version of struct iommu_gpasid_bind_data */
> > > +		if (copy_from_user(&version,
> > > +			(void __user *) (arg + minsz),
> > > +					sizeof(version)))
> > > +			return -EFAULT;  
> > 
> > Why are we coping things from beyond the size we've validated that the
> > user has provided again?  
> 
> let me wait for the result in Jacob's thread below. looks like need
> to have a decision from you and Joreg. If using argsze is good, then
> I guess we don't need the version-to-size mapping. right? Actually,
> the version-to-size mapping is added to ensure vfio copy data correctly.
> https://lkml.org/lkml/2020/4/2/876
> 
> > > +
> > > +		data_size = iommu_uapi_get_data_size(
> > > +				IOMMU_UAPI_BIND_GPASID, version);
> > > +		gbind_data = kzalloc(data_size, GFP_KERNEL);
> > > +		if (!gbind_data)
> > > +			return -ENOMEM;
> > > +
> > > +		if (copy_from_user(gbind_data,
> > > +			 (void __user *) (arg + minsz), data_size)) {
> > > +			kfree(gbind_data);
> > > +			return -EFAULT;
> > > +		}  
> > 
> > And again.  argsz isn't just for minsz.
> >  
> > > +
> > > +		switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
> > > +		case VFIO_IOMMU_BIND_GUEST_PGTBL:
> > > +			ret = vfio_iommu_type1_bind_gpasid(iommu,
> > > +							   gbind_data);
> > > +			break;
> > > +		case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
> > > +			ret = vfio_iommu_type1_unbind_gpasid(iommu,
> > > +							     gbind_data);
> > > +			break;
> > > +		default:
> > > +			ret = -EINVAL;
> > > +			break;
> > > +		}
> > > +		kfree(gbind_data);
> > > +		return ret;
> > >  	}
> > >
> > >  	return -ENOTTY;
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index ebeaf3e..2235bc6 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -14,6 +14,7 @@
> > >
> > >  #include <linux/types.h>
> > >  #include <linux/ioctl.h>
> > > +#include <linux/iommu.h>
> > >
> > >  #define VFIO_API_VERSION	0
> > >
> > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request {
> > >   */
> > >  #define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 22)
> > >
> > > +/**
> > > + * Supported flags:
> > > + *	- VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for
> > > + *			nesting type IOMMUs. In @data field It takes struct
> > > + *			iommu_gpasid_bind_data.
> > > + *	- VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table  
> > operation  
> > > + *			invoked by VFIO_IOMMU_BIND_GUEST_PGTBL.  
> > 
> > This must require iommu_gpasid_bind_data in the data field as well,
> > right?  
> 
> yes.
> 
> Regards,
> Yi Liu
>
Yi Liu April 4, 2020, 10:28 a.m. UTC | #10
Hi Alex,

> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Saturday, April 4, 2020 2:11 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> 
> On Fri, 3 Apr 2020 13:30:49 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > Hi Alex,
> >
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Friday, April 3, 2020 3:57 AM
> > > To: Liu, Yi L <yi.l.liu@intel.com>
> > >
> > > On Sun, 22 Mar 2020 05:32:03 -0700
> > > "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > >
> > > > From: Liu Yi L <yi.l.liu@intel.com>
> > > >
> > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by
> > > > hardware IOMMUs that have nesting DMA translation (a.k.a dual
> > > > stage address translation). For such hardware IOMMUs, there are
> > > > two stages/levels of address translation, and software may let
> > > > userspace/VM to own the first-
> > > > level/stage-1 translation structures. Example of such usage is
> > > > vSVA ( virtual Shared Virtual Addressing). VM owns the
> > > > first-level/stage-1 translation structures and bind the structures
> > > > to host, then hardware IOMMU would utilize nesting translation
> > > > when doing DMA translation fo the devices behind such hardware IOMMU.
> > > >
> > > > This patch adds vfio support for binding guest translation (a.k.a
> > > > stage 1) structure to host iommu. And for
> > > > VFIO_TYPE1_NESTING_IOMMU, not only bind guest page table is
> > > > needed, it also requires to expose interface to guest for iommu
> > > > cache invalidation when guest modified the first-level/stage-1
> > > > translation structures since hardware needs to be notified to flush stale iotlbs.
> This would be introduced in next patch.
> > > >
> > > > In this patch, guest page table bind and unbind are done by using
> > > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and
> > > > VFIO_IOMMU_UNBIND_GUEST_PGTBL
> > > under IOCTL
> > > > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by struct
> > > > iommu_gpasid_bind_data. Before binding guest page table to host,
> > > > VM should have got a PASID allocated by host via
> VFIO_IOMMU_PASID_REQUEST.
> > > >
> > > > Bind guest translation structures (here is guest page table) to
> > > > host are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> > > >
> > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > Cc: Alex Williamson <alex.williamson@redhat.com>
> > > > Cc: Eric Auger <eric.auger@redhat.com>
> > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com>
> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > ---
> > > >  drivers/vfio/vfio_iommu_type1.c | 158
> > > ++++++++++++++++++++++++++++++++++++++++
> > > >  include/uapi/linux/vfio.h       |  46 ++++++++++++
> > > >  2 files changed, 204 insertions(+)
> > > >
> > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644
> > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > @@ -130,6 +130,33 @@ struct vfio_regions {
> > > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> > > >  					(!list_empty(&iommu->domain_list))
> > > >
> > > > +struct domain_capsule {
> > > > +	struct iommu_domain *domain;
> > > > +	void *data;
> > > > +};
> > > > +
> > > > +/* iommu->lock must be held */
> > > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > > > +		      int (*fn)(struct device *dev, void *data),
> > > > +		      void *data)
> > > > +{
> > > > +	struct domain_capsule dc = {.data = data};
> > > > +	struct vfio_domain *d;
> > > > +	struct vfio_group *g;
> > > > +	int ret = 0;
> > > > +
> > > > +	list_for_each_entry(d, &iommu->domain_list, next) {
> > > > +		dc.domain = d->domain;
> > > > +		list_for_each_entry(g, &d->group_list, next) {
> > > > +			ret = iommu_group_for_each_dev(g->iommu_group,
> > > > +						       &dc, fn);
> > > > +			if (ret)
> > > > +				break;
> > > > +		}
> > > > +	}
> > > > +	return ret;
> > > > +}
> > > > +
> > > >  static int put_pfn(unsigned long pfn, int prot);
> > > >
> > > >  /*
> > > > @@ -2314,6 +2341,88 @@ static int
> > > > vfio_iommu_info_add_nesting_cap(struct
> > > vfio_iommu *iommu,
> > > >  	return 0;
> > > >  }
> > > >
> > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) {
> > > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > > +
> > > > +	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); }
> > > > +
> > > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data)
> > > > +{
> > > > +	struct domain_capsule *dc = (struct domain_capsule *)data;
> > > > +	struct iommu_gpasid_bind_data *gbind_data =
> > > > +		(struct iommu_gpasid_bind_data *) dc->data;
> > > > +
> > > > +	return iommu_sva_unbind_gpasid(dc->domain, dev,
> > > > +					gbind_data->hpasid);
> > > > +}
> > > > +
> > > > +/**
> > > > + * Unbind specific gpasid, caller of this function requires hold
> > > > + * vfio_iommu->lock
> > > > + */
> > > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu,
> > > > +				struct iommu_gpasid_bind_data *gbind_data) {
> > > > +	return vfio_iommu_for_each_dev(iommu,
> > > > +				vfio_unbind_gpasid_fn, gbind_data); }
> > > > +
> > > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
> > > > +				struct iommu_gpasid_bind_data *gbind_data) {
> > > > +	int ret = 0;
> > > > +
> > > > +	mutex_lock(&iommu->lock);
> > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > +		ret = -EINVAL;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	ret = vfio_iommu_for_each_dev(iommu,
> > > > +			vfio_bind_gpasid_fn, gbind_data);
> > > > +	/*
> > > > +	 * If bind failed, it may not be a total failure. Some devices
> > > > +	 * within the iommu group may have bind successfully. Although
> > > > +	 * we don't enable pasid capability for non-singletion iommu
> > > > +	 * groups, a unbind operation would be helpful to ensure no
> > > > +	 * partial binding for an iommu group.
> > >
> > > Where was the non-singleton group restriction done, I missed that.
> >
> > Hmm, it's missed. thanks for spotting it. How about adding this check
> > in the vfio_iommu_for_each_dev()? If looped a non-singleton group,
> > just skip it. It applies to the cache_inv path all the same.
> 
> I don't really understand the singleton issue, which is why I was surprised to see this
> since I didn't see a discussion previously.
> Skipping a singleton group seems like unpredictable behavior to the user though.

There is a discussion on the SVA availability in the below link. There
was a conclusion to only support SVA for singleton group. I think bind
guest page table also needs to apply this rule.
https://patchwork.kernel.org/patch/10213877/

> > > > +	 */
> > > > +	if (ret)
> > > > +		/*
> > > > +		 * Undo all binds that already succeeded, no need to
> > > > +		 * check the return value here since some device within
> > > > +		 * the group has no successful bind when coming to this
> > > > +		 * place switch.
> > > > +		 */
> > > > +		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> > >
> > > However, the for_each_dev function stops when the callback function
> > > returns error, are we just assuming we stop at the same device as we
> > > faulted on the first time and that we traverse the same set of
> > > devices the second time?  It seems strange to me that unbind should
> > > be able to fail.
> >
> > unbind can fail if a user attempts to unbind a pasid which is not
> > belonged to it or a pasid which hasn't ever been bound. Otherwise, I
> > didn't see a reason to fail.
> 
> Even if so, this doesn't address the first part of the question. 
> If our for_each_dev()
> callback returns error then the loop stops and we can't be sure we've
> triggered it
> everywhere that it needs to be triggered. 

Hmm, let me pull back a little. Back to the code, in the attempt to
do bind, the code uses for_each_dev() to loop devices. If failed then
uses for_each_dev() again to do unbind. Your question is can the second
for_each_dev() be able to undo the bind correctly as the second
for_each_dev() call has no idea where it failed in the bind phase. is it?
Actually, this is why I added the comment that no need to check the return
value of vfio_iommu_type1_do_guest_unbind().

> There are also aspects of whether it's an
> error to unbind something that is not bound because the result is still
> that the pasid
> is unbound, right?

agreed, as you mentioned in the below comment, no need to fail unbind
unless user is trying to unbind a pasid which doesn't belong to it.

> > > > +
> > > > +out_unlock:
> > > > +	mutex_unlock(&iommu->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > > > +				struct iommu_gpasid_bind_data *gbind_data) {
> > > > +	int ret = 0;
> > > > +
> > > > +	mutex_lock(&iommu->lock);
> > > > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > > > +		ret = -EINVAL;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> > >
> > > How is a user supposed to respond to their unbind failing?
> >
> > If it's a malicious unbind (e.g. unbind a not yet bound pasid or
> > unbind a pasid which doesn't belong to current user).
> 
> And if it's not a malicious unbind?  To me this is similar semantics to
> free() failing.  Is there any remedy other than to abort?  Thanks,

got it. so if user is trying to unbind a pasid which doesn't belong to
it, should kernel return error to user or just abort it?

Regards,
Yi Liu

> Alex
> 
> > > > +
> > > > +out_unlock:
> > > > +	mutex_unlock(&iommu->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > >  				   unsigned int cmd, unsigned long arg)  { @@ -
> 2471,6
> > > > +2580,55 @@ static long vfio_iommu_type1_ioctl(void
> > > *iommu_data,
> > > >  		default:
> > > >  			return -EINVAL;
> > > >  		}
> > > > +
> > > > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > > > +		struct vfio_iommu_type1_bind bind;
> > > > +		u32 version;
> > > > +		int data_size;
> > > > +		void *gbind_data;
> > > > +		int ret;
> > > > +
> > > > +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> > > > +
> > > > +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> > > > +			return -EFAULT;
> > > > +
> > > > +		if (bind.argsz < minsz)
> > > > +			return -EINVAL;
> > > > +
> > > > +		/* Get the version of struct iommu_gpasid_bind_data */
> > > > +		if (copy_from_user(&version,
> > > > +			(void __user *) (arg + minsz),
> > > > +					sizeof(version)))
> > > > +			return -EFAULT;
> > >
> > > Why are we coping things from beyond the size we've validated that
> > > the user has provided again?
> >
> > let me wait for the result in Jacob's thread below. looks like need to
> > have a decision from you and Joreg. If using argsze is good, then I
> > guess we don't need the version-to-size mapping. right? Actually, the
> > version-to-size mapping is added to ensure vfio copy data correctly.
> > https://lkml.org/lkml/2020/4/2/876
> >
> > > > +
> > > > +		data_size = iommu_uapi_get_data_size(
> > > > +				IOMMU_UAPI_BIND_GPASID, version);
> > > > +		gbind_data = kzalloc(data_size, GFP_KERNEL);
> > > > +		if (!gbind_data)
> > > > +			return -ENOMEM;
> > > > +
> > > > +		if (copy_from_user(gbind_data,
> > > > +			 (void __user *) (arg + minsz), data_size)) {
> > > > +			kfree(gbind_data);
> > > > +			return -EFAULT;
> > > > +		}
> > >
> > > And again.  argsz isn't just for minsz.
> > >
> > > > +
> > > > +		switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
> > > > +		case VFIO_IOMMU_BIND_GUEST_PGTBL:
> > > > +			ret = vfio_iommu_type1_bind_gpasid(iommu,
> > > > +							   gbind_data);
> > > > +			break;
> > > > +		case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
> > > > +			ret = vfio_iommu_type1_unbind_gpasid(iommu,
> > > > +							     gbind_data);
> > > > +			break;
> > > > +		default:
> > > > +			ret = -EINVAL;
> > > > +			break;
> > > > +		}
> > > > +		kfree(gbind_data);
> > > > +		return ret;
> > > >  	}
> > > >
> > > >  	return -ENOTTY;
> > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > > index ebeaf3e..2235bc6 100644
> > > > --- a/include/uapi/linux/vfio.h
> > > > +++ b/include/uapi/linux/vfio.h
> > > > @@ -14,6 +14,7 @@
> > > >
> > > >  #include <linux/types.h>
> > > >  #include <linux/ioctl.h>
> > > > +#include <linux/iommu.h>
> > > >
> > > >  #define VFIO_API_VERSION	0
> > > >
> > > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request {
> > > >   */
> > > >  #define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 22)
> > > >
> > > > +/**
> > > > + * Supported flags:
> > > > + *	- VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to
> host for
> > > > + *			nesting type IOMMUs. In @data field It takes struct
> > > > + *			iommu_gpasid_bind_data.
> > > > + *	- VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page
> table
> > > operation
> > > > + *			invoked by VFIO_IOMMU_BIND_GUEST_PGTBL.
> > >
> > > This must require iommu_gpasid_bind_data in the data field as well,
> > > right?
> >
> > yes.
> >
> > Regards,
> > Yi Liu
> >
Yi Liu April 7, 2020, 10:33 a.m. UTC | #11
Hi Jean,

> From: Jean-Philippe Brucker < jean-philippe@linaro.org >
> Sent: Friday, April 3, 2020 4:35 PM
> Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> 
> On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote:
> > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > > > >  		default:
> > > > > >  			return -EINVAL;
> > > > > >  		}
> > > > > > +
> > > > > > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > > > >
> > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me.
> > > >
> > > > Emm, it's up to the flags to indicate bind what. It was proposed to
> > > > cover the three cases below:
> > > > a) BIND/UNBIND_GPASID
> > > > b) BIND/UNBIND_GPASID_TABLE
> > > > c) BIND/UNBIND_PROCESS
> > > > <only a) is covered in this patch>
> > > > So it's called VFIO_IOMMU_BIND.
> > >
> > > but aren't they all about PASID related binding?
> >
> > yeah, I can rename it. :-)
> 
> I don't know if anyone intends to implement it, but SMMUv2 supports
> nesting translation without any PASID support. For that case the name
> VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more sense.
> Ideally we'd also use a neutral name for the IOMMU API instead of
> bind_gpasid(), but that's easier to change later.

I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may
cause confusion when thinking about VFIO_SET_IOMMU. How about using
VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another
VFIO_BIND_PROCESS in future for the SVA bind case.

Regards,
Yi Liu
Jean-Philippe Brucker April 9, 2020, 8:28 a.m. UTC | #12
On Tue, Apr 07, 2020 at 10:33:25AM +0000, Liu, Yi L wrote:
> Hi Jean,
> 
> > From: Jean-Philippe Brucker < jean-philippe@linaro.org >
> > Sent: Friday, April 3, 2020 4:35 PM
> > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> > 
> > On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote:
> > > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > > > > >  		default:
> > > > > > >  			return -EINVAL;
> > > > > > >  		}
> > > > > > > +
> > > > > > > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > > > > >
> > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me.
> > > > >
> > > > > Emm, it's up to the flags to indicate bind what. It was proposed to
> > > > > cover the three cases below:
> > > > > a) BIND/UNBIND_GPASID
> > > > > b) BIND/UNBIND_GPASID_TABLE
> > > > > c) BIND/UNBIND_PROCESS
> > > > > <only a) is covered in this patch>
> > > > > So it's called VFIO_IOMMU_BIND.
> > > >
> > > > but aren't they all about PASID related binding?
> > >
> > > yeah, I can rename it. :-)
> > 
> > I don't know if anyone intends to implement it, but SMMUv2 supports
> > nesting translation without any PASID support. For that case the name
> > VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more sense.
> > Ideally we'd also use a neutral name for the IOMMU API instead of
> > bind_gpasid(), but that's easier to change later.
> 
> I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may
> cause confusion when thinking about VFIO_SET_IOMMU. How about using
> VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another
> VFIO_BIND_PROCESS in future for the SVA bind case.

I think minimizing the number of ioctls is more important than finding the
ideal name. VFIO_IOMMU_BIND was fine to me, but if it's too vague then
rename it to VFIO_IOMMU_BIND_PASID and we'll just piggy-back on it for
non-PASID things (they should be rare enough).

Thanks,
Jean
Yi Liu April 9, 2020, 9:15 a.m. UTC | #13
> From: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Sent: Thursday, April 9, 2020 4:29 PM
> To: Liu, Yi L <yi.l.liu@intel.com>
> 
> On Tue, Apr 07, 2020 at 10:33:25AM +0000, Liu, Yi L wrote:
> > Hi Jean,
> >
> > > From: Jean-Philippe Brucker < jean-philippe@linaro.org >
> > > Sent: Friday, April 3, 2020 4:35 PM
> > > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> > >
> > > On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote:
> > > > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > > > > > >  		default:
> > > > > > > >  			return -EINVAL;
> > > > > > > >  		}
> > > > > > > > +
> > > > > > > > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > > > > > >
> > > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me.
> > > > > >
> > > > > > Emm, it's up to the flags to indicate bind what. It was proposed to
> > > > > > cover the three cases below:
> > > > > > a) BIND/UNBIND_GPASID
> > > > > > b) BIND/UNBIND_GPASID_TABLE
> > > > > > c) BIND/UNBIND_PROCESS
> > > > > > <only a) is covered in this patch>
> > > > > > So it's called VFIO_IOMMU_BIND.
> > > > >
> > > > > but aren't they all about PASID related binding?
> > > >
> > > > yeah, I can rename it. :-)
> > >
> > > I don't know if anyone intends to implement it, but SMMUv2 supports
> > > nesting translation without any PASID support. For that case the name
> > > VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more
> sense.
> > > Ideally we'd also use a neutral name for the IOMMU API instead of
> > > bind_gpasid(), but that's easier to change later.
> >
> > I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may
> > cause confusion when thinking about VFIO_SET_IOMMU. How about using
> > VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another
> > VFIO_BIND_PROCESS in future for the SVA bind case.
> 
> I think minimizing the number of ioctls is more important than finding the
> ideal name. VFIO_IOMMU_BIND was fine to me, but if it's too vague then
> rename it to VFIO_IOMMU_BIND_PASID and we'll just piggy-back on it for
> non-PASID things (they should be rare enough).
maybe we can start with VFIO_IOMMU_BIND_PASID. Actually, there is
also a discussion on reusing the same ioctl and vfio structure for
pasid_alloc/free, bind/unbind_gpasid. and cache_inv. how about your
opinion?

https://lkml.org/lkml/2020/4/3/833

Regards,
Yi Liu
Jean-Philippe Brucker April 9, 2020, 9:38 a.m. UTC | #14
On Thu, Apr 09, 2020 at 09:15:29AM +0000, Liu, Yi L wrote:
> > From: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > Sent: Thursday, April 9, 2020 4:29 PM
> > To: Liu, Yi L <yi.l.liu@intel.com>
> > 
> > On Tue, Apr 07, 2020 at 10:33:25AM +0000, Liu, Yi L wrote:
> > > Hi Jean,
> > >
> > > > From: Jean-Philippe Brucker < jean-philippe@linaro.org >
> > > > Sent: Friday, April 3, 2020 4:35 PM
> > > > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> > > >
> > > > On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote:
> > > > > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > > > > > > >  		default:
> > > > > > > > >  			return -EINVAL;
> > > > > > > > >  		}
> > > > > > > > > +
> > > > > > > > > +	} else if (cmd == VFIO_IOMMU_BIND) {
> > > > > > > >
> > > > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me.
> > > > > > >
> > > > > > > Emm, it's up to the flags to indicate bind what. It was proposed to
> > > > > > > cover the three cases below:
> > > > > > > a) BIND/UNBIND_GPASID
> > > > > > > b) BIND/UNBIND_GPASID_TABLE
> > > > > > > c) BIND/UNBIND_PROCESS
> > > > > > > <only a) is covered in this patch>
> > > > > > > So it's called VFIO_IOMMU_BIND.
> > > > > >
> > > > > > but aren't they all about PASID related binding?
> > > > >
> > > > > yeah, I can rename it. :-)
> > > >
> > > > I don't know if anyone intends to implement it, but SMMUv2 supports
> > > > nesting translation without any PASID support. For that case the name
> > > > VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more
> > sense.
> > > > Ideally we'd also use a neutral name for the IOMMU API instead of
> > > > bind_gpasid(), but that's easier to change later.
> > >
> > > I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may
> > > cause confusion when thinking about VFIO_SET_IOMMU. How about using
> > > VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another
> > > VFIO_BIND_PROCESS in future for the SVA bind case.
> > 
> > I think minimizing the number of ioctls is more important than finding the
> > ideal name. VFIO_IOMMU_BIND was fine to me, but if it's too vague then
> > rename it to VFIO_IOMMU_BIND_PASID and we'll just piggy-back on it for
> > non-PASID things (they should be rare enough).
> maybe we can start with VFIO_IOMMU_BIND_PASID. Actually, there is
> also a discussion on reusing the same ioctl and vfio structure for
> pasid_alloc/free, bind/unbind_gpasid. and cache_inv. how about your
> opinion?

Merging bind with unbind and alloc with free makes sense. I'd leave at
least invalidate a separate ioctl, because alloc/bind/unbind/free are
setup functions while invalidate is a runtime thing and performance
sensitive. But I can't see a good reason not to merge them all together,
so either way is fine by me.

Thanks,
Jean
Yi Liu April 11, 2020, 5:52 a.m. UTC | #15
Hi Alex,

> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Friday, April 3, 2020 3:57 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> 
> On Sun, 22 Mar 2020 05:32:03 -0700
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > From: Liu Yi L <yi.l.liu@intel.com>
> >
> > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by
[...]
> > +/**
> > + * Unbind specific gpasid, caller of this function requires hold
> > + * vfio_iommu->lock
> > + */
> > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data)
> > +{
> > +	return vfio_iommu_for_each_dev(iommu,
> > +				vfio_unbind_gpasid_fn, gbind_data);
> > +}
> > +
> > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
> > +				struct iommu_gpasid_bind_data *gbind_data)
> > +{
> > +	int ret = 0;
> > +
> > +	mutex_lock(&iommu->lock);
> > +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	ret = vfio_iommu_for_each_dev(iommu,
> > +			vfio_bind_gpasid_fn, gbind_data);
> > +	/*
> > +	 * If bind failed, it may not be a total failure. Some devices
> > +	 * within the iommu group may have bind successfully. Although
> > +	 * we don't enable pasid capability for non-singletion iommu
> > +	 * groups, a unbind operation would be helpful to ensure no
> > +	 * partial binding for an iommu group.
> 
> Where was the non-singleton group restriction done, I missed that.
> 
> > +	 */
> > +	if (ret)
> > +		/*
> > +		 * Undo all binds that already succeeded, no need to
> > +		 * check the return value here since some device within
> > +		 * the group has no successful bind when coming to this
> > +		 * place switch.
> > +		 */
> > +		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
> 
> However, the for_each_dev function stops when the callback function
> returns error, are we just assuming we stop at the same device as we
> faulted on the first time and that we traverse the same set of devices
> the second time?  It seems strange to me that unbind should be able to
> fail.

I think the code needs enhancement. Although one group per container
and one device per group is the most typical and desired case, but
the code here loops domains, groups, devices. It should be able to
unwind prior bind when the loop failed for a device. So I plan to do
below change for bind path.

list_for_each_entry(domain, &iommu->domain_list, next) {
	list_for_each_entry(group, &domain->group_list, next) {
		/*
		  * if bind failed on a certain device, should unbind prior successful
		  * bind iommu_group_for_each_dev() should be modified to take two
		  * callbacks, one for forward loop and one for reverse loop when failure
		  * happened. "return_upon_failure" indicates whether return upon failure
		  * during forward loop or not. If yes, iommu_group_for_each_dev() should
		  * unwind the prior bind in this iommu group before return.
		  */
		ret = iommu_group_for_each_dev(iommu_group, bind_gpasid_fn,
					unbind_gpasid_fn, data, return_upon_failure);
		if (ret)
			break;
	}
	if (ret) {
		/* unwind bindings with prior groups */
		list_for_each_entry_continue_reverse(group,
							&domain->group_list, next) {
			iommu_group_for_each_dev(iommu_group, unbind_gpasid_fn,
						NULL, data, ignore_intermediate_failure);
		}
		break;
	}
}

if (ret) {
	/* unwind bindings with prior domains */
	list_for_each_entry_continue_reverse(domain, &iommu->domain_list, next) {
		iommu_group_for_each_dev(iommu_group, unbind_gpasid_fn,
						NULL, data, ignore_intermediate_failure);
		}
	}
}

return ret;

Regards,
Yi Liu
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 82a9e0b..a877747 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -130,6 +130,33 @@  struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
+struct domain_capsule {
+	struct iommu_domain *domain;
+	void *data;
+};
+
+/* iommu->lock must be held */
+static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
+		      int (*fn)(struct device *dev, void *data),
+		      void *data)
+{
+	struct domain_capsule dc = {.data = data};
+	struct vfio_domain *d;
+	struct vfio_group *g;
+	int ret = 0;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		dc.domain = d->domain;
+		list_for_each_entry(g, &d->group_list, next) {
+			ret = iommu_group_for_each_dev(g->iommu_group,
+						       &dc, fn);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -2314,6 +2341,88 @@  static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
 	return 0;
 }
 
+static int vfio_bind_gpasid_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	struct iommu_gpasid_bind_data *gbind_data =
+		(struct iommu_gpasid_bind_data *) dc->data;
+
+	return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data);
+}
+
+static int vfio_unbind_gpasid_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	struct iommu_gpasid_bind_data *gbind_data =
+		(struct iommu_gpasid_bind_data *) dc->data;
+
+	return iommu_sva_unbind_gpasid(dc->domain, dev,
+					gbind_data->hpasid);
+}
+
+/**
+ * Unbind specific gpasid, caller of this function requires hold
+ * vfio_iommu->lock
+ */
+static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu,
+				struct iommu_gpasid_bind_data *gbind_data)
+{
+	return vfio_iommu_for_each_dev(iommu,
+				vfio_unbind_gpasid_fn, gbind_data);
+}
+
+static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu,
+				struct iommu_gpasid_bind_data *gbind_data)
+{
+	int ret = 0;
+
+	mutex_lock(&iommu->lock);
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = vfio_iommu_for_each_dev(iommu,
+			vfio_bind_gpasid_fn, gbind_data);
+	/*
+	 * If bind failed, it may not be a total failure. Some devices
+	 * within the iommu group may have bind successfully. Although
+	 * we don't enable pasid capability for non-singletion iommu
+	 * groups, a unbind operation would be helpful to ensure no
+	 * partial binding for an iommu group.
+	 */
+	if (ret)
+		/*
+		 * Undo all binds that already succeeded, no need to
+		 * check the return value here since some device within
+		 * the group has no successful bind when coming to this
+		 * place switch.
+		 */
+		vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
+				struct iommu_gpasid_bind_data *gbind_data)
+{
+	int ret = 0;
+
+	mutex_lock(&iommu->lock);
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -2471,6 +2580,55 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 		default:
 			return -EINVAL;
 		}
+
+	} else if (cmd == VFIO_IOMMU_BIND) {
+		struct vfio_iommu_type1_bind bind;
+		u32 version;
+		int data_size;
+		void *gbind_data;
+		int ret;
+
+		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
+
+		if (copy_from_user(&bind, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (bind.argsz < minsz)
+			return -EINVAL;
+
+		/* Get the version of struct iommu_gpasid_bind_data */
+		if (copy_from_user(&version,
+			(void __user *) (arg + minsz),
+					sizeof(version)))
+			return -EFAULT;
+
+		data_size = iommu_uapi_get_data_size(
+				IOMMU_UAPI_BIND_GPASID, version);
+		gbind_data = kzalloc(data_size, GFP_KERNEL);
+		if (!gbind_data)
+			return -ENOMEM;
+
+		if (copy_from_user(gbind_data,
+			 (void __user *) (arg + minsz), data_size)) {
+			kfree(gbind_data);
+			return -EFAULT;
+		}
+
+		switch (bind.flags & VFIO_IOMMU_BIND_MASK) {
+		case VFIO_IOMMU_BIND_GUEST_PGTBL:
+			ret = vfio_iommu_type1_bind_gpasid(iommu,
+							   gbind_data);
+			break;
+		case VFIO_IOMMU_UNBIND_GUEST_PGTBL:
+			ret = vfio_iommu_type1_unbind_gpasid(iommu,
+							     gbind_data);
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+		}
+		kfree(gbind_data);
+		return ret;
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ebeaf3e..2235bc6 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@ 
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <linux/iommu.h>
 
 #define VFIO_API_VERSION	0
 
@@ -853,6 +854,51 @@  struct vfio_iommu_type1_pasid_request {
  */
 #define VFIO_IOMMU_PASID_REQUEST	_IO(VFIO_TYPE, VFIO_BASE + 22)
 
+/**
+ * Supported flags:
+ *	- VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for
+ *			nesting type IOMMUs. In @data field It takes struct
+ *			iommu_gpasid_bind_data.
+ *	- VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table operation
+ *			invoked by VFIO_IOMMU_BIND_GUEST_PGTBL.
+ *
+ */
+struct vfio_iommu_type1_bind {
+	__u32		argsz;
+	__u32		flags;
+#define VFIO_IOMMU_BIND_GUEST_PGTBL	(1 << 0)
+#define VFIO_IOMMU_UNBIND_GUEST_PGTBL	(1 << 1)
+	__u8		data[];
+};
+
+#define VFIO_IOMMU_BIND_MASK	(VFIO_IOMMU_BIND_GUEST_PGTBL | \
+					VFIO_IOMMU_UNBIND_GUEST_PGTBL)
+
+/**
+ * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23,
+ *				struct vfio_iommu_type1_bind)
+ *
+ * Manage address spaces of devices in this container. Initially a TYPE1
+ * container can only have one address space, managed with
+ * VFIO_IOMMU_MAP/UNMAP_DMA.
+ *
+ * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP
+ * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page
+ * tables, and BIND manages the stage-1 (guest) page tables. Other types of
+ * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls
+ * the traffics only require single stage translation while BIND controls the
+ * traffics require nesting translation. But this depends on the underlying
+ * IOMMU architecture and isn't guaranteed. Example of this is the guest SVA
+ * traffics, such traffics need nesting translation to gain gVA->gPA and then
+ * gPA->hPA translation.
+ *
+ * Availability of this feature depends on the device, its bus, the underlying
+ * IOMMU and the CPU architecture.
+ *
+ * returns: 0 on success, -errno on failure.
+ */
+#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 23)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*