Message ID | 1584880325-10561-7-git-send-email-yi.l.liu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio: expose virtual Shared Virtual Addressing to VMs | expand |
Hi Yi, Thank you for the patch! Yet something to improve: [auto build test ERROR on vfio/next] [also build test ERROR on v5.6-rc6 next-20200320] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system. BTW, we also suggest to use '--base' option to specify the base tree in git format-patch, please see https://stackoverflow.com/a/37406982] url: https://github.com/0day-ci/linux/commits/Liu-Yi-L/vfio-expose-virtual-Shared-Virtual-Addressing-to-VMs/20200322-213259 base: https://github.com/awilliam/linux-vfio.git next config: arm64-defconfig (attached as .config) compiler: aarch64-linux-gcc (GCC) 9.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree GCC_VERSION=9.2.0 make.cross ARCH=arm64 If you fix the issue, kindly add following tag Reported-by: kbuild test robot <lkp@intel.com> All errors (new ones prefixed by >>): drivers/vfio/vfio_iommu_type1.c: In function 'vfio_iommu_get_stage1_format': drivers/vfio/vfio_iommu_type1.c:2300:4: error: 'DOMAIN_ATTR_PASID_FORMAT' undeclared (first use in this function) 2300 | DOMAIN_ATTR_PASID_FORMAT, &format)) { | ^~~~~~~~~~~~~~~~~~~~~~~~ drivers/vfio/vfio_iommu_type1.c:2300:4: note: each undeclared identifier is reported only once for each function it appears in drivers/vfio/vfio_iommu_type1.c: In function 'vfio_iommu_type1_ioctl': drivers/vfio/vfio_iommu_type1.c:2464:11: error: implicit declaration of function 'iommu_get_uapi_version' [-Werror=implicit-function-declaration] 2464 | return iommu_get_uapi_version(); | ^~~~~~~~~~~~~~~~~~~~~~ >> drivers/vfio/vfio_iommu_type1.c:2626:15: error: implicit declaration of function 'iommu_uapi_get_data_size' [-Werror=implicit-function-declaration] 2626 | data_size = iommu_uapi_get_data_size( | ^~~~~~~~~~~~~~~~~~~~~~~~ >> drivers/vfio/vfio_iommu_type1.c:2627:5: error: 'IOMMU_UAPI_BIND_GPASID' undeclared (first use in this function) 2627 | IOMMU_UAPI_BIND_GPASID, version); | ^~~~~~~~~~~~~~~~~~~~~~ cc1: some warnings being treated as errors vim +/iommu_uapi_get_data_size +2626 drivers/vfio/vfio_iommu_type1.c 2446 2447 static long vfio_iommu_type1_ioctl(void *iommu_data, 2448 unsigned int cmd, unsigned long arg) 2449 { 2450 struct vfio_iommu *iommu = iommu_data; 2451 unsigned long minsz; 2452 2453 if (cmd == VFIO_CHECK_EXTENSION) { 2454 switch (arg) { 2455 case VFIO_TYPE1_IOMMU: 2456 case VFIO_TYPE1v2_IOMMU: 2457 case VFIO_TYPE1_NESTING_IOMMU: 2458 return 1; 2459 case VFIO_DMA_CC_IOMMU: 2460 if (!iommu) 2461 return 0; 2462 return vfio_domains_have_iommu_cache(iommu); 2463 case VFIO_NESTING_IOMMU_UAPI: 2464 return iommu_get_uapi_version(); 2465 default: 2466 return 0; 2467 } 2468 } else if (cmd == VFIO_IOMMU_GET_INFO) { 2469 struct vfio_iommu_type1_info info; 2470 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 2471 unsigned long capsz; 2472 int ret; 2473 2474 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 2475 2476 /* For backward compatibility, cannot require this */ 2477 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 2478 2479 if (copy_from_user(&info, (void __user *)arg, minsz)) 2480 return -EFAULT; 2481 2482 if (info.argsz < minsz) 2483 return -EINVAL; 2484 2485 if (info.argsz >= capsz) { 2486 minsz = capsz; 2487 info.cap_offset = 0; /* output, no-recopy necessary */ 2488 } 2489 2490 info.flags = VFIO_IOMMU_INFO_PGSIZES; 2491 2492 info.iova_pgsizes = vfio_pgsize_bitmap(iommu); 2493 2494 ret = vfio_iommu_iova_build_caps(iommu, &caps); 2495 if (ret) 2496 return ret; 2497 2498 ret = vfio_iommu_info_add_nesting_cap(iommu, &caps); 2499 if (ret) 2500 return ret; 2501 2502 if (caps.size) { 2503 info.flags |= VFIO_IOMMU_INFO_CAPS; 2504 2505 if (info.argsz < sizeof(info) + caps.size) { 2506 info.argsz = sizeof(info) + caps.size; 2507 } else { 2508 vfio_info_cap_shift(&caps, sizeof(info)); 2509 if (copy_to_user((void __user *)arg + 2510 sizeof(info), caps.buf, 2511 caps.size)) { 2512 kfree(caps.buf); 2513 return -EFAULT; 2514 } 2515 info.cap_offset = sizeof(info); 2516 } 2517 2518 kfree(caps.buf); 2519 } 2520 2521 return copy_to_user((void __user *)arg, &info, minsz) ? 2522 -EFAULT : 0; 2523 2524 } else if (cmd == VFIO_IOMMU_MAP_DMA) { 2525 struct vfio_iommu_type1_dma_map map; 2526 uint32_t mask = VFIO_DMA_MAP_FLAG_READ | 2527 VFIO_DMA_MAP_FLAG_WRITE; 2528 2529 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 2530 2531 if (copy_from_user(&map, (void __user *)arg, minsz)) 2532 return -EFAULT; 2533 2534 if (map.argsz < minsz || map.flags & ~mask) 2535 return -EINVAL; 2536 2537 return vfio_dma_do_map(iommu, &map); 2538 2539 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { 2540 struct vfio_iommu_type1_dma_unmap unmap; 2541 long ret; 2542 2543 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 2544 2545 if (copy_from_user(&unmap, (void __user *)arg, minsz)) 2546 return -EFAULT; 2547 2548 if (unmap.argsz < minsz || unmap.flags) 2549 return -EINVAL; 2550 2551 ret = vfio_dma_do_unmap(iommu, &unmap); 2552 if (ret) 2553 return ret; 2554 2555 return copy_to_user((void __user *)arg, &unmap, minsz) ? 2556 -EFAULT : 0; 2557 2558 } else if (cmd == VFIO_IOMMU_PASID_REQUEST) { 2559 struct vfio_iommu_type1_pasid_request req; 2560 unsigned long offset; 2561 2562 minsz = offsetofend(struct vfio_iommu_type1_pasid_request, 2563 flags); 2564 2565 if (copy_from_user(&req, (void __user *)arg, minsz)) 2566 return -EFAULT; 2567 2568 if (req.argsz < minsz || 2569 !vfio_iommu_type1_pasid_req_valid(req.flags)) 2570 return -EINVAL; 2571 2572 if (copy_from_user((void *)&req + minsz, 2573 (void __user *)arg + minsz, 2574 sizeof(req) - minsz)) 2575 return -EFAULT; 2576 2577 switch (req.flags & VFIO_PASID_REQUEST_MASK) { 2578 case VFIO_IOMMU_PASID_ALLOC: 2579 { 2580 int ret = 0, result; 2581 2582 result = vfio_iommu_type1_pasid_alloc(iommu, 2583 req.alloc_pasid.min, 2584 req.alloc_pasid.max); 2585 if (result > 0) { 2586 offset = offsetof( 2587 struct vfio_iommu_type1_pasid_request, 2588 alloc_pasid.result); 2589 ret = copy_to_user( 2590 (void __user *) (arg + offset), 2591 &result, sizeof(result)); 2592 } else { 2593 pr_debug("%s: PASID alloc failed\n", __func__); 2594 ret = -EFAULT; 2595 } 2596 return ret; 2597 } 2598 case VFIO_IOMMU_PASID_FREE: 2599 return vfio_iommu_type1_pasid_free(iommu, 2600 req.free_pasid); 2601 default: 2602 return -EINVAL; 2603 } 2604 2605 } else if (cmd == VFIO_IOMMU_BIND) { 2606 struct vfio_iommu_type1_bind bind; 2607 u32 version; 2608 int data_size; 2609 void *gbind_data; 2610 int ret; 2611 2612 minsz = offsetofend(struct vfio_iommu_type1_bind, flags); 2613 2614 if (copy_from_user(&bind, (void __user *)arg, minsz)) 2615 return -EFAULT; 2616 2617 if (bind.argsz < minsz) 2618 return -EINVAL; 2619 2620 /* Get the version of struct iommu_gpasid_bind_data */ 2621 if (copy_from_user(&version, 2622 (void __user *) (arg + minsz), 2623 sizeof(version))) 2624 return -EFAULT; 2625 > 2626 data_size = iommu_uapi_get_data_size( > 2627 IOMMU_UAPI_BIND_GPASID, version); 2628 gbind_data = kzalloc(data_size, GFP_KERNEL); 2629 if (!gbind_data) 2630 return -ENOMEM; 2631 2632 if (copy_from_user(gbind_data, 2633 (void __user *) (arg + minsz), data_size)) { 2634 kfree(gbind_data); 2635 return -EFAULT; 2636 } 2637 2638 switch (bind.flags & VFIO_IOMMU_BIND_MASK) { 2639 case VFIO_IOMMU_BIND_GUEST_PGTBL: 2640 ret = vfio_iommu_type1_bind_gpasid(iommu, 2641 gbind_data); 2642 break; 2643 case VFIO_IOMMU_UNBIND_GUEST_PGTBL: 2644 ret = vfio_iommu_type1_unbind_gpasid(iommu, 2645 gbind_data); 2646 break; 2647 default: 2648 ret = -EINVAL; 2649 break; 2650 } 2651 kfree(gbind_data); 2652 return ret; 2653 } 2654 2655 return -ENOTTY; 2656 } 2657 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Sunday, March 22, 2020 8:32 PM > > From: Liu Yi L <yi.l.liu@intel.com> > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by > hardware > IOMMUs that have nesting DMA translation (a.k.a dual stage address > translation). For such hardware IOMMUs, there are two stages/levels of > address translation, and software may let userspace/VM to own the first- > level/stage-1 translation structures. Example of such usage is vSVA ( > virtual Shared Virtual Addressing). VM owns the first-level/stage-1 > translation structures and bind the structures to host, then hardware > IOMMU would utilize nesting translation when doing DMA translation fo > the devices behind such hardware IOMMU. > > This patch adds vfio support for binding guest translation (a.k.a stage 1) > structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only > bind > guest page table is needed, it also requires to expose interface to guest > for iommu cache invalidation when guest modified the first-level/stage-1 > translation structures since hardware needs to be notified to flush stale > iotlbs. This would be introduced in next patch. > > In this patch, guest page table bind and unbind are done by using flags > VFIO_IOMMU_BIND_GUEST_PGTBL and > VFIO_IOMMU_UNBIND_GUEST_PGTBL under IOCTL > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by > struct iommu_gpasid_bind_data. Before binding guest page table to host, > VM should have got a PASID allocated by host via > VFIO_IOMMU_PASID_REQUEST. > > Bind guest translation structures (here is guest page table) to host Bind -> Binding > are the first step to setup vSVA (Virtual Shared Virtual Addressing). are -> is. and you already explained vSVA earlier. > > Cc: Kevin Tian <kevin.tian@intel.com> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > Cc: Alex Williamson <alex.williamson@redhat.com> > Cc: Eric Auger <eric.auger@redhat.com> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > --- > drivers/vfio/vfio_iommu_type1.c | 158 > ++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 46 ++++++++++++ > 2 files changed, 204 insertions(+) > > diff --git a/drivers/vfio/vfio_iommu_type1.c > b/drivers/vfio/vfio_iommu_type1.c > index 82a9e0b..a877747 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -130,6 +130,33 @@ struct vfio_regions { > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > (!list_empty(&iommu->domain_list)) > > +struct domain_capsule { > + struct iommu_domain *domain; > + void *data; > +}; > + > +/* iommu->lock must be held */ > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > + int (*fn)(struct device *dev, void *data), > + void *data) > +{ > + struct domain_capsule dc = {.data = data}; > + struct vfio_domain *d; > + struct vfio_group *g; > + int ret = 0; > + > + list_for_each_entry(d, &iommu->domain_list, next) { > + dc.domain = d->domain; > + list_for_each_entry(g, &d->group_list, next) { > + ret = iommu_group_for_each_dev(g->iommu_group, > + &dc, fn); > + if (ret) > + break; > + } > + } > + return ret; > +} > + > static int put_pfn(unsigned long pfn, int prot); > > /* > @@ -2314,6 +2341,88 @@ static int > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > return 0; > } > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_gpasid_bind_data *gbind_data = > + (struct iommu_gpasid_bind_data *) dc->data; > + In Jacob's vSVA iommu series, [PATCH 06/11]: + /* REVISIT: upper layer/VFIO can track host process that bind the PASID. + * ioasid_set = mm might be sufficient for vfio to check pasid VMM + * ownership. + */ I asked him who exactly should be responsible for tracking the pasid ownership. Although no response yet, I expect vfio/iommu can have a clear policy and also documented here to provide consistent message. > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); > +} > + > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_gpasid_bind_data *gbind_data = > + (struct iommu_gpasid_bind_data *) dc->data; > + > + return iommu_sva_unbind_gpasid(dc->domain, dev, > + gbind_data->hpasid); curious why we have to share the same bind_data structure between bind and unbind, especially when unbind requires only one field? I didn't see a clear reason, and just similar to earlier ALLOC/FREE which don't share structure either. Current way simply wastes space for unbind operation... > +} > + > +/** > + * Unbind specific gpasid, caller of this function requires hold > + * vfio_iommu->lock > + */ > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu > *iommu, > + struct iommu_gpasid_bind_data *gbind_data) > +{ > + return vfio_iommu_for_each_dev(iommu, > + vfio_unbind_gpasid_fn, gbind_data); > +} > + > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > + struct iommu_gpasid_bind_data *gbind_data) > +{ > + int ret = 0; > + > + mutex_lock(&iommu->lock); > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + ret = vfio_iommu_for_each_dev(iommu, > + vfio_bind_gpasid_fn, gbind_data); > + /* > + * If bind failed, it may not be a total failure. Some devices > + * within the iommu group may have bind successfully. Although > + * we don't enable pasid capability for non-singletion iommu > + * groups, a unbind operation would be helpful to ensure no > + * partial binding for an iommu group. > + */ > + if (ret) > + /* > + * Undo all binds that already succeeded, no need to binds -> bindings > + * check the return value here since some device within > + * the group has no successful bind when coming to this > + * place switch. > + */ remove 'switch' > + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > + > +out_unlock: > + mutex_unlock(&iommu->lock); > + return ret; > +} > + > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > + struct iommu_gpasid_bind_data *gbind_data) > +{ > + int ret = 0; > + > + mutex_lock(&iommu->lock); > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > + > +out_unlock: > + mutex_unlock(&iommu->lock); > + return ret; > +} > + > static long vfio_iommu_type1_ioctl(void *iommu_data, > unsigned int cmd, unsigned long arg) > { > @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > default: > return -EINVAL; > } > + > + } else if (cmd == VFIO_IOMMU_BIND) { BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me. > + struct vfio_iommu_type1_bind bind; > + u32 version; > + int data_size; > + void *gbind_data; > + int ret; > + > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz) > + return -EINVAL; > + > + /* Get the version of struct iommu_gpasid_bind_data */ > + if (copy_from_user(&version, > + (void __user *) (arg + minsz), > + sizeof(version))) > + return -EFAULT; > + > + data_size = iommu_uapi_get_data_size( > + IOMMU_UAPI_BIND_GPASID, version); > + gbind_data = kzalloc(data_size, GFP_KERNEL); > + if (!gbind_data) > + return -ENOMEM; > + > + if (copy_from_user(gbind_data, > + (void __user *) (arg + minsz), data_size)) { > + kfree(gbind_data); > + return -EFAULT; > + } > + > + switch (bind.flags & VFIO_IOMMU_BIND_MASK) { > + case VFIO_IOMMU_BIND_GUEST_PGTBL: > + ret = vfio_iommu_type1_bind_gpasid(iommu, > + gbind_data); > + break; > + case VFIO_IOMMU_UNBIND_GUEST_PGTBL: > + ret = vfio_iommu_type1_unbind_gpasid(iommu, > + gbind_data); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + kfree(gbind_data); > + return ret; > } > > return -ENOTTY; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index ebeaf3e..2235bc6 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -14,6 +14,7 @@ > > #include <linux/types.h> > #include <linux/ioctl.h> > +#include <linux/iommu.h> > > #define VFIO_API_VERSION 0 > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request { > */ > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + > 22) > > +/** > + * Supported flags: > + * - VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host > for > + * nesting type IOMMUs. In @data field It takes struct > + * iommu_gpasid_bind_data. > + * - VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page > table operation > + * invoked by VFIO_IOMMU_BIND_GUEST_PGTBL. > + * > + */ > +struct vfio_iommu_type1_bind { > + __u32 argsz; > + __u32 flags; > +#define VFIO_IOMMU_BIND_GUEST_PGTBL (1 << 0) > +#define VFIO_IOMMU_UNBIND_GUEST_PGTBL (1 << 1) > + __u8 data[]; > +}; > + > +#define VFIO_IOMMU_BIND_MASK (VFIO_IOMMU_BIND_GUEST_PGTBL > | \ > + > VFIO_IOMMU_UNBIND_GUEST_PGTBL) > + > +/** > + * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23, > + * struct vfio_iommu_type1_bind) > + * > + * Manage address spaces of devices in this container. Initially a TYPE1 > + * container can only have one address space, managed with > + * VFIO_IOMMU_MAP/UNMAP_DMA. the last sentence seems irrelevant and more suitable in commit msg. > + * > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by > both MAP/UNMAP > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) > page > + * tables, and BIND manages the stage-1 (guest) page tables. Other types of Are "other types" the counterpart to VFIO_TYPE1_NESTING_IOMMU? What are those types? I thought only NESTING_IOMMU allows two stage translation... > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where The first sentence said the same thing. Then what is the exact difference? > MAP/UNMAP controls > + * the traffics only require single stage translation while BIND controls the > + * traffics require nesting translation. But this depends on the underlying > + * IOMMU architecture and isn't guaranteed. Example of this is the guest > SVA > + * traffics, such traffics need nesting translation to gain gVA->gPA and then > + * gPA->hPA translation. I'm a bit confused about the content since "other types of". Are they trying to state some exceptions/corner cases that this API cannot resolve or explain the desired behavior of the API? Especially the last example, which is worded as if the example for "isn't guaranteed" but isn't guest SVA the main purpose of this API? > + * > + * Availability of this feature depends on the device, its bus, the underlying > + * IOMMU and the CPU architecture. > + * > + * returns: 0 on success, -errno on failure. > + */ > +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > + > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ > > /* > -- > 2.7.4
> From: Tian, Kevin <kevin.tian@intel.com> > Sent: Monday, March 30, 2020 8:46 PM > Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > > From: Liu, Yi L <yi.l.liu@intel.com> > > Sent: Sunday, March 22, 2020 8:32 PM > > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware > > IOMMUs that have nesting DMA translation (a.k.a dual stage address > > translation). For such hardware IOMMUs, there are two stages/levels of > > address translation, and software may let userspace/VM to own the > > first- > > level/stage-1 translation structures. Example of such usage is vSVA ( > > virtual Shared Virtual Addressing). VM owns the first-level/stage-1 > > translation structures and bind the structures to host, then hardware > > IOMMU would utilize nesting translation when doing DMA translation fo > > the devices behind such hardware IOMMU. > > > > This patch adds vfio support for binding guest translation (a.k.a > > stage 1) structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, > > not only bind guest page table is needed, it also requires to expose > > interface to guest for iommu cache invalidation when guest modified > > the first-level/stage-1 translation structures since hardware needs to > > be notified to flush stale iotlbs. This would be introduced in next > > patch. > > > > In this patch, guest page table bind and unbind are done by using > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and > VFIO_IOMMU_UNBIND_GUEST_PGTBL > > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by > > struct iommu_gpasid_bind_data. Before binding guest page table to > > host, VM should have got a PASID allocated by host via > > VFIO_IOMMU_PASID_REQUEST. > > > > Bind guest translation structures (here is guest page table) to host > > Bind -> Binding got it. > > are the first step to setup vSVA (Virtual Shared Virtual Addressing). > > are -> is. and you already explained vSVA earlier. oh yes, it is. > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > Cc: Alex Williamson <alex.williamson@redhat.com> > > Cc: Eric Auger <eric.auger@redhat.com> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > --- > > drivers/vfio/vfio_iommu_type1.c | 158 > > ++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/vfio.h | 46 ++++++++++++ > > 2 files changed, 204 insertions(+) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -130,6 +130,33 @@ struct vfio_regions { > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > (!list_empty(&iommu->domain_list)) > > > > +struct domain_capsule { > > + struct iommu_domain *domain; > > + void *data; > > +}; > > + > > +/* iommu->lock must be held */ > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > > + int (*fn)(struct device *dev, void *data), > > + void *data) > > +{ > > + struct domain_capsule dc = {.data = data}; > > + struct vfio_domain *d; > > + struct vfio_group *g; > > + int ret = 0; > > + > > + list_for_each_entry(d, &iommu->domain_list, next) { > > + dc.domain = d->domain; > > + list_for_each_entry(g, &d->group_list, next) { > > + ret = iommu_group_for_each_dev(g->iommu_group, > > + &dc, fn); > > + if (ret) > > + break; > > + } > > + } > > + return ret; > > +} > > + > > static int put_pfn(unsigned long pfn, int prot); > > > > /* > > @@ -2314,6 +2341,88 @@ static int > > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > > return 0; > > } > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) { > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_gpasid_bind_data *gbind_data = > > + (struct iommu_gpasid_bind_data *) dc->data; > > + > > In Jacob's vSVA iommu series, [PATCH 06/11]: > > + /* REVISIT: upper layer/VFIO can track host process that bind the > PASID. > + * ioasid_set = mm might be sufficient for vfio to check pasid VMM > + * ownership. > + */ > > I asked him who exactly should be responsible for tracking the pasid ownership. > Although no response yet, I expect vfio/iommu can have a clear policy and also > documented here to provide consistent message. yep. > > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); } > > + > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) { > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_gpasid_bind_data *gbind_data = > > + (struct iommu_gpasid_bind_data *) dc->data; > > + > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > + gbind_data->hpasid); > > curious why we have to share the same bind_data structure between bind and > unbind, especially when unbind requires only one field? I didn't see a clear reason, > and just similar to earlier ALLOC/FREE which don't share structure either. > Current way simply wastes space for unbind operation... no special reason today. But the gIOVA support over nested translation is in plan, it may require a flag to indicate it as guest iommu driver may user a single PASID value(RID2PASID) for all devices in guest side. Especially if the RID2PASID value used for IOVA the the same with host side. So adding a flag to indicate the binding is for IOVA is helpful. For PF/VF, iommu driver just bind with the host side's RID2PASID. While for ADI (Assignable Device Interface), vfio layer needs to figure out the default PASID stored in the aux-domain, and then iommu driver bind gIOVA table to the default PASID. The potential flag is required in both bind and unbind path. As such, it would be better to share the structure. > > +} > > + > > +/** > > + * Unbind specific gpasid, caller of this function requires hold > > + * vfio_iommu->lock > > + */ > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu > > *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) { > > + return vfio_iommu_for_each_dev(iommu, > > + vfio_unbind_gpasid_fn, gbind_data); } > > + > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) { > > + int ret = 0; > > + > > + mutex_lock(&iommu->lock); > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + ret = vfio_iommu_for_each_dev(iommu, > > + vfio_bind_gpasid_fn, gbind_data); > > + /* > > + * If bind failed, it may not be a total failure. Some devices > > + * within the iommu group may have bind successfully. Although > > + * we don't enable pasid capability for non-singletion iommu > > + * groups, a unbind operation would be helpful to ensure no > > + * partial binding for an iommu group. > > + */ > > + if (ret) > > + /* > > + * Undo all binds that already succeeded, no need to > > binds -> bindings got it. > > > + * check the return value here since some device within > > + * the group has no successful bind when coming to this > > + * place switch. > > + */ > > remove 'switch' oh, yes. > > > + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > + > > +out_unlock: > > + mutex_unlock(&iommu->lock); > > + return ret; > > +} > > + > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) { > > + int ret = 0; > > + > > + mutex_lock(&iommu->lock); > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > + > > +out_unlock: > > + mutex_unlock(&iommu->lock); > > + return ret; > > +} > > + > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > unsigned int cmd, unsigned long arg) { @@ - > 2471,6 +2580,55 @@ > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > default: > > return -EINVAL; > > } > > + > > + } else if (cmd == VFIO_IOMMU_BIND) { > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me. Emm, it's up to the flags to indicate bind what. It was proposed to cover the three cases below: a) BIND/UNBIND_GPASID b) BIND/UNBIND_GPASID_TABLE c) BIND/UNBIND_PROCESS <only a) is covered in this patch> So it's called VFIO_IOMMU_BIND. > > > + struct vfio_iommu_type1_bind bind; > > + u32 version; > > + int data_size; > > + void *gbind_data; > > + int ret; > > + > > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > > + > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (bind.argsz < minsz) > > + return -EINVAL; > > + > > + /* Get the version of struct iommu_gpasid_bind_data */ > > + if (copy_from_user(&version, > > + (void __user *) (arg + minsz), > > + sizeof(version))) > > + return -EFAULT; > > + > > + data_size = iommu_uapi_get_data_size( > > + IOMMU_UAPI_BIND_GPASID, version); > > + gbind_data = kzalloc(data_size, GFP_KERNEL); > > + if (!gbind_data) > > + return -ENOMEM; > > + > > + if (copy_from_user(gbind_data, > > + (void __user *) (arg + minsz), data_size)) { > > + kfree(gbind_data); > > + return -EFAULT; > > + } > > + > > + switch (bind.flags & VFIO_IOMMU_BIND_MASK) { > > + case VFIO_IOMMU_BIND_GUEST_PGTBL: > > + ret = vfio_iommu_type1_bind_gpasid(iommu, > > + gbind_data); > > + break; > > + case VFIO_IOMMU_UNBIND_GUEST_PGTBL: > > + ret = vfio_iommu_type1_unbind_gpasid(iommu, > > + gbind_data); > > + break; > > + default: > > + ret = -EINVAL; > > + break; > > + } > > + kfree(gbind_data); > > + return ret; > > } > > > > return -ENOTTY; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index ebeaf3e..2235bc6 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -14,6 +14,7 @@ > > > > #include <linux/types.h> > > #include <linux/ioctl.h> > > +#include <linux/iommu.h> > > > > #define VFIO_API_VERSION 0 > > > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request { > > */ > > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + > > 22) > > > > +/** > > + * Supported flags: > > + * - VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host > > for > > + * nesting type IOMMUs. In @data field It takes struct > > + * iommu_gpasid_bind_data. > > + * - VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page > > table operation > > + * invoked by VFIO_IOMMU_BIND_GUEST_PGTBL. > > + * > > + */ > > +struct vfio_iommu_type1_bind { > > + __u32 argsz; > > + __u32 flags; > > +#define VFIO_IOMMU_BIND_GUEST_PGTBL (1 << 0) > > +#define VFIO_IOMMU_UNBIND_GUEST_PGTBL (1 << 1) > > + __u8 data[]; > > +}; > > + > > +#define VFIO_IOMMU_BIND_MASK (VFIO_IOMMU_BIND_GUEST_PGTBL > > | \ > > + > > VFIO_IOMMU_UNBIND_GUEST_PGTBL) > > + > > +/** > > + * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23, > > + * struct vfio_iommu_type1_bind) > > + * > > + * Manage address spaces of devices in this container. Initially a > > +TYPE1 > > + * container can only have one address space, managed with > > + * VFIO_IOMMU_MAP/UNMAP_DMA. > > the last sentence seems irrelevant and more suitable in commit msg. oh, I could remove it. > > + * > > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by > > both MAP/UNMAP > > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 > > + (host) > > page > > + * tables, and BIND manages the stage-1 (guest) page tables. Other > > + types of > > Are "other types" the counterpart to VFIO_TYPE1_NESTING_IOMMU? > What are those types? I thought only NESTING_IOMMU allows two stage > translation... it's a mistake... please ignore this message. would correct it in next version. > > > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where > > The first sentence said the same thing. Then what is the exact difference? this sentence were added by mistake. will correct it. > > > MAP/UNMAP controls > > + * the traffics only require single stage translation while BIND > > + controls the > > + * traffics require nesting translation. But this depends on the > > + underlying > > + * IOMMU architecture and isn't guaranteed. Example of this is the > > + guest > > SVA > > + * traffics, such traffics need nesting translation to gain gVA->gPA > > + and then > > + * gPA->hPA translation. > > I'm a bit confused about the content since "other types of". Are they trying to state > some exceptions/corner cases that this API cannot resolve or explain the desired > behavior of the API? Especially the last example, which is worded as if the example > for "isn't guaranteed" > but isn't guest SVA the main purpose of this API? > I think the description in original patch is bad especially with the "other types" phrase. How about the below description? /** * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23, * struct vfio_iommu_type1_bind) * * Manage address spaces of devices in this container when it's an IOMMU * of type VFIO_TYPE1_NESTING_IOMMU. Such type IOMMU allows MAP/UNMAP and * BIND to coexist, where MAP/UNMAP controls the traffics only require * single stage translation while BIND controls the traffics require nesting * translation. * * Availability of this feature depends on the device, its bus, the underlying * IOMMU and the CPU architecture. * * returns: 0 on success, -errno on failure. */ Regards, Yi Liu
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Wednesday, April 1, 2020 5:13 PM > > > From: Tian, Kevin <kevin.tian@intel.com> > > Sent: Monday, March 30, 2020 8:46 PM > > Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > > > > From: Liu, Yi L <yi.l.liu@intel.com> > > > Sent: Sunday, March 22, 2020 8:32 PM > > > > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by > hardware > > > IOMMUs that have nesting DMA translation (a.k.a dual stage address > > > translation). For such hardware IOMMUs, there are two stages/levels of > > > address translation, and software may let userspace/VM to own the > > > first- > > > level/stage-1 translation structures. Example of such usage is vSVA ( > > > virtual Shared Virtual Addressing). VM owns the first-level/stage-1 > > > translation structures and bind the structures to host, then hardware > > > IOMMU would utilize nesting translation when doing DMA translation fo > > > the devices behind such hardware IOMMU. > > > > > > This patch adds vfio support for binding guest translation (a.k.a > > > stage 1) structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, > > > not only bind guest page table is needed, it also requires to expose > > > interface to guest for iommu cache invalidation when guest modified > > > the first-level/stage-1 translation structures since hardware needs to > > > be notified to flush stale iotlbs. This would be introduced in next > > > patch. > > > > > > In this patch, guest page table bind and unbind are done by using > > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and > > VFIO_IOMMU_UNBIND_GUEST_PGTBL > > > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by > > > struct iommu_gpasid_bind_data. Before binding guest page table to > > > host, VM should have got a PASID allocated by host via > > > VFIO_IOMMU_PASID_REQUEST. > > > > > > Bind guest translation structures (here is guest page table) to host > > > > Bind -> Binding > got it. > > > are the first step to setup vSVA (Virtual Shared Virtual Addressing). > > > > are -> is. and you already explained vSVA earlier. > oh yes, it is. > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > Cc: Alex Williamson <alex.williamson@redhat.com> > > > Cc: Eric Auger <eric.auger@redhat.com> > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > --- > > > drivers/vfio/vfio_iommu_type1.c | 158 > > > ++++++++++++++++++++++++++++++++++++++++ > > > include/uapi/linux/vfio.h | 46 ++++++++++++ > > > 2 files changed, 204 insertions(+) > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644 > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > @@ -130,6 +130,33 @@ struct vfio_regions { > > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > > (!list_empty(&iommu->domain_list)) > > > > > > +struct domain_capsule { > > > + struct iommu_domain *domain; > > > + void *data; > > > +}; > > > + > > > +/* iommu->lock must be held */ > > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > > > + int (*fn)(struct device *dev, void *data), > > > + void *data) > > > +{ > > > + struct domain_capsule dc = {.data = data}; > > > + struct vfio_domain *d; > > > + struct vfio_group *g; > > > + int ret = 0; > > > + > > > + list_for_each_entry(d, &iommu->domain_list, next) { > > > + dc.domain = d->domain; > > > + list_for_each_entry(g, &d->group_list, next) { > > > + ret = iommu_group_for_each_dev(g->iommu_group, > > > + &dc, fn); > > > + if (ret) > > > + break; > > > + } > > > + } > > > + return ret; > > > +} > > > + > > > static int put_pfn(unsigned long pfn, int prot); > > > > > > /* > > > @@ -2314,6 +2341,88 @@ static int > > > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > > > return 0; > > > } > > > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) { > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > + struct iommu_gpasid_bind_data *gbind_data = > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > + > > > > In Jacob's vSVA iommu series, [PATCH 06/11]: > > > > + /* REVISIT: upper layer/VFIO can track host process that bind > the > > PASID. > > + * ioasid_set = mm might be sufficient for vfio to check pasid > VMM > > + * ownership. > > + */ > > > > I asked him who exactly should be responsible for tracking the pasid > ownership. > > Although no response yet, I expect vfio/iommu can have a clear policy and > also > > documented here to provide consistent message. > > yep. > > > > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); } > > > + > > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) { > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > + struct iommu_gpasid_bind_data *gbind_data = > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > + > > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > > + gbind_data->hpasid); > > > > curious why we have to share the same bind_data structure between bind > and > > unbind, especially when unbind requires only one field? I didn't see a clear > reason, > > and just similar to earlier ALLOC/FREE which don't share structure either. > > Current way simply wastes space for unbind operation... > > no special reason today. But the gIOVA support over nested translation > is in plan, it may require a flag to indicate it as guest iommu driver > may user a single PASID value(RID2PASID) for all devices in guest side. > Especially if the RID2PASID value used for IOVA the the same with host > side. So adding a flag to indicate the binding is for IOVA is helpful. > For PF/VF, iommu driver just bind with the host side's RID2PASID. While > for ADI (Assignable Device Interface), vfio layer needs to figure out > the default PASID stored in the aux-domain, and then iommu driver bind > gIOVA table to the default PASID. The potential flag is required in both > bind and unbind path. As such, it would be better to share the structure. I'm fine with it if you are pretty sure that more extension will be required in the future, though I didn't fully understand above explanations.
> From: Tian, Kevin <kevin.tian@intel.com> > Sent: Thursday, April 2, 2020 10:12 AM > To: Liu, Yi L <yi.l.liu@intel.com>; alex.williamson@redhat.com; > Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > > From: Liu, Yi L <yi.l.liu@intel.com> > > Sent: Wednesday, April 1, 2020 5:13 PM > > > > > From: Tian, Kevin <kevin.tian@intel.com> > > > Sent: Monday, March 30, 2020 8:46 PM > > > Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to > > > host > > > > > > > From: Liu, Yi L <yi.l.liu@intel.com> > > > > Sent: Sunday, March 22, 2020 8:32 PM > > > > > > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > > > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by > > hardware > > > > IOMMUs that have nesting DMA translation (a.k.a dual stage address > > > > translation). For such hardware IOMMUs, there are two > > > > stages/levels of address translation, and software may let > > > > userspace/VM to own the > > > > first- > > > > level/stage-1 translation structures. Example of such usage is > > > > vSVA ( virtual Shared Virtual Addressing). VM owns the > > > > first-level/stage-1 translation structures and bind the structures > > > > to host, then hardware IOMMU would utilize nesting translation > > > > when doing DMA translation fo the devices behind such hardware IOMMU. > > > > > > > > This patch adds vfio support for binding guest translation (a.k.a > > > > stage 1) structure to host iommu. And for > > > > VFIO_TYPE1_NESTING_IOMMU, not only bind guest page table is > > > > needed, it also requires to expose interface to guest for iommu > > > > cache invalidation when guest modified the first-level/stage-1 > > > > translation structures since hardware needs to be notified to > > > > flush stale iotlbs. This would be introduced in next patch. > > > > > > > > In this patch, guest page table bind and unbind are done by using > > > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and > > > VFIO_IOMMU_UNBIND_GUEST_PGTBL > > > > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by > > > > struct iommu_gpasid_bind_data. Before binding guest page table to > > > > host, VM should have got a PASID allocated by host via > > > > VFIO_IOMMU_PASID_REQUEST. > > > > > > > > Bind guest translation structures (here is guest page table) to > > > > host > > > > > > Bind -> Binding > > got it. > > > > are the first step to setup vSVA (Virtual Shared Virtual Addressing). > > > > > > are -> is. and you already explained vSVA earlier. > > oh yes, it is. > > > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > > Cc: Alex Williamson <alex.williamson@redhat.com> > > > > Cc: Eric Auger <eric.auger@redhat.com> > > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > > --- > > > > drivers/vfio/vfio_iommu_type1.c | 158 > > > > ++++++++++++++++++++++++++++++++++++++++ > > > > include/uapi/linux/vfio.h | 46 ++++++++++++ > > > > 2 files changed, 204 insertions(+) > > > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644 > > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > > @@ -130,6 +130,33 @@ struct vfio_regions { > > > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > > > (!list_empty(&iommu->domain_list)) > > > > > > > > +struct domain_capsule { > > > > + struct iommu_domain *domain; > > > > + void *data; > > > > +}; > > > > + > > > > +/* iommu->lock must be held */ > > > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > > > > + int (*fn)(struct device *dev, void *data), > > > > + void *data) > > > > +{ > > > > + struct domain_capsule dc = {.data = data}; > > > > + struct vfio_domain *d; > > > > + struct vfio_group *g; > > > > + int ret = 0; > > > > + > > > > + list_for_each_entry(d, &iommu->domain_list, next) { > > > > + dc.domain = d->domain; > > > > + list_for_each_entry(g, &d->group_list, next) { > > > > + ret = iommu_group_for_each_dev(g->iommu_group, > > > > + &dc, fn); > > > > + if (ret) > > > > + break; > > > > + } > > > > + } > > > > + return ret; > > > > +} > > > > + > > > > static int put_pfn(unsigned long pfn, int prot); > > > > > > > > /* > > > > @@ -2314,6 +2341,88 @@ static int > > > > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > > > > return 0; > > > > } > > > > > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) { > > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > > + struct iommu_gpasid_bind_data *gbind_data = > > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > > + > > > > > > In Jacob's vSVA iommu series, [PATCH 06/11]: > > > > > > + /* REVISIT: upper layer/VFIO can track host process that bind > > the > > > PASID. > > > + * ioasid_set = mm might be sufficient for vfio to check pasid > > VMM > > > + * ownership. > > > + */ > > > > > > I asked him who exactly should be responsible for tracking the pasid > > ownership. > > > Although no response yet, I expect vfio/iommu can have a clear > > > policy and > > also > > > documented here to provide consistent message. > > > > yep. > > > > > > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); } > > > > + > > > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) { > > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > > + struct iommu_gpasid_bind_data *gbind_data = > > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > > + > > > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > > > + gbind_data->hpasid); > > > > > > curious why we have to share the same bind_data structure between > > > bind > > and > > > unbind, especially when unbind requires only one field? I didn't see > > > a clear > > reason, > > > and just similar to earlier ALLOC/FREE which don't share structure either. > > > Current way simply wastes space for unbind operation... > > > > no special reason today. But the gIOVA support over nested translation > > is in plan, it may require a flag to indicate it as guest iommu driver > > may user a single PASID value(RID2PASID) for all devices in guest side. > > Especially if the RID2PASID value used for IOVA the the same with host > > side. So adding a flag to indicate the binding is for IOVA is helpful. > > For PF/VF, iommu driver just bind with the host side's RID2PASID. > > While for ADI (Assignable Device Interface), vfio layer needs to > > figure out the default PASID stored in the aux-domain, and then iommu > > driver bind gIOVA table to the default PASID. The potential flag is > > required in both bind and unbind path. As such, it would be better to share the > structure. > > I'm fine with it if you are pretty sure that more extension will be required in the > future, though I didn't fully understand above explanations.
On Sun, 22 Mar 2020 05:32:03 -0700 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > From: Liu Yi L <yi.l.liu@intel.com> > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware > IOMMUs that have nesting DMA translation (a.k.a dual stage address > translation). For such hardware IOMMUs, there are two stages/levels of > address translation, and software may let userspace/VM to own the first- > level/stage-1 translation structures. Example of such usage is vSVA ( > virtual Shared Virtual Addressing). VM owns the first-level/stage-1 > translation structures and bind the structures to host, then hardware > IOMMU would utilize nesting translation when doing DMA translation fo > the devices behind such hardware IOMMU. > > This patch adds vfio support for binding guest translation (a.k.a stage 1) > structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only bind > guest page table is needed, it also requires to expose interface to guest > for iommu cache invalidation when guest modified the first-level/stage-1 > translation structures since hardware needs to be notified to flush stale > iotlbs. This would be introduced in next patch. > > In this patch, guest page table bind and unbind are done by using flags > VFIO_IOMMU_BIND_GUEST_PGTBL and VFIO_IOMMU_UNBIND_GUEST_PGTBL under IOCTL > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by > struct iommu_gpasid_bind_data. Before binding guest page table to host, > VM should have got a PASID allocated by host via VFIO_IOMMU_PASID_REQUEST. > > Bind guest translation structures (here is guest page table) to host > are the first step to setup vSVA (Virtual Shared Virtual Addressing). > > Cc: Kevin Tian <kevin.tian@intel.com> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > Cc: Alex Williamson <alex.williamson@redhat.com> > Cc: Eric Auger <eric.auger@redhat.com> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > --- > drivers/vfio/vfio_iommu_type1.c | 158 ++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 46 ++++++++++++ > 2 files changed, 204 insertions(+) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 82a9e0b..a877747 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -130,6 +130,33 @@ struct vfio_regions { > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > (!list_empty(&iommu->domain_list)) > > +struct domain_capsule { > + struct iommu_domain *domain; > + void *data; > +}; > + > +/* iommu->lock must be held */ > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > + int (*fn)(struct device *dev, void *data), > + void *data) > +{ > + struct domain_capsule dc = {.data = data}; > + struct vfio_domain *d; > + struct vfio_group *g; > + int ret = 0; > + > + list_for_each_entry(d, &iommu->domain_list, next) { > + dc.domain = d->domain; > + list_for_each_entry(g, &d->group_list, next) { > + ret = iommu_group_for_each_dev(g->iommu_group, > + &dc, fn); > + if (ret) > + break; > + } > + } > + return ret; > +} > + > static int put_pfn(unsigned long pfn, int prot); > > /* > @@ -2314,6 +2341,88 @@ static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > return 0; > } > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_gpasid_bind_data *gbind_data = > + (struct iommu_gpasid_bind_data *) dc->data; > + > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); > +} > + > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_gpasid_bind_data *gbind_data = > + (struct iommu_gpasid_bind_data *) dc->data; > + > + return iommu_sva_unbind_gpasid(dc->domain, dev, > + gbind_data->hpasid); > +} > + > +/** > + * Unbind specific gpasid, caller of this function requires hold > + * vfio_iommu->lock > + */ > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > + struct iommu_gpasid_bind_data *gbind_data) > +{ > + return vfio_iommu_for_each_dev(iommu, > + vfio_unbind_gpasid_fn, gbind_data); > +} > + > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > + struct iommu_gpasid_bind_data *gbind_data) > +{ > + int ret = 0; > + > + mutex_lock(&iommu->lock); > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + ret = vfio_iommu_for_each_dev(iommu, > + vfio_bind_gpasid_fn, gbind_data); > + /* > + * If bind failed, it may not be a total failure. Some devices > + * within the iommu group may have bind successfully. Although > + * we don't enable pasid capability for non-singletion iommu > + * groups, a unbind operation would be helpful to ensure no > + * partial binding for an iommu group. Where was the non-singleton group restriction done, I missed that. > + */ > + if (ret) > + /* > + * Undo all binds that already succeeded, no need to > + * check the return value here since some device within > + * the group has no successful bind when coming to this > + * place switch. > + */ > + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); However, the for_each_dev function stops when the callback function returns error, are we just assuming we stop at the same device as we faulted on the first time and that we traverse the same set of devices the second time? It seems strange to me that unbind should be able to fail. > + > +out_unlock: > + mutex_unlock(&iommu->lock); > + return ret; > +} > + > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > + struct iommu_gpasid_bind_data *gbind_data) > +{ > + int ret = 0; > + > + mutex_lock(&iommu->lock); > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); How is a user supposed to respond to their unbind failing? > + > +out_unlock: > + mutex_unlock(&iommu->lock); > + return ret; > +} > + > static long vfio_iommu_type1_ioctl(void *iommu_data, > unsigned int cmd, unsigned long arg) > { > @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, > default: > return -EINVAL; > } > + > + } else if (cmd == VFIO_IOMMU_BIND) { > + struct vfio_iommu_type1_bind bind; > + u32 version; > + int data_size; > + void *gbind_data; > + int ret; > + > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz) > + return -EINVAL; > + > + /* Get the version of struct iommu_gpasid_bind_data */ > + if (copy_from_user(&version, > + (void __user *) (arg + minsz), > + sizeof(version))) > + return -EFAULT; Why are we coping things from beyond the size we've validated that the user has provided again? > + > + data_size = iommu_uapi_get_data_size( > + IOMMU_UAPI_BIND_GPASID, version); > + gbind_data = kzalloc(data_size, GFP_KERNEL); > + if (!gbind_data) > + return -ENOMEM; > + > + if (copy_from_user(gbind_data, > + (void __user *) (arg + minsz), data_size)) { > + kfree(gbind_data); > + return -EFAULT; > + } And again. argsz isn't just for minsz. > + > + switch (bind.flags & VFIO_IOMMU_BIND_MASK) { > + case VFIO_IOMMU_BIND_GUEST_PGTBL: > + ret = vfio_iommu_type1_bind_gpasid(iommu, > + gbind_data); > + break; > + case VFIO_IOMMU_UNBIND_GUEST_PGTBL: > + ret = vfio_iommu_type1_unbind_gpasid(iommu, > + gbind_data); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + kfree(gbind_data); > + return ret; > } > > return -ENOTTY; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index ebeaf3e..2235bc6 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -14,6 +14,7 @@ > > #include <linux/types.h> > #include <linux/ioctl.h> > +#include <linux/iommu.h> > > #define VFIO_API_VERSION 0 > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request { > */ > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 22) > > +/** > + * Supported flags: > + * - VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for > + * nesting type IOMMUs. In @data field It takes struct > + * iommu_gpasid_bind_data. > + * - VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table operation > + * invoked by VFIO_IOMMU_BIND_GUEST_PGTBL. This must require iommu_gpasid_bind_data in the data field as well, right? > + * > + */ > +struct vfio_iommu_type1_bind { > + __u32 argsz; > + __u32 flags; > +#define VFIO_IOMMU_BIND_GUEST_PGTBL (1 << 0) > +#define VFIO_IOMMU_UNBIND_GUEST_PGTBL (1 << 1) > + __u8 data[]; > +}; > + > +#define VFIO_IOMMU_BIND_MASK (VFIO_IOMMU_BIND_GUEST_PGTBL | \ > + VFIO_IOMMU_UNBIND_GUEST_PGTBL) > + > +/** > + * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23, > + * struct vfio_iommu_type1_bind) > + * > + * Manage address spaces of devices in this container. Initially a TYPE1 > + * container can only have one address space, managed with > + * VFIO_IOMMU_MAP/UNMAP_DMA. > + * > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page > + * tables, and BIND manages the stage-1 (guest) page tables. Other types of > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls > + * the traffics only require single stage translation while BIND controls the > + * traffics require nesting translation. But this depends on the underlying > + * IOMMU architecture and isn't guaranteed. Example of this is the guest SVA > + * traffics, such traffics need nesting translation to gain gVA->gPA and then > + * gPA->hPA translation. > + * > + * Availability of this feature depends on the device, its bus, the underlying > + * IOMMU and the CPU architecture. > + * > + * returns: 0 on success, -errno on failure. > + */ > +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > + > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ > > /*
On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote: > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > > > default: > > > > > return -EINVAL; > > > > > } > > > > > + > > > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me. > > > > > > Emm, it's up to the flags to indicate bind what. It was proposed to > > > cover the three cases below: > > > a) BIND/UNBIND_GPASID > > > b) BIND/UNBIND_GPASID_TABLE > > > c) BIND/UNBIND_PROCESS > > > <only a) is covered in this patch> > > > So it's called VFIO_IOMMU_BIND. > > > > but aren't they all about PASID related binding? > > yeah, I can rename it. :-) I don't know if anyone intends to implement it, but SMMUv2 supports nesting translation without any PASID support. For that case the name VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more sense. Ideally we'd also use a neutral name for the IOMMU API instead of bind_gpasid(), but that's easier to change later. Thanks, Jean
Hi Alex, > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, April 3, 2020 3:57 AM > To: Liu, Yi L <yi.l.liu@intel.com> > > On Sun, 22 Mar 2020 05:32:03 -0700 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware > > IOMMUs that have nesting DMA translation (a.k.a dual stage address > > translation). For such hardware IOMMUs, there are two stages/levels of > > address translation, and software may let userspace/VM to own the first- > > level/stage-1 translation structures. Example of such usage is vSVA ( > > virtual Shared Virtual Addressing). VM owns the first-level/stage-1 > > translation structures and bind the structures to host, then hardware > > IOMMU would utilize nesting translation when doing DMA translation fo > > the devices behind such hardware IOMMU. > > > > This patch adds vfio support for binding guest translation (a.k.a stage 1) > > structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only bind > > guest page table is needed, it also requires to expose interface to guest > > for iommu cache invalidation when guest modified the first-level/stage-1 > > translation structures since hardware needs to be notified to flush stale > > iotlbs. This would be introduced in next patch. > > > > In this patch, guest page table bind and unbind are done by using flags > > VFIO_IOMMU_BIND_GUEST_PGTBL and VFIO_IOMMU_UNBIND_GUEST_PGTBL > under IOCTL > > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by > > struct iommu_gpasid_bind_data. Before binding guest page table to host, > > VM should have got a PASID allocated by host via VFIO_IOMMU_PASID_REQUEST. > > > > Bind guest translation structures (here is guest page table) to host > > are the first step to setup vSVA (Virtual Shared Virtual Addressing). > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > Cc: Alex Williamson <alex.williamson@redhat.com> > > Cc: Eric Auger <eric.auger@redhat.com> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > --- > > drivers/vfio/vfio_iommu_type1.c | 158 > ++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/vfio.h | 46 ++++++++++++ > > 2 files changed, 204 insertions(+) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > index 82a9e0b..a877747 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -130,6 +130,33 @@ struct vfio_regions { > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > (!list_empty(&iommu->domain_list)) > > > > +struct domain_capsule { > > + struct iommu_domain *domain; > > + void *data; > > +}; > > + > > +/* iommu->lock must be held */ > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > > + int (*fn)(struct device *dev, void *data), > > + void *data) > > +{ > > + struct domain_capsule dc = {.data = data}; > > + struct vfio_domain *d; > > + struct vfio_group *g; > > + int ret = 0; > > + > > + list_for_each_entry(d, &iommu->domain_list, next) { > > + dc.domain = d->domain; > > + list_for_each_entry(g, &d->group_list, next) { > > + ret = iommu_group_for_each_dev(g->iommu_group, > > + &dc, fn); > > + if (ret) > > + break; > > + } > > + } > > + return ret; > > +} > > + > > static int put_pfn(unsigned long pfn, int prot); > > > > /* > > @@ -2314,6 +2341,88 @@ static int vfio_iommu_info_add_nesting_cap(struct > vfio_iommu *iommu, > > return 0; > > } > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) > > +{ > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_gpasid_bind_data *gbind_data = > > + (struct iommu_gpasid_bind_data *) dc->data; > > + > > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); > > +} > > + > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > > +{ > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_gpasid_bind_data *gbind_data = > > + (struct iommu_gpasid_bind_data *) dc->data; > > + > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > + gbind_data->hpasid); > > +} > > + > > +/** > > + * Unbind specific gpasid, caller of this function requires hold > > + * vfio_iommu->lock > > + */ > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) > > +{ > > + return vfio_iommu_for_each_dev(iommu, > > + vfio_unbind_gpasid_fn, gbind_data); > > +} > > + > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) > > +{ > > + int ret = 0; > > + > > + mutex_lock(&iommu->lock); > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + ret = vfio_iommu_for_each_dev(iommu, > > + vfio_bind_gpasid_fn, gbind_data); > > + /* > > + * If bind failed, it may not be a total failure. Some devices > > + * within the iommu group may have bind successfully. Although > > + * we don't enable pasid capability for non-singletion iommu > > + * groups, a unbind operation would be helpful to ensure no > > + * partial binding for an iommu group. > > Where was the non-singleton group restriction done, I missed that. Hmm, it's missed. thanks for spotting it. How about adding this check in the vfio_iommu_for_each_dev()? If looped a non-singleton group, just skip it. It applies to the cache_inv path all the same. > > + */ > > + if (ret) > > + /* > > + * Undo all binds that already succeeded, no need to > > + * check the return value here since some device within > > + * the group has no successful bind when coming to this > > + * place switch. > > + */ > > + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > However, the for_each_dev function stops when the callback function > returns error, are we just assuming we stop at the same device as we > faulted on the first time and that we traverse the same set of devices > the second time? It seems strange to me that unbind should be able to > fail. unbind can fail if a user attempts to unbind a pasid which is not belonged to it or a pasid which hasn't ever been bound. Otherwise, I didn't see a reason to fail. > > + > > +out_unlock: > > + mutex_unlock(&iommu->lock); > > + return ret; > > +} > > + > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) > > +{ > > + int ret = 0; > > + > > + mutex_lock(&iommu->lock); > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > How is a user supposed to respond to their unbind failing? If it's a malicious unbind (e.g. unbind a not yet bound pasid or unbind a pasid which doesn't belong to current user). > > + > > +out_unlock: > > + mutex_unlock(&iommu->lock); > > + return ret; > > +} > > + > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > unsigned int cmd, unsigned long arg) > > { > > @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > > default: > > return -EINVAL; > > } > > + > > + } else if (cmd == VFIO_IOMMU_BIND) { > > + struct vfio_iommu_type1_bind bind; > > + u32 version; > > + int data_size; > > + void *gbind_data; > > + int ret; > > + > > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > > + > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (bind.argsz < minsz) > > + return -EINVAL; > > + > > + /* Get the version of struct iommu_gpasid_bind_data */ > > + if (copy_from_user(&version, > > + (void __user *) (arg + minsz), > > + sizeof(version))) > > + return -EFAULT; > > Why are we coping things from beyond the size we've validated that the > user has provided again? let me wait for the result in Jacob's thread below. looks like need to have a decision from you and Joreg. If using argsze is good, then I guess we don't need the version-to-size mapping. right? Actually, the version-to-size mapping is added to ensure vfio copy data correctly. https://lkml.org/lkml/2020/4/2/876 > > + > > + data_size = iommu_uapi_get_data_size( > > + IOMMU_UAPI_BIND_GPASID, version); > > + gbind_data = kzalloc(data_size, GFP_KERNEL); > > + if (!gbind_data) > > + return -ENOMEM; > > + > > + if (copy_from_user(gbind_data, > > + (void __user *) (arg + minsz), data_size)) { > > + kfree(gbind_data); > > + return -EFAULT; > > + } > > And again. argsz isn't just for minsz. > > > + > > + switch (bind.flags & VFIO_IOMMU_BIND_MASK) { > > + case VFIO_IOMMU_BIND_GUEST_PGTBL: > > + ret = vfio_iommu_type1_bind_gpasid(iommu, > > + gbind_data); > > + break; > > + case VFIO_IOMMU_UNBIND_GUEST_PGTBL: > > + ret = vfio_iommu_type1_unbind_gpasid(iommu, > > + gbind_data); > > + break; > > + default: > > + ret = -EINVAL; > > + break; > > + } > > + kfree(gbind_data); > > + return ret; > > } > > > > return -ENOTTY; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index ebeaf3e..2235bc6 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -14,6 +14,7 @@ > > > > #include <linux/types.h> > > #include <linux/ioctl.h> > > +#include <linux/iommu.h> > > > > #define VFIO_API_VERSION 0 > > > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request { > > */ > > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 22) > > > > +/** > > + * Supported flags: > > + * - VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for > > + * nesting type IOMMUs. In @data field It takes struct > > + * iommu_gpasid_bind_data. > > + * - VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table > operation > > + * invoked by VFIO_IOMMU_BIND_GUEST_PGTBL. > > This must require iommu_gpasid_bind_data in the data field as well, > right? yes. Regards, Yi Liu
On Fri, 3 Apr 2020 13:30:49 +0000 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > Hi Alex, > > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Friday, April 3, 2020 3:57 AM > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > On Sun, 22 Mar 2020 05:32:03 -0700 > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware > > > IOMMUs that have nesting DMA translation (a.k.a dual stage address > > > translation). For such hardware IOMMUs, there are two stages/levels of > > > address translation, and software may let userspace/VM to own the first- > > > level/stage-1 translation structures. Example of such usage is vSVA ( > > > virtual Shared Virtual Addressing). VM owns the first-level/stage-1 > > > translation structures and bind the structures to host, then hardware > > > IOMMU would utilize nesting translation when doing DMA translation fo > > > the devices behind such hardware IOMMU. > > > > > > This patch adds vfio support for binding guest translation (a.k.a stage 1) > > > structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU, not only bind > > > guest page table is needed, it also requires to expose interface to guest > > > for iommu cache invalidation when guest modified the first-level/stage-1 > > > translation structures since hardware needs to be notified to flush stale > > > iotlbs. This would be introduced in next patch. > > > > > > In this patch, guest page table bind and unbind are done by using flags > > > VFIO_IOMMU_BIND_GUEST_PGTBL and VFIO_IOMMU_UNBIND_GUEST_PGTBL > > under IOCTL > > > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by > > > struct iommu_gpasid_bind_data. Before binding guest page table to host, > > > VM should have got a PASID allocated by host via VFIO_IOMMU_PASID_REQUEST. > > > > > > Bind guest translation structures (here is guest page table) to host > > > are the first step to setup vSVA (Virtual Shared Virtual Addressing). > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > Cc: Alex Williamson <alex.williamson@redhat.com> > > > Cc: Eric Auger <eric.auger@redhat.com> > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > --- > > > drivers/vfio/vfio_iommu_type1.c | 158 > > ++++++++++++++++++++++++++++++++++++++++ > > > include/uapi/linux/vfio.h | 46 ++++++++++++ > > > 2 files changed, 204 insertions(+) > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > > index 82a9e0b..a877747 100644 > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > @@ -130,6 +130,33 @@ struct vfio_regions { > > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > > (!list_empty(&iommu->domain_list)) > > > > > > +struct domain_capsule { > > > + struct iommu_domain *domain; > > > + void *data; > > > +}; > > > + > > > +/* iommu->lock must be held */ > > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > > > + int (*fn)(struct device *dev, void *data), > > > + void *data) > > > +{ > > > + struct domain_capsule dc = {.data = data}; > > > + struct vfio_domain *d; > > > + struct vfio_group *g; > > > + int ret = 0; > > > + > > > + list_for_each_entry(d, &iommu->domain_list, next) { > > > + dc.domain = d->domain; > > > + list_for_each_entry(g, &d->group_list, next) { > > > + ret = iommu_group_for_each_dev(g->iommu_group, > > > + &dc, fn); > > > + if (ret) > > > + break; > > > + } > > > + } > > > + return ret; > > > +} > > > + > > > static int put_pfn(unsigned long pfn, int prot); > > > > > > /* > > > @@ -2314,6 +2341,88 @@ static int vfio_iommu_info_add_nesting_cap(struct > > vfio_iommu *iommu, > > > return 0; > > > } > > > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) > > > +{ > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > + struct iommu_gpasid_bind_data *gbind_data = > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > + > > > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); > > > +} > > > + > > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > > > +{ > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > + struct iommu_gpasid_bind_data *gbind_data = > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > + > > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > > + gbind_data->hpasid); > > > +} > > > + > > > +/** > > > + * Unbind specific gpasid, caller of this function requires hold > > > + * vfio_iommu->lock > > > + */ > > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > > > + struct iommu_gpasid_bind_data *gbind_data) > > > +{ > > > + return vfio_iommu_for_each_dev(iommu, > > > + vfio_unbind_gpasid_fn, gbind_data); > > > +} > > > + > > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > > > + struct iommu_gpasid_bind_data *gbind_data) > > > +{ > > > + int ret = 0; > > > + > > > + mutex_lock(&iommu->lock); > > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > > + ret = -EINVAL; > > > + goto out_unlock; > > > + } > > > + > > > + ret = vfio_iommu_for_each_dev(iommu, > > > + vfio_bind_gpasid_fn, gbind_data); > > > + /* > > > + * If bind failed, it may not be a total failure. Some devices > > > + * within the iommu group may have bind successfully. Although > > > + * we don't enable pasid capability for non-singletion iommu > > > + * groups, a unbind operation would be helpful to ensure no > > > + * partial binding for an iommu group. > > > > Where was the non-singleton group restriction done, I missed that. > > Hmm, it's missed. thanks for spotting it. How about adding this > check in the vfio_iommu_for_each_dev()? If looped a non-singleton > group, just skip it. It applies to the cache_inv path all the > same. I don't really understand the singleton issue, which is why I was surprised to see this since I didn't see a discussion previously. Skipping a singleton group seems like unpredictable behavior to the user though. > > > + */ > > > + if (ret) > > > + /* > > > + * Undo all binds that already succeeded, no need to > > > + * check the return value here since some device within > > > + * the group has no successful bind when coming to this > > > + * place switch. > > > + */ > > > + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > > > However, the for_each_dev function stops when the callback function > > returns error, are we just assuming we stop at the same device as we > > faulted on the first time and that we traverse the same set of devices > > the second time? It seems strange to me that unbind should be able to > > fail. > > unbind can fail if a user attempts to unbind a pasid which is not belonged > to it or a pasid which hasn't ever been bound. Otherwise, I didn't see a > reason to fail. Even if so, this doesn't address the first part of the question. If our for_each_dev() callback returns error then the loop stops and we can't be sure we've triggered it everywhere that it needs to be triggered. There are also aspects of whether it's an error to unbind something that is not bound because the result is still that the pasid is unbound, right? > > > + > > > +out_unlock: > > > + mutex_unlock(&iommu->lock); > > > + return ret; > > > +} > > > + > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > + struct iommu_gpasid_bind_data *gbind_data) > > > +{ > > > + int ret = 0; > > > + > > > + mutex_lock(&iommu->lock); > > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > > + ret = -EINVAL; > > > + goto out_unlock; > > > + } > > > + > > > + ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > > > How is a user supposed to respond to their unbind failing? > > If it's a malicious unbind (e.g. unbind a not yet bound pasid or unbind > a pasid which doesn't belong to current user). And if it's not a malicious unbind? To me this is similar semantics to free() failing. Is there any remedy other than to abort? Thanks, Alex > > > + > > > +out_unlock: > > > + mutex_unlock(&iommu->lock); > > > + return ret; > > > +} > > > + > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > unsigned int cmd, unsigned long arg) > > > { > > > @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void > > *iommu_data, > > > default: > > > return -EINVAL; > > > } > > > + > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > + struct vfio_iommu_type1_bind bind; > > > + u32 version; > > > + int data_size; > > > + void *gbind_data; > > > + int ret; > > > + > > > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > > > + > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (bind.argsz < minsz) > > > + return -EINVAL; > > > + > > > + /* Get the version of struct iommu_gpasid_bind_data */ > > > + if (copy_from_user(&version, > > > + (void __user *) (arg + minsz), > > > + sizeof(version))) > > > + return -EFAULT; > > > > Why are we coping things from beyond the size we've validated that the > > user has provided again? > > let me wait for the result in Jacob's thread below. looks like need > to have a decision from you and Joreg. If using argsze is good, then > I guess we don't need the version-to-size mapping. right? Actually, > the version-to-size mapping is added to ensure vfio copy data correctly. > https://lkml.org/lkml/2020/4/2/876 > > > > + > > > + data_size = iommu_uapi_get_data_size( > > > + IOMMU_UAPI_BIND_GPASID, version); > > > + gbind_data = kzalloc(data_size, GFP_KERNEL); > > > + if (!gbind_data) > > > + return -ENOMEM; > > > + > > > + if (copy_from_user(gbind_data, > > > + (void __user *) (arg + minsz), data_size)) { > > > + kfree(gbind_data); > > > + return -EFAULT; > > > + } > > > > And again. argsz isn't just for minsz. > > > > > + > > > + switch (bind.flags & VFIO_IOMMU_BIND_MASK) { > > > + case VFIO_IOMMU_BIND_GUEST_PGTBL: > > > + ret = vfio_iommu_type1_bind_gpasid(iommu, > > > + gbind_data); > > > + break; > > > + case VFIO_IOMMU_UNBIND_GUEST_PGTBL: > > > + ret = vfio_iommu_type1_unbind_gpasid(iommu, > > > + gbind_data); > > > + break; > > > + default: > > > + ret = -EINVAL; > > > + break; > > > + } > > > + kfree(gbind_data); > > > + return ret; > > > } > > > > > > return -ENOTTY; > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > index ebeaf3e..2235bc6 100644 > > > --- a/include/uapi/linux/vfio.h > > > +++ b/include/uapi/linux/vfio.h > > > @@ -14,6 +14,7 @@ > > > > > > #include <linux/types.h> > > > #include <linux/ioctl.h> > > > +#include <linux/iommu.h> > > > > > > #define VFIO_API_VERSION 0 > > > > > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request { > > > */ > > > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 22) > > > > > > +/** > > > + * Supported flags: > > > + * - VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for > > > + * nesting type IOMMUs. In @data field It takes struct > > > + * iommu_gpasid_bind_data. > > > + * - VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table > > operation > > > + * invoked by VFIO_IOMMU_BIND_GUEST_PGTBL. > > > > This must require iommu_gpasid_bind_data in the data field as well, > > right? > > yes. > > Regards, > Yi Liu >
Hi Alex, > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Saturday, April 4, 2020 2:11 AM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > On Fri, 3 Apr 2020 13:30:49 +0000 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > Hi Alex, > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Friday, April 3, 2020 3:57 AM > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > > > On Sun, 22 Mar 2020 05:32:03 -0700 > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > > > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by > > > > hardware IOMMUs that have nesting DMA translation (a.k.a dual > > > > stage address translation). For such hardware IOMMUs, there are > > > > two stages/levels of address translation, and software may let > > > > userspace/VM to own the first- > > > > level/stage-1 translation structures. Example of such usage is > > > > vSVA ( virtual Shared Virtual Addressing). VM owns the > > > > first-level/stage-1 translation structures and bind the structures > > > > to host, then hardware IOMMU would utilize nesting translation > > > > when doing DMA translation fo the devices behind such hardware IOMMU. > > > > > > > > This patch adds vfio support for binding guest translation (a.k.a > > > > stage 1) structure to host iommu. And for > > > > VFIO_TYPE1_NESTING_IOMMU, not only bind guest page table is > > > > needed, it also requires to expose interface to guest for iommu > > > > cache invalidation when guest modified the first-level/stage-1 > > > > translation structures since hardware needs to be notified to flush stale iotlbs. > This would be introduced in next patch. > > > > > > > > In this patch, guest page table bind and unbind are done by using > > > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and > > > > VFIO_IOMMU_UNBIND_GUEST_PGTBL > > > under IOCTL > > > > VFIO_IOMMU_BIND, the bind/unbind data are conveyed by struct > > > > iommu_gpasid_bind_data. Before binding guest page table to host, > > > > VM should have got a PASID allocated by host via > VFIO_IOMMU_PASID_REQUEST. > > > > > > > > Bind guest translation structures (here is guest page table) to > > > > host are the first step to setup vSVA (Virtual Shared Virtual Addressing). > > > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > > Cc: Alex Williamson <alex.williamson@redhat.com> > > > > Cc: Eric Auger <eric.auger@redhat.com> > > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > > > Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.com> > > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > > --- > > > > drivers/vfio/vfio_iommu_type1.c | 158 > > > ++++++++++++++++++++++++++++++++++++++++ > > > > include/uapi/linux/vfio.h | 46 ++++++++++++ > > > > 2 files changed, 204 insertions(+) > > > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644 > > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > > @@ -130,6 +130,33 @@ struct vfio_regions { > > > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > > > (!list_empty(&iommu->domain_list)) > > > > > > > > +struct domain_capsule { > > > > + struct iommu_domain *domain; > > > > + void *data; > > > > +}; > > > > + > > > > +/* iommu->lock must be held */ > > > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, > > > > + int (*fn)(struct device *dev, void *data), > > > > + void *data) > > > > +{ > > > > + struct domain_capsule dc = {.data = data}; > > > > + struct vfio_domain *d; > > > > + struct vfio_group *g; > > > > + int ret = 0; > > > > + > > > > + list_for_each_entry(d, &iommu->domain_list, next) { > > > > + dc.domain = d->domain; > > > > + list_for_each_entry(g, &d->group_list, next) { > > > > + ret = iommu_group_for_each_dev(g->iommu_group, > > > > + &dc, fn); > > > > + if (ret) > > > > + break; > > > > + } > > > > + } > > > > + return ret; > > > > +} > > > > + > > > > static int put_pfn(unsigned long pfn, int prot); > > > > > > > > /* > > > > @@ -2314,6 +2341,88 @@ static int > > > > vfio_iommu_info_add_nesting_cap(struct > > > vfio_iommu *iommu, > > > > return 0; > > > > } > > > > > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) { > > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > > + struct iommu_gpasid_bind_data *gbind_data = > > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > > + > > > > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); } > > > > + > > > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > > > > +{ > > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > > + struct iommu_gpasid_bind_data *gbind_data = > > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > > + > > > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > > > + gbind_data->hpasid); > > > > +} > > > > + > > > > +/** > > > > + * Unbind specific gpasid, caller of this function requires hold > > > > + * vfio_iommu->lock > > > > + */ > > > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > > > > + struct iommu_gpasid_bind_data *gbind_data) { > > > > + return vfio_iommu_for_each_dev(iommu, > > > > + vfio_unbind_gpasid_fn, gbind_data); } > > > > + > > > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > > > > + struct iommu_gpasid_bind_data *gbind_data) { > > > > + int ret = 0; > > > > + > > > > + mutex_lock(&iommu->lock); > > > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > > > + ret = -EINVAL; > > > > + goto out_unlock; > > > > + } > > > > + > > > > + ret = vfio_iommu_for_each_dev(iommu, > > > > + vfio_bind_gpasid_fn, gbind_data); > > > > + /* > > > > + * If bind failed, it may not be a total failure. Some devices > > > > + * within the iommu group may have bind successfully. Although > > > > + * we don't enable pasid capability for non-singletion iommu > > > > + * groups, a unbind operation would be helpful to ensure no > > > > + * partial binding for an iommu group. > > > > > > Where was the non-singleton group restriction done, I missed that. > > > > Hmm, it's missed. thanks for spotting it. How about adding this check > > in the vfio_iommu_for_each_dev()? If looped a non-singleton group, > > just skip it. It applies to the cache_inv path all the same. > > I don't really understand the singleton issue, which is why I was surprised to see this > since I didn't see a discussion previously. > Skipping a singleton group seems like unpredictable behavior to the user though. There is a discussion on the SVA availability in the below link. There was a conclusion to only support SVA for singleton group. I think bind guest page table also needs to apply this rule. https://patchwork.kernel.org/patch/10213877/ > > > > + */ > > > > + if (ret) > > > > + /* > > > > + * Undo all binds that already succeeded, no need to > > > > + * check the return value here since some device within > > > > + * the group has no successful bind when coming to this > > > > + * place switch. > > > > + */ > > > > + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > > > > > However, the for_each_dev function stops when the callback function > > > returns error, are we just assuming we stop at the same device as we > > > faulted on the first time and that we traverse the same set of > > > devices the second time? It seems strange to me that unbind should > > > be able to fail. > > > > unbind can fail if a user attempts to unbind a pasid which is not > > belonged to it or a pasid which hasn't ever been bound. Otherwise, I > > didn't see a reason to fail. > > Even if so, this doesn't address the first part of the question. > If our for_each_dev() > callback returns error then the loop stops and we can't be sure we've > triggered it > everywhere that it needs to be triggered. Hmm, let me pull back a little. Back to the code, in the attempt to do bind, the code uses for_each_dev() to loop devices. If failed then uses for_each_dev() again to do unbind. Your question is can the second for_each_dev() be able to undo the bind correctly as the second for_each_dev() call has no idea where it failed in the bind phase. is it? Actually, this is why I added the comment that no need to check the return value of vfio_iommu_type1_do_guest_unbind(). > There are also aspects of whether it's an > error to unbind something that is not bound because the result is still > that the pasid > is unbound, right? agreed, as you mentioned in the below comment, no need to fail unbind unless user is trying to unbind a pasid which doesn't belong to it. > > > > + > > > > +out_unlock: > > > > + mutex_unlock(&iommu->lock); > > > > + return ret; > > > > +} > > > > + > > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > > + struct iommu_gpasid_bind_data *gbind_data) { > > > > + int ret = 0; > > > > + > > > > + mutex_lock(&iommu->lock); > > > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > > > + ret = -EINVAL; > > > > + goto out_unlock; > > > > + } > > > > + > > > > + ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > > > > > How is a user supposed to respond to their unbind failing? > > > > If it's a malicious unbind (e.g. unbind a not yet bound pasid or > > unbind a pasid which doesn't belong to current user). > > And if it's not a malicious unbind? To me this is similar semantics to > free() failing. Is there any remedy other than to abort? Thanks, got it. so if user is trying to unbind a pasid which doesn't belong to it, should kernel return error to user or just abort it? Regards, Yi Liu > Alex > > > > > + > > > > +out_unlock: > > > > + mutex_unlock(&iommu->lock); > > > > + return ret; > > > > +} > > > > + > > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > > unsigned int cmd, unsigned long arg) { @@ - > 2471,6 > > > > +2580,55 @@ static long vfio_iommu_type1_ioctl(void > > > *iommu_data, > > > > default: > > > > return -EINVAL; > > > > } > > > > + > > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > > + struct vfio_iommu_type1_bind bind; > > > > + u32 version; > > > > + int data_size; > > > > + void *gbind_data; > > > > + int ret; > > > > + > > > > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > > > > + > > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > > + return -EFAULT; > > > > + > > > > + if (bind.argsz < minsz) > > > > + return -EINVAL; > > > > + > > > > + /* Get the version of struct iommu_gpasid_bind_data */ > > > > + if (copy_from_user(&version, > > > > + (void __user *) (arg + minsz), > > > > + sizeof(version))) > > > > + return -EFAULT; > > > > > > Why are we coping things from beyond the size we've validated that > > > the user has provided again? > > > > let me wait for the result in Jacob's thread below. looks like need to > > have a decision from you and Joreg. If using argsze is good, then I > > guess we don't need the version-to-size mapping. right? Actually, the > > version-to-size mapping is added to ensure vfio copy data correctly. > > https://lkml.org/lkml/2020/4/2/876 > > > > > > + > > > > + data_size = iommu_uapi_get_data_size( > > > > + IOMMU_UAPI_BIND_GPASID, version); > > > > + gbind_data = kzalloc(data_size, GFP_KERNEL); > > > > + if (!gbind_data) > > > > + return -ENOMEM; > > > > + > > > > + if (copy_from_user(gbind_data, > > > > + (void __user *) (arg + minsz), data_size)) { > > > > + kfree(gbind_data); > > > > + return -EFAULT; > > > > + } > > > > > > And again. argsz isn't just for minsz. > > > > > > > + > > > > + switch (bind.flags & VFIO_IOMMU_BIND_MASK) { > > > > + case VFIO_IOMMU_BIND_GUEST_PGTBL: > > > > + ret = vfio_iommu_type1_bind_gpasid(iommu, > > > > + gbind_data); > > > > + break; > > > > + case VFIO_IOMMU_UNBIND_GUEST_PGTBL: > > > > + ret = vfio_iommu_type1_unbind_gpasid(iommu, > > > > + gbind_data); > > > > + break; > > > > + default: > > > > + ret = -EINVAL; > > > > + break; > > > > + } > > > > + kfree(gbind_data); > > > > + return ret; > > > > } > > > > > > > > return -ENOTTY; > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > > index ebeaf3e..2235bc6 100644 > > > > --- a/include/uapi/linux/vfio.h > > > > +++ b/include/uapi/linux/vfio.h > > > > @@ -14,6 +14,7 @@ > > > > > > > > #include <linux/types.h> > > > > #include <linux/ioctl.h> > > > > +#include <linux/iommu.h> > > > > > > > > #define VFIO_API_VERSION 0 > > > > > > > > @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request { > > > > */ > > > > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 22) > > > > > > > > +/** > > > > + * Supported flags: > > > > + * - VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to > host for > > > > + * nesting type IOMMUs. In @data field It takes struct > > > > + * iommu_gpasid_bind_data. > > > > + * - VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page > table > > > operation > > > > + * invoked by VFIO_IOMMU_BIND_GUEST_PGTBL. > > > > > > This must require iommu_gpasid_bind_data in the data field as well, > > > right? > > > > yes. > > > > Regards, > > Yi Liu > >
Hi Jean, > From: Jean-Philippe Brucker < jean-philippe@linaro.org > > Sent: Friday, April 3, 2020 4:35 PM > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote: > > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > > > > default: > > > > > > return -EINVAL; > > > > > > } > > > > > > + > > > > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > > > > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me. > > > > > > > > Emm, it's up to the flags to indicate bind what. It was proposed to > > > > cover the three cases below: > > > > a) BIND/UNBIND_GPASID > > > > b) BIND/UNBIND_GPASID_TABLE > > > > c) BIND/UNBIND_PROCESS > > > > <only a) is covered in this patch> > > > > So it's called VFIO_IOMMU_BIND. > > > > > > but aren't they all about PASID related binding? > > > > yeah, I can rename it. :-) > > I don't know if anyone intends to implement it, but SMMUv2 supports > nesting translation without any PASID support. For that case the name > VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more sense. > Ideally we'd also use a neutral name for the IOMMU API instead of > bind_gpasid(), but that's easier to change later. I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may cause confusion when thinking about VFIO_SET_IOMMU. How about using VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another VFIO_BIND_PROCESS in future for the SVA bind case. Regards, Yi Liu
On Tue, Apr 07, 2020 at 10:33:25AM +0000, Liu, Yi L wrote: > Hi Jean, > > > From: Jean-Philippe Brucker < jean-philippe@linaro.org > > > Sent: Friday, April 3, 2020 4:35 PM > > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > > > On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote: > > > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > > > > > default: > > > > > > > return -EINVAL; > > > > > > > } > > > > > > > + > > > > > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > > > > > > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me. > > > > > > > > > > Emm, it's up to the flags to indicate bind what. It was proposed to > > > > > cover the three cases below: > > > > > a) BIND/UNBIND_GPASID > > > > > b) BIND/UNBIND_GPASID_TABLE > > > > > c) BIND/UNBIND_PROCESS > > > > > <only a) is covered in this patch> > > > > > So it's called VFIO_IOMMU_BIND. > > > > > > > > but aren't they all about PASID related binding? > > > > > > yeah, I can rename it. :-) > > > > I don't know if anyone intends to implement it, but SMMUv2 supports > > nesting translation without any PASID support. For that case the name > > VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more sense. > > Ideally we'd also use a neutral name for the IOMMU API instead of > > bind_gpasid(), but that's easier to change later. > > I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may > cause confusion when thinking about VFIO_SET_IOMMU. How about using > VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another > VFIO_BIND_PROCESS in future for the SVA bind case. I think minimizing the number of ioctls is more important than finding the ideal name. VFIO_IOMMU_BIND was fine to me, but if it's too vague then rename it to VFIO_IOMMU_BIND_PASID and we'll just piggy-back on it for non-PASID things (they should be rare enough). Thanks, Jean
> From: Jean-Philippe Brucker <jean-philippe@linaro.org> > Sent: Thursday, April 9, 2020 4:29 PM > To: Liu, Yi L <yi.l.liu@intel.com> > > On Tue, Apr 07, 2020 at 10:33:25AM +0000, Liu, Yi L wrote: > > Hi Jean, > > > > > From: Jean-Philippe Brucker < jean-philippe@linaro.org > > > > Sent: Friday, April 3, 2020 4:35 PM > > > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > > > > > On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote: > > > > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > > > > > > default: > > > > > > > > return -EINVAL; > > > > > > > > } > > > > > > > > + > > > > > > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > > > > > > > > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me. > > > > > > > > > > > > Emm, it's up to the flags to indicate bind what. It was proposed to > > > > > > cover the three cases below: > > > > > > a) BIND/UNBIND_GPASID > > > > > > b) BIND/UNBIND_GPASID_TABLE > > > > > > c) BIND/UNBIND_PROCESS > > > > > > <only a) is covered in this patch> > > > > > > So it's called VFIO_IOMMU_BIND. > > > > > > > > > > but aren't they all about PASID related binding? > > > > > > > > yeah, I can rename it. :-) > > > > > > I don't know if anyone intends to implement it, but SMMUv2 supports > > > nesting translation without any PASID support. For that case the name > > > VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more > sense. > > > Ideally we'd also use a neutral name for the IOMMU API instead of > > > bind_gpasid(), but that's easier to change later. > > > > I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may > > cause confusion when thinking about VFIO_SET_IOMMU. How about using > > VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another > > VFIO_BIND_PROCESS in future for the SVA bind case. > > I think minimizing the number of ioctls is more important than finding the > ideal name. VFIO_IOMMU_BIND was fine to me, but if it's too vague then > rename it to VFIO_IOMMU_BIND_PASID and we'll just piggy-back on it for > non-PASID things (they should be rare enough). maybe we can start with VFIO_IOMMU_BIND_PASID. Actually, there is also a discussion on reusing the same ioctl and vfio structure for pasid_alloc/free, bind/unbind_gpasid. and cache_inv. how about your opinion? https://lkml.org/lkml/2020/4/3/833 Regards, Yi Liu
On Thu, Apr 09, 2020 at 09:15:29AM +0000, Liu, Yi L wrote: > > From: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Sent: Thursday, April 9, 2020 4:29 PM > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > On Tue, Apr 07, 2020 at 10:33:25AM +0000, Liu, Yi L wrote: > > > Hi Jean, > > > > > > > From: Jean-Philippe Brucker < jean-philippe@linaro.org > > > > > Sent: Friday, April 3, 2020 4:35 PM > > > > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > > > > > > > On Thu, Apr 02, 2020 at 08:05:29AM +0000, Liu, Yi L wrote: > > > > > > > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > > > > > > > default: > > > > > > > > > return -EINVAL; > > > > > > > > > } > > > > > > > > > + > > > > > > > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > > > > > > > > > > > > > > BIND what? VFIO_IOMMU_BIND_PASID sounds clearer to me. > > > > > > > > > > > > > > Emm, it's up to the flags to indicate bind what. It was proposed to > > > > > > > cover the three cases below: > > > > > > > a) BIND/UNBIND_GPASID > > > > > > > b) BIND/UNBIND_GPASID_TABLE > > > > > > > c) BIND/UNBIND_PROCESS > > > > > > > <only a) is covered in this patch> > > > > > > > So it's called VFIO_IOMMU_BIND. > > > > > > > > > > > > but aren't they all about PASID related binding? > > > > > > > > > > yeah, I can rename it. :-) > > > > > > > > I don't know if anyone intends to implement it, but SMMUv2 supports > > > > nesting translation without any PASID support. For that case the name > > > > VFIO_IOMMU_BIND_GUEST_PGTBL without "PASID" anywhere makes more > > sense. > > > > Ideally we'd also use a neutral name for the IOMMU API instead of > > > > bind_gpasid(), but that's easier to change later. > > > > > > I agree VFIO_IOMMU_BIND is somehow not straight-forward. Especially, it may > > > cause confusion when thinking about VFIO_SET_IOMMU. How about using > > > VFIO_NESTING_IOMMU_BIND_STAGE1 to cover a) and b)? And has another > > > VFIO_BIND_PROCESS in future for the SVA bind case. > > > > I think minimizing the number of ioctls is more important than finding the > > ideal name. VFIO_IOMMU_BIND was fine to me, but if it's too vague then > > rename it to VFIO_IOMMU_BIND_PASID and we'll just piggy-back on it for > > non-PASID things (they should be rare enough). > maybe we can start with VFIO_IOMMU_BIND_PASID. Actually, there is > also a discussion on reusing the same ioctl and vfio structure for > pasid_alloc/free, bind/unbind_gpasid. and cache_inv. how about your > opinion? Merging bind with unbind and alloc with free makes sense. I'd leave at least invalidate a separate ioctl, because alloc/bind/unbind/free are setup functions while invalidate is a runtime thing and performance sensitive. But I can't see a good reason not to merge them all together, so either way is fine by me. Thanks, Jean
Hi Alex, > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, April 3, 2020 3:57 AM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host > > On Sun, 22 Mar 2020 05:32:03 -0700 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > From: Liu Yi L <yi.l.liu@intel.com> > > > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by [...] > > +/** > > + * Unbind specific gpasid, caller of this function requires hold > > + * vfio_iommu->lock > > + */ > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) > > +{ > > + return vfio_iommu_for_each_dev(iommu, > > + vfio_unbind_gpasid_fn, gbind_data); > > +} > > + > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) > > +{ > > + int ret = 0; > > + > > + mutex_lock(&iommu->lock); > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + ret = vfio_iommu_for_each_dev(iommu, > > + vfio_bind_gpasid_fn, gbind_data); > > + /* > > + * If bind failed, it may not be a total failure. Some devices > > + * within the iommu group may have bind successfully. Although > > + * we don't enable pasid capability for non-singletion iommu > > + * groups, a unbind operation would be helpful to ensure no > > + * partial binding for an iommu group. > > Where was the non-singleton group restriction done, I missed that. > > > + */ > > + if (ret) > > + /* > > + * Undo all binds that already succeeded, no need to > > + * check the return value here since some device within > > + * the group has no successful bind when coming to this > > + * place switch. > > + */ > > + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); > > However, the for_each_dev function stops when the callback function > returns error, are we just assuming we stop at the same device as we > faulted on the first time and that we traverse the same set of devices > the second time? It seems strange to me that unbind should be able to > fail. I think the code needs enhancement. Although one group per container and one device per group is the most typical and desired case, but the code here loops domains, groups, devices. It should be able to unwind prior bind when the loop failed for a device. So I plan to do below change for bind path. list_for_each_entry(domain, &iommu->domain_list, next) { list_for_each_entry(group, &domain->group_list, next) { /* * if bind failed on a certain device, should unbind prior successful * bind iommu_group_for_each_dev() should be modified to take two * callbacks, one for forward loop and one for reverse loop when failure * happened. "return_upon_failure" indicates whether return upon failure * during forward loop or not. If yes, iommu_group_for_each_dev() should * unwind the prior bind in this iommu group before return. */ ret = iommu_group_for_each_dev(iommu_group, bind_gpasid_fn, unbind_gpasid_fn, data, return_upon_failure); if (ret) break; } if (ret) { /* unwind bindings with prior groups */ list_for_each_entry_continue_reverse(group, &domain->group_list, next) { iommu_group_for_each_dev(iommu_group, unbind_gpasid_fn, NULL, data, ignore_intermediate_failure); } break; } } if (ret) { /* unwind bindings with prior domains */ list_for_each_entry_continue_reverse(domain, &iommu->domain_list, next) { iommu_group_for_each_dev(iommu_group, unbind_gpasid_fn, NULL, data, ignore_intermediate_failure); } } } return ret; Regards, Yi Liu
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -130,6 +130,33 @@ struct vfio_regions { #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ (!list_empty(&iommu->domain_list)) +struct domain_capsule { + struct iommu_domain *domain; + void *data; +}; + +/* iommu->lock must be held */ +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu, + int (*fn)(struct device *dev, void *data), + void *data) +{ + struct domain_capsule dc = {.data = data}; + struct vfio_domain *d; + struct vfio_group *g; + int ret = 0; + + list_for_each_entry(d, &iommu->domain_list, next) { + dc.domain = d->domain; + list_for_each_entry(g, &d->group_list, next) { + ret = iommu_group_for_each_dev(g->iommu_group, + &dc, fn); + if (ret) + break; + } + } + return ret; +} + static int put_pfn(unsigned long pfn, int prot); /* @@ -2314,6 +2341,88 @@ static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, return 0; } +static int vfio_bind_gpasid_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + struct iommu_gpasid_bind_data *gbind_data = + (struct iommu_gpasid_bind_data *) dc->data; + + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); +} + +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + struct iommu_gpasid_bind_data *gbind_data = + (struct iommu_gpasid_bind_data *) dc->data; + + return iommu_sva_unbind_gpasid(dc->domain, dev, + gbind_data->hpasid); +} + +/** + * Unbind specific gpasid, caller of this function requires hold + * vfio_iommu->lock + */ +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, + struct iommu_gpasid_bind_data *gbind_data) +{ + return vfio_iommu_for_each_dev(iommu, + vfio_unbind_gpasid_fn, gbind_data); +} + +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, + struct iommu_gpasid_bind_data *gbind_data) +{ + int ret = 0; + + mutex_lock(&iommu->lock); + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = vfio_iommu_for_each_dev(iommu, + vfio_bind_gpasid_fn, gbind_data); + /* + * If bind failed, it may not be a total failure. Some devices + * within the iommu group may have bind successfully. Although + * we don't enable pasid capability for non-singletion iommu + * groups, a unbind operation would be helpful to ensure no + * partial binding for an iommu group. + */ + if (ret) + /* + * Undo all binds that already succeeded, no need to + * check the return value here since some device within + * the group has no successful bind when coming to this + * place switch. + */ + vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); + +out_unlock: + mutex_unlock(&iommu->lock); + return ret; +} + +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, + struct iommu_gpasid_bind_data *gbind_data) +{ + int ret = 0; + + mutex_lock(&iommu->lock); + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = vfio_iommu_type1_do_guest_unbind(iommu, gbind_data); + +out_unlock: + mutex_unlock(&iommu->lock); + return ret; +} + static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -2471,6 +2580,55 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, default: return -EINVAL; } + + } else if (cmd == VFIO_IOMMU_BIND) { + struct vfio_iommu_type1_bind bind; + u32 version; + int data_size; + void *gbind_data; + int ret; + + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); + + if (copy_from_user(&bind, (void __user *)arg, minsz)) + return -EFAULT; + + if (bind.argsz < minsz) + return -EINVAL; + + /* Get the version of struct iommu_gpasid_bind_data */ + if (copy_from_user(&version, + (void __user *) (arg + minsz), + sizeof(version))) + return -EFAULT; + + data_size = iommu_uapi_get_data_size( + IOMMU_UAPI_BIND_GPASID, version); + gbind_data = kzalloc(data_size, GFP_KERNEL); + if (!gbind_data) + return -ENOMEM; + + if (copy_from_user(gbind_data, + (void __user *) (arg + minsz), data_size)) { + kfree(gbind_data); + return -EFAULT; + } + + switch (bind.flags & VFIO_IOMMU_BIND_MASK) { + case VFIO_IOMMU_BIND_GUEST_PGTBL: + ret = vfio_iommu_type1_bind_gpasid(iommu, + gbind_data); + break; + case VFIO_IOMMU_UNBIND_GUEST_PGTBL: + ret = vfio_iommu_type1_unbind_gpasid(iommu, + gbind_data); + break; + default: + ret = -EINVAL; + break; + } + kfree(gbind_data); + return ret; } return -ENOTTY; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ebeaf3e..2235bc6 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/ioctl.h> +#include <linux/iommu.h> #define VFIO_API_VERSION 0 @@ -853,6 +854,51 @@ struct vfio_iommu_type1_pasid_request { */ #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 22) +/** + * Supported flags: + * - VFIO_IOMMU_BIND_GUEST_PGTBL: bind guest page tables to host for + * nesting type IOMMUs. In @data field It takes struct + * iommu_gpasid_bind_data. + * - VFIO_IOMMU_UNBIND_GUEST_PGTBL: undo a bind guest page table operation + * invoked by VFIO_IOMMU_BIND_GUEST_PGTBL. + * + */ +struct vfio_iommu_type1_bind { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_BIND_GUEST_PGTBL (1 << 0) +#define VFIO_IOMMU_UNBIND_GUEST_PGTBL (1 << 1) + __u8 data[]; +}; + +#define VFIO_IOMMU_BIND_MASK (VFIO_IOMMU_BIND_GUEST_PGTBL | \ + VFIO_IOMMU_UNBIND_GUEST_PGTBL) + +/** + * VFIO_IOMMU_BIND - _IOW(VFIO_TYPE, VFIO_BASE + 23, + * struct vfio_iommu_type1_bind) + * + * Manage address spaces of devices in this container. Initially a TYPE1 + * container can only have one address space, managed with + * VFIO_IOMMU_MAP/UNMAP_DMA. + * + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page + * tables, and BIND manages the stage-1 (guest) page tables. Other types of + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls + * the traffics only require single stage translation while BIND controls the + * traffics require nesting translation. But this depends on the underlying + * IOMMU architecture and isn't guaranteed. Example of this is the guest SVA + * traffics, such traffics need nesting translation to gain gVA->gPA and then + * gPA->hPA translation. + * + * Availability of this feature depends on the device, its bus, the underlying + * IOMMU and the CPU architecture. + * + * returns: 0 on success, -errno on failure. + */ +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /*