Message ID | 1584880325-10561-8-git-send-email-yi.l.liu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio: expose virtual Shared Virtual Addressing to VMs | expand |
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Sunday, March 22, 2020 8:32 PM > > From: Liu Yi L <yi.l.liu@linux.intel.com> > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest > "owns" the > first-level/stage-1 translation structures, the host IOMMU driver has no > knowledge of first-level/stage-1 structure cache updates unless the guest > invalidation requests are trapped and propagated to the host. > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to > propagate guest > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU > cache > correctness. > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely > as the host IOMMU iotlb correctness are ensured. > > Cc: Kevin Tian <kevin.tian@intel.com> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > Cc: Alex Williamson <alex.williamson@redhat.com> > Cc: Eric Auger <eric.auger@redhat.com> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > Signed-off-by: Liu Yi L <yi.l.liu@linux.intel.com> > Signed-off-by: Eric Auger <eric.auger@redhat.com> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > --- > drivers/vfio/vfio_iommu_type1.c | 49 > +++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 22 ++++++++++++++++++ > 2 files changed, 71 insertions(+) > > diff --git a/drivers/vfio/vfio_iommu_type1.c > b/drivers/vfio/vfio_iommu_type1.c > index a877747..937ec3f 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -2423,6 +2423,15 @@ static long > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > return ret; > } > > +static int vfio_cache_inv_fn(struct device *dev, void *data) vfio_iommu_cache_inv_fn > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_cache_invalidate_info *cache_inv_info = > + (struct iommu_cache_invalidate_info *) dc->data; > + > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); > +} > + > static long vfio_iommu_type1_ioctl(void *iommu_data, > unsigned int cmd, unsigned long arg) > { > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > } > kfree(gbind_data); > return ret; > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { > + struct vfio_iommu_type1_cache_invalidate cache_inv; > + u32 version; > + int info_size; > + void *cache_info; > + int ret; > + > + minsz = offsetofend(struct > vfio_iommu_type1_cache_invalidate, > + flags); > + > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (cache_inv.argsz < minsz || cache_inv.flags) > + return -EINVAL; > + > + /* Get the version of struct iommu_cache_invalidate_info */ > + if (copy_from_user(&version, > + (void __user *) (arg + minsz), sizeof(version))) > + return -EFAULT; > + > + info_size = iommu_uapi_get_data_size( > + IOMMU_UAPI_CACHE_INVAL, > version); > + > + cache_info = kzalloc(info_size, GFP_KERNEL); > + if (!cache_info) > + return -ENOMEM; > + > + if (copy_from_user(cache_info, > + (void __user *) (arg + minsz), info_size)) { > + kfree(cache_info); > + return -EFAULT; > + } > + > + mutex_lock(&iommu->lock); > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn, > + cache_info); > + mutex_unlock(&iommu->lock); > + kfree(cache_info); > + return ret; > } > > return -ENOTTY; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 2235bc6..62ca791 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { > */ > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > > +/** > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, > + * struct vfio_iommu_type1_cache_invalidate) > + * > + * Propagate guest IOMMU cache invalidation to the host. The cache > + * invalidation information is conveyed by @cache_info, the content > + * format would be structures defined in uapi/linux/iommu.h. User > + * should be aware of that the struct iommu_cache_invalidate_info > + * has a @version field, vfio needs to parse this field before getting > + * data from userspace. > + * > + * Availability of this IOCTL is after VFIO_SET_IOMMU. > + * > + * returns: 0 on success, -errno on failure. > + */ > +struct vfio_iommu_type1_cache_invalidate { > + __u32 argsz; > + __u32 flags; > + struct iommu_cache_invalidate_info cache_info; > +}; > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + > 24) > + > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ > > /* > -- > 2.7.4 This patch looks good to me in general. But since there is still a major open about version compatibility, I'll hold my r-b until that open is closed.
> @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, > } > kfree(gbind_data); > return ret; > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { Please refactor the spaghetti in this ioctl handler to use a switch statement and a helper function per command before growing it even more. > + /* Get the version of struct iommu_cache_invalidate_info */ > + if (copy_from_user(&version, > + (void __user *) (arg + minsz), sizeof(version))) > + return -EFAULT; > + > + info_size = iommu_uapi_get_data_size( > + IOMMU_UAPI_CACHE_INVAL, version); > + > + cache_info = kzalloc(info_size, GFP_KERNEL); > + if (!cache_info) > + return -ENOMEM; > + > + if (copy_from_user(cache_info, > + (void __user *) (arg + minsz), info_size)) { The user might have changed the version while you were allocating and freeing the memory, introducing potentially exploitable racing conditions.
Hi Hellwig, Thanks for your review, Hellwig. :-) inline reply. > From: Christoph Hellwig <hch@infradead.org> > Sent: Tuesday, March 31, 2020 3:56 PM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE > > > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > > } > > kfree(gbind_data); > > return ret; > > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { > > Please refactor the spaghetti in this ioctl handler to use a switch statement and a > helper function per command before growing it even more. got it. I may get a separate refactor patch before adding my changes. > > > + /* Get the version of struct iommu_cache_invalidate_info */ > > + if (copy_from_user(&version, > > + (void __user *) (arg + minsz), sizeof(version))) > > + return -EFAULT; > > + > > + info_size = iommu_uapi_get_data_size( > > + IOMMU_UAPI_CACHE_INVAL, version); > > + > > + cache_info = kzalloc(info_size, GFP_KERNEL); > > + if (!cache_info) > > + return -ENOMEM; > > + > > + if (copy_from_user(cache_info, > > + (void __user *) (arg + minsz), info_size)) { > > The user might have changed the version while you were allocating and > freeing the > memory, introducing potentially exploitable racing conditions. yeah, I know the @version is not welcomed in the thread Jacob is driving. I'll adjust the code here once the open in that thread has been solved. But regardless of the version, I'm not sure if I 100% got your point. Could you elaborate a bit? BTW. The code somehow referenced the code below. The basic flow is copying partial data from __arg and then copy the rest data after figuring out how much left. The difference betwen below code and my code is just different way to figure out left data size. Since I'm not sure if I got your point. If the racing is true in such flow, I guess there are quite a few places need to enhance. vfio_pci_ioctl(){ { ... } else if (cmd == VFIO_DEVICE_SET_IRQS) { struct vfio_irq_set hdr; u8 *data = NULL; int max, ret = 0; size_t data_size = 0; minsz = offsetofend(struct vfio_irq_set, count); if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; max = vfio_pci_get_irq_count(vdev, hdr.index); ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS, &data_size); if (ret) return ret; if (data_size) { data = memdup_user((void __user *)(arg + minsz), data_size); if (IS_ERR(data)) return PTR_ERR(data); } mutex_lock(&vdev->igate); ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start, hdr.count, data); mutex_unlock(&vdev->igate); kfree(data); return ret; } else if (cmd == VFIO_DEVICE_RESET) { ... } Regards, Yi Liu
> From: Tian, Kevin <kevin.tian@intel.com> > Sent: Monday, March 30, 2020 8:58 PM > To: Liu, Yi L <yi.l.liu@intel.com>; alex.williamson@redhat.com; > Subject: RE: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE > > > From: Liu, Yi L <yi.l.liu@intel.com> > > Sent: Sunday, March 22, 2020 8:32 PM > > > > From: Liu Yi L <yi.l.liu@linux.intel.com> > > > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest "owns" > > the > > first-level/stage-1 translation structures, the host IOMMU driver has > > no knowledge of first-level/stage-1 structure cache updates unless the > > guest invalidation requests are trapped and propagated to the host. > > > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to propagate > > guest > > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU > > cache correctness. > > > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used > > safely as the host IOMMU iotlb correctness are ensured. > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > Cc: Alex Williamson <alex.williamson@redhat.com> > > Cc: Eric Auger <eric.auger@redhat.com> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Signed-off-by: Liu Yi L <yi.l.liu@linux.intel.com> > > Signed-off-by: Eric Auger <eric.auger@redhat.com> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > --- > > drivers/vfio/vfio_iommu_type1.c | 49 > > +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/vfio.h | 22 ++++++++++++++++++ > > 2 files changed, 71 insertions(+) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c index a877747..937ec3f 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -2423,6 +2423,15 @@ static long > > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > return ret; > > } > > > > +static int vfio_cache_inv_fn(struct device *dev, void *data) > > vfio_iommu_cache_inv_fn got it. > > +{ > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_cache_invalidate_info *cache_inv_info = > > + (struct iommu_cache_invalidate_info *) dc->data; > > + > > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); } > > + > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > unsigned int cmd, unsigned long arg) { @@ - > 2629,6 +2638,46 @@ > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > } > > kfree(gbind_data); > > return ret; > > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { > > + struct vfio_iommu_type1_cache_invalidate cache_inv; > > + u32 version; > > + int info_size; > > + void *cache_info; > > + int ret; > > + > > + minsz = offsetofend(struct > > vfio_iommu_type1_cache_invalidate, > > + flags); > > + > > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (cache_inv.argsz < minsz || cache_inv.flags) > > + return -EINVAL; > > + > > + /* Get the version of struct iommu_cache_invalidate_info */ > > + if (copy_from_user(&version, > > + (void __user *) (arg + minsz), sizeof(version))) > > + return -EFAULT; > > + > > + info_size = iommu_uapi_get_data_size( > > + IOMMU_UAPI_CACHE_INVAL, > > version); > > + > > + cache_info = kzalloc(info_size, GFP_KERNEL); > > + if (!cache_info) > > + return -ENOMEM; > > + > > + if (copy_from_user(cache_info, > > + (void __user *) (arg + minsz), info_size)) { > > + kfree(cache_info); > > + return -EFAULT; > > + } > > + > > + mutex_lock(&iommu->lock); > > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn, > > + cache_info); > > + mutex_unlock(&iommu->lock); > > + kfree(cache_info); > > + return ret; > > } > > > > return -ENOTTY; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index 2235bc6..62ca791 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { > > */ > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > > > > +/** > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, > > + * struct vfio_iommu_type1_cache_invalidate) > > + * > > + * Propagate guest IOMMU cache invalidation to the host. The cache > > + * invalidation information is conveyed by @cache_info, the content > > + * format would be structures defined in uapi/linux/iommu.h. User > > + * should be aware of that the struct iommu_cache_invalidate_info > > + * has a @version field, vfio needs to parse this field before > > +getting > > + * data from userspace. > > + * > > + * Availability of this IOCTL is after VFIO_SET_IOMMU. > > + * > > + * returns: 0 on success, -errno on failure. > > + */ > > +struct vfio_iommu_type1_cache_invalidate { > > + __u32 argsz; > > + __u32 flags; > > + struct iommu_cache_invalidate_info cache_info; > > +}; > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + > > 24) > > + > > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU > > -------- */ > > > > /* > > -- > > 2.7.4 > > This patch looks good to me in general. But since there is still a major open about > version compatibility, I'll hold my r-b until that open is closed.
On Sun, 22 Mar 2020 05:32:04 -0700 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > From: Liu Yi L <yi.l.liu@linux.intel.com> > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest "owns" the > first-level/stage-1 translation structures, the host IOMMU driver has no > knowledge of first-level/stage-1 structure cache updates unless the guest > invalidation requests are trapped and propagated to the host. > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to propagate guest > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU cache > correctness. > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely > as the host IOMMU iotlb correctness are ensured. > > Cc: Kevin Tian <kevin.tian@intel.com> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > Cc: Alex Williamson <alex.williamson@redhat.com> > Cc: Eric Auger <eric.auger@redhat.com> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > Signed-off-by: Liu Yi L <yi.l.liu@linux.intel.com> > Signed-off-by: Eric Auger <eric.auger@redhat.com> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > --- > drivers/vfio/vfio_iommu_type1.c | 49 +++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 22 ++++++++++++++++++ > 2 files changed, 71 insertions(+) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index a877747..937ec3f 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -2423,6 +2423,15 @@ static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > return ret; > } > > +static int vfio_cache_inv_fn(struct device *dev, void *data) > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_cache_invalidate_info *cache_inv_info = > + (struct iommu_cache_invalidate_info *) dc->data; > + > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); > +} > + > static long vfio_iommu_type1_ioctl(void *iommu_data, > unsigned int cmd, unsigned long arg) > { > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, > } > kfree(gbind_data); > return ret; > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { > + struct vfio_iommu_type1_cache_invalidate cache_inv; > + u32 version; > + int info_size; > + void *cache_info; > + int ret; > + > + minsz = offsetofend(struct vfio_iommu_type1_cache_invalidate, > + flags); This breaks backward compatibility as soon as struct iommu_cache_invalidate_info changes size by its defined versioning scheme. ie. a field gets added, the version is bumped, all existing userspace breaks. Our minsz is offsetofend to the version field, interpret the version to size, then reevaluate argsz. > + > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (cache_inv.argsz < minsz || cache_inv.flags) > + return -EINVAL; > + > + /* Get the version of struct iommu_cache_invalidate_info */ > + if (copy_from_user(&version, > + (void __user *) (arg + minsz), sizeof(version))) > + return -EFAULT; > + > + info_size = iommu_uapi_get_data_size( > + IOMMU_UAPI_CACHE_INVAL, version); > + > + cache_info = kzalloc(info_size, GFP_KERNEL); > + if (!cache_info) > + return -ENOMEM; > + > + if (copy_from_user(cache_info, > + (void __user *) (arg + minsz), info_size)) { > + kfree(cache_info); > + return -EFAULT; > + } > + > + mutex_lock(&iommu->lock); > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn, > + cache_info); How does a user respond when their cache invalidate fails? Isn't this also another case where our for_each_dev can fail at an arbitrary point leaving us with no idea whether each device even had the opportunity to perform the invalidation request. I don't see how we have any chance to maintain coherency after this faults. > + mutex_unlock(&iommu->lock); > + kfree(cache_info); > + return ret; > } > > return -ENOTTY; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 2235bc6..62ca791 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { > */ > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > > +/** > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, > + * struct vfio_iommu_type1_cache_invalidate) > + * > + * Propagate guest IOMMU cache invalidation to the host. The cache > + * invalidation information is conveyed by @cache_info, the content > + * format would be structures defined in uapi/linux/iommu.h. User > + * should be aware of that the struct iommu_cache_invalidate_info > + * has a @version field, vfio needs to parse this field before getting > + * data from userspace. > + * > + * Availability of this IOCTL is after VFIO_SET_IOMMU. Is this a necessary qualifier? A user can try to call this ioctl at any point, it only makes sense in certain configurations, but it should always "do the right thing" relative to the container iommu config. Also, I don't see anything in these last few patches testing the operating IOMMU model, what happens when a user calls them when not using the nesting IOMMU? Is this ioctl and the previous BIND ioctl only valid when configured for the nesting IOMMU type? > + * > + * returns: 0 on success, -errno on failure. > + */ > +struct vfio_iommu_type1_cache_invalidate { > + __u32 argsz; > + __u32 flags; > + struct iommu_cache_invalidate_info cache_info; > +}; > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 24) The future extension capabilities of this ioctl worry me, I wonder if we should do another data[] with flag defining that data as CACHE_INFO. > + > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ > > /*
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, April 3, 2020 4:24 AM > > On Sun, 22 Mar 2020 05:32:04 -0700 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > From: Liu Yi L <yi.l.liu@linux.intel.com> > > > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest > "owns" the > > first-level/stage-1 translation structures, the host IOMMU driver has no > > knowledge of first-level/stage-1 structure cache updates unless the guest > > invalidation requests are trapped and propagated to the host. > > > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to > propagate guest > > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU > cache > > correctness. > > > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely > > as the host IOMMU iotlb correctness are ensured. > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > Cc: Alex Williamson <alex.williamson@redhat.com> > > Cc: Eric Auger <eric.auger@redhat.com> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Signed-off-by: Liu Yi L <yi.l.liu@linux.intel.com> > > Signed-off-by: Eric Auger <eric.auger@redhat.com> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > --- > > drivers/vfio/vfio_iommu_type1.c | 49 > +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/vfio.h | 22 ++++++++++++++++++ > > 2 files changed, 71 insertions(+) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > b/drivers/vfio/vfio_iommu_type1.c > > index a877747..937ec3f 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -2423,6 +2423,15 @@ static long > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > return ret; > > } > > > > +static int vfio_cache_inv_fn(struct device *dev, void *data) > > +{ > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_cache_invalidate_info *cache_inv_info = > > + (struct iommu_cache_invalidate_info *) dc->data; > > + > > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); > > +} > > + > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > unsigned int cmd, unsigned long arg) > > { > > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > > } > > kfree(gbind_data); > > return ret; > > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { > > + struct vfio_iommu_type1_cache_invalidate cache_inv; > > + u32 version; > > + int info_size; > > + void *cache_info; > > + int ret; > > + > > + minsz = offsetofend(struct > vfio_iommu_type1_cache_invalidate, > > + flags); > > This breaks backward compatibility as soon as struct > iommu_cache_invalidate_info changes size by its defined versioning > scheme. ie. a field gets added, the version is bumped, all existing > userspace breaks. Our minsz is offsetofend to the version field, > interpret the version to size, then reevaluate argsz. btw the version scheme is challenged by Christoph Hellwig. After some discussions, we need your guidance how to move forward. Jacob summarized available options below: https://lkml.org/lkml/2020/4/2/876 > > > + > > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (cache_inv.argsz < minsz || cache_inv.flags) > > + return -EINVAL; > > + > > + /* Get the version of struct iommu_cache_invalidate_info */ > > + if (copy_from_user(&version, > > + (void __user *) (arg + minsz), sizeof(version))) > > + return -EFAULT; > > + > > + info_size = iommu_uapi_get_data_size( > > + IOMMU_UAPI_CACHE_INVAL, > version); > > + > > + cache_info = kzalloc(info_size, GFP_KERNEL); > > + if (!cache_info) > > + return -ENOMEM; > > + > > + if (copy_from_user(cache_info, > > + (void __user *) (arg + minsz), info_size)) { > > + kfree(cache_info); > > + return -EFAULT; > > + } > > + > > + mutex_lock(&iommu->lock); > > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn, > > + cache_info); > > How does a user respond when their cache invalidate fails? Isn't this > also another case where our for_each_dev can fail at an arbitrary point > leaving us with no idea whether each device even had the opportunity to > perform the invalidation request. I don't see how we have any chance > to maintain coherency after this faults. Then can we make it simple to support singleton group only? > > > + mutex_unlock(&iommu->lock); > > + kfree(cache_info); > > + return ret; > > } > > > > return -ENOTTY; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index 2235bc6..62ca791 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { > > */ > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > > > > +/** > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, > > + * struct vfio_iommu_type1_cache_invalidate) > > + * > > + * Propagate guest IOMMU cache invalidation to the host. The cache > > + * invalidation information is conveyed by @cache_info, the content > > + * format would be structures defined in uapi/linux/iommu.h. User > > + * should be aware of that the struct iommu_cache_invalidate_info > > + * has a @version field, vfio needs to parse this field before getting > > + * data from userspace. > > + * > > + * Availability of this IOCTL is after VFIO_SET_IOMMU. > > Is this a necessary qualifier? A user can try to call this ioctl at > any point, it only makes sense in certain configurations, but it should > always "do the right thing" relative to the container iommu config. > > Also, I don't see anything in these last few patches testing the > operating IOMMU model, what happens when a user calls them when not > using the nesting IOMMU? > > Is this ioctl and the previous BIND ioctl only valid when configured > for the nesting IOMMU type? I think so. We should add the nesting check in those new ioctls. > > > + * > > + * returns: 0 on success, -errno on failure. > > + */ > > +struct vfio_iommu_type1_cache_invalidate { > > + __u32 argsz; > > + __u32 flags; > > + struct iommu_cache_invalidate_info cache_info; > > +}; > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE > + 24) > > The future extension capabilities of this ioctl worry me, I wonder if > we should do another data[] with flag defining that data as CACHE_INFO. Can you elaborate? Does it mean with this way we don't rely on iommu driver to provide version_to_size conversion and instead we just pass data[] to iommu driver for further audit? > > > + > > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- > */ > > > > /* Thanks Kevin
On Fri, 3 Apr 2020 06:39:22 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Friday, April 3, 2020 4:24 AM > > > > On Sun, 22 Mar 2020 05:32:04 -0700 > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > From: Liu Yi L <yi.l.liu@linux.intel.com> > > > > > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest > > "owns" the > > > first-level/stage-1 translation structures, the host IOMMU driver > > > has no knowledge of first-level/stage-1 structure cache updates > > > unless the guest invalidation requests are trapped and propagated > > > to the host. > > > > > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to > > propagate guest > > > first-level/stage-1 IOMMU cache invalidations to host to ensure > > > IOMMU > > cache > > > correctness. > > > > > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be > > > used safely as the host IOMMU iotlb correctness are ensured. > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > Cc: Alex Williamson <alex.williamson@redhat.com> > > > Cc: Eric Auger <eric.auger@redhat.com> > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > > Signed-off-by: Liu Yi L <yi.l.liu@linux.intel.com> > > > Signed-off-by: Eric Auger <eric.auger@redhat.com> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > --- > > > drivers/vfio/vfio_iommu_type1.c | 49 > > +++++++++++++++++++++++++++++++++++++++++ > > > include/uapi/linux/vfio.h | 22 ++++++++++++++++++ > > > 2 files changed, 71 insertions(+) > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c > > > index a877747..937ec3f 100644 > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > @@ -2423,6 +2423,15 @@ static long > > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > return ret; > > > } > > > > > > +static int vfio_cache_inv_fn(struct device *dev, void *data) > > > +{ > > > + struct domain_capsule *dc = (struct domain_capsule > > > *)data; > > > + struct iommu_cache_invalidate_info *cache_inv_info = > > > + (struct iommu_cache_invalidate_info *) dc->data; > > > + > > > + return iommu_cache_invalidate(dc->domain, dev, > > > cache_inv_info); +} > > > + > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > unsigned int cmd, unsigned > > > long arg) { > > > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void > > *iommu_data, > > > } > > > kfree(gbind_data); > > > return ret; > > > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { > > > + struct vfio_iommu_type1_cache_invalidate > > > cache_inv; > > > + u32 version; > > > + int info_size; > > > + void *cache_info; > > > + int ret; > > > + > > > + minsz = offsetofend(struct > > vfio_iommu_type1_cache_invalidate, > > > + flags); > > > > This breaks backward compatibility as soon as struct > > iommu_cache_invalidate_info changes size by its defined versioning > > scheme. ie. a field gets added, the version is bumped, all existing > > userspace breaks. Our minsz is offsetofend to the version field, > > interpret the version to size, then reevaluate argsz. > > btw the version scheme is challenged by Christoph Hellwig. After > some discussions, we need your guidance how to move forward. > Jacob summarized available options below: > https://lkml.org/lkml/2020/4/2/876 > For this particular case, I don't quite get the difference between minsz=flags and minsz=version. Our IOMMU version scheme will only change size at the end where the variable union is used for vendor specific extensions. Version bump does not change size (only re-purpose padding) from the start of the UAPI structure to the union, i.e. version will __always__ be after struct vfio_iommu_type1_cache_invalidate.flags. > > > > > + > > > + if (copy_from_user(&cache_inv, (void __user > > > *)arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (cache_inv.argsz < minsz || cache_inv.flags) > > > + return -EINVAL; > > > + > > > + /* Get the version of struct > > > iommu_cache_invalidate_info */ > > > + if (copy_from_user(&version, > > > + (void __user *) (arg + minsz), > > > sizeof(version))) > > > + return -EFAULT; > > > + > > > + info_size = iommu_uapi_get_data_size( > > > + IOMMU_UAPI_CACHE_INVAL, > > version); > > > + > > > + cache_info = kzalloc(info_size, GFP_KERNEL); > > > + if (!cache_info) > > > + return -ENOMEM; > > > + > > > + if (copy_from_user(cache_info, > > > + (void __user *) (arg + minsz), > > > info_size)) { > > > + kfree(cache_info); > > > + return -EFAULT; > > > + } > > > + > > > + mutex_lock(&iommu->lock); > > > + ret = vfio_iommu_for_each_dev(iommu, > > > vfio_cache_inv_fn, > > > + cache_info); > > > > How does a user respond when their cache invalidate fails? Isn't > > this also another case where our for_each_dev can fail at an > > arbitrary point leaving us with no idea whether each device even > > had the opportunity to perform the invalidation request. I don't > > see how we have any chance to maintain coherency after this > > faults. > > Then can we make it simple to support singleton group only? > > > > > > + mutex_unlock(&iommu->lock); > > > + kfree(cache_info); > > > + return ret; > > > } > > > > > > return -ENOTTY; > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > index 2235bc6..62ca791 100644 > > > --- a/include/uapi/linux/vfio.h > > > +++ b/include/uapi/linux/vfio.h > > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { > > > */ > > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE > > > + 23) > > > > > > +/** > > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, > > > + * struct > > > vfio_iommu_type1_cache_invalidate) > > > + * > > > + * Propagate guest IOMMU cache invalidation to the host. The > > > cache > > > + * invalidation information is conveyed by @cache_info, the > > > content > > > + * format would be structures defined in uapi/linux/iommu.h. User > > > + * should be aware of that the struct > > > iommu_cache_invalidate_info > > > + * has a @version field, vfio needs to parse this field before > > > getting > > > + * data from userspace. > > > + * > > > + * Availability of this IOCTL is after VFIO_SET_IOMMU. > > > > Is this a necessary qualifier? A user can try to call this ioctl at > > any point, it only makes sense in certain configurations, but it > > should always "do the right thing" relative to the container iommu > > config. > > > > Also, I don't see anything in these last few patches testing the > > operating IOMMU model, what happens when a user calls them when not > > using the nesting IOMMU? > > > > Is this ioctl and the previous BIND ioctl only valid when configured > > for the nesting IOMMU type? > > I think so. We should add the nesting check in those new ioctls. > I also added nesting domain attribute check in IOMMU driver, so bind guest PASID will fail if nesting mode is not supported. There will be no invalidation w/o bind. > > > > > + * > > > + * returns: 0 on success, -errno on failure. > > > + */ > > > +struct vfio_iommu_type1_cache_invalidate { > > > + __u32 argsz; > > > + __u32 flags; > > > + struct iommu_cache_invalidate_info cache_info; > > > +}; > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, > > > VFIO_BASE > > + 24) > > > > The future extension capabilities of this ioctl worry me, I wonder > > if we should do another data[] with flag defining that data as > > CACHE_INFO. > > Can you elaborate? Does it mean with this way we don't rely on iommu > driver to provide version_to_size conversion and instead we just pass > data[] to iommu driver for further audit? > I guess Alex meant we do something similar to: struct vfio_irq_set { __u32 argsz; __u32 flags; #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ __u32 index; __u32 start; __u32 count; __u8 data[]; }; So we could do: struct vfio_iommu_type1_cache_invalidate { __u32 argsz; #define VFIO_INVL_DATA_NONE #define VFIO_INVL_DATA_CACHE_INFO (1 << 1) __u32 flags; __u8 data[]; } We still need version_to_size version, but under if (flag & VFIO_INVL_DATA_CACHE_INFO) get_size_from_version(); > > > > > + > > > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU > > > -------- > > */ > > > > > > /* > > Thanks > Kevin [Jacob Pan]
On Fri, 3 Apr 2020 06:39:22 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Friday, April 3, 2020 4:24 AM > > > > On Sun, 22 Mar 2020 05:32:04 -0700 > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > From: Liu Yi L <yi.l.liu@linux.intel.com> > > > > > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest > > "owns" the > > > first-level/stage-1 translation structures, the host IOMMU driver has no > > > knowledge of first-level/stage-1 structure cache updates unless the guest > > > invalidation requests are trapped and propagated to the host. > > > > > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to > > propagate guest > > > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU > > cache > > > correctness. > > > > > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely > > > as the host IOMMU iotlb correctness are ensured. > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > Cc: Alex Williamson <alex.williamson@redhat.com> > > > Cc: Eric Auger <eric.auger@redhat.com> > > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > > Signed-off-by: Liu Yi L <yi.l.liu@linux.intel.com> > > > Signed-off-by: Eric Auger <eric.auger@redhat.com> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > --- > > > drivers/vfio/vfio_iommu_type1.c | 49 > > +++++++++++++++++++++++++++++++++++++++++ > > > include/uapi/linux/vfio.h | 22 ++++++++++++++++++ > > > 2 files changed, 71 insertions(+) > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c > > > index a877747..937ec3f 100644 > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > @@ -2423,6 +2423,15 @@ static long > > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > return ret; > > > } > > > > > > +static int vfio_cache_inv_fn(struct device *dev, void *data) > > > +{ > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > + struct iommu_cache_invalidate_info *cache_inv_info = > > > + (struct iommu_cache_invalidate_info *) dc->data; > > > + > > > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); > > > +} > > > + > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > unsigned int cmd, unsigned long arg) > > > { > > > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void > > *iommu_data, > > > } > > > kfree(gbind_data); > > > return ret; > > > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { > > > + struct vfio_iommu_type1_cache_invalidate cache_inv; > > > + u32 version; > > > + int info_size; > > > + void *cache_info; > > > + int ret; > > > + > > > + minsz = offsetofend(struct > > vfio_iommu_type1_cache_invalidate, > > > + flags); > > > > This breaks backward compatibility as soon as struct > > iommu_cache_invalidate_info changes size by its defined versioning > > scheme. ie. a field gets added, the version is bumped, all existing > > userspace breaks. Our minsz is offsetofend to the version field, > > interpret the version to size, then reevaluate argsz. > > btw the version scheme is challenged by Christoph Hellwig. After > some discussions, we need your guidance how to move forward. > Jacob summarized available options below: > https://lkml.org/lkml/2020/4/2/876 Ok > > > > > + > > > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (cache_inv.argsz < minsz || cache_inv.flags) > > > + return -EINVAL; > > > + > > > + /* Get the version of struct iommu_cache_invalidate_info */ > > > + if (copy_from_user(&version, > > > + (void __user *) (arg + minsz), sizeof(version))) > > > + return -EFAULT; > > > + > > > + info_size = iommu_uapi_get_data_size( > > > + IOMMU_UAPI_CACHE_INVAL, > > version); > > > + > > > + cache_info = kzalloc(info_size, GFP_KERNEL); > > > + if (!cache_info) > > > + return -ENOMEM; > > > + > > > + if (copy_from_user(cache_info, > > > + (void __user *) (arg + minsz), info_size)) { > > > + kfree(cache_info); > > > + return -EFAULT; > > > + } > > > + > > > + mutex_lock(&iommu->lock); > > > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn, > > > + cache_info); > > > > How does a user respond when their cache invalidate fails? Isn't this > > also another case where our for_each_dev can fail at an arbitrary point > > leaving us with no idea whether each device even had the opportunity to > > perform the invalidation request. I don't see how we have any chance > > to maintain coherency after this faults. > > Then can we make it simple to support singleton group only? Are you suggesting a single group per container or a single device per group? Unless we have both, aren't we always going to have this issue. OTOH, why should a cache invalidate fail? > > > + mutex_unlock(&iommu->lock); > > > + kfree(cache_info); > > > + return ret; > > > } > > > > > > return -ENOTTY; > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > index 2235bc6..62ca791 100644 > > > --- a/include/uapi/linux/vfio.h > > > +++ b/include/uapi/linux/vfio.h > > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { > > > */ > > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > > > > > > +/** > > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, > > > + * struct vfio_iommu_type1_cache_invalidate) > > > + * > > > + * Propagate guest IOMMU cache invalidation to the host. The cache > > > + * invalidation information is conveyed by @cache_info, the content > > > + * format would be structures defined in uapi/linux/iommu.h. User > > > + * should be aware of that the struct iommu_cache_invalidate_info > > > + * has a @version field, vfio needs to parse this field before getting > > > + * data from userspace. > > > + * > > > + * Availability of this IOCTL is after VFIO_SET_IOMMU. > > > > Is this a necessary qualifier? A user can try to call this ioctl at > > any point, it only makes sense in certain configurations, but it should > > always "do the right thing" relative to the container iommu config. > > > > Also, I don't see anything in these last few patches testing the > > operating IOMMU model, what happens when a user calls them when not > > using the nesting IOMMU? > > > > Is this ioctl and the previous BIND ioctl only valid when configured > > for the nesting IOMMU type? > > I think so. We should add the nesting check in those new ioctls. > > > > > > + * > > > + * returns: 0 on success, -errno on failure. > > > + */ > > > +struct vfio_iommu_type1_cache_invalidate { > > > + __u32 argsz; > > > + __u32 flags; > > > + struct iommu_cache_invalidate_info cache_info; > > > +}; > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE > > + 24) > > > > The future extension capabilities of this ioctl worry me, I wonder if > > we should do another data[] with flag defining that data as CACHE_INFO. > > Can you elaborate? Does it mean with this way we don't rely on iommu > driver to provide version_to_size conversion and instead we just pass > data[] to iommu driver for further audit? No, my concern is that this ioctl has a single function, strictly tied to the iommu uapi. If we replace cache_info with data[] then we can define a flag to specify that data[] is struct iommu_cache_invalidate_info, and if we need to, a different flag to identify data[] as something else. For example if we get stuck expanding cache_info to meet new demands and develop a new uapi to solve that, how would we expand this ioctl to support it rather than also create a new ioctl? There's also a trade-off in making the ioctl usage more difficult for the user. I'd still expect the vfio layer to check the flag and interpret data[] as indicated by the flag rather than just passing a blob of opaque data to the iommu layer though. Thanks, Alex
Hi Alex, > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, April 3, 2020 11:35 PM > Subject: Re: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE > > On Fri, 3 Apr 2020 06:39:22 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Friday, April 3, 2020 4:24 AM > > > > > > On Sun, 22 Mar 2020 05:32:04 -0700 > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > > > From: Liu Yi L <yi.l.liu@linux.intel.com> > > > > [...] > > > > > > > > + > > > > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) > > > > + return -EFAULT; > > > > + > > > > + if (cache_inv.argsz < minsz || cache_inv.flags) > > > > + return -EINVAL; > > > > + > > > > + /* Get the version of struct iommu_cache_invalidate_info */ > > > > + if (copy_from_user(&version, > > > > + (void __user *) (arg + minsz), sizeof(version))) > > > > + return -EFAULT; > > > > + > > > > + info_size = iommu_uapi_get_data_size( > > > > + IOMMU_UAPI_CACHE_INVAL, > > > version); > > > > + > > > > + cache_info = kzalloc(info_size, GFP_KERNEL); > > > > + if (!cache_info) > > > > + return -ENOMEM; > > > > + > > > > + if (copy_from_user(cache_info, > > > > + (void __user *) (arg + minsz), info_size)) { > > > > + kfree(cache_info); > > > > + return -EFAULT; > > > > + } > > > > + > > > > + mutex_lock(&iommu->lock); > > > > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn, > > > > + cache_info); > > > > > > How does a user respond when their cache invalidate fails? Isn't this > > > also another case where our for_each_dev can fail at an arbitrary point > > > leaving us with no idea whether each device even had the opportunity to > > > perform the invalidation request. I don't see how we have any chance > > > to maintain coherency after this faults. > > > > Then can we make it simple to support singleton group only? > > Are you suggesting a single group per container or a single device per > group? Unless we have both, aren't we always going to have this issue. Agreed. we need both to avoid the potential for_each_dev() loop issue. I suppose this is also the most typical and desired config for vSVA support. I think it makes sense with below items: a) one group per container PASID and nested translation gives user-space a chance to attach their page table (e.g. guest process page table) to host IOMMU, this is vSVA. If adding multiple groups to a vSVA-capable container, then a SVA bind on this container means bind it with all groups (devices are included) within the container. This doesn't make sense with three reasons: for one the passthru devices are not necessary to be manipulated by same guest application; for two passthru devices are not surely added in a single guest group; for three not all passthru devices (either from different group or same group) are sva capable. As above, enforce one group per container makes sense to me. b) one device per group SVA support is limited to singleton group so far in bare-metal bind per Jean's series. I think it's be good to follow it in passthru case. https://patchwork.kernel.org/patch/10213877/ https://lkml.org/lkml/2019/4/10/663 As mentioned in a), group may have both SVA-capable device and non-SVA -capable device, it would be a problem for VFIO to figure a way to isolate them. > OTOH, why should a cache invalidate fail? there are sanity check done by vendor iommu driver against the invalidate request from userspace. so it may fail if sanity check failed. But I guess it may be better to something like abort instead of fail the request. isn't? > > > > > + mutex_unlock(&iommu->lock); > > > > + kfree(cache_info); > > > > + return ret; > > > > } > > > > > > > > return -ENOTTY; > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > > index 2235bc6..62ca791 100644 > > > > --- a/include/uapi/linux/vfio.h > > > > +++ b/include/uapi/linux/vfio.h > > > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { > > > > */ > > > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) > > > > > > > > +/** > > > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, > > > > + * struct vfio_iommu_type1_cache_invalidate) > > > > + * > > > > + * Propagate guest IOMMU cache invalidation to the host. The cache > > > > + * invalidation information is conveyed by @cache_info, the content > > > > + * format would be structures defined in uapi/linux/iommu.h. User > > > > + * should be aware of that the struct iommu_cache_invalidate_info > > > > + * has a @version field, vfio needs to parse this field before getting > > > > + * data from userspace. > > > > + * > > > > + * Availability of this IOCTL is after VFIO_SET_IOMMU. > > > > > > Is this a necessary qualifier? A user can try to call this ioctl at > > > any point, it only makes sense in certain configurations, but it should > > > always "do the right thing" relative to the container iommu config. > > > > > > Also, I don't see anything in these last few patches testing the > > > operating IOMMU model, what happens when a user calls them when not > > > using the nesting IOMMU? > > > > > > Is this ioctl and the previous BIND ioctl only valid when configured > > > for the nesting IOMMU type? > > > > I think so. We should add the nesting check in those new ioctls. > > > > > > > > > + * > > > > + * returns: 0 on success, -errno on failure. > > > > + */ > > > > +struct vfio_iommu_type1_cache_invalidate { > > > > + __u32 argsz; > > > > + __u32 flags; > > > > + struct iommu_cache_invalidate_info cache_info; > > > > +}; > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE > > > + 24) > > > > > > The future extension capabilities of this ioctl worry me, I wonder if > > > we should do another data[] with flag defining that data as CACHE_INFO. > > > > Can you elaborate? Does it mean with this way we don't rely on iommu > > driver to provide version_to_size conversion and instead we just pass > > data[] to iommu driver for further audit? > > No, my concern is that this ioctl has a single function, strictly tied > to the iommu uapi. If we replace cache_info with data[] then we can > define a flag to specify that data[] is struct > iommu_cache_invalidate_info, and if we need to, a different flag to > identify data[] as something else. For example if we get stuck > expanding cache_info to meet new demands and develop a new uapi to > solve that, how would we expand this ioctl to support it rather than > also create a new ioctl? There's also a trade-off in making the ioctl > usage more difficult for the user. I'd still expect the vfio layer to > check the flag and interpret data[] as indicated by the flag rather > than just passing a blob of opaque data to the iommu layer though. Ok, I think data[] is acceptable. BTW. Do you have any decision on the uapi version open iin Jacob's thread? I'd like to re-work my patch based on your decision. https://lkml.org/lkml/2020/4/2/876 thanks again for your help. :-) Regards, Yi Liu
Hi Alex, Still have a direction question with you. Better get agreement with you before heading forward. > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, April 3, 2020 11:35 PM [...] > > > > + * > > > > + * returns: 0 on success, -errno on failure. > > > > + */ > > > > +struct vfio_iommu_type1_cache_invalidate { > > > > + __u32 argsz; > > > > + __u32 flags; > > > > + struct iommu_cache_invalidate_info cache_info; > > > > +}; > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, > VFIO_BASE > > > + 24) > > > > > > The future extension capabilities of this ioctl worry me, I wonder if > > > we should do another data[] with flag defining that data as CACHE_INFO. > > > > Can you elaborate? Does it mean with this way we don't rely on iommu > > driver to provide version_to_size conversion and instead we just pass > > data[] to iommu driver for further audit? > > No, my concern is that this ioctl has a single function, strictly tied > to the iommu uapi. If we replace cache_info with data[] then we can > define a flag to specify that data[] is struct > iommu_cache_invalidate_info, and if we need to, a different flag to > identify data[] as something else. For example if we get stuck > expanding cache_info to meet new demands and develop a new uapi to > solve that, how would we expand this ioctl to support it rather than > also create a new ioctl? There's also a trade-off in making the ioctl > usage more difficult for the user. I'd still expect the vfio layer to > check the flag and interpret data[] as indicated by the flag rather > than just passing a blob of opaque data to the iommu layer though. > Thanks, Based on your comments about defining a single ioctl and a unified vfio structure (with a @data[] field) for pasid_alloc/free, bind/ unbind_gpasid, cache_inv. After some offline trying, I think it would be good for bind/unbind_gpasid and cache_inv as both of them use the iommu uapi definition. While the pasid alloc/free operation doesn't. It would be weird to put all of them together. So pasid alloc/free may have a separate ioctl. It would look as below. Does this direction look good per your opinion? ioctl #22: VFIO_IOMMU_PASID_REQUEST /** * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID * specify a pasid to be freed when flags == FREE_PASID * @range: specify the allocation range when flags == ALLOC_PASID */ struct vfio_iommu_pasid_request { __u32 argsz; #define VFIO_IOMMU_ALLOC_PASID (1 << 0) #define VFIO_IOMMU_FREE_PASID (1 << 1) __u32 flags; __u32 pasid; struct { __u32 min; __u32 max; } range; }; ioctl #23: VFIO_IOMMU_NESTING_OP struct vfio_iommu_type1_nesting_op { __u32 argsz; __u32 flags; __u32 op; __u8 data[]; }; /* Nesting Ops */ #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 Thanks, Yi Liu
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Thursday, April 16, 2020 6:40 PM > > Hi Alex, > Still have a direction question with you. Better get agreement with you > before heading forward. > > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Friday, April 3, 2020 11:35 PM > [...] > > > > > + * > > > > > + * returns: 0 on success, -errno on failure. > > > > > + */ > > > > > +struct vfio_iommu_type1_cache_invalidate { > > > > > + __u32 argsz; > > > > > + __u32 flags; > > > > > + struct iommu_cache_invalidate_info cache_info; > > > > > +}; > > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, > > VFIO_BASE > > > > + 24) > > > > > > > > The future extension capabilities of this ioctl worry me, I wonder if > > > > we should do another data[] with flag defining that data as > CACHE_INFO. > > > > > > Can you elaborate? Does it mean with this way we don't rely on iommu > > > driver to provide version_to_size conversion and instead we just pass > > > data[] to iommu driver for further audit? > > > > No, my concern is that this ioctl has a single function, strictly tied > > to the iommu uapi. If we replace cache_info with data[] then we can > > define a flag to specify that data[] is struct > > iommu_cache_invalidate_info, and if we need to, a different flag to > > identify data[] as something else. For example if we get stuck > > expanding cache_info to meet new demands and develop a new uapi to > > solve that, how would we expand this ioctl to support it rather than > > also create a new ioctl? There's also a trade-off in making the ioctl > > usage more difficult for the user. I'd still expect the vfio layer to > > check the flag and interpret data[] as indicated by the flag rather > > than just passing a blob of opaque data to the iommu layer though. > > Thanks, > > Based on your comments about defining a single ioctl and a unified > vfio structure (with a @data[] field) for pasid_alloc/free, bind/ > unbind_gpasid, cache_inv. After some offline trying, I think it would > be good for bind/unbind_gpasid and cache_inv as both of them use the > iommu uapi definition. While the pasid alloc/free operation doesn't. > It would be weird to put all of them together. So pasid alloc/free > may have a separate ioctl. It would look as below. Does this direction > look good per your opinion? > > ioctl #22: VFIO_IOMMU_PASID_REQUEST > /** > * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID > * specify a pasid to be freed when flags == FREE_PASID > * @range: specify the allocation range when flags == ALLOC_PASID > */ > struct vfio_iommu_pasid_request { > __u32 argsz; > #define VFIO_IOMMU_ALLOC_PASID (1 << 0) > #define VFIO_IOMMU_FREE_PASID (1 << 1) > __u32 flags; > __u32 pasid; > struct { > __u32 min; > __u32 max; > } range; > }; > > ioctl #23: VFIO_IOMMU_NESTING_OP > struct vfio_iommu_type1_nesting_op { > __u32 argsz; > __u32 flags; > __u32 op; > __u8 data[]; > }; > > /* Nesting Ops */ > #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 > #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 > #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 > Then why cannot we just put PASID into the header since the majority of nested usage is associated with a pasid? ioctl #23: VFIO_IOMMU_NESTING_OP struct vfio_iommu_type1_nesting_op { __u32 argsz; __u32 flags; __u32 op; __u32 pasid; __u8 data[]; }; In case of SMMUv2 which supports nested w/o PASID, this field can be ignored for that specific case. Thanks Kevin
Hi Kevin, On 4/16/20 2:09 PM, Tian, Kevin wrote: >> From: Liu, Yi L <yi.l.liu@intel.com> >> Sent: Thursday, April 16, 2020 6:40 PM >> >> Hi Alex, >> Still have a direction question with you. Better get agreement with you >> before heading forward. >> >>> From: Alex Williamson <alex.williamson@redhat.com> >>> Sent: Friday, April 3, 2020 11:35 PM >> [...] >>>>>> + * >>>>>> + * returns: 0 on success, -errno on failure. >>>>>> + */ >>>>>> +struct vfio_iommu_type1_cache_invalidate { >>>>>> + __u32 argsz; >>>>>> + __u32 flags; >>>>>> + struct iommu_cache_invalidate_info cache_info; >>>>>> +}; >>>>>> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, >>> VFIO_BASE >>>>> + 24) >>>>> >>>>> The future extension capabilities of this ioctl worry me, I wonder if >>>>> we should do another data[] with flag defining that data as >> CACHE_INFO. >>>> >>>> Can you elaborate? Does it mean with this way we don't rely on iommu >>>> driver to provide version_to_size conversion and instead we just pass >>>> data[] to iommu driver for further audit? >>> >>> No, my concern is that this ioctl has a single function, strictly tied >>> to the iommu uapi. If we replace cache_info with data[] then we can >>> define a flag to specify that data[] is struct >>> iommu_cache_invalidate_info, and if we need to, a different flag to >>> identify data[] as something else. For example if we get stuck >>> expanding cache_info to meet new demands and develop a new uapi to >>> solve that, how would we expand this ioctl to support it rather than >>> also create a new ioctl? There's also a trade-off in making the ioctl >>> usage more difficult for the user. I'd still expect the vfio layer to >>> check the flag and interpret data[] as indicated by the flag rather >>> than just passing a blob of opaque data to the iommu layer though. >>> Thanks, >> >> Based on your comments about defining a single ioctl and a unified >> vfio structure (with a @data[] field) for pasid_alloc/free, bind/ >> unbind_gpasid, cache_inv. After some offline trying, I think it would >> be good for bind/unbind_gpasid and cache_inv as both of them use the >> iommu uapi definition. While the pasid alloc/free operation doesn't. >> It would be weird to put all of them together. So pasid alloc/free >> may have a separate ioctl. It would look as below. Does this direction >> look good per your opinion? >> >> ioctl #22: VFIO_IOMMU_PASID_REQUEST >> /** >> * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID >> * specify a pasid to be freed when flags == FREE_PASID >> * @range: specify the allocation range when flags == ALLOC_PASID >> */ >> struct vfio_iommu_pasid_request { >> __u32 argsz; >> #define VFIO_IOMMU_ALLOC_PASID (1 << 0) >> #define VFIO_IOMMU_FREE_PASID (1 << 1) >> __u32 flags; >> __u32 pasid; >> struct { >> __u32 min; >> __u32 max; >> } range; >> }; >> >> ioctl #23: VFIO_IOMMU_NESTING_OP >> struct vfio_iommu_type1_nesting_op { >> __u32 argsz; >> __u32 flags; >> __u32 op; >> __u8 data[]; >> }; >> >> /* Nesting Ops */ >> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 >> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 >> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 >> > > Then why cannot we just put PASID into the header since the > majority of nested usage is associated with a pasid? > > ioctl #23: VFIO_IOMMU_NESTING_OP > struct vfio_iommu_type1_nesting_op { > __u32 argsz; > __u32 flags; > __u32 op; > __u32 pasid; > __u8 data[]; > }; > > In case of SMMUv2 which supports nested w/o PASID, this field can > be ignored for that specific case. On my side I would prefer keeping the pasid in the data[]. This is not always used. For instance, in iommu_cache_invalidate_info/iommu_inv_pasid_info we devised flags to tell whether the PASID is used. Thanks Eric > > Thanks > Kevin >
> From: Auger Eric <eric.auger@redhat.com> > Sent: Thursday, April 16, 2020 8:43 PM > > Hi Kevin, > On 4/16/20 2:09 PM, Tian, Kevin wrote: > >> From: Liu, Yi L <yi.l.liu@intel.com> > >> Sent: Thursday, April 16, 2020 6:40 PM > >> > >> Hi Alex, > >> Still have a direction question with you. Better get agreement with you > >> before heading forward. > >> > >>> From: Alex Williamson <alex.williamson@redhat.com> > >>> Sent: Friday, April 3, 2020 11:35 PM > >> [...] > >>>>>> + * > >>>>>> + * returns: 0 on success, -errno on failure. > >>>>>> + */ > >>>>>> +struct vfio_iommu_type1_cache_invalidate { > >>>>>> + __u32 argsz; > >>>>>> + __u32 flags; > >>>>>> + struct iommu_cache_invalidate_info cache_info; > >>>>>> +}; > >>>>>> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, > >>> VFIO_BASE > >>>>> + 24) > >>>>> > >>>>> The future extension capabilities of this ioctl worry me, I wonder if > >>>>> we should do another data[] with flag defining that data as > >> CACHE_INFO. > >>>> > >>>> Can you elaborate? Does it mean with this way we don't rely on iommu > >>>> driver to provide version_to_size conversion and instead we just pass > >>>> data[] to iommu driver for further audit? > >>> > >>> No, my concern is that this ioctl has a single function, strictly tied > >>> to the iommu uapi. If we replace cache_info with data[] then we can > >>> define a flag to specify that data[] is struct > >>> iommu_cache_invalidate_info, and if we need to, a different flag to > >>> identify data[] as something else. For example if we get stuck > >>> expanding cache_info to meet new demands and develop a new uapi to > >>> solve that, how would we expand this ioctl to support it rather than > >>> also create a new ioctl? There's also a trade-off in making the ioctl > >>> usage more difficult for the user. I'd still expect the vfio layer to > >>> check the flag and interpret data[] as indicated by the flag rather > >>> than just passing a blob of opaque data to the iommu layer though. > >>> Thanks, > >> > >> Based on your comments about defining a single ioctl and a unified > >> vfio structure (with a @data[] field) for pasid_alloc/free, bind/ > >> unbind_gpasid, cache_inv. After some offline trying, I think it would > >> be good for bind/unbind_gpasid and cache_inv as both of them use the > >> iommu uapi definition. While the pasid alloc/free operation doesn't. > >> It would be weird to put all of them together. So pasid alloc/free > >> may have a separate ioctl. It would look as below. Does this direction > >> look good per your opinion? > >> > >> ioctl #22: VFIO_IOMMU_PASID_REQUEST > >> /** > >> * @pasid: used to return the pasid alloc result when flags == > ALLOC_PASID > >> * specify a pasid to be freed when flags == FREE_PASID > >> * @range: specify the allocation range when flags == ALLOC_PASID > >> */ > >> struct vfio_iommu_pasid_request { > >> __u32 argsz; > >> #define VFIO_IOMMU_ALLOC_PASID (1 << 0) > >> #define VFIO_IOMMU_FREE_PASID (1 << 1) > >> __u32 flags; > >> __u32 pasid; > >> struct { > >> __u32 min; > >> __u32 max; > >> } range; > >> }; > >> > >> ioctl #23: VFIO_IOMMU_NESTING_OP > >> struct vfio_iommu_type1_nesting_op { > >> __u32 argsz; > >> __u32 flags; > >> __u32 op; > >> __u8 data[]; > >> }; > >> > >> /* Nesting Ops */ > >> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 > >> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 > >> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 > >> > > > > Then why cannot we just put PASID into the header since the > > majority of nested usage is associated with a pasid? > > > > ioctl #23: VFIO_IOMMU_NESTING_OP > > struct vfio_iommu_type1_nesting_op { > > __u32 argsz; > > __u32 flags; > > __u32 op; > > __u32 pasid; > > __u8 data[]; > > }; > > > > In case of SMMUv2 which supports nested w/o PASID, this field can > > be ignored for that specific case. > On my side I would prefer keeping the pasid in the data[]. This is not > always used. > > For instance, in iommu_cache_invalidate_info/iommu_inv_pasid_info we > devised flags to tell whether the PASID is used. > But don't we include a PASID in both invalidate structures already? struct iommu_inv_addr_info { #define IOMMU_INV_ADDR_FLAGS_PASID (1 << 0) #define IOMMU_INV_ADDR_FLAGS_ARCHID (1 << 1) #define IOMMU_INV_ADDR_FLAGS_LEAF (1 << 2) __u32 flags; __u32 archid; __u64 pasid; __u64 addr; __u64 granule_size; __u64 nb_granules; }; struct iommu_inv_pasid_info { #define IOMMU_INV_PASID_FLAGS_PASID (1 << 0) #define IOMMU_INV_PASID_FLAGS_ARCHID (1 << 1) __u32 flags; __u32 archid; __u64 pasid; }; then consolidating the pasid field into generic header doesn't hurt. the specific handler still rely on flags to tell whether it is used? Thanks Kevin
On Thu, 16 Apr 2020 10:40:03 +0000 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > Hi Alex, > Still have a direction question with you. Better get agreement with you > before heading forward. > > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Friday, April 3, 2020 11:35 PM > [...] > > > > > + * > > > > > + * returns: 0 on success, -errno on failure. > > > > > + */ > > > > > +struct vfio_iommu_type1_cache_invalidate { > > > > > + __u32 argsz; > > > > > + __u32 flags; > > > > > + struct iommu_cache_invalidate_info cache_info; > > > > > +}; > > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, > > VFIO_BASE > > > > + 24) > > > > > > > > The future extension capabilities of this ioctl worry me, I wonder if > > > > we should do another data[] with flag defining that data as CACHE_INFO. > > > > > > Can you elaborate? Does it mean with this way we don't rely on iommu > > > driver to provide version_to_size conversion and instead we just pass > > > data[] to iommu driver for further audit? > > > > No, my concern is that this ioctl has a single function, strictly tied > > to the iommu uapi. If we replace cache_info with data[] then we can > > define a flag to specify that data[] is struct > > iommu_cache_invalidate_info, and if we need to, a different flag to > > identify data[] as something else. For example if we get stuck > > expanding cache_info to meet new demands and develop a new uapi to > > solve that, how would we expand this ioctl to support it rather than > > also create a new ioctl? There's also a trade-off in making the ioctl > > usage more difficult for the user. I'd still expect the vfio layer to > > check the flag and interpret data[] as indicated by the flag rather > > than just passing a blob of opaque data to the iommu layer though. > > Thanks, > > Based on your comments about defining a single ioctl and a unified > vfio structure (with a @data[] field) for pasid_alloc/free, bind/ > unbind_gpasid, cache_inv. After some offline trying, I think it would > be good for bind/unbind_gpasid and cache_inv as both of them use the > iommu uapi definition. While the pasid alloc/free operation doesn't. > It would be weird to put all of them together. So pasid alloc/free > may have a separate ioctl. It would look as below. Does this direction > look good per your opinion? > > ioctl #22: VFIO_IOMMU_PASID_REQUEST > /** > * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID > * specify a pasid to be freed when flags == FREE_PASID > * @range: specify the allocation range when flags == ALLOC_PASID > */ > struct vfio_iommu_pasid_request { > __u32 argsz; > #define VFIO_IOMMU_ALLOC_PASID (1 << 0) > #define VFIO_IOMMU_FREE_PASID (1 << 1) > __u32 flags; > __u32 pasid; > struct { > __u32 min; > __u32 max; > } range; > }; Can't the ioctl return the pasid valid on alloc (like GET_DEVICE_FD)? Would it be useful to support freeing a range of pasids? If so then we could simply use range for both, ie. allocate a pasid from this range and return it, or free all pasids in this range? vfio already needs to track pasids to free them on release, so presumably this is something we could support easily. > ioctl #23: VFIO_IOMMU_NESTING_OP > struct vfio_iommu_type1_nesting_op { > __u32 argsz; > __u32 flags; > __u32 op; > __u8 data[]; > }; data only has 4-byte alignment, I think we really want it at an 8-byte alignment. This is why I embedded the "op" into the flag for DEVICE_FEATURE. Thanks, Alex > > /* Nesting Ops */ > #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 > #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 > #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 > > Thanks, > Yi Liu >
On Thu, 16 Apr 2020 08:40:31 -0600 Alex Williamson <alex.williamson@redhat.com> wrote: > On Thu, 16 Apr 2020 10:40:03 +0000 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > Hi Alex, > > Still have a direction question with you. Better get agreement with you > > before heading forward. > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Friday, April 3, 2020 11:35 PM > > [...] > > > > > > + * > > > > > > + * returns: 0 on success, -errno on failure. > > > > > > + */ > > > > > > +struct vfio_iommu_type1_cache_invalidate { > > > > > > + __u32 argsz; > > > > > > + __u32 flags; > > > > > > + struct iommu_cache_invalidate_info cache_info; > > > > > > +}; > > > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, > > > VFIO_BASE > > > > > + 24) > > > > > > > > > > The future extension capabilities of this ioctl worry me, I wonder if > > > > > we should do another data[] with flag defining that data as CACHE_INFO. > > > > > > > > Can you elaborate? Does it mean with this way we don't rely on iommu > > > > driver to provide version_to_size conversion and instead we just pass > > > > data[] to iommu driver for further audit? > > > > > > No, my concern is that this ioctl has a single function, strictly tied > > > to the iommu uapi. If we replace cache_info with data[] then we can > > > define a flag to specify that data[] is struct > > > iommu_cache_invalidate_info, and if we need to, a different flag to > > > identify data[] as something else. For example if we get stuck > > > expanding cache_info to meet new demands and develop a new uapi to > > > solve that, how would we expand this ioctl to support it rather than > > > also create a new ioctl? There's also a trade-off in making the ioctl > > > usage more difficult for the user. I'd still expect the vfio layer to > > > check the flag and interpret data[] as indicated by the flag rather > > > than just passing a blob of opaque data to the iommu layer though. > > > Thanks, > > > > Based on your comments about defining a single ioctl and a unified > > vfio structure (with a @data[] field) for pasid_alloc/free, bind/ > > unbind_gpasid, cache_inv. After some offline trying, I think it would > > be good for bind/unbind_gpasid and cache_inv as both of them use the > > iommu uapi definition. While the pasid alloc/free operation doesn't. > > It would be weird to put all of them together. So pasid alloc/free > > may have a separate ioctl. It would look as below. Does this direction > > look good per your opinion? > > > > ioctl #22: VFIO_IOMMU_PASID_REQUEST > > /** > > * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID > > * specify a pasid to be freed when flags == FREE_PASID > > * @range: specify the allocation range when flags == ALLOC_PASID > > */ > > struct vfio_iommu_pasid_request { > > __u32 argsz; > > #define VFIO_IOMMU_ALLOC_PASID (1 << 0) > > #define VFIO_IOMMU_FREE_PASID (1 << 1) > > __u32 flags; > > __u32 pasid; > > struct { > > __u32 min; > > __u32 max; > > } range; > > }; > > Can't the ioctl return the pasid valid on alloc (like GET_DEVICE_FD)? s/valid/value/ > Would it be useful to support freeing a range of pasids? If so then we > could simply use range for both, ie. allocate a pasid from this range > and return it, or free all pasids in this range? vfio already needs to > track pasids to free them on release, so presumably this is something > we could support easily. > > > ioctl #23: VFIO_IOMMU_NESTING_OP > > struct vfio_iommu_type1_nesting_op { > > __u32 argsz; > > __u32 flags; > > __u32 op; > > __u8 data[]; > > }; > > data only has 4-byte alignment, I think we really want it at an 8-byte > alignment. This is why I embedded the "op" into the flag for > DEVICE_FEATURE. Thanks, > > Alex > > > > > /* Nesting Ops */ > > #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 > > #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 > > #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 > > > > Thanks, > > Yi Liu > > > > _______________________________________________ > iommu mailing list > iommu@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu >
Hi Kevin, On 4/16/20 3:28 PM, Tian, Kevin wrote: >> From: Auger Eric <eric.auger@redhat.com> >> Sent: Thursday, April 16, 2020 8:43 PM >> >> Hi Kevin, >> On 4/16/20 2:09 PM, Tian, Kevin wrote: >>>> From: Liu, Yi L <yi.l.liu@intel.com> >>>> Sent: Thursday, April 16, 2020 6:40 PM >>>> >>>> Hi Alex, >>>> Still have a direction question with you. Better get agreement with you >>>> before heading forward. >>>> >>>>> From: Alex Williamson <alex.williamson@redhat.com> >>>>> Sent: Friday, April 3, 2020 11:35 PM >>>> [...] >>>>>>>> + * >>>>>>>> + * returns: 0 on success, -errno on failure. >>>>>>>> + */ >>>>>>>> +struct vfio_iommu_type1_cache_invalidate { >>>>>>>> + __u32 argsz; >>>>>>>> + __u32 flags; >>>>>>>> + struct iommu_cache_invalidate_info cache_info; >>>>>>>> +}; >>>>>>>> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, >>>>> VFIO_BASE >>>>>>> + 24) >>>>>>> >>>>>>> The future extension capabilities of this ioctl worry me, I wonder if >>>>>>> we should do another data[] with flag defining that data as >>>> CACHE_INFO. >>>>>> >>>>>> Can you elaborate? Does it mean with this way we don't rely on iommu >>>>>> driver to provide version_to_size conversion and instead we just pass >>>>>> data[] to iommu driver for further audit? >>>>> >>>>> No, my concern is that this ioctl has a single function, strictly tied >>>>> to the iommu uapi. If we replace cache_info with data[] then we can >>>>> define a flag to specify that data[] is struct >>>>> iommu_cache_invalidate_info, and if we need to, a different flag to >>>>> identify data[] as something else. For example if we get stuck >>>>> expanding cache_info to meet new demands and develop a new uapi to >>>>> solve that, how would we expand this ioctl to support it rather than >>>>> also create a new ioctl? There's also a trade-off in making the ioctl >>>>> usage more difficult for the user. I'd still expect the vfio layer to >>>>> check the flag and interpret data[] as indicated by the flag rather >>>>> than just passing a blob of opaque data to the iommu layer though. >>>>> Thanks, >>>> >>>> Based on your comments about defining a single ioctl and a unified >>>> vfio structure (with a @data[] field) for pasid_alloc/free, bind/ >>>> unbind_gpasid, cache_inv. After some offline trying, I think it would >>>> be good for bind/unbind_gpasid and cache_inv as both of them use the >>>> iommu uapi definition. While the pasid alloc/free operation doesn't. >>>> It would be weird to put all of them together. So pasid alloc/free >>>> may have a separate ioctl. It would look as below. Does this direction >>>> look good per your opinion? >>>> >>>> ioctl #22: VFIO_IOMMU_PASID_REQUEST >>>> /** >>>> * @pasid: used to return the pasid alloc result when flags == >> ALLOC_PASID >>>> * specify a pasid to be freed when flags == FREE_PASID >>>> * @range: specify the allocation range when flags == ALLOC_PASID >>>> */ >>>> struct vfio_iommu_pasid_request { >>>> __u32 argsz; >>>> #define VFIO_IOMMU_ALLOC_PASID (1 << 0) >>>> #define VFIO_IOMMU_FREE_PASID (1 << 1) >>>> __u32 flags; >>>> __u32 pasid; >>>> struct { >>>> __u32 min; >>>> __u32 max; >>>> } range; >>>> }; >>>> >>>> ioctl #23: VFIO_IOMMU_NESTING_OP >>>> struct vfio_iommu_type1_nesting_op { >>>> __u32 argsz; >>>> __u32 flags; >>>> __u32 op; >>>> __u8 data[]; >>>> }; >>>> >>>> /* Nesting Ops */ >>>> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 >>>> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 >>>> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 >>>> >>> >>> Then why cannot we just put PASID into the header since the >>> majority of nested usage is associated with a pasid? >>> >>> ioctl #23: VFIO_IOMMU_NESTING_OP >>> struct vfio_iommu_type1_nesting_op { >>> __u32 argsz; >>> __u32 flags; >>> __u32 op; >>> __u32 pasid; >>> __u8 data[]; >>> }; >>> >>> In case of SMMUv2 which supports nested w/o PASID, this field can >>> be ignored for that specific case. >> On my side I would prefer keeping the pasid in the data[]. This is not >> always used. >> >> For instance, in iommu_cache_invalidate_info/iommu_inv_pasid_info we >> devised flags to tell whether the PASID is used. >> > > But don't we include a PASID in both invalidate structures already? The pasid presence is indicated by the IOMMU_INV_ADDR_FLAGS_PASID flag. For instance for nested stage SMMUv3 I current performs an ARCHID (asid) based invalidation only. Eric > > struct iommu_inv_addr_info { > #define IOMMU_INV_ADDR_FLAGS_PASID (1 << 0) > #define IOMMU_INV_ADDR_FLAGS_ARCHID (1 << 1) > #define IOMMU_INV_ADDR_FLAGS_LEAF (1 << 2) > __u32 flags; > __u32 archid; > __u64 pasid; > __u64 addr; > __u64 granule_size; > __u64 nb_granules; > }; > > struct iommu_inv_pasid_info { > #define IOMMU_INV_PASID_FLAGS_PASID (1 << 0) > #define IOMMU_INV_PASID_FLAGS_ARCHID (1 << 1) > __u32 flags; > __u32 archid; > __u64 pasid; > }; > > then consolidating the pasid field into generic header doesn't > hurt. the specific handler still rely on flags to tell whether it > is used? > > Thanks > Kevin >
Hi Alex, > From: Alex Williamson <alex.williamson@redhat.com> > Sent: Thursday, April 16, 2020 10:41 PM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE > > On Thu, 16 Apr 2020 10:40:03 +0000 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > Hi Alex, > > Still have a direction question with you. Better get agreement with you > > before heading forward. > > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Friday, April 3, 2020 11:35 PM > > [...] > > > > > > + * > > > > > > + * returns: 0 on success, -errno on failure. > > > > > > + */ > > > > > > +struct vfio_iommu_type1_cache_invalidate { > > > > > > + __u32 argsz; > > > > > > + __u32 flags; > > > > > > + struct iommu_cache_invalidate_info cache_info; > > > > > > +}; > > > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, > > > VFIO_BASE > > > > > + 24) > > > > > > > > > > The future extension capabilities of this ioctl worry me, I wonder if > > > > > we should do another data[] with flag defining that data as CACHE_INFO. > > > > > > > > Can you elaborate? Does it mean with this way we don't rely on iommu > > > > driver to provide version_to_size conversion and instead we just pass > > > > data[] to iommu driver for further audit? > > > > > > No, my concern is that this ioctl has a single function, strictly tied > > > to the iommu uapi. If we replace cache_info with data[] then we can > > > define a flag to specify that data[] is struct > > > iommu_cache_invalidate_info, and if we need to, a different flag to > > > identify data[] as something else. For example if we get stuck > > > expanding cache_info to meet new demands and develop a new uapi to > > > solve that, how would we expand this ioctl to support it rather than > > > also create a new ioctl? There's also a trade-off in making the ioctl > > > usage more difficult for the user. I'd still expect the vfio layer to > > > check the flag and interpret data[] as indicated by the flag rather > > > than just passing a blob of opaque data to the iommu layer though. > > > Thanks, > > > > Based on your comments about defining a single ioctl and a unified > > vfio structure (with a @data[] field) for pasid_alloc/free, bind/ > > unbind_gpasid, cache_inv. After some offline trying, I think it would > > be good for bind/unbind_gpasid and cache_inv as both of them use the > > iommu uapi definition. While the pasid alloc/free operation doesn't. > > It would be weird to put all of them together. So pasid alloc/free > > may have a separate ioctl. It would look as below. Does this direction > > look good per your opinion? > > > > ioctl #22: VFIO_IOMMU_PASID_REQUEST > > /** > > * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID > > * specify a pasid to be freed when flags == FREE_PASID > > * @range: specify the allocation range when flags == ALLOC_PASID > > */ > > struct vfio_iommu_pasid_request { > > __u32 argsz; > > #define VFIO_IOMMU_ALLOC_PASID (1 << 0) > > #define VFIO_IOMMU_FREE_PASID (1 << 1) > > __u32 flags; > > __u32 pasid; > > struct { > > __u32 min; > > __u32 max; > > } range; > > }; > > Can't the ioctl return the pasid valid on alloc (like GET_DEVICE_FD)? Yep, I think you mentioned before. At that time, I believed it would be better to return the result via a __u32 buffer so that make full use of the 32 bits. But looks like it doesn't make much difference. I'll follow your suggestion. > Would it be useful to support freeing a range of pasids? If so then we > could simply use range for both, ie. allocate a pasid from this range > and return it, or free all pasids in this range? vfio already needs to > track pasids to free them on release, so presumably this is something > we could support easily. yes, I think it is a nice thing. then I can remove the @pasid field. will do it. > > ioctl #23: VFIO_IOMMU_NESTING_OP > > struct vfio_iommu_type1_nesting_op { > > __u32 argsz; > > __u32 flags; > > __u32 op; > > __u8 data[]; > > }; > > data only has 4-byte alignment, I think we really want it at an 8-byte > alignment. This is why I embedded the "op" into the flag for > DEVICE_FEATURE. Thanks, got it. I may also merge the op into flags (maybe the lower 16 bits for op). Thanks, Yi Liu > Alex > > > > > /* Nesting Ops */ > > #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0 > > #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1 > > #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2 > > > > Thanks, > > Yi Liu > >
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index a877747..937ec3f 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2423,6 +2423,15 @@ static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, return ret; } +static int vfio_cache_inv_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + struct iommu_cache_invalidate_info *cache_inv_info = + (struct iommu_cache_invalidate_info *) dc->data; + + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); +} + static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } kfree(gbind_data); return ret; + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) { + struct vfio_iommu_type1_cache_invalidate cache_inv; + u32 version; + int info_size; + void *cache_info; + int ret; + + minsz = offsetofend(struct vfio_iommu_type1_cache_invalidate, + flags); + + if (copy_from_user(&cache_inv, (void __user *)arg, minsz)) + return -EFAULT; + + if (cache_inv.argsz < minsz || cache_inv.flags) + return -EINVAL; + + /* Get the version of struct iommu_cache_invalidate_info */ + if (copy_from_user(&version, + (void __user *) (arg + minsz), sizeof(version))) + return -EFAULT; + + info_size = iommu_uapi_get_data_size( + IOMMU_UAPI_CACHE_INVAL, version); + + cache_info = kzalloc(info_size, GFP_KERNEL); + if (!cache_info) + return -ENOMEM; + + if (copy_from_user(cache_info, + (void __user *) (arg + minsz), info_size)) { + kfree(cache_info); + return -EFAULT; + } + + mutex_lock(&iommu->lock); + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn, + cache_info); + mutex_unlock(&iommu->lock); + kfree(cache_info); + return ret; } return -ENOTTY; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 2235bc6..62ca791 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind { */ #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23) +/** + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24, + * struct vfio_iommu_type1_cache_invalidate) + * + * Propagate guest IOMMU cache invalidation to the host. The cache + * invalidation information is conveyed by @cache_info, the content + * format would be structures defined in uapi/linux/iommu.h. User + * should be aware of that the struct iommu_cache_invalidate_info + * has a @version field, vfio needs to parse this field before getting + * data from userspace. + * + * Availability of this IOCTL is after VFIO_SET_IOMMU. + * + * returns: 0 on success, -errno on failure. + */ +struct vfio_iommu_type1_cache_invalidate { + __u32 argsz; + __u32 flags; + struct iommu_cache_invalidate_info cache_info; +}; +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 24) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /*