Message ID | 1592988927-48009-4-git-send-email-yi.l.liu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio: expose virtual Shared Virtual Addressing to VMs | expand |
On Wed, 24 Jun 2020 01:55:16 -0700 Liu Yi L <yi.l.liu@intel.com> wrote: > This patch exports iommu nesting capability info to user space through > VFIO. User space is expected to check this info for supported uAPIs (e.g. > PASID alloc/free, bind page table, and cache invalidation) and the vendor > specific format information for first level/stage page table that will be > bound to. > > The nesting info is available only after the nesting iommu type is set > for a container. Current implementation imposes one limitation - one > nesting container should include at most one group. The philosophy of > vfio container is having all groups/devices within the container share > the same IOMMU context. When vSVA is enabled, one IOMMU context could > include one 2nd-level address space and multiple 1st-level address spaces. > While the 2nd-leve address space is reasonably sharable by multiple groups > , blindly sharing 1st-level address spaces across all groups within the > container might instead break the guest expectation. In the future sub/ > super container concept might be introduced to allow partial address space > sharing within an IOMMU context. But for now let's go with this restriction > by requiring singleton container for using nesting iommu features. Below > link has the related discussion about this decision. > > https://lkml.org/lkml/2020/5/15/1028 > > Cc: Kevin Tian <kevin.tian@intel.com> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > Cc: Alex Williamson <alex.williamson@redhat.com> > Cc: Eric Auger <eric.auger@redhat.com> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > Cc: Joerg Roedel <joro@8bytes.org> > Cc: Lu Baolu <baolu.lu@linux.intel.com> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > --- > drivers/vfio/vfio_iommu_type1.c | 73 +++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 9 +++++ > 2 files changed, 82 insertions(+) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 7accb59..8c143d5 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -72,6 +72,7 @@ struct vfio_iommu { > uint64_t pgsize_bitmap; > bool v2; > bool nesting; > + struct iommu_nesting_info *nesting_info; > bool dirty_page_tracking; > bool pinned_page_dirty_scope; > }; Mind the structure packing and alignment, placing a pointer in the middle of a section of bools is going to create wasteful holes in the data structure. > @@ -130,6 +131,9 @@ struct vfio_regions { > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > (!list_empty(&iommu->domain_list)) > > +#define IS_DOMAIN_IN_CONTAINER(iommu) ((iommu->external_domain) || \ > + (!list_empty(&iommu->domain_list))) > + > #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) > > /* > @@ -1959,6 +1963,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, > } > } > > + /* Nesting type container can include only one group */ > + if (iommu->nesting && IS_DOMAIN_IN_CONTAINER(iommu)) { > + mutex_unlock(&iommu->lock); > + return -EINVAL; > + } > + > group = kzalloc(sizeof(*group), GFP_KERNEL); > domain = kzalloc(sizeof(*domain), GFP_KERNEL); > if (!group || !domain) { > @@ -2029,6 +2039,36 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, > if (ret) > goto out_domain; > > + /* Nesting cap info is available only after attaching */ > + if (iommu->nesting) { > + struct iommu_nesting_info tmp; > + struct iommu_nesting_info *info; > + > + /* First get the size of vendor specific nesting info */ > + ret = iommu_domain_get_attr(domain->domain, > + DOMAIN_ATTR_NESTING, > + &tmp); > + if (ret) > + goto out_detach; > + > + info = kzalloc(tmp.size, GFP_KERNEL); > + if (!info) { > + ret = -ENOMEM; > + goto out_detach; > + } > + > + /* Now get the nesting info */ > + info->size = tmp.size; > + ret = iommu_domain_get_attr(domain->domain, > + DOMAIN_ATTR_NESTING, > + info); > + if (ret) { > + kfree(info); > + goto out_detach; > + } > + iommu->nesting_info = info; > + } > + > /* Get aperture info */ > iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo); > > @@ -2138,6 +2178,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, > return 0; > > out_detach: > + kfree(iommu->nesting_info); This looks prone to a use-after-free. > vfio_iommu_detach_group(domain, group); > out_domain: > iommu_domain_free(domain->domain); > @@ -2338,6 +2379,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, > vfio_iommu_unmap_unpin_all(iommu); > else > vfio_iommu_unmap_unpin_reaccount(iommu); > + > + kfree(iommu->nesting_info); As does this. Set to NULL since get_info tests the pointer before trying to use it. > } > iommu_domain_free(domain->domain); > list_del(&domain->next); > @@ -2546,6 +2589,30 @@ static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, > return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); > } > > +static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > + struct vfio_info_cap *caps) > +{ > + struct vfio_info_cap_header *header; > + struct vfio_iommu_type1_info_cap_nesting *nesting_cap; > + size_t size; > + > + size = sizeof(*nesting_cap) + iommu->nesting_info->size; > + > + header = vfio_info_cap_add(caps, size, > + VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1); > + if (IS_ERR(header)) > + return PTR_ERR(header); > + > + nesting_cap = container_of(header, > + struct vfio_iommu_type1_info_cap_nesting, > + header); > + > + memcpy(&nesting_cap->info, iommu->nesting_info, > + iommu->nesting_info->size); > + > + return 0; > +} > + > static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, > unsigned long arg) > { > @@ -2586,6 +2653,12 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, > if (ret) > return ret; > > + if (iommu->nesting_info) { > + ret = vfio_iommu_info_add_nesting_cap(iommu, &caps); > + if (ret) > + return ret; > + } > + > if (caps.size) { > info.flags |= VFIO_IOMMU_INFO_CAPS; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index eca66926..f1f39e1 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -14,6 +14,7 @@ > > #include <linux/types.h> > #include <linux/ioctl.h> > +#include <linux/iommu.h> Why? We're not directly referencing any IOMMU UAPI structures here. > > #define VFIO_API_VERSION 0 > > @@ -1039,6 +1040,14 @@ struct vfio_iommu_type1_info_cap_migration { > __u64 max_dirty_bitmap_size; /* in bytes */ > }; > > +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING 3 > + > +struct vfio_iommu_type1_info_cap_nesting { > + struct vfio_info_cap_header header; > + __u32 flags; I think there's an alignment issue here for a uapi. The header field is 8-bytes total and info[] should start at an 8-byte alignment to allow data[] within info to have 8-byte alignment. This could lead to the structure having a compiler dependent size and offsets. We should add a 4-byte reserved field here to resolve. > + __u8 info[]; > +}; This should have a lot more description around it, a user could not infer that info[] is including a struct iommu_nesting_info from the information provided here. Thanks, Alex > + > #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > > /**
Hi Alex, > From: Alex Williamson < alex.williamson@redhat.com > > Sent: Friday, July 3, 2020 2:39 AM > > On Wed, 24 Jun 2020 01:55:16 -0700 > Liu Yi L <yi.l.liu@intel.com> wrote: > > > This patch exports iommu nesting capability info to user space through > > VFIO. User space is expected to check this info for supported uAPIs (e.g. > > PASID alloc/free, bind page table, and cache invalidation) and the > > vendor specific format information for first level/stage page table > > that will be bound to. > > > > The nesting info is available only after the nesting iommu type is set > > for a container. Current implementation imposes one limitation - one > > nesting container should include at most one group. The philosophy of > > vfio container is having all groups/devices within the container share > > the same IOMMU context. When vSVA is enabled, one IOMMU context could > > include one 2nd-level address space and multiple 1st-level address spaces. > > While the 2nd-leve address space is reasonably sharable by multiple > > groups , blindly sharing 1st-level address spaces across all groups > > within the container might instead break the guest expectation. In the > > future sub/ super container concept might be introduced to allow > > partial address space sharing within an IOMMU context. But for now > > let's go with this restriction by requiring singleton container for > > using nesting iommu features. Below link has the related discussion > > about this > > decision. > > > > https://lkml.org/lkml/2020/5/15/1028 > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > CC: Jacob Pan <jacob.jun.pan@linux.intel.com> > > Cc: Alex Williamson <alex.williamson@redhat.com> > > Cc: Eric Auger <eric.auger@redhat.com> > > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> > > Cc: Joerg Roedel <joro@8bytes.org> > > Cc: Lu Baolu <baolu.lu@linux.intel.com> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > --- > > drivers/vfio/vfio_iommu_type1.c | 73 > +++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/vfio.h | 9 +++++ > > 2 files changed, 82 insertions(+) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c index 7accb59..8c143d5 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -72,6 +72,7 @@ struct vfio_iommu { > > uint64_t pgsize_bitmap; > > bool v2; > > bool nesting; > > + struct iommu_nesting_info *nesting_info; > > bool dirty_page_tracking; > > bool pinned_page_dirty_scope; > > }; > > Mind the structure packing and alignment, placing a pointer in the middle > of a > section of bools is going to create wasteful holes in the data structure. how about below? Add the @nesting_info and @vmm in the end of this struct. I've two questions, the first one is how the place the comment of the @external_domain; second question is do you want me to move the @nesting field to be near-by with the @nesting_info and @vmm. :) please let me know your preference. struct vfio_iommu { struct list_head domain_list; struct list_head iova_list; struct vfio_domain *external_domain; /* domain for external user */ struct mutex lock; struct rb_root dma_list; struct blocking_notifier_head notifier; unsigned int dma_avail; uint64_t pgsize_bitmap; bool v2; bool nesting; bool dirty_page_tracking; bool pinned_page_dirty_scope; struct iommu_nesting_info *nesting_info; struct vfio_mm *vmm; }; > > @@ -130,6 +131,9 @@ struct vfio_regions { > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > > (!list_empty(&iommu->domain_list)) > > > > +#define IS_DOMAIN_IN_CONTAINER(iommu) ((iommu- > >external_domain) || \ > > + (!list_empty(&iommu->domain_list))) > > + > > #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / > BITS_PER_BYTE) > > > > /* > > @@ -1959,6 +1963,12 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > } > > } > > > > + /* Nesting type container can include only one group */ > > + if (iommu->nesting && IS_DOMAIN_IN_CONTAINER(iommu)) { > > + mutex_unlock(&iommu->lock); > > + return -EINVAL; > > + } > > + > > group = kzalloc(sizeof(*group), GFP_KERNEL); > > domain = kzalloc(sizeof(*domain), GFP_KERNEL); > > if (!group || !domain) { > > @@ -2029,6 +2039,36 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > if (ret) > > goto out_domain; > > > > + /* Nesting cap info is available only after attaching */ > > + if (iommu->nesting) { > > + struct iommu_nesting_info tmp; > > + struct iommu_nesting_info *info; > > + > > + /* First get the size of vendor specific nesting info */ > > + ret = iommu_domain_get_attr(domain->domain, > > + DOMAIN_ATTR_NESTING, > > + &tmp); > > + if (ret) > > + goto out_detach; > > + > > + info = kzalloc(tmp.size, GFP_KERNEL); > > + if (!info) { > > + ret = -ENOMEM; > > + goto out_detach; > > + } > > + > > + /* Now get the nesting info */ > > + info->size = tmp.size; > > + ret = iommu_domain_get_attr(domain->domain, > > + DOMAIN_ATTR_NESTING, > > + info); > > + if (ret) { > > + kfree(info); > > + goto out_detach; > > + } > > + iommu->nesting_info = info; > > + } > > + > > /* Get aperture info */ > > iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, > &geo); > > > > @@ -2138,6 +2178,7 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > > return 0; > > > > out_detach: > > + kfree(iommu->nesting_info); > > This looks prone to a use-after-free. how about setting iommu->nesting_info to NULL? just as the next comment from you. > > vfio_iommu_detach_group(domain, group); > > out_domain: > > iommu_domain_free(domain->domain); > > @@ -2338,6 +2379,8 @@ static void vfio_iommu_type1_detach_group(void > *iommu_data, > > vfio_iommu_unmap_unpin_all(iommu); > > else > > > vfio_iommu_unmap_unpin_reaccount(iommu); > > + > > + kfree(iommu->nesting_info); > > As does this. Set to NULL since get_info tests the pointer before trying to > use it. got it. > > } > > iommu_domain_free(domain->domain); > > list_del(&domain->next); > > @@ -2546,6 +2589,30 @@ static int vfio_iommu_migration_build_caps(struct > vfio_iommu *iommu, > > return vfio_info_add_capability(caps, &cap_mig.header, > > sizeof(cap_mig)); } > > > > +static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, > > + struct vfio_info_cap *caps) > > +{ > > + struct vfio_info_cap_header *header; > > + struct vfio_iommu_type1_info_cap_nesting *nesting_cap; > > + size_t size; > > + > > + size = sizeof(*nesting_cap) + iommu->nesting_info->size; > > + > > + header = vfio_info_cap_add(caps, size, > > + VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1); > > + if (IS_ERR(header)) > > + return PTR_ERR(header); > > + > > + nesting_cap = container_of(header, > > + struct vfio_iommu_type1_info_cap_nesting, > > + header); > > + > > + memcpy(&nesting_cap->info, iommu->nesting_info, > > + iommu->nesting_info->size); > > + > > + return 0; > > +} > > + > > static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, > > unsigned long arg) > > { > > @@ -2586,6 +2653,12 @@ static int vfio_iommu_type1_get_info(struct > vfio_iommu *iommu, > > if (ret) > > return ret; > > > > + if (iommu->nesting_info) { > > + ret = vfio_iommu_info_add_nesting_cap(iommu, &caps); > > + if (ret) > > + return ret; > > + } > > + > > if (caps.size) { > > info.flags |= VFIO_IOMMU_INFO_CAPS; > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index eca66926..f1f39e1 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -14,6 +14,7 @@ > > > > #include <linux/types.h> > > #include <linux/ioctl.h> > > +#include <linux/iommu.h> > > Why? We're not directly referencing any IOMMU UAPI structures here. oh, yes. will remove it. > > > > #define VFIO_API_VERSION 0 > > > > @@ -1039,6 +1040,14 @@ struct vfio_iommu_type1_info_cap_migration { > > __u64 max_dirty_bitmap_size; /* in bytes */ > > }; > > > > +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING 3 > > + > > +struct vfio_iommu_type1_info_cap_nesting { > > + struct vfio_info_cap_header header; > > + __u32 flags; > > I think there's an alignment issue here for a uapi. The header field is > 8-bytes total > and info[] should start at an 8-byte alignment to allow data[] within info > to have > 8-byte alignment. This could lead to the structure having a compiler > dependent > size and offsets. We should add a 4-byte reserved field here to resolve. got it. or how about defining the flags as __u64? > > > + __u8 info[]; > > +}; > > This should have a lot more description around it, a user could not infer > that info[] > is including a struct iommu_nesting_info from the information provided here. > Thanks, sure. BTW. do you think it is necessary to add a flag to indicate the info[] is a struct iommu_nesting_info? or as a start, it's not necessary to do it. Regards, Yi Liu > Alex > > > + > > #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > > > > /**
Hi Alex, > From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Friday, July 3, 2020 2:06 PM [...] > > > +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING 3 > > > + > > > +struct vfio_iommu_type1_info_cap_nesting { > > > + struct vfio_info_cap_header header; > > > + __u32 flags; > > > > I think there's an alignment issue here for a uapi. The header field > > is 8-bytes total and info[] should start at an 8-byte alignment to > > allow data[] within info to have 8-byte alignment. This could lead to > > the structure having a compiler dependent size and offsets. We should > > add a 4-byte reserved field here to resolve. > > got it. or how about defining the flags as __u64? > > > > > > + __u8 info[]; > > > +}; > > > > This should have a lot more description around it, a user could not > > infer that info[] is including a struct iommu_nesting_info from the > > information provided here. > > Thanks, > > sure. BTW. do you think it is necessary to add a flag to indicate the info[] is a > struct iommu_nesting_info? or as a start, it's not necessary to do it. seems like I misunderstood your comment. Does below description suits your comment? /* * Reporting nesting info to user space. * * @info: the nesting info provided by IOMMU driver. Today * it is expected to be a struct iommu_nesting_info * data. */ struct vfio_iommu_type1_info_cap_nesting { struct vfio_info_cap_header header; __u32 flags; __u32 padding; __u8 info[]; }; Thanks, Yi Liu > Regards, > Yi Liu > > > Alex > > > > > + > > > #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > > > > > > /**
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 7accb59..8c143d5 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -72,6 +72,7 @@ struct vfio_iommu { uint64_t pgsize_bitmap; bool v2; bool nesting; + struct iommu_nesting_info *nesting_info; bool dirty_page_tracking; bool pinned_page_dirty_scope; }; @@ -130,6 +131,9 @@ struct vfio_regions { #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ (!list_empty(&iommu->domain_list)) +#define IS_DOMAIN_IN_CONTAINER(iommu) ((iommu->external_domain) || \ + (!list_empty(&iommu->domain_list))) + #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) /* @@ -1959,6 +1963,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, } } + /* Nesting type container can include only one group */ + if (iommu->nesting && IS_DOMAIN_IN_CONTAINER(iommu)) { + mutex_unlock(&iommu->lock); + return -EINVAL; + } + group = kzalloc(sizeof(*group), GFP_KERNEL); domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!group || !domain) { @@ -2029,6 +2039,36 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_domain; + /* Nesting cap info is available only after attaching */ + if (iommu->nesting) { + struct iommu_nesting_info tmp; + struct iommu_nesting_info *info; + + /* First get the size of vendor specific nesting info */ + ret = iommu_domain_get_attr(domain->domain, + DOMAIN_ATTR_NESTING, + &tmp); + if (ret) + goto out_detach; + + info = kzalloc(tmp.size, GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto out_detach; + } + + /* Now get the nesting info */ + info->size = tmp.size; + ret = iommu_domain_get_attr(domain->domain, + DOMAIN_ATTR_NESTING, + info); + if (ret) { + kfree(info); + goto out_detach; + } + iommu->nesting_info = info; + } + /* Get aperture info */ iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo); @@ -2138,6 +2178,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, return 0; out_detach: + kfree(iommu->nesting_info); vfio_iommu_detach_group(domain, group); out_domain: iommu_domain_free(domain->domain); @@ -2338,6 +2379,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, vfio_iommu_unmap_unpin_all(iommu); else vfio_iommu_unmap_unpin_reaccount(iommu); + + kfree(iommu->nesting_info); } iommu_domain_free(domain->domain); list_del(&domain->next); @@ -2546,6 +2589,30 @@ static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); } +static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu, + struct vfio_info_cap *caps) +{ + struct vfio_info_cap_header *header; + struct vfio_iommu_type1_info_cap_nesting *nesting_cap; + size_t size; + + size = sizeof(*nesting_cap) + iommu->nesting_info->size; + + header = vfio_info_cap_add(caps, size, + VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + nesting_cap = container_of(header, + struct vfio_iommu_type1_info_cap_nesting, + header); + + memcpy(&nesting_cap->info, iommu->nesting_info, + iommu->nesting_info->size); + + return 0; +} + static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, unsigned long arg) { @@ -2586,6 +2653,12 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, if (ret) return ret; + if (iommu->nesting_info) { + ret = vfio_iommu_info_add_nesting_cap(iommu, &caps); + if (ret) + return ret; + } + if (caps.size) { info.flags |= VFIO_IOMMU_INFO_CAPS; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index eca66926..f1f39e1 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/ioctl.h> +#include <linux/iommu.h> #define VFIO_API_VERSION 0 @@ -1039,6 +1040,14 @@ struct vfio_iommu_type1_info_cap_migration { __u64 max_dirty_bitmap_size; /* in bytes */ }; +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING 3 + +struct vfio_iommu_type1_info_cap_nesting { + struct vfio_info_cap_header header; + __u32 flags; + __u8 info[]; +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /**
This patch exports iommu nesting capability info to user space through VFIO. User space is expected to check this info for supported uAPIs (e.g. PASID alloc/free, bind page table, and cache invalidation) and the vendor specific format information for first level/stage page table that will be bound to. The nesting info is available only after the nesting iommu type is set for a container. Current implementation imposes one limitation - one nesting container should include at most one group. The philosophy of vfio container is having all groups/devices within the container share the same IOMMU context. When vSVA is enabled, one IOMMU context could include one 2nd-level address space and multiple 1st-level address spaces. While the 2nd-leve address space is reasonably sharable by multiple groups , blindly sharing 1st-level address spaces across all groups within the container might instead break the guest expectation. In the future sub/ super container concept might be introduced to allow partial address space sharing within an IOMMU context. But for now let's go with this restriction by requiring singleton container for using nesting iommu features. Below link has the related discussion about this decision. https://lkml.org/lkml/2020/5/15/1028 Cc: Kevin Tian <kevin.tian@intel.com> CC: Jacob Pan <jacob.jun.pan@linux.intel.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Eric Auger <eric.auger@redhat.com> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> Cc: Joerg Roedel <joro@8bytes.org> Cc: Lu Baolu <baolu.lu@linux.intel.com> Signed-off-by: Liu Yi L <yi.l.liu@intel.com> --- drivers/vfio/vfio_iommu_type1.c | 73 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 9 +++++ 2 files changed, 82 insertions(+)