Message ID | 20210913071606.2966-12-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/14] vfio: Move vfio_iommu_group_get() to vfio_register_group_dev() | expand |
On 9/13/2021 12:46 PM, Christoph Hellwig wrote: > Pass the group flags to ->attach_group and remove the messy check for > the bus type. > I like the way vfio_group_type is used in this patch, that removes messy way to call symbol_get(mdev_bus_type). Any thoughts on how VFIO_IOMMU, i.e. IOMMU backed mdev, can be implemented? For IOMMU backed mdev, mdev->dev->iommu_group should be same as mdev->type->parent->dev->iommu_group or in other words, parent device would be DMA alias for the mdev device with the restriction - single mdev device can be created for the physical device. Is it possible to link iommu_group of these two devices some way? Thanks, Kirti > Signed-off-by: Christoph Hellwig <hch@lst.de> > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> > Reviewed-by: Kevin Tian <kevin.tian@intel.com> > --- > drivers/vfio/vfio.c | 32 +++++------------------------ > drivers/vfio/vfio.h | 27 +++++++++++++++++++++++- > drivers/vfio/vfio_iommu_spapr_tce.c | 2 +- > drivers/vfio/vfio_iommu_type1.c | 19 ++--------------- > 4 files changed, 34 insertions(+), 46 deletions(-) > > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c > index 6589e296ef348c..08b27b64f0f935 100644 > --- a/drivers/vfio/vfio.c > +++ b/drivers/vfio/vfio.c > @@ -68,30 +68,6 @@ struct vfio_unbound_dev { > struct list_head unbound_next; > }; > > -enum vfio_group_type { > - /* > - * Physical device with IOMMU backing. > - */ > - VFIO_IOMMU, > - > - /* > - * Virtual device without IOMMU backing. The VFIO core fakes up an > - * iommu_group as the iommu_group sysfs interface is part of the > - * userspace ABI. The user of these devices must not be able to > - * directly trigger unmediated DMA. > - */ > - VFIO_EMULATED_IOMMU, > - > - /* > - * Physical device without IOMMU backing. The VFIO core fakes up an > - * iommu_group as the iommu_group sysfs interface is part of the > - * userspace ABI. Users can trigger unmediated DMA by the device, > - * usage is highly dangerous, requires an explicit opt-in and will > - * taint the kernel. > - */ > - VFIO_NO_IOMMU, > -}; > - > struct vfio_group { > struct kref kref; > int minor; > @@ -219,7 +195,7 @@ static long vfio_noiommu_ioctl(void *iommu_data, > } > > static int vfio_noiommu_attach_group(void *iommu_data, > - struct iommu_group *iommu_group) > + struct iommu_group *iommu_group, enum vfio_group_type type) > { > return 0; > } > @@ -1129,7 +1105,8 @@ static int __vfio_container_attach_groups(struct vfio_container *container, > int ret = -ENODEV; > > list_for_each_entry(group, &container->group_list, container_next) { > - ret = driver->ops->attach_group(data, group->iommu_group); > + ret = driver->ops->attach_group(data, group->iommu_group, > + group->type); > if (ret) > goto unwind; > } > @@ -1387,7 +1364,8 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) > driver = container->iommu_driver; > if (driver) { > ret = driver->ops->attach_group(container->iommu_data, > - group->iommu_group); > + group->iommu_group, > + group->type); > if (ret) > goto unlock_out; > } > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h > index a78de649eb2f16..a6713022115155 100644 > --- a/drivers/vfio/vfio.h > +++ b/drivers/vfio/vfio.h > @@ -4,6 +4,30 @@ > * Author: Alex Williamson <alex.williamson@redhat.com> > */ > > +enum vfio_group_type { > + /* > + * Physical device with IOMMU backing. > + */ > + VFIO_IOMMU, > + > + /* > + * Virtual device without IOMMU backing. The VFIO core fakes up an > + * iommu_group as the iommu_group sysfs interface is part of the > + * userspace ABI. The user of these devices must not be able to > + * directly trigger unmediated DMA. > + */ > + VFIO_EMULATED_IOMMU, > + > + /* > + * Physical device without IOMMU backing. The VFIO core fakes up an > + * iommu_group as the iommu_group sysfs interface is part of the > + * userspace ABI. Users can trigger unmediated DMA by the device, > + * usage is highly dangerous, requires an explicit opt-in and will > + * taint the kernel. > + */ > + VFIO_NO_IOMMU, > +}; > + > /* events for the backend driver notify callback */ > enum vfio_iommu_notify_type { > VFIO_IOMMU_CONTAINER_CLOSE = 0, > @@ -20,7 +44,8 @@ struct vfio_iommu_driver_ops { > long (*ioctl)(void *iommu_data, unsigned int cmd, > unsigned long arg); > int (*attach_group)(void *iommu_data, > - struct iommu_group *group); > + struct iommu_group *group, > + enum vfio_group_type); > void (*detach_group)(void *iommu_data, > struct iommu_group *group); > int (*pin_pages)(void *iommu_data, > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c > index 3efd09faeca4a8..936a26b13c0b01 100644 > --- a/drivers/vfio/vfio_iommu_spapr_tce.c > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c > @@ -1239,7 +1239,7 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container, > } > > static int tce_iommu_attach_group(void *iommu_data, > - struct iommu_group *iommu_group) > + struct iommu_group *iommu_group, enum vfio_group_type type) > { > int ret = 0; > struct tce_container *container = iommu_data; > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 42a6be1fb7265e..a48e9f597cb213 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -36,7 +36,6 @@ > #include <linux/uaccess.h> > #include <linux/vfio.h> > #include <linux/workqueue.h> > -#include <linux/mdev.h> > #include <linux/notifier.h> > #include <linux/dma-iommu.h> > #include <linux/irqdomain.h> > @@ -1934,20 +1933,6 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, > return ret; > } > > -static bool vfio_bus_is_mdev(struct bus_type *bus) > -{ > - struct bus_type *mdev_bus; > - bool ret = false; > - > - mdev_bus = symbol_get(mdev_bus_type); > - if (mdev_bus) { > - ret = (bus == mdev_bus); > - symbol_put(mdev_bus_type); > - } > - > - return ret; > -} > - > /* > * This is a helper function to insert an address range to iova list. > * The list is initially created with a single entry corresponding to > @@ -2172,7 +2157,7 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, > } > > static int vfio_iommu_type1_attach_group(void *iommu_data, > - struct iommu_group *iommu_group) > + struct iommu_group *iommu_group, enum vfio_group_type type) > { > struct vfio_iommu *iommu = iommu_data; > struct vfio_iommu_group *group; > @@ -2207,7 +2192,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, > if (ret) > goto out_free; > > - if (vfio_bus_is_mdev(bus)) { > + if (type == VFIO_EMULATED_IOMMU) { > if (!iommu->external_domain) { > INIT_LIST_HEAD(&domain->group_list); > iommu->external_domain = domain; >
On Fri, Sep 17, 2021 at 01:25:59AM +0530, Kirti Wankhede wrote: > > > On 9/13/2021 12:46 PM, Christoph Hellwig wrote: > > Pass the group flags to ->attach_group and remove the messy check for > > the bus type. > > > > I like the way vfio_group_type is used in this patch, that removes messy way > to call symbol_get(mdev_bus_type). > > Any thoughts on how VFIO_IOMMU, i.e. IOMMU backed mdev, can be implemented? > > For IOMMU backed mdev, mdev->dev->iommu_group should be same as > mdev->type->parent->dev->iommu_group or in other words, parent device would > be DMA alias for the mdev device with the restriction - single mdev device > can be created for the physical device. Is it possible to link iommu_group > of these two devices some way? You just use the new style mdev API and directly call vfio_register_group_dev and it will pick up the parent->dev->iommu_group naturally like everything else using physical iommu groups. Jason
> From: Jason Gunthorpe <jgg@ziepe.ca> > Sent: Friday, September 17, 2021 6:19 AM > > On Fri, Sep 17, 2021 at 01:25:59AM +0530, Kirti Wankhede wrote: > > > > > > On 9/13/2021 12:46 PM, Christoph Hellwig wrote: > > > Pass the group flags to ->attach_group and remove the messy check for > > > the bus type. > > > > > > > I like the way vfio_group_type is used in this patch, that removes messy > way > > to call symbol_get(mdev_bus_type). > > > > Any thoughts on how VFIO_IOMMU, i.e. IOMMU backed mdev, can be > implemented? > > > > For IOMMU backed mdev, mdev->dev->iommu_group should be same as > > mdev->type->parent->dev->iommu_group or in other words, parent device > would > > be DMA alias for the mdev device with the restriction - single mdev device > > can be created for the physical device. Is it possible to link iommu_group > > of these two devices some way? > > You just use the new style mdev API and directly call > vfio_register_group_dev and it will pick up the > parent->dev->iommu_group naturally like everything else using physical > iommu groups. > For above usage (wrap pdev into mdev), isn't the right way to directly add vendor vfio-pci driver since vfio-pci-core has been split out now? It's not necessary to fake a mdev just for adding some mediation in the r/w path... Another type of IOMMU-backed mdev is with pasid support. But for this case we discussed earlier that it doesn't have group and will follow the new /dev/iommu proposal instead. Thanks Kevin
On Fri, Sep 17, 2021 at 04:49:41AM +0000, Tian, Kevin wrote: > > You just use the new style mdev API and directly call > > vfio_register_group_dev and it will pick up the > > parent->dev->iommu_group naturally like everything else using physical > > iommu groups. > > > > For above usage (wrap pdev into mdev), isn't the right way to directly add > vendor vfio-pci driver since vfio-pci-core has been split out now? It's not > necessary to fake a mdev just for adding some mediation in the r/w path... Exactly. > Another type of IOMMU-backed mdev is with pasid support. But for this > case we discussed earlier that it doesn't have group and will follow the > new /dev/iommu proposal instead. Indeed.
On 9/17/2021 10:35 AM, Christoph Hellwig wrote: > On Fri, Sep 17, 2021 at 04:49:41AM +0000, Tian, Kevin wrote: >>> You just use the new style mdev API and directly call >>> vfio_register_group_dev and it will pick up the >>> parent->dev->iommu_group naturally like everything else using physical >>> iommu groups. >>> >> Directly calling vfio_register_group_dev() doesn't work without linking mdev->dev->iommu_group to mdev->type->parent->dev->iommu_group. When mdev device is created, mdev->dev->iommu_group is NULL. Then if called vfio_register_group_dev with mdev->dev as vfio device, it fails because mdev->dev->iommu_group is NULL. So create vfio_device with mdev parent's dev as below: if (IOMMU backed mdev) vfio_init_group_dev(&vfio_dev, &mdev->type->parent->dev, &fops); else vfio_init_group_dev(&vfio_dev, &mdev->dev, &fops); if (IOMMU backed mdev) vfio_register_group_dev(&vfio_dev); else vfio_register_emulated_iommu_dev(&vfio_dev); But still mdev->dev->iommu_group is NULL and /sys/bus/mdev/devices/<mdev_uuid>/iommu_group is not present. For QEMU, input parameter is mdev device's UUID. QEMU checks /sys/bus/mdev/devices/<mdev_uuid>/iommu_group path and fails with error "no iommu_group found". There has to be symlink /mdev/devices/<mdev_uuid>/iommu_group to it's parent device's iommu_group. iommu_group_add_device(parent_iommu_group, mdev->dev) fails because mdev device is not pci device or ACPI device. Can it be allowed to add mdev device to its parent's iommu group here? >> For above usage (wrap pdev into mdev), isn't the right way to directly add >> vendor vfio-pci driver since vfio-pci-core has been split out now? It's not >> necessary to fake a mdev just for adding some mediation in the r/w path... > > Exactly. vfio-pci doesn't provide way to configure the device as mdev framework provide using mdev_types. Thanks, Kirti
On Fri, Sep 17, 2021 at 12:21:07PM +0530, Kirti Wankhede wrote: > > > On 9/17/2021 10:35 AM, Christoph Hellwig wrote: > > On Fri, Sep 17, 2021 at 04:49:41AM +0000, Tian, Kevin wrote: > > > > You just use the new style mdev API and directly call > > > > vfio_register_group_dev and it will pick up the > > > > parent->dev->iommu_group naturally like everything else using physical > > > > iommu groups. > > > > > > > > > Directly calling vfio_register_group_dev() doesn't work without linking > mdev->dev->iommu_group to mdev->type->parent->dev->iommu_group. You pass the PCI device, not the mdev to vfio_register_group_dev(). > > > For above usage (wrap pdev into mdev), isn't the right way to directly add > > > vendor vfio-pci driver since vfio-pci-core has been split out now? It's not > > > necessary to fake a mdev just for adding some mediation in the r/w path... > > > > Exactly. > > vfio-pci doesn't provide way to configure the device as mdev framework > provide using mdev_types. The linux standard is for a PCI PF driver to configure it's VFs, not a mdev or vfio. Jason
On Fri, 17 Sep 2021 09:53:01 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Fri, Sep 17, 2021 at 12:21:07PM +0530, Kirti Wankhede wrote: > > > > > > On 9/17/2021 10:35 AM, Christoph Hellwig wrote: > > > On Fri, Sep 17, 2021 at 04:49:41AM +0000, Tian, Kevin wrote: > > > > > You just use the new style mdev API and directly call > > > > > vfio_register_group_dev and it will pick up the > > > > > parent->dev->iommu_group naturally like everything else using physical > > > > > iommu groups. > > > > > > > > > > > > > Directly calling vfio_register_group_dev() doesn't work without linking > > mdev->dev->iommu_group to mdev->type->parent->dev->iommu_group. > > You pass the PCI device, not the mdev to vfio_register_group_dev(). > > > > > For above usage (wrap pdev into mdev), isn't the right way to directly add > > > > vendor vfio-pci driver since vfio-pci-core has been split out now? It's not > > > > necessary to fake a mdev just for adding some mediation in the r/w path... > > > > > > Exactly. > > > > vfio-pci doesn't provide way to configure the device as mdev framework > > provide using mdev_types. > > The linux standard is for a PCI PF driver to configure it's VFs, not a > mdev or vfio. Hi Jason, I'm only aware that the PF driver enables basic SR-IOV configuration of VFs, ie. the number of enabled VFs. The mdev interface enables not only management of the number of child devices, but the flavor of each child, for example the non-homogeneous slice of resources allocated per child device. I think that's the sort of configuration Kirti is asking about here and I'm not aware of any standard mechanism for a PF driver to apply a configuration per VF. A mediated path to a physical device isn't exclusive to providing features like migration, it can also be used to create these sorts device flavors. For example, we might expose NIC VFs and administrative configuration should restrict VF1 to a 1Gbit interface while VF2 gets 10Gbit. Are we left to driver specific sysfs attributes to achieve this or can we create some form of standardization like mdev provides? Thanks, Alex
On Thu, Sep 30, 2021 at 10:46:20AM -0600, Alex Williamson wrote: > I'm only aware that the PF driver enables basic SR-IOV configuration of > VFs, ie. the number of enabled VFs. This is quite common in the netdev world, for instance you use the PF driver to set the MAC addresses, QOS and other details on the VF devices. > only management of the number of child devices, but the flavor of each > child, for example the non-homogeneous slice of resources allocated per > child device. Since the devices are PCI VFs they should be able to be used, with configuration, any place a PCI VF is usable. EG vfio-pci, a Kernel driver, etc. This is why the PF needs to provide the configuration to support all the use cases. > I'm not aware of any standard mechanism for a PF driver to apply a > configuration per VF. devlink is the standard way these days. It can model the PCI devices and puts a control point in the PF driver. I'd defer to Jiri, Leon and others to explain the details of this though :) > create these sorts device flavors. For example, we might expose NIC VFs > and administrative configuration should restrict VF1 to a 1Gbit > interface while VF2 gets 10Gbit. This is all being done already today through the PF driver using either netink or devlink interfaces Jason
On Thu, Sep 30, 2021 at 01:57:07PM -0300, Jason Gunthorpe wrote: > On Thu, Sep 30, 2021 at 10:46:20AM -0600, Alex Williamson wrote: > > I'm only aware that the PF driver enables basic SR-IOV configuration of > > VFs, ie. the number of enabled VFs. > > This is quite common in the netdev world, for instance you use the PF > driver to set the MAC addresses, QOS and other details on the VF > devices. The NVMe spec also support it using the Virtualization extensions, although I'm not aware of any device that actually implements it so far. > > > only management of the number of child devices, but the flavor of each > > child, for example the non-homogeneous slice of resources allocated per > > child device. > > Since the devices are PCI VFs they should be able to be used, with > configuration, any place a PCI VF is usable. EG vfio-pci, a Kernel > driver, etc. > > This is why the PF needs to provide the configuration to support all > the use cases. Exactly.
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 6589e296ef348c..08b27b64f0f935 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -68,30 +68,6 @@ struct vfio_unbound_dev { struct list_head unbound_next; }; -enum vfio_group_type { - /* - * Physical device with IOMMU backing. - */ - VFIO_IOMMU, - - /* - * Virtual device without IOMMU backing. The VFIO core fakes up an - * iommu_group as the iommu_group sysfs interface is part of the - * userspace ABI. The user of these devices must not be able to - * directly trigger unmediated DMA. - */ - VFIO_EMULATED_IOMMU, - - /* - * Physical device without IOMMU backing. The VFIO core fakes up an - * iommu_group as the iommu_group sysfs interface is part of the - * userspace ABI. Users can trigger unmediated DMA by the device, - * usage is highly dangerous, requires an explicit opt-in and will - * taint the kernel. - */ - VFIO_NO_IOMMU, -}; - struct vfio_group { struct kref kref; int minor; @@ -219,7 +195,7 @@ static long vfio_noiommu_ioctl(void *iommu_data, } static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) + struct iommu_group *iommu_group, enum vfio_group_type type) { return 0; } @@ -1129,7 +1105,8 @@ static int __vfio_container_attach_groups(struct vfio_container *container, int ret = -ENODEV; list_for_each_entry(group, &container->group_list, container_next) { - ret = driver->ops->attach_group(data, group->iommu_group); + ret = driver->ops->attach_group(data, group->iommu_group, + group->type); if (ret) goto unwind; } @@ -1387,7 +1364,8 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) driver = container->iommu_driver; if (driver) { ret = driver->ops->attach_group(container->iommu_data, - group->iommu_group); + group->iommu_group, + group->type); if (ret) goto unlock_out; } diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index a78de649eb2f16..a6713022115155 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -4,6 +4,30 @@ * Author: Alex Williamson <alex.williamson@redhat.com> */ +enum vfio_group_type { + /* + * Physical device with IOMMU backing. + */ + VFIO_IOMMU, + + /* + * Virtual device without IOMMU backing. The VFIO core fakes up an + * iommu_group as the iommu_group sysfs interface is part of the + * userspace ABI. The user of these devices must not be able to + * directly trigger unmediated DMA. + */ + VFIO_EMULATED_IOMMU, + + /* + * Physical device without IOMMU backing. The VFIO core fakes up an + * iommu_group as the iommu_group sysfs interface is part of the + * userspace ABI. Users can trigger unmediated DMA by the device, + * usage is highly dangerous, requires an explicit opt-in and will + * taint the kernel. + */ + VFIO_NO_IOMMU, +}; + /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, @@ -20,7 +44,8 @@ struct vfio_iommu_driver_ops { long (*ioctl)(void *iommu_data, unsigned int cmd, unsigned long arg); int (*attach_group)(void *iommu_data, - struct iommu_group *group); + struct iommu_group *group, + enum vfio_group_type); void (*detach_group)(void *iommu_data, struct iommu_group *group); int (*pin_pages)(void *iommu_data, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 3efd09faeca4a8..936a26b13c0b01 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -1239,7 +1239,7 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container, } static int tce_iommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) + struct iommu_group *iommu_group, enum vfio_group_type type) { int ret = 0; struct tce_container *container = iommu_data; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 42a6be1fb7265e..a48e9f597cb213 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -36,7 +36,6 @@ #include <linux/uaccess.h> #include <linux/vfio.h> #include <linux/workqueue.h> -#include <linux/mdev.h> #include <linux/notifier.h> #include <linux/dma-iommu.h> #include <linux/irqdomain.h> @@ -1934,20 +1933,6 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, return ret; } -static bool vfio_bus_is_mdev(struct bus_type *bus) -{ - struct bus_type *mdev_bus; - bool ret = false; - - mdev_bus = symbol_get(mdev_bus_type); - if (mdev_bus) { - ret = (bus == mdev_bus); - symbol_put(mdev_bus_type); - } - - return ret; -} - /* * This is a helper function to insert an address range to iova list. * The list is initially created with a single entry corresponding to @@ -2172,7 +2157,7 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, } static int vfio_iommu_type1_attach_group(void *iommu_data, - struct iommu_group *iommu_group) + struct iommu_group *iommu_group, enum vfio_group_type type) { struct vfio_iommu *iommu = iommu_data; struct vfio_iommu_group *group; @@ -2207,7 +2192,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_free; - if (vfio_bus_is_mdev(bus)) { + if (type == VFIO_EMULATED_IOMMU) { if (!iommu->external_domain) { INIT_LIST_HEAD(&domain->group_list); iommu->external_domain = domain;