diff mbox series

[rfcv1,4/6] vfio: initialize IOMMUFDDevice and pass to vIOMMU

Message ID 20240115101313.131139-5-zhenzhong.duan@intel.com (mailing list archive)
State New, archived
Headers show
Series Check and sync host IOMMU cap/ecap with vIOMMU | expand

Commit Message

Duan, Zhenzhong Jan. 15, 2024, 10:13 a.m. UTC
Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU
could get hw IOMMU information.

In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to vIOMMU,
in case vIOMMU needs some processing for VFIO legacy backend mode.

Originally-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 include/hw/vfio/vfio-common.h |  2 ++
 hw/vfio/iommufd.c             |  2 ++
 hw/vfio/pci.c                 | 24 +++++++++++++++++++-----
 3 files changed, 23 insertions(+), 5 deletions(-)

Comments

Joao Martins Jan. 17, 2024, 3:37 p.m. UTC | #1
On 15/01/2024 10:13, Zhenzhong Duan wrote:
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 9bfddc1360..cbd035f148 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
>      VFIOContainerBase *bcontainer;
>      VFIOIOMMUFDContainer *container;
>      VFIOAddressSpace *space;
> +    IOMMUFDDevice *idev = &vbasedev->idev;
>      struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>      int ret, devfd;
>      uint32_t ioas_id;
> @@ -428,6 +429,7 @@ found_container:
>      QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
>      QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>  
> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid);
>      trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
>                                     vbasedev->num_regions, vbasedev->flags);
>      return 0;

In the dirty tracking series, I'll need to fetch out_capabilities from device
and do a bunch of stuff that is used when allocating hwpt to ask for dirty
tracking. And this means having iommufd_device_init() be called before we call
iommufd_cdev_attach_container().

Here's what it looks based on an earlier version of your patch:

https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563aa20e4f22d107219b5

I can move the call earlier in my series, unless there's something specifically
when you call it here?

	Joao
Eric Auger Jan. 17, 2024, 5:30 p.m. UTC | #2
Hi Zhenzhong,

On 1/15/24 11:13, Zhenzhong Duan wrote:
> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU
> could get hw IOMMU information.
>
> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to vIOMMU,
> in case vIOMMU needs some processing for VFIO legacy backend mode.
>
> Originally-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>  include/hw/vfio/vfio-common.h |  2 ++
>  hw/vfio/iommufd.c             |  2 ++
>  hw/vfio/pci.c                 | 24 +++++++++++++++++++-----
>  3 files changed, 23 insertions(+), 5 deletions(-)
>
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 9b7ef7d02b..fde0d0ca60 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -31,6 +31,7 @@
>  #endif
>  #include "sysemu/sysemu.h"
>  #include "hw/vfio/vfio-container-base.h"
> +#include "sysemu/iommufd_device.h"
>  
>  #define VFIO_MSG_PREFIX "vfio %s: "
>  
> @@ -126,6 +127,7 @@ typedef struct VFIODevice {
>      bool dirty_tracking;
>      int devid;
>      IOMMUFDBackend *iommufd;
> +    IOMMUFDDevice idev;
This looks duplicate of existing fields:
idev.dev_id is same as above devid. by the way let's try to use the same
devid everywhere.
idev.iommufd is same as above iommufd if != NULL.
So we should at least rationalize.
>  } VFIODevice;
>  
>  struct VFIODeviceOps {
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 9bfddc1360..cbd035f148 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
>      VFIOContainerBase *bcontainer;
>      VFIOIOMMUFDContainer *container;
>      VFIOAddressSpace *space;
> +    IOMMUFDDevice *idev = &vbasedev->idev;
>      struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>      int ret, devfd;
>      uint32_t ioas_id;
> @@ -428,6 +429,7 @@ found_container:
>      QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
>      QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>  
> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid);
>      trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
>                                     vbasedev->num_regions, vbasedev->flags);
>      return 0;
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index d7fe06715c..2c3a5d267b 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>  
>      vfio_bars_register(vdev);
>  
> -    ret = vfio_add_capabilities(vdev, errp);
> +    if (vbasedev->iommufd) {
> +        ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp);
> +    } else {
> +        ret = pci_device_set_iommu_device(pdev, 0, errp);
> +    }
>      if (ret) {
> +        error_prepend(errp, "Failed to set iommu_device: ");
at the moment it is rather an IOMMUFD device.
>          goto out_teardown;
>      }
>  
> +    ret = vfio_add_capabilities(vdev, errp);
> +    if (ret) {
> +        goto out_unset_idev;
> +    }
> +
>      if (vdev->vga) {
>          vfio_vga_quirk_setup(vdev);
>      }
> @@ -3128,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>              error_setg(errp,
>                         "cannot support IGD OpRegion feature on hotplugged "
>                         "device");
> -            goto out_teardown;
> +            goto out_unset_idev;
>          }
>  
>          ret = vfio_get_dev_region_info(vbasedev,
> @@ -3137,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>          if (ret) {
>              error_setg_errno(errp, -ret,
>                               "does not support requested IGD OpRegion feature");
> -            goto out_teardown;
> +            goto out_unset_idev;
>          }
>  
>          ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
>          g_free(opregion);
>          if (ret) {
> -            goto out_teardown;
> +            goto out_unset_idev;
>          }
>      }
>  
> @@ -3229,6 +3239,8 @@ out_deregister:
>      if (vdev->intx.mmap_timer) {
>          timer_free(vdev->intx.mmap_timer);
>      }
> +out_unset_idev:
> +    pci_device_unset_iommu_device(pdev);
>  out_teardown:
>      vfio_teardown_msi(vdev);
>      vfio_bars_exit(vdev);
> @@ -3257,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj)
>  static void vfio_exitfn(PCIDevice *pdev)
>  {
>      VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>  
>      vfio_unregister_req_notifier(vdev);
>      vfio_unregister_err_notifier(vdev);
> @@ -3271,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev)
>      vfio_teardown_msi(vdev);
>      vfio_pci_disable_rp_atomics(vdev);
>      vfio_bars_exit(vdev);
> -    vfio_migration_exit(&vdev->vbasedev);
> +    vfio_migration_exit(vbasedev);
> +    pci_device_unset_iommu_device(pdev);
>  }
>  
>  static void vfio_pci_reset(DeviceState *dev)
Thanks

Eric
Duan, Zhenzhong Jan. 18, 2024, 8:17 a.m. UTC | #3
>-----Original Message-----
>From: Joao Martins <joao.m.martins@oracle.com>
>Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to
>vIOMMU
>
>On 15/01/2024 10:13, Zhenzhong Duan wrote:
>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>> index 9bfddc1360..cbd035f148 100644
>> --- a/hw/vfio/iommufd.c
>> +++ b/hw/vfio/iommufd.c
>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name,
>VFIODevice *vbasedev,
>>      VFIOContainerBase *bcontainer;
>>      VFIOIOMMUFDContainer *container;
>>      VFIOAddressSpace *space;
>> +    IOMMUFDDevice *idev = &vbasedev->idev;
>>      struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>      int ret, devfd;
>>      uint32_t ioas_id;
>> @@ -428,6 +429,7 @@ found_container:
>>      QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev,
>container_next);
>>      QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>>
>> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev-
>>devid);
>>      trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev-
>>num_irqs,
>>                                     vbasedev->num_regions, vbasedev->flags);
>>      return 0;
>
>In the dirty tracking series, I'll need to fetch out_capabilities from device
>and do a bunch of stuff that is used when allocating hwpt to ask for dirty
>tracking. And this means having iommufd_device_init() be called before we
>call
>iommufd_cdev_attach_container().
>
>Here's what it looks based on an earlier version of your patch:
>
>https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563a
>a20e4f22d107219b5
>
>I can move the call earlier in my series, unless there's something specifically
>when you call it here?

I think it's safe to move it earlier, just remember to do the same for existing
container.

Thanks
Zhenzhong
Duan, Zhenzhong Jan. 18, 2024, 9:23 a.m. UTC | #4
>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to
>vIOMMU
>
>Hi Zhenzhong,
>
>On 1/15/24 11:13, Zhenzhong Duan wrote:
>> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU
>> could get hw IOMMU information.
>>
>> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to
>vIOMMU,
>> in case vIOMMU needs some processing for VFIO legacy backend mode.
>>
>> Originally-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>  include/hw/vfio/vfio-common.h |  2 ++
>>  hw/vfio/iommufd.c             |  2 ++
>>  hw/vfio/pci.c                 | 24 +++++++++++++++++++-----
>>  3 files changed, 23 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>common.h
>> index 9b7ef7d02b..fde0d0ca60 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -31,6 +31,7 @@
>>  #endif
>>  #include "sysemu/sysemu.h"
>>  #include "hw/vfio/vfio-container-base.h"
>> +#include "sysemu/iommufd_device.h"
>>
>>  #define VFIO_MSG_PREFIX "vfio %s: "
>>
>> @@ -126,6 +127,7 @@ typedef struct VFIODevice {
>>      bool dirty_tracking;
>>      int devid;
>>      IOMMUFDBackend *iommufd;
>> +    IOMMUFDDevice idev;
>This looks duplicate of existing fields:
>idev.dev_id is same as above devid. by the way let's try to use the same
>devid everywhere.
>idev.iommufd is same as above iommufd if != NULL.
>So we should at least rationalize.

Indeed, I'll remove devid and *iommufd. Thanks for suggestion.

>>  } VFIODevice;
>>
>>  struct VFIODeviceOps {
>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>> index 9bfddc1360..cbd035f148 100644
>> --- a/hw/vfio/iommufd.c
>> +++ b/hw/vfio/iommufd.c
>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name,
>VFIODevice *vbasedev,
>>      VFIOContainerBase *bcontainer;
>>      VFIOIOMMUFDContainer *container;
>>      VFIOAddressSpace *space;
>> +    IOMMUFDDevice *idev = &vbasedev->idev;
>>      struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>      int ret, devfd;
>>      uint32_t ioas_id;
>> @@ -428,6 +429,7 @@ found_container:
>>      QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev,
>container_next);
>>      QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>>
>> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev-
>>devid);
>>      trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev-
>>num_irqs,
>>                                     vbasedev->num_regions, vbasedev->flags);
>>      return 0;
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index d7fe06715c..2c3a5d267b 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev,
>Error **errp)
>>
>>      vfio_bars_register(vdev);
>>
>> -    ret = vfio_add_capabilities(vdev, errp);
>> +    if (vbasedev->iommufd) {
>> +        ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp);
>> +    } else {
>> +        ret = pci_device_set_iommu_device(pdev, 0, errp);
>> +    }
>>      if (ret) {
>> +        error_prepend(errp, "Failed to set iommu_device: ");
>at the moment it is rather an IOMMUFD device.

Will use "Failed to set IOMMUFD device: "

Thanks
Zhenzhong

>>          goto out_teardown;
>>      }
>>
>> +    ret = vfio_add_capabilities(vdev, errp);
>> +    if (ret) {
>> +        goto out_unset_idev;
>> +    }
>> +
>>      if (vdev->vga) {
>>          vfio_vga_quirk_setup(vdev);
>>      }
>> @@ -3128,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error
>**errp)
>>              error_setg(errp,
>>                         "cannot support IGD OpRegion feature on hotplugged "
>>                         "device");
>> -            goto out_teardown;
>> +            goto out_unset_idev;
>>          }
>>
>>          ret = vfio_get_dev_region_info(vbasedev,
>> @@ -3137,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev,
>Error **errp)
>>          if (ret) {
>>              error_setg_errno(errp, -ret,
>>                               "does not support requested IGD OpRegion feature");
>> -            goto out_teardown;
>> +            goto out_unset_idev;
>>          }
>>
>>          ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
>>          g_free(opregion);
>>          if (ret) {
>> -            goto out_teardown;
>> +            goto out_unset_idev;
>>          }
>>      }
>>
>> @@ -3229,6 +3239,8 @@ out_deregister:
>>      if (vdev->intx.mmap_timer) {
>>          timer_free(vdev->intx.mmap_timer);
>>      }
>> +out_unset_idev:
>> +    pci_device_unset_iommu_device(pdev);
>>  out_teardown:
>>      vfio_teardown_msi(vdev);
>>      vfio_bars_exit(vdev);
>> @@ -3257,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj)
>>  static void vfio_exitfn(PCIDevice *pdev)
>>  {
>>      VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>
>>      vfio_unregister_req_notifier(vdev);
>>      vfio_unregister_err_notifier(vdev);
>> @@ -3271,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev)
>>      vfio_teardown_msi(vdev);
>>      vfio_pci_disable_rp_atomics(vdev);
>>      vfio_bars_exit(vdev);
>> -    vfio_migration_exit(&vdev->vbasedev);
>> +    vfio_migration_exit(vbasedev);
>> +    pci_device_unset_iommu_device(pdev);
>>  }
>>
>>  static void vfio_pci_reset(DeviceState *dev)
>Thanks
>
>Eric
Yi Liu Jan. 18, 2024, 10:17 a.m. UTC | #5
On 2024/1/18 16:17, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Joao Martins <joao.m.martins@oracle.com>
>> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to
>> vIOMMU
>>
>> On 15/01/2024 10:13, Zhenzhong Duan wrote:
>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>>> index 9bfddc1360..cbd035f148 100644
>>> --- a/hw/vfio/iommufd.c
>>> +++ b/hw/vfio/iommufd.c
>>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name,
>> VFIODevice *vbasedev,
>>>       VFIOContainerBase *bcontainer;
>>>       VFIOIOMMUFDContainer *container;
>>>       VFIOAddressSpace *space;
>>> +    IOMMUFDDevice *idev = &vbasedev->idev;
>>>       struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>>       int ret, devfd;
>>>       uint32_t ioas_id;
>>> @@ -428,6 +429,7 @@ found_container:
>>>       QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev,
>> container_next);
>>>       QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>>>
>>> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev-
>>> devid);
>>>       trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev-
>>> num_irqs,
>>>                                      vbasedev->num_regions, vbasedev->flags);
>>>       return 0;
>>
>> In the dirty tracking series, I'll need to fetch out_capabilities from device
>> and do a bunch of stuff that is used when allocating hwpt to ask for dirty
>> tracking. And this means having iommufd_device_init() be called before we
>> call
>> iommufd_cdev_attach_container().
>>
>> Here's what it looks based on an earlier version of your patch:
>>
>> https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563a
>> a20e4f22d107219b5
>>
>> I can move the call earlier in my series, unless there's something specifically
>> when you call it here?
> 
> I think it's safe to move it earlier, just remember to do the same for existing
> container.

yes, as long as the input of iommufd_device_init() are available in the new 
place. And remember to destroy it if the code failed after initializing
iommufd_device.
Joao Martins Jan. 18, 2024, 10:20 a.m. UTC | #6
On 18/01/2024 10:17, Yi Liu wrote:
> On 2024/1/18 16:17, Duan, Zhenzhong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Joao Martins <joao.m.martins@oracle.com>
>>> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to
>>> vIOMMU
>>>
>>> On 15/01/2024 10:13, Zhenzhong Duan wrote:
>>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>>>> index 9bfddc1360..cbd035f148 100644
>>>> --- a/hw/vfio/iommufd.c
>>>> +++ b/hw/vfio/iommufd.c
>>>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name,
>>> VFIODevice *vbasedev,
>>>>       VFIOContainerBase *bcontainer;
>>>>       VFIOIOMMUFDContainer *container;
>>>>       VFIOAddressSpace *space;
>>>> +    IOMMUFDDevice *idev = &vbasedev->idev;
>>>>       struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>>>       int ret, devfd;
>>>>       uint32_t ioas_id;
>>>> @@ -428,6 +429,7 @@ found_container:
>>>>       QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev,
>>> container_next);
>>>>       QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>>>>
>>>> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev-
>>>> devid);
>>>>       trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev-
>>>> num_irqs,
>>>>                                      vbasedev->num_regions, vbasedev->flags);
>>>>       return 0;
>>>
>>> In the dirty tracking series, I'll need to fetch out_capabilities from device
>>> and do a bunch of stuff that is used when allocating hwpt to ask for dirty
>>> tracking. And this means having iommufd_device_init() be called before we
>>> call
>>> iommufd_cdev_attach_container().
>>>
>>> Here's what it looks based on an earlier version of your patch:
>>>
>>> https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563a
>>> a20e4f22d107219b5
>>>
>>> I can move the call earlier in my series, unless there's something specifically
>>> when you call it here?
>>
>> I think it's safe to move it earlier, just remember to do the same for existing
>> container.
> 
> yes, as long as the input of iommufd_device_init() are available in the new
> place. And remember to destroy it if the code failed after initializing
> iommufd_device.
> 
In the way I am using I don't think there's any teardown as no new resources or
things are done. We essentially just initialize @idev and fetch some
capabilities thus nothing needs teardown. But I am not sure is the nesting
needs; perhaps destruction of resources is related to something to that?
Cédric Le Goater Jan. 22, 2024, 5:15 p.m. UTC | #7
On 1/15/24 11:13, Zhenzhong Duan wrote:
> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU
> could get hw IOMMU information.
> 
> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to vIOMMU,
> in case vIOMMU needs some processing for VFIO legacy backend mode.
> 
> Originally-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>   include/hw/vfio/vfio-common.h |  2 ++
>   hw/vfio/iommufd.c             |  2 ++
>   hw/vfio/pci.c                 | 24 +++++++++++++++++++-----
>   3 files changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 9b7ef7d02b..fde0d0ca60 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -31,6 +31,7 @@
>   #endif
>   #include "sysemu/sysemu.h"
>   #include "hw/vfio/vfio-container-base.h"
> +#include "sysemu/iommufd_device.h"
>   
>   #define VFIO_MSG_PREFIX "vfio %s: "
>   
> @@ -126,6 +127,7 @@ typedef struct VFIODevice {
>       bool dirty_tracking;
>       int devid;
>       IOMMUFDBackend *iommufd;
> +    IOMMUFDDevice idev;
>   } VFIODevice;
>   
>   struct VFIODeviceOps {
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 9bfddc1360..cbd035f148 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
>       VFIOContainerBase *bcontainer;
>       VFIOIOMMUFDContainer *container;
>       VFIOAddressSpace *space;
> +    IOMMUFDDevice *idev = &vbasedev->idev;
>       struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>       int ret, devfd;
>       uint32_t ioas_id;
> @@ -428,6 +429,7 @@ found_container:
>       QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
>       QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>   
> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid);
>       trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
>                                      vbasedev->num_regions, vbasedev->flags);
>       return 0;
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index d7fe06715c..2c3a5d267b 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>   
>       vfio_bars_register(vdev);
>   
> -    ret = vfio_add_capabilities(vdev, errp);
> +    if (vbasedev->iommufd) {
> +        ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp);
> +    } else {
> +        ret = pci_device_set_iommu_device(pdev, 0, errp);


AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice will do
nothing. Why call it ?


Thanks,

C.



> +    }
>       if (ret) {
> +        error_prepend(errp, "Failed to set iommu_device: ");
>           goto out_teardown;
>       }
>   
> +    ret = vfio_add_capabilities(vdev, errp);
> +    if (ret) {
> +        goto out_unset_idev;
> +    }
> +
>       if (vdev->vga) {
>           vfio_vga_quirk_setup(vdev);
>       }
> @@ -3128,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>               error_setg(errp,
>                          "cannot support IGD OpRegion feature on hotplugged "
>                          "device");
> -            goto out_teardown;
> +            goto out_unset_idev;
>           }
>   
>           ret = vfio_get_dev_region_info(vbasedev,
> @@ -3137,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>           if (ret) {
>               error_setg_errno(errp, -ret,
>                                "does not support requested IGD OpRegion feature");
> -            goto out_teardown;
> +            goto out_unset_idev;
>           }
>   
>           ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
>           g_free(opregion);
>           if (ret) {
> -            goto out_teardown;
> +            goto out_unset_idev;
>           }
>       }
>   
> @@ -3229,6 +3239,8 @@ out_deregister:
>       if (vdev->intx.mmap_timer) {
>           timer_free(vdev->intx.mmap_timer);
>       }
> +out_unset_idev:
> +    pci_device_unset_iommu_device(pdev);
>   out_teardown:
>       vfio_teardown_msi(vdev);
>       vfio_bars_exit(vdev);
> @@ -3257,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj)
>   static void vfio_exitfn(PCIDevice *pdev)
>   {
>       VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>   
>       vfio_unregister_req_notifier(vdev);
>       vfio_unregister_err_notifier(vdev);
> @@ -3271,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev)
>       vfio_teardown_msi(vdev);
>       vfio_pci_disable_rp_atomics(vdev);
>       vfio_bars_exit(vdev);
> -    vfio_migration_exit(&vdev->vbasedev);
> +    vfio_migration_exit(vbasedev);
> +    pci_device_unset_iommu_device(pdev);
>   }
>   
>   static void vfio_pci_reset(DeviceState *dev)
Duan, Zhenzhong Jan. 23, 2024, 9:46 a.m. UTC | #8
>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to
>vIOMMU
>
>On 1/15/24 11:13, Zhenzhong Duan wrote:
>> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU
>> could get hw IOMMU information.
>>
>> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to
>vIOMMU,
>> in case vIOMMU needs some processing for VFIO legacy backend mode.
>>
>> Originally-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>   include/hw/vfio/vfio-common.h |  2 ++
>>   hw/vfio/iommufd.c             |  2 ++
>>   hw/vfio/pci.c                 | 24 +++++++++++++++++++-----
>>   3 files changed, 23 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>common.h
>> index 9b7ef7d02b..fde0d0ca60 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -31,6 +31,7 @@
>>   #endif
>>   #include "sysemu/sysemu.h"
>>   #include "hw/vfio/vfio-container-base.h"
>> +#include "sysemu/iommufd_device.h"
>>
>>   #define VFIO_MSG_PREFIX "vfio %s: "
>>
>> @@ -126,6 +127,7 @@ typedef struct VFIODevice {
>>       bool dirty_tracking;
>>       int devid;
>>       IOMMUFDBackend *iommufd;
>> +    IOMMUFDDevice idev;
>>   } VFIODevice;
>>
>>   struct VFIODeviceOps {
>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>> index 9bfddc1360..cbd035f148 100644
>> --- a/hw/vfio/iommufd.c
>> +++ b/hw/vfio/iommufd.c
>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name,
>VFIODevice *vbasedev,
>>       VFIOContainerBase *bcontainer;
>>       VFIOIOMMUFDContainer *container;
>>       VFIOAddressSpace *space;
>> +    IOMMUFDDevice *idev = &vbasedev->idev;
>>       struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>       int ret, devfd;
>>       uint32_t ioas_id;
>> @@ -428,6 +429,7 @@ found_container:
>>       QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev,
>container_next);
>>       QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>>
>> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev-
>>devid);
>>       trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev-
>>num_irqs,
>>                                      vbasedev->num_regions, vbasedev->flags);
>>       return 0;
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index d7fe06715c..2c3a5d267b 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev,
>Error **errp)
>>
>>       vfio_bars_register(vdev);
>>
>> -    ret = vfio_add_capabilities(vdev, errp);
>> +    if (vbasedev->iommufd) {
>> +        ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp);
>> +    } else {
>> +        ret = pci_device_set_iommu_device(pdev, 0, errp);
>
>
>AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice will
>do
>nothing. Why call it ?

We will do something in nesting series, see https://github.com/yiliu1765/qemu/commit/7f0bb59575bb5cf38618ae950f68a8661676e881

Another choice is to call pci_device_set_iommu_device() no matter which backend
is used and check idev->iommufd in vtd_dev_set_iommu_device(). Is this better
for you?

Thanks
Zhenzhong
Cédric Le Goater Jan. 23, 2024, 12:54 p.m. UTC | #9
On 1/23/24 10:46, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Cédric Le Goater <clg@redhat.com>
>> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to
>> vIOMMU
>>
>> On 1/15/24 11:13, Zhenzhong Duan wrote:
>>> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU
>>> could get hw IOMMU information.
>>>
>>> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to
>> vIOMMU,
>>> in case vIOMMU needs some processing for VFIO legacy backend mode.
>>>
>>> Originally-by: Yi Liu <yi.l.liu@intel.com>
>>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>> ---
>>>    include/hw/vfio/vfio-common.h |  2 ++
>>>    hw/vfio/iommufd.c             |  2 ++
>>>    hw/vfio/pci.c                 | 24 +++++++++++++++++++-----
>>>    3 files changed, 23 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>> common.h
>>> index 9b7ef7d02b..fde0d0ca60 100644
>>> --- a/include/hw/vfio/vfio-common.h
>>> +++ b/include/hw/vfio/vfio-common.h
>>> @@ -31,6 +31,7 @@
>>>    #endif
>>>    #include "sysemu/sysemu.h"
>>>    #include "hw/vfio/vfio-container-base.h"
>>> +#include "sysemu/iommufd_device.h"
>>>
>>>    #define VFIO_MSG_PREFIX "vfio %s: "
>>>
>>> @@ -126,6 +127,7 @@ typedef struct VFIODevice {
>>>        bool dirty_tracking;
>>>        int devid;
>>>        IOMMUFDBackend *iommufd;
>>> +    IOMMUFDDevice idev;
>>>    } VFIODevice;
>>>
>>>    struct VFIODeviceOps {
>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>>> index 9bfddc1360..cbd035f148 100644
>>> --- a/hw/vfio/iommufd.c
>>> +++ b/hw/vfio/iommufd.c
>>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name,
>> VFIODevice *vbasedev,
>>>        VFIOContainerBase *bcontainer;
>>>        VFIOIOMMUFDContainer *container;
>>>        VFIOAddressSpace *space;
>>> +    IOMMUFDDevice *idev = &vbasedev->idev;
>>>        struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>>        int ret, devfd;
>>>        uint32_t ioas_id;
>>> @@ -428,6 +429,7 @@ found_container:
>>>        QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev,
>> container_next);
>>>        QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>>>
>>> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev-
>>> devid);
>>>        trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev-
>>> num_irqs,
>>>                                       vbasedev->num_regions, vbasedev->flags);
>>>        return 0;
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index d7fe06715c..2c3a5d267b 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev,
>> Error **errp)
>>>
>>>        vfio_bars_register(vdev);
>>>
>>> -    ret = vfio_add_capabilities(vdev, errp);
>>> +    if (vbasedev->iommufd) {
>>> +        ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp);
>>> +    } else {
>>> +        ret = pci_device_set_iommu_device(pdev, 0, errp);
>>
>>
>> AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice will
>> do
>> nothing. Why call it ?
> 
> We will do something in nesting series, see https://github.com/yiliu1765/qemu/commit/7f0bb59575bb5cf38618ae950f68a8661676e881

ok, that's not much. idev is used as a capability bool and later on
to pass the /dev/iommu fd.  We don't need to support the legacy mode ?

> Another choice is to call pci_device_set_iommu_device() no matter which backend
> is used and check idev->iommufd in vtd_dev_set_iommu_device(). Is this better
> for you?

yes. Should be fine. There is more to it though.

IIUC, what will determine most of the requirements, is the legacy
mode. We also need the host iommu info in that case. As said Eric,
ideally, we should introduce a common abstract "host-iommu-info" struct
and sub structs associated with the iommu backends (iommufd + legacy)
which would be allocated accordingly.

So, IOMMUFDDevice usage should be limited to the iommufd files. All PCI
files should use the common abstract type. We should define these data
structures first. They could be simple C struct for now. We will see if
QOM applies after.

Will take a look at Eric's patchset next.

Thanks,

C.
Duan, Zhenzhong Jan. 24, 2024, 9:26 a.m. UTC | #10
>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to
>vIOMMU
>
>On 1/23/24 10:46, Duan, Zhenzhong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Cédric Le Goater <clg@redhat.com>
>>> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass
>to
>>> vIOMMU
>>>
>>> On 1/15/24 11:13, Zhenzhong Duan wrote:
>>>> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU
>>>> could get hw IOMMU information.
>>>>
>>>> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to
>>> vIOMMU,
>>>> in case vIOMMU needs some processing for VFIO legacy backend mode.
>>>>
>>>> Originally-by: Yi Liu <yi.l.liu@intel.com>
>>>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>>> ---
>>>>    include/hw/vfio/vfio-common.h |  2 ++
>>>>    hw/vfio/iommufd.c             |  2 ++
>>>>    hw/vfio/pci.c                 | 24 +++++++++++++++++++-----
>>>>    3 files changed, 23 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>>> common.h
>>>> index 9b7ef7d02b..fde0d0ca60 100644
>>>> --- a/include/hw/vfio/vfio-common.h
>>>> +++ b/include/hw/vfio/vfio-common.h
>>>> @@ -31,6 +31,7 @@
>>>>    #endif
>>>>    #include "sysemu/sysemu.h"
>>>>    #include "hw/vfio/vfio-container-base.h"
>>>> +#include "sysemu/iommufd_device.h"
>>>>
>>>>    #define VFIO_MSG_PREFIX "vfio %s: "
>>>>
>>>> @@ -126,6 +127,7 @@ typedef struct VFIODevice {
>>>>        bool dirty_tracking;
>>>>        int devid;
>>>>        IOMMUFDBackend *iommufd;
>>>> +    IOMMUFDDevice idev;
>>>>    } VFIODevice;
>>>>
>>>>    struct VFIODeviceOps {
>>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>>>> index 9bfddc1360..cbd035f148 100644
>>>> --- a/hw/vfio/iommufd.c
>>>> +++ b/hw/vfio/iommufd.c
>>>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char
>*name,
>>> VFIODevice *vbasedev,
>>>>        VFIOContainerBase *bcontainer;
>>>>        VFIOIOMMUFDContainer *container;
>>>>        VFIOAddressSpace *space;
>>>> +    IOMMUFDDevice *idev = &vbasedev->idev;
>>>>        struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>>>        int ret, devfd;
>>>>        uint32_t ioas_id;
>>>> @@ -428,6 +429,7 @@ found_container:
>>>>        QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev,
>>> container_next);
>>>>        QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
>>>>
>>>> +    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev-
>>>> devid);
>>>>        trace_iommufd_cdev_device_info(vbasedev->name, devfd,
>vbasedev-
>>>> num_irqs,
>>>>                                       vbasedev->num_regions, vbasedev->flags);
>>>>        return 0;
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index d7fe06715c..2c3a5d267b 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev,
>>> Error **errp)
>>>>
>>>>        vfio_bars_register(vdev);
>>>>
>>>> -    ret = vfio_add_capabilities(vdev, errp);
>>>> +    if (vbasedev->iommufd) {
>>>> +        ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp);
>>>> +    } else {
>>>> +        ret = pci_device_set_iommu_device(pdev, 0, errp);
>>>
>>>
>>> AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice
>will
>>> do
>>> nothing. Why call it ?
>>
>> We will do something in nesting series, see
>https://github.com/yiliu1765/qemu/commit/7f0bb59575bb5cf38618ae950
>f68a8661676e881
>
>ok, that's not much. idev is used as a capability bool and later on
>to pass the /dev/iommu fd.  We don't need to support the legacy mode ?

It's better to have for legacy mode. Especially when we support address
width 57 to QEMU Intel_iommu in future.

>
>> Another choice is to call pci_device_set_iommu_device() no matter which
>backend
>> is used and check idev->iommufd in vtd_dev_set_iommu_device(). Is this
>better
>> for you?
>
>yes. Should be fine. There is more to it though.
>
>IIUC, what will determine most of the requirements, is the legacy
>mode. We also need the host iommu info in that case. As said Eric,
>ideally, we should introduce a common abstract "host-iommu-info" struct
>and sub structs associated with the iommu backends (iommufd + legacy)
>which would be allocated accordingly.

I see, I'll make a rfcv2 as you and Eric suggested and discuss further
with Eric what elements he needs in legacy sub structs.

>
>So, IOMMUFDDevice usage should be limited to the iommufd files. All PCI
>files should use the common abstract type. We should define these data
>structures first. They could be simple C struct for now. We will see if
>QOM applies after.

Got it.

Thanks
Zhenzhong
diff mbox series

Patch

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9b7ef7d02b..fde0d0ca60 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@ 
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/iommufd_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -126,6 +127,7 @@  typedef struct VFIODevice {
     bool dirty_tracking;
     int devid;
     IOMMUFDBackend *iommufd;
+    IOMMUFDDevice idev;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 9bfddc1360..cbd035f148 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -309,6 +309,7 @@  static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
     VFIOContainerBase *bcontainer;
     VFIOIOMMUFDContainer *container;
     VFIOAddressSpace *space;
+    IOMMUFDDevice *idev = &vbasedev->idev;
     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
     int ret, devfd;
     uint32_t ioas_id;
@@ -428,6 +429,7 @@  found_container:
     QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
     QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
 
+    iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid);
     trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
                                    vbasedev->num_regions, vbasedev->flags);
     return 0;
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d7fe06715c..2c3a5d267b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3107,11 +3107,21 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
 
     vfio_bars_register(vdev);
 
-    ret = vfio_add_capabilities(vdev, errp);
+    if (vbasedev->iommufd) {
+        ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp);
+    } else {
+        ret = pci_device_set_iommu_device(pdev, 0, errp);
+    }
     if (ret) {
+        error_prepend(errp, "Failed to set iommu_device: ");
         goto out_teardown;
     }
 
+    ret = vfio_add_capabilities(vdev, errp);
+    if (ret) {
+        goto out_unset_idev;
+    }
+
     if (vdev->vga) {
         vfio_vga_quirk_setup(vdev);
     }
@@ -3128,7 +3138,7 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
             error_setg(errp,
                        "cannot support IGD OpRegion feature on hotplugged "
                        "device");
-            goto out_teardown;
+            goto out_unset_idev;
         }
 
         ret = vfio_get_dev_region_info(vbasedev,
@@ -3137,13 +3147,13 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
         if (ret) {
             error_setg_errno(errp, -ret,
                              "does not support requested IGD OpRegion feature");
-            goto out_teardown;
+            goto out_unset_idev;
         }
 
         ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
         g_free(opregion);
         if (ret) {
-            goto out_teardown;
+            goto out_unset_idev;
         }
     }
 
@@ -3229,6 +3239,8 @@  out_deregister:
     if (vdev->intx.mmap_timer) {
         timer_free(vdev->intx.mmap_timer);
     }
+out_unset_idev:
+    pci_device_unset_iommu_device(pdev);
 out_teardown:
     vfio_teardown_msi(vdev);
     vfio_bars_exit(vdev);
@@ -3257,6 +3269,7 @@  static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
 
     vfio_unregister_req_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
@@ -3271,7 +3284,8 @@  static void vfio_exitfn(PCIDevice *pdev)
     vfio_teardown_msi(vdev);
     vfio_pci_disable_rp_atomics(vdev);
     vfio_bars_exit(vdev);
-    vfio_migration_exit(&vdev->vbasedev);
+    vfio_migration_exit(vbasedev);
+    pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)