Message ID | 20240115101313.131139-5-zhenzhong.duan@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Check and sync host IOMMU cap/ecap with vIOMMU | expand |
On 15/01/2024 10:13, Zhenzhong Duan wrote: > diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c > index 9bfddc1360..cbd035f148 100644 > --- a/hw/vfio/iommufd.c > +++ b/hw/vfio/iommufd.c > @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, > VFIOContainerBase *bcontainer; > VFIOIOMMUFDContainer *container; > VFIOAddressSpace *space; > + IOMMUFDDevice *idev = &vbasedev->idev; > struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; > int ret, devfd; > uint32_t ioas_id; > @@ -428,6 +429,7 @@ found_container: > QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); > QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); > > + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid); > trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, > vbasedev->num_regions, vbasedev->flags); > return 0; In the dirty tracking series, I'll need to fetch out_capabilities from device and do a bunch of stuff that is used when allocating hwpt to ask for dirty tracking. And this means having iommufd_device_init() be called before we call iommufd_cdev_attach_container(). Here's what it looks based on an earlier version of your patch: https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563aa20e4f22d107219b5 I can move the call earlier in my series, unless there's something specifically when you call it here? Joao
Hi Zhenzhong, On 1/15/24 11:13, Zhenzhong Duan wrote: > Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU > could get hw IOMMU information. > > In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to vIOMMU, > in case vIOMMU needs some processing for VFIO legacy backend mode. > > Originally-by: Yi Liu <yi.l.liu@intel.com> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> > --- > include/hw/vfio/vfio-common.h | 2 ++ > hw/vfio/iommufd.c | 2 ++ > hw/vfio/pci.c | 24 +++++++++++++++++++----- > 3 files changed, 23 insertions(+), 5 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 9b7ef7d02b..fde0d0ca60 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -31,6 +31,7 @@ > #endif > #include "sysemu/sysemu.h" > #include "hw/vfio/vfio-container-base.h" > +#include "sysemu/iommufd_device.h" > > #define VFIO_MSG_PREFIX "vfio %s: " > > @@ -126,6 +127,7 @@ typedef struct VFIODevice { > bool dirty_tracking; > int devid; > IOMMUFDBackend *iommufd; > + IOMMUFDDevice idev; This looks duplicate of existing fields: idev.dev_id is same as above devid. by the way let's try to use the same devid everywhere. idev.iommufd is same as above iommufd if != NULL. So we should at least rationalize. > } VFIODevice; > > struct VFIODeviceOps { > diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c > index 9bfddc1360..cbd035f148 100644 > --- a/hw/vfio/iommufd.c > +++ b/hw/vfio/iommufd.c > @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, > VFIOContainerBase *bcontainer; > VFIOIOMMUFDContainer *container; > VFIOAddressSpace *space; > + IOMMUFDDevice *idev = &vbasedev->idev; > struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; > int ret, devfd; > uint32_t ioas_id; > @@ -428,6 +429,7 @@ found_container: > QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); > QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); > > + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid); > trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, > vbasedev->num_regions, vbasedev->flags); > return 0; > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index d7fe06715c..2c3a5d267b 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > > vfio_bars_register(vdev); > > - ret = vfio_add_capabilities(vdev, errp); > + if (vbasedev->iommufd) { > + ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp); > + } else { > + ret = pci_device_set_iommu_device(pdev, 0, errp); > + } > if (ret) { > + error_prepend(errp, "Failed to set iommu_device: "); at the moment it is rather an IOMMUFD device. > goto out_teardown; > } > > + ret = vfio_add_capabilities(vdev, errp); > + if (ret) { > + goto out_unset_idev; > + } > + > if (vdev->vga) { > vfio_vga_quirk_setup(vdev); > } > @@ -3128,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > error_setg(errp, > "cannot support IGD OpRegion feature on hotplugged " > "device"); > - goto out_teardown; > + goto out_unset_idev; > } > > ret = vfio_get_dev_region_info(vbasedev, > @@ -3137,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > if (ret) { > error_setg_errno(errp, -ret, > "does not support requested IGD OpRegion feature"); > - goto out_teardown; > + goto out_unset_idev; > } > > ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); > g_free(opregion); > if (ret) { > - goto out_teardown; > + goto out_unset_idev; > } > } > > @@ -3229,6 +3239,8 @@ out_deregister: > if (vdev->intx.mmap_timer) { > timer_free(vdev->intx.mmap_timer); > } > +out_unset_idev: > + pci_device_unset_iommu_device(pdev); > out_teardown: > vfio_teardown_msi(vdev); > vfio_bars_exit(vdev); > @@ -3257,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj) > static void vfio_exitfn(PCIDevice *pdev) > { > VFIOPCIDevice *vdev = VFIO_PCI(pdev); > + VFIODevice *vbasedev = &vdev->vbasedev; > > vfio_unregister_req_notifier(vdev); > vfio_unregister_err_notifier(vdev); > @@ -3271,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev) > vfio_teardown_msi(vdev); > vfio_pci_disable_rp_atomics(vdev); > vfio_bars_exit(vdev); > - vfio_migration_exit(&vdev->vbasedev); > + vfio_migration_exit(vbasedev); > + pci_device_unset_iommu_device(pdev); > } > > static void vfio_pci_reset(DeviceState *dev) Thanks Eric
>-----Original Message----- >From: Joao Martins <joao.m.martins@oracle.com> >Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to >vIOMMU > >On 15/01/2024 10:13, Zhenzhong Duan wrote: >> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >> index 9bfddc1360..cbd035f148 100644 >> --- a/hw/vfio/iommufd.c >> +++ b/hw/vfio/iommufd.c >> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, >VFIODevice *vbasedev, >> VFIOContainerBase *bcontainer; >> VFIOIOMMUFDContainer *container; >> VFIOAddressSpace *space; >> + IOMMUFDDevice *idev = &vbasedev->idev; >> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; >> int ret, devfd; >> uint32_t ioas_id; >> @@ -428,6 +429,7 @@ found_container: >> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, >container_next); >> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); >> >> + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev- >>devid); >> trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev- >>num_irqs, >> vbasedev->num_regions, vbasedev->flags); >> return 0; > >In the dirty tracking series, I'll need to fetch out_capabilities from device >and do a bunch of stuff that is used when allocating hwpt to ask for dirty >tracking. And this means having iommufd_device_init() be called before we >call >iommufd_cdev_attach_container(). > >Here's what it looks based on an earlier version of your patch: > >https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563a >a20e4f22d107219b5 > >I can move the call earlier in my series, unless there's something specifically >when you call it here? I think it's safe to move it earlier, just remember to do the same for existing container. Thanks Zhenzhong
>-----Original Message----- >From: Eric Auger <eric.auger@redhat.com> >Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to >vIOMMU > >Hi Zhenzhong, > >On 1/15/24 11:13, Zhenzhong Duan wrote: >> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU >> could get hw IOMMU information. >> >> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to >vIOMMU, >> in case vIOMMU needs some processing for VFIO legacy backend mode. >> >> Originally-by: Yi Liu <yi.l.liu@intel.com> >> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >> --- >> include/hw/vfio/vfio-common.h | 2 ++ >> hw/vfio/iommufd.c | 2 ++ >> hw/vfio/pci.c | 24 +++++++++++++++++++----- >> 3 files changed, 23 insertions(+), 5 deletions(-) >> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio- >common.h >> index 9b7ef7d02b..fde0d0ca60 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -31,6 +31,7 @@ >> #endif >> #include "sysemu/sysemu.h" >> #include "hw/vfio/vfio-container-base.h" >> +#include "sysemu/iommufd_device.h" >> >> #define VFIO_MSG_PREFIX "vfio %s: " >> >> @@ -126,6 +127,7 @@ typedef struct VFIODevice { >> bool dirty_tracking; >> int devid; >> IOMMUFDBackend *iommufd; >> + IOMMUFDDevice idev; >This looks duplicate of existing fields: >idev.dev_id is same as above devid. by the way let's try to use the same >devid everywhere. >idev.iommufd is same as above iommufd if != NULL. >So we should at least rationalize. Indeed, I'll remove devid and *iommufd. Thanks for suggestion. >> } VFIODevice; >> >> struct VFIODeviceOps { >> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >> index 9bfddc1360..cbd035f148 100644 >> --- a/hw/vfio/iommufd.c >> +++ b/hw/vfio/iommufd.c >> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, >VFIODevice *vbasedev, >> VFIOContainerBase *bcontainer; >> VFIOIOMMUFDContainer *container; >> VFIOAddressSpace *space; >> + IOMMUFDDevice *idev = &vbasedev->idev; >> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; >> int ret, devfd; >> uint32_t ioas_id; >> @@ -428,6 +429,7 @@ found_container: >> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, >container_next); >> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); >> >> + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev- >>devid); >> trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev- >>num_irqs, >> vbasedev->num_regions, vbasedev->flags); >> return 0; >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index d7fe06715c..2c3a5d267b 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, >Error **errp) >> >> vfio_bars_register(vdev); >> >> - ret = vfio_add_capabilities(vdev, errp); >> + if (vbasedev->iommufd) { >> + ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp); >> + } else { >> + ret = pci_device_set_iommu_device(pdev, 0, errp); >> + } >> if (ret) { >> + error_prepend(errp, "Failed to set iommu_device: "); >at the moment it is rather an IOMMUFD device. Will use "Failed to set IOMMUFD device: " Thanks Zhenzhong >> goto out_teardown; >> } >> >> + ret = vfio_add_capabilities(vdev, errp); >> + if (ret) { >> + goto out_unset_idev; >> + } >> + >> if (vdev->vga) { >> vfio_vga_quirk_setup(vdev); >> } >> @@ -3128,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error >**errp) >> error_setg(errp, >> "cannot support IGD OpRegion feature on hotplugged " >> "device"); >> - goto out_teardown; >> + goto out_unset_idev; >> } >> >> ret = vfio_get_dev_region_info(vbasedev, >> @@ -3137,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, >Error **errp) >> if (ret) { >> error_setg_errno(errp, -ret, >> "does not support requested IGD OpRegion feature"); >> - goto out_teardown; >> + goto out_unset_idev; >> } >> >> ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); >> g_free(opregion); >> if (ret) { >> - goto out_teardown; >> + goto out_unset_idev; >> } >> } >> >> @@ -3229,6 +3239,8 @@ out_deregister: >> if (vdev->intx.mmap_timer) { >> timer_free(vdev->intx.mmap_timer); >> } >> +out_unset_idev: >> + pci_device_unset_iommu_device(pdev); >> out_teardown: >> vfio_teardown_msi(vdev); >> vfio_bars_exit(vdev); >> @@ -3257,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj) >> static void vfio_exitfn(PCIDevice *pdev) >> { >> VFIOPCIDevice *vdev = VFIO_PCI(pdev); >> + VFIODevice *vbasedev = &vdev->vbasedev; >> >> vfio_unregister_req_notifier(vdev); >> vfio_unregister_err_notifier(vdev); >> @@ -3271,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev) >> vfio_teardown_msi(vdev); >> vfio_pci_disable_rp_atomics(vdev); >> vfio_bars_exit(vdev); >> - vfio_migration_exit(&vdev->vbasedev); >> + vfio_migration_exit(vbasedev); >> + pci_device_unset_iommu_device(pdev); >> } >> >> static void vfio_pci_reset(DeviceState *dev) >Thanks > >Eric
On 2024/1/18 16:17, Duan, Zhenzhong wrote: > > >> -----Original Message----- >> From: Joao Martins <joao.m.martins@oracle.com> >> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to >> vIOMMU >> >> On 15/01/2024 10:13, Zhenzhong Duan wrote: >>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >>> index 9bfddc1360..cbd035f148 100644 >>> --- a/hw/vfio/iommufd.c >>> +++ b/hw/vfio/iommufd.c >>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, >> VFIODevice *vbasedev, >>> VFIOContainerBase *bcontainer; >>> VFIOIOMMUFDContainer *container; >>> VFIOAddressSpace *space; >>> + IOMMUFDDevice *idev = &vbasedev->idev; >>> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; >>> int ret, devfd; >>> uint32_t ioas_id; >>> @@ -428,6 +429,7 @@ found_container: >>> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, >> container_next); >>> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); >>> >>> + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev- >>> devid); >>> trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev- >>> num_irqs, >>> vbasedev->num_regions, vbasedev->flags); >>> return 0; >> >> In the dirty tracking series, I'll need to fetch out_capabilities from device >> and do a bunch of stuff that is used when allocating hwpt to ask for dirty >> tracking. And this means having iommufd_device_init() be called before we >> call >> iommufd_cdev_attach_container(). >> >> Here's what it looks based on an earlier version of your patch: >> >> https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563a >> a20e4f22d107219b5 >> >> I can move the call earlier in my series, unless there's something specifically >> when you call it here? > > I think it's safe to move it earlier, just remember to do the same for existing > container. yes, as long as the input of iommufd_device_init() are available in the new place. And remember to destroy it if the code failed after initializing iommufd_device.
On 18/01/2024 10:17, Yi Liu wrote: > On 2024/1/18 16:17, Duan, Zhenzhong wrote: >> >> >>> -----Original Message----- >>> From: Joao Martins <joao.m.martins@oracle.com> >>> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to >>> vIOMMU >>> >>> On 15/01/2024 10:13, Zhenzhong Duan wrote: >>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >>>> index 9bfddc1360..cbd035f148 100644 >>>> --- a/hw/vfio/iommufd.c >>>> +++ b/hw/vfio/iommufd.c >>>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, >>> VFIODevice *vbasedev, >>>> VFIOContainerBase *bcontainer; >>>> VFIOIOMMUFDContainer *container; >>>> VFIOAddressSpace *space; >>>> + IOMMUFDDevice *idev = &vbasedev->idev; >>>> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; >>>> int ret, devfd; >>>> uint32_t ioas_id; >>>> @@ -428,6 +429,7 @@ found_container: >>>> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, >>> container_next); >>>> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); >>>> >>>> + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev- >>>> devid); >>>> trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev- >>>> num_irqs, >>>> vbasedev->num_regions, vbasedev->flags); >>>> return 0; >>> >>> In the dirty tracking series, I'll need to fetch out_capabilities from device >>> and do a bunch of stuff that is used when allocating hwpt to ask for dirty >>> tracking. And this means having iommufd_device_init() be called before we >>> call >>> iommufd_cdev_attach_container(). >>> >>> Here's what it looks based on an earlier version of your patch: >>> >>> https://github.com/jpemartins/qemu/commit/433f97a05e0cdd8e3b8563a >>> a20e4f22d107219b5 >>> >>> I can move the call earlier in my series, unless there's something specifically >>> when you call it here? >> >> I think it's safe to move it earlier, just remember to do the same for existing >> container. > > yes, as long as the input of iommufd_device_init() are available in the new > place. And remember to destroy it if the code failed after initializing > iommufd_device. > In the way I am using I don't think there's any teardown as no new resources or things are done. We essentially just initialize @idev and fetch some capabilities thus nothing needs teardown. But I am not sure is the nesting needs; perhaps destruction of resources is related to something to that?
On 1/15/24 11:13, Zhenzhong Duan wrote: > Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU > could get hw IOMMU information. > > In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to vIOMMU, > in case vIOMMU needs some processing for VFIO legacy backend mode. > > Originally-by: Yi Liu <yi.l.liu@intel.com> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> > --- > include/hw/vfio/vfio-common.h | 2 ++ > hw/vfio/iommufd.c | 2 ++ > hw/vfio/pci.c | 24 +++++++++++++++++++----- > 3 files changed, 23 insertions(+), 5 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 9b7ef7d02b..fde0d0ca60 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -31,6 +31,7 @@ > #endif > #include "sysemu/sysemu.h" > #include "hw/vfio/vfio-container-base.h" > +#include "sysemu/iommufd_device.h" > > #define VFIO_MSG_PREFIX "vfio %s: " > > @@ -126,6 +127,7 @@ typedef struct VFIODevice { > bool dirty_tracking; > int devid; > IOMMUFDBackend *iommufd; > + IOMMUFDDevice idev; > } VFIODevice; > > struct VFIODeviceOps { > diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c > index 9bfddc1360..cbd035f148 100644 > --- a/hw/vfio/iommufd.c > +++ b/hw/vfio/iommufd.c > @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, > VFIOContainerBase *bcontainer; > VFIOIOMMUFDContainer *container; > VFIOAddressSpace *space; > + IOMMUFDDevice *idev = &vbasedev->idev; > struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; > int ret, devfd; > uint32_t ioas_id; > @@ -428,6 +429,7 @@ found_container: > QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); > QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); > > + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid); > trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, > vbasedev->num_regions, vbasedev->flags); > return 0; > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index d7fe06715c..2c3a5d267b 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > > vfio_bars_register(vdev); > > - ret = vfio_add_capabilities(vdev, errp); > + if (vbasedev->iommufd) { > + ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp); > + } else { > + ret = pci_device_set_iommu_device(pdev, 0, errp); AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice will do nothing. Why call it ? Thanks, C. > + } > if (ret) { > + error_prepend(errp, "Failed to set iommu_device: "); > goto out_teardown; > } > > + ret = vfio_add_capabilities(vdev, errp); > + if (ret) { > + goto out_unset_idev; > + } > + > if (vdev->vga) { > vfio_vga_quirk_setup(vdev); > } > @@ -3128,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > error_setg(errp, > "cannot support IGD OpRegion feature on hotplugged " > "device"); > - goto out_teardown; > + goto out_unset_idev; > } > > ret = vfio_get_dev_region_info(vbasedev, > @@ -3137,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > if (ret) { > error_setg_errno(errp, -ret, > "does not support requested IGD OpRegion feature"); > - goto out_teardown; > + goto out_unset_idev; > } > > ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); > g_free(opregion); > if (ret) { > - goto out_teardown; > + goto out_unset_idev; > } > } > > @@ -3229,6 +3239,8 @@ out_deregister: > if (vdev->intx.mmap_timer) { > timer_free(vdev->intx.mmap_timer); > } > +out_unset_idev: > + pci_device_unset_iommu_device(pdev); > out_teardown: > vfio_teardown_msi(vdev); > vfio_bars_exit(vdev); > @@ -3257,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj) > static void vfio_exitfn(PCIDevice *pdev) > { > VFIOPCIDevice *vdev = VFIO_PCI(pdev); > + VFIODevice *vbasedev = &vdev->vbasedev; > > vfio_unregister_req_notifier(vdev); > vfio_unregister_err_notifier(vdev); > @@ -3271,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev) > vfio_teardown_msi(vdev); > vfio_pci_disable_rp_atomics(vdev); > vfio_bars_exit(vdev); > - vfio_migration_exit(&vdev->vbasedev); > + vfio_migration_exit(vbasedev); > + pci_device_unset_iommu_device(pdev); > } > > static void vfio_pci_reset(DeviceState *dev)
>-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to >vIOMMU > >On 1/15/24 11:13, Zhenzhong Duan wrote: >> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU >> could get hw IOMMU information. >> >> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to >vIOMMU, >> in case vIOMMU needs some processing for VFIO legacy backend mode. >> >> Originally-by: Yi Liu <yi.l.liu@intel.com> >> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >> --- >> include/hw/vfio/vfio-common.h | 2 ++ >> hw/vfio/iommufd.c | 2 ++ >> hw/vfio/pci.c | 24 +++++++++++++++++++----- >> 3 files changed, 23 insertions(+), 5 deletions(-) >> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio- >common.h >> index 9b7ef7d02b..fde0d0ca60 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -31,6 +31,7 @@ >> #endif >> #include "sysemu/sysemu.h" >> #include "hw/vfio/vfio-container-base.h" >> +#include "sysemu/iommufd_device.h" >> >> #define VFIO_MSG_PREFIX "vfio %s: " >> >> @@ -126,6 +127,7 @@ typedef struct VFIODevice { >> bool dirty_tracking; >> int devid; >> IOMMUFDBackend *iommufd; >> + IOMMUFDDevice idev; >> } VFIODevice; >> >> struct VFIODeviceOps { >> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >> index 9bfddc1360..cbd035f148 100644 >> --- a/hw/vfio/iommufd.c >> +++ b/hw/vfio/iommufd.c >> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, >VFIODevice *vbasedev, >> VFIOContainerBase *bcontainer; >> VFIOIOMMUFDContainer *container; >> VFIOAddressSpace *space; >> + IOMMUFDDevice *idev = &vbasedev->idev; >> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; >> int ret, devfd; >> uint32_t ioas_id; >> @@ -428,6 +429,7 @@ found_container: >> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, >container_next); >> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); >> >> + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev- >>devid); >> trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev- >>num_irqs, >> vbasedev->num_regions, vbasedev->flags); >> return 0; >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index d7fe06715c..2c3a5d267b 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, >Error **errp) >> >> vfio_bars_register(vdev); >> >> - ret = vfio_add_capabilities(vdev, errp); >> + if (vbasedev->iommufd) { >> + ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp); >> + } else { >> + ret = pci_device_set_iommu_device(pdev, 0, errp); > > >AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice will >do >nothing. Why call it ? We will do something in nesting series, see https://github.com/yiliu1765/qemu/commit/7f0bb59575bb5cf38618ae950f68a8661676e881 Another choice is to call pci_device_set_iommu_device() no matter which backend is used and check idev->iommufd in vtd_dev_set_iommu_device(). Is this better for you? Thanks Zhenzhong
On 1/23/24 10:46, Duan, Zhenzhong wrote: > > >> -----Original Message----- >> From: Cédric Le Goater <clg@redhat.com> >> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to >> vIOMMU >> >> On 1/15/24 11:13, Zhenzhong Duan wrote: >>> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU >>> could get hw IOMMU information. >>> >>> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to >> vIOMMU, >>> in case vIOMMU needs some processing for VFIO legacy backend mode. >>> >>> Originally-by: Yi Liu <yi.l.liu@intel.com> >>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> >>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >>> --- >>> include/hw/vfio/vfio-common.h | 2 ++ >>> hw/vfio/iommufd.c | 2 ++ >>> hw/vfio/pci.c | 24 +++++++++++++++++++----- >>> 3 files changed, 23 insertions(+), 5 deletions(-) >>> >>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio- >> common.h >>> index 9b7ef7d02b..fde0d0ca60 100644 >>> --- a/include/hw/vfio/vfio-common.h >>> +++ b/include/hw/vfio/vfio-common.h >>> @@ -31,6 +31,7 @@ >>> #endif >>> #include "sysemu/sysemu.h" >>> #include "hw/vfio/vfio-container-base.h" >>> +#include "sysemu/iommufd_device.h" >>> >>> #define VFIO_MSG_PREFIX "vfio %s: " >>> >>> @@ -126,6 +127,7 @@ typedef struct VFIODevice { >>> bool dirty_tracking; >>> int devid; >>> IOMMUFDBackend *iommufd; >>> + IOMMUFDDevice idev; >>> } VFIODevice; >>> >>> struct VFIODeviceOps { >>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >>> index 9bfddc1360..cbd035f148 100644 >>> --- a/hw/vfio/iommufd.c >>> +++ b/hw/vfio/iommufd.c >>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, >> VFIODevice *vbasedev, >>> VFIOContainerBase *bcontainer; >>> VFIOIOMMUFDContainer *container; >>> VFIOAddressSpace *space; >>> + IOMMUFDDevice *idev = &vbasedev->idev; >>> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; >>> int ret, devfd; >>> uint32_t ioas_id; >>> @@ -428,6 +429,7 @@ found_container: >>> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, >> container_next); >>> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); >>> >>> + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev- >>> devid); >>> trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev- >>> num_irqs, >>> vbasedev->num_regions, vbasedev->flags); >>> return 0; >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >>> index d7fe06715c..2c3a5d267b 100644 >>> --- a/hw/vfio/pci.c >>> +++ b/hw/vfio/pci.c >>> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, >> Error **errp) >>> >>> vfio_bars_register(vdev); >>> >>> - ret = vfio_add_capabilities(vdev, errp); >>> + if (vbasedev->iommufd) { >>> + ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp); >>> + } else { >>> + ret = pci_device_set_iommu_device(pdev, 0, errp); >> >> >> AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice will >> do >> nothing. Why call it ? > > We will do something in nesting series, see https://github.com/yiliu1765/qemu/commit/7f0bb59575bb5cf38618ae950f68a8661676e881 ok, that's not much. idev is used as a capability bool and later on to pass the /dev/iommu fd. We don't need to support the legacy mode ? > Another choice is to call pci_device_set_iommu_device() no matter which backend > is used and check idev->iommufd in vtd_dev_set_iommu_device(). Is this better > for you? yes. Should be fine. There is more to it though. IIUC, what will determine most of the requirements, is the legacy mode. We also need the host iommu info in that case. As said Eric, ideally, we should introduce a common abstract "host-iommu-info" struct and sub structs associated with the iommu backends (iommufd + legacy) which would be allocated accordingly. So, IOMMUFDDevice usage should be limited to the iommufd files. All PCI files should use the common abstract type. We should define these data structures first. They could be simple C struct for now. We will see if QOM applies after. Will take a look at Eric's patchset next. Thanks, C.
>-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass to >vIOMMU > >On 1/23/24 10:46, Duan, Zhenzhong wrote: >> >> >>> -----Original Message----- >>> From: Cédric Le Goater <clg@redhat.com> >>> Subject: Re: [PATCH rfcv1 4/6] vfio: initialize IOMMUFDDevice and pass >to >>> vIOMMU >>> >>> On 1/15/24 11:13, Zhenzhong Duan wrote: >>>> Initialize IOMMUFDDevice in vfio and pass to vIOMMU, so that vIOMMU >>>> could get hw IOMMU information. >>>> >>>> In VFIO legacy backend mode, we still pass a NULL IOMMUFDDevice to >>> vIOMMU, >>>> in case vIOMMU needs some processing for VFIO legacy backend mode. >>>> >>>> Originally-by: Yi Liu <yi.l.liu@intel.com> >>>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> >>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >>>> --- >>>> include/hw/vfio/vfio-common.h | 2 ++ >>>> hw/vfio/iommufd.c | 2 ++ >>>> hw/vfio/pci.c | 24 +++++++++++++++++++----- >>>> 3 files changed, 23 insertions(+), 5 deletions(-) >>>> >>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio- >>> common.h >>>> index 9b7ef7d02b..fde0d0ca60 100644 >>>> --- a/include/hw/vfio/vfio-common.h >>>> +++ b/include/hw/vfio/vfio-common.h >>>> @@ -31,6 +31,7 @@ >>>> #endif >>>> #include "sysemu/sysemu.h" >>>> #include "hw/vfio/vfio-container-base.h" >>>> +#include "sysemu/iommufd_device.h" >>>> >>>> #define VFIO_MSG_PREFIX "vfio %s: " >>>> >>>> @@ -126,6 +127,7 @@ typedef struct VFIODevice { >>>> bool dirty_tracking; >>>> int devid; >>>> IOMMUFDBackend *iommufd; >>>> + IOMMUFDDevice idev; >>>> } VFIODevice; >>>> >>>> struct VFIODeviceOps { >>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >>>> index 9bfddc1360..cbd035f148 100644 >>>> --- a/hw/vfio/iommufd.c >>>> +++ b/hw/vfio/iommufd.c >>>> @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char >*name, >>> VFIODevice *vbasedev, >>>> VFIOContainerBase *bcontainer; >>>> VFIOIOMMUFDContainer *container; >>>> VFIOAddressSpace *space; >>>> + IOMMUFDDevice *idev = &vbasedev->idev; >>>> struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; >>>> int ret, devfd; >>>> uint32_t ioas_id; >>>> @@ -428,6 +429,7 @@ found_container: >>>> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, >>> container_next); >>>> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); >>>> >>>> + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev- >>>> devid); >>>> trace_iommufd_cdev_device_info(vbasedev->name, devfd, >vbasedev- >>>> num_irqs, >>>> vbasedev->num_regions, vbasedev->flags); >>>> return 0; >>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >>>> index d7fe06715c..2c3a5d267b 100644 >>>> --- a/hw/vfio/pci.c >>>> +++ b/hw/vfio/pci.c >>>> @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, >>> Error **errp) >>>> >>>> vfio_bars_register(vdev); >>>> >>>> - ret = vfio_add_capabilities(vdev, errp); >>>> + if (vbasedev->iommufd) { >>>> + ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp); >>>> + } else { >>>> + ret = pci_device_set_iommu_device(pdev, 0, errp); >>> >>> >>> AFAICT, pci_device_set_iommu_device() with a NULL IOMMUFDDevice >will >>> do >>> nothing. Why call it ? >> >> We will do something in nesting series, see >https://github.com/yiliu1765/qemu/commit/7f0bb59575bb5cf38618ae950 >f68a8661676e881 > >ok, that's not much. idev is used as a capability bool and later on >to pass the /dev/iommu fd. We don't need to support the legacy mode ? It's better to have for legacy mode. Especially when we support address width 57 to QEMU Intel_iommu in future. > >> Another choice is to call pci_device_set_iommu_device() no matter which >backend >> is used and check idev->iommufd in vtd_dev_set_iommu_device(). Is this >better >> for you? > >yes. Should be fine. There is more to it though. > >IIUC, what will determine most of the requirements, is the legacy >mode. We also need the host iommu info in that case. As said Eric, >ideally, we should introduce a common abstract "host-iommu-info" struct >and sub structs associated with the iommu backends (iommufd + legacy) >which would be allocated accordingly. I see, I'll make a rfcv2 as you and Eric suggested and discuss further with Eric what elements he needs in legacy sub structs. > >So, IOMMUFDDevice usage should be limited to the iommufd files. All PCI >files should use the common abstract type. We should define these data >structures first. They could be simple C struct for now. We will see if >QOM applies after. Got it. Thanks Zhenzhong
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9b7ef7d02b..fde0d0ca60 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -31,6 +31,7 @@ #endif #include "sysemu/sysemu.h" #include "hw/vfio/vfio-container-base.h" +#include "sysemu/iommufd_device.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -126,6 +127,7 @@ typedef struct VFIODevice { bool dirty_tracking; int devid; IOMMUFDBackend *iommufd; + IOMMUFDDevice idev; } VFIODevice; struct VFIODeviceOps { diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 9bfddc1360..cbd035f148 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -309,6 +309,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, VFIOContainerBase *bcontainer; VFIOIOMMUFDContainer *container; VFIOAddressSpace *space; + IOMMUFDDevice *idev = &vbasedev->idev; struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; int ret, devfd; uint32_t ioas_id; @@ -428,6 +429,7 @@ found_container: QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + iommufd_device_init(idev, sizeof(*idev), container->be, vbasedev->devid); trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, vbasedev->num_regions, vbasedev->flags); return 0; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d7fe06715c..2c3a5d267b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3107,11 +3107,21 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); - ret = vfio_add_capabilities(vdev, errp); + if (vbasedev->iommufd) { + ret = pci_device_set_iommu_device(pdev, &vbasedev->idev, errp); + } else { + ret = pci_device_set_iommu_device(pdev, 0, errp); + } if (ret) { + error_prepend(errp, "Failed to set iommu_device: "); goto out_teardown; } + ret = vfio_add_capabilities(vdev, errp); + if (ret) { + goto out_unset_idev; + } + if (vdev->vga) { vfio_vga_quirk_setup(vdev); } @@ -3128,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) error_setg(errp, "cannot support IGD OpRegion feature on hotplugged " "device"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_get_dev_region_info(vbasedev, @@ -3137,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (ret) { error_setg_errno(errp, -ret, "does not support requested IGD OpRegion feature"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); g_free(opregion); if (ret) { - goto out_teardown; + goto out_unset_idev; } } @@ -3229,6 +3239,8 @@ out_deregister: if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } +out_unset_idev: + pci_device_unset_iommu_device(pdev); out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); @@ -3257,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj) static void vfio_exitfn(PCIDevice *pdev) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); vfio_unregister_err_notifier(vdev); @@ -3271,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); vfio_bars_exit(vdev); - vfio_migration_exit(&vdev->vbasedev); + vfio_migration_exit(vbasedev); + pci_device_unset_iommu_device(pdev); } static void vfio_pci_reset(DeviceState *dev)