Message ID | 20230622214845.3980-13-joao.m.martins@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio: VFIO migration support with vIOMMU | expand |
On 23/06/2023 0:48, Joao Martins wrote: > External email: Use caution opening links or attachments > > > Currently, device dirty page tracking with vIOMMU is not supported, > and a blocker is added and the migration is prevented. > > When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as > requesting by the vIOMMU. These IOVA ranges can potentially be mapped > anywhere in the vIOMMU IOVA space as advertised by the VMM. > > To support device dirty tracking when vIOMMU enabled instead create the > dirty ranges based on the vIOMMU provided limits, which leads to the > tracking of the whole IOVA space regardless of what devices use. > > Signed-off-by: Avihai Horon <avihaih@nvidia.com> > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > --- > include/hw/vfio/vfio-common.h | 1 + > hw/vfio/common.c | 58 +++++++++++++++++++++++++++++------ > hw/vfio/pci.c | 7 +++++ > 3 files changed, 56 insertions(+), 10 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index f41860988d6b..c4bafad084b4 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -71,6 +71,7 @@ typedef struct VFIOMigration { > typedef struct VFIOAddressSpace { > AddressSpace *as; > bool no_dma_translation; > + hwaddr max_iova; > QLIST_HEAD(, VFIOContainer) containers; > QLIST_ENTRY(VFIOAddressSpace) list; > } VFIOAddressSpace; > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index ecfb9afb3fb6..85fddef24026 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void) > return false; > } > > +static int vfio_viommu_get_max_iova(hwaddr *max_iova) > +{ > + VFIOAddressSpace *space; > + > + *max_iova = 0; > + > + QLIST_FOREACH(space, &vfio_address_spaces, list) { > + if (space->as == &address_space_memory) { > + continue; > + } > + > + if (*max_iova < space->max_iova) { > + *max_iova = space->max_iova; > + } > + } Looks like max_iova is a per VFIOAddressSpace property, so why do we need to iterate over all address spaces? Thanks. > + > + return *max_iova == 0; > +} > + > int vfio_block_giommu_migration(Error **errp) > { > int ret; > @@ -1464,10 +1483,11 @@ static const MemoryListener vfio_dirty_tracking_listener = { > .region_add = vfio_listener_dirty_tracking_update, > }; > > -static void vfio_dirty_tracking_init(VFIOContainer *container, > +static int vfio_dirty_tracking_init(VFIOContainer *container, > VFIODirtyRanges *ranges) > { > VFIODirtyRangesListener dirty; > + int ret; > > memset(&dirty, 0, sizeof(dirty)); > dirty.ranges.min32 = UINT32_MAX; > @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, > dirty.listener = vfio_dirty_tracking_listener; > dirty.container = container; > > - memory_listener_register(&dirty.listener, > - container->space->as); > + if (vfio_viommu_preset()) { > + hwaddr iommu_max_iova; > + > + ret = vfio_viommu_get_max_iova(&iommu_max_iova); > + if (ret) { > + return -EINVAL; > + } > + > + vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges); > + } else { > + memory_listener_register(&dirty.listener, > + container->space->as); > + /* > + * The memory listener is synchronous, and used to calculate the range > + * to dirty tracking. Unregister it after we are done as we are not > + * interested in any follow-up updates. > + */ > + memory_listener_unregister(&dirty.listener); > + } > > *ranges = dirty.ranges; > > - /* > - * The memory listener is synchronous, and used to calculate the range > - * to dirty tracking. Unregister it after we are done as we are not > - * interested in any follow-up updates. > - */ > - memory_listener_unregister(&dirty.listener); > + return 0; > } > > static void vfio_devices_dma_logging_stop(VFIOContainer *container) > @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) > VFIOGroup *group; > int ret = 0; > > - vfio_dirty_tracking_init(container, &ranges); > + ret = vfio_dirty_tracking_init(container, &ranges); > + if (ret) { > + error_report("Failed to init DMA logging ranges, err %d", > + ret); > + return -EOPNOTSUPP; > + } > + > feature = vfio_device_feature_dma_logging_start_create(container, > &ranges); > if (!feature) { > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 8a98e6ffc480..3bda5618c5b5 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -2974,6 +2974,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > &dma_translation); > space->no_dma_translation = !dma_translation; > > + /* > + * Support for advertised IOMMU address space boundaries is optional. > + * By default, it is not advertised i.e. space::max_iova is 0. > + */ > + pci_device_iommu_get_attr(pdev, IOMMU_ATTR_MAX_IOVA, > + &space->max_iova); > + > QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { > if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { > error_setg(errp, "device is already attached"); > -- > 2.17.2 >
On 09/07/2023 16:24, Avihai Horon wrote: > On 23/06/2023 0:48, Joao Martins wrote: >> Currently, device dirty page tracking with vIOMMU is not supported, >> and a blocker is added and the migration is prevented. >> >> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as >> requesting by the vIOMMU. These IOVA ranges can potentially be mapped >> anywhere in the vIOMMU IOVA space as advertised by the VMM. >> >> To support device dirty tracking when vIOMMU enabled instead create the >> dirty ranges based on the vIOMMU provided limits, which leads to the >> tracking of the whole IOVA space regardless of what devices use. >> >> Signed-off-by: Avihai Horon <avihaih@nvidia.com> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >> --- >> include/hw/vfio/vfio-common.h | 1 + >> hw/vfio/common.c | 58 +++++++++++++++++++++++++++++------ >> hw/vfio/pci.c | 7 +++++ >> 3 files changed, 56 insertions(+), 10 deletions(-) >> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index f41860988d6b..c4bafad084b4 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -71,6 +71,7 @@ typedef struct VFIOMigration { >> typedef struct VFIOAddressSpace { >> AddressSpace *as; >> bool no_dma_translation; >> + hwaddr max_iova; >> QLIST_HEAD(, VFIOContainer) containers; >> QLIST_ENTRY(VFIOAddressSpace) list; >> } VFIOAddressSpace; >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index ecfb9afb3fb6..85fddef24026 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void) >> return false; >> } >> >> +static int vfio_viommu_get_max_iova(hwaddr *max_iova) >> +{ >> + VFIOAddressSpace *space; >> + >> + *max_iova = 0; >> + >> + QLIST_FOREACH(space, &vfio_address_spaces, list) { >> + if (space->as == &address_space_memory) { >> + continue; >> + } >> + >> + if (*max_iova < space->max_iova) { >> + *max_iova = space->max_iova; >> + } >> + } > > Looks like max_iova is a per VFIOAddressSpace property, so why do we need to > iterate over all address spaces? > This was more futureproof-ing when Qemu supports multiple vIOMMU. In theory this tracks device address space, and if two different devices stand behind different vIOMMU, then this loop would compute the highest IOVA that we would track by the host device dirty tracker. But I realize this might introduce unnecessary complexity, and we should 'obey' the advertised vIOMMU max_iova for the device. With Zhenzhong blocker cleanup I can make this just fetch the max_iova in the space and be done with it. Joao > Thanks. > >> + >> + return *max_iova == 0; >> +} >> + >> int vfio_block_giommu_migration(Error **errp) >> { >> int ret; >> @@ -1464,10 +1483,11 @@ static const MemoryListener >> vfio_dirty_tracking_listener = { >> .region_add = vfio_listener_dirty_tracking_update, >> }; >> >> -static void vfio_dirty_tracking_init(VFIOContainer *container, >> +static int vfio_dirty_tracking_init(VFIOContainer *container, >> VFIODirtyRanges *ranges) >> { >> VFIODirtyRangesListener dirty; >> + int ret; >> >> memset(&dirty, 0, sizeof(dirty)); >> dirty.ranges.min32 = UINT32_MAX; >> @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer >> *container, >> dirty.listener = vfio_dirty_tracking_listener; >> dirty.container = container; >> >> - memory_listener_register(&dirty.listener, >> - container->space->as); >> + if (vfio_viommu_preset()) { >> + hwaddr iommu_max_iova; >> + >> + ret = vfio_viommu_get_max_iova(&iommu_max_iova); >> + if (ret) { >> + return -EINVAL; >> + } >> + >> + vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges); >> + } else { >> + memory_listener_register(&dirty.listener, >> + container->space->as); >> + /* >> + * The memory listener is synchronous, and used to calculate the range >> + * to dirty tracking. Unregister it after we are done as we are not >> + * interested in any follow-up updates. >> + */ >> + memory_listener_unregister(&dirty.listener); >> + } >> >> *ranges = dirty.ranges; >> >> - /* >> - * The memory listener is synchronous, and used to calculate the range >> - * to dirty tracking. Unregister it after we are done as we are not >> - * interested in any follow-up updates. >> - */ >> - memory_listener_unregister(&dirty.listener); >> + return 0; >> } >> >> static void vfio_devices_dma_logging_stop(VFIOContainer *container) >> @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer >> *container) >> VFIOGroup *group; >> int ret = 0; >> >> - vfio_dirty_tracking_init(container, &ranges); >> + ret = vfio_dirty_tracking_init(container, &ranges); >> + if (ret) { >> + error_report("Failed to init DMA logging ranges, err %d", >> + ret); >> + return -EOPNOTSUPP; >> + } >> + >> feature = vfio_device_feature_dma_logging_start_create(container, >> &ranges); >> if (!feature) { >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index 8a98e6ffc480..3bda5618c5b5 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -2974,6 +2974,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) >> &dma_translation); >> space->no_dma_translation = !dma_translation; >> >> + /* >> + * Support for advertised IOMMU address space boundaries is optional. >> + * By default, it is not advertised i.e. space::max_iova is 0. >> + */ >> + pci_device_iommu_get_attr(pdev, IOMMU_ATTR_MAX_IOVA, >> + &space->max_iova); >> + >> QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { >> if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { >> error_setg(errp, "device is already attached"); >> -- >> 2.17.2 >>
Hi Joao, On 6/23/2023 5:48 AM, Joao Martins wrote: > Currently, device dirty page tracking with vIOMMU is not supported, > and a blocker is added and the migration is prevented. > > When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as > requesting by the vIOMMU. These IOVA ranges can potentially be mapped > anywhere in the vIOMMU IOVA space as advertised by the VMM. > > To support device dirty tracking when vIOMMU enabled instead create the > dirty ranges based on the vIOMMU provided limits, which leads to the > tracking of the whole IOVA space regardless of what devices use. > > Signed-off-by: Avihai Horon <avihaih@nvidia.com> > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > --- > include/hw/vfio/vfio-common.h | 1 + > hw/vfio/common.c | 58 +++++++++++++++++++++++++++++------ > hw/vfio/pci.c | 7 +++++ > 3 files changed, 56 insertions(+), 10 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index f41860988d6b..c4bafad084b4 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -71,6 +71,7 @@ typedef struct VFIOMigration { > typedef struct VFIOAddressSpace { > AddressSpace *as; > bool no_dma_translation; > + hwaddr max_iova; > QLIST_HEAD(, VFIOContainer) containers; > QLIST_ENTRY(VFIOAddressSpace) list; > } VFIOAddressSpace; > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index ecfb9afb3fb6..85fddef24026 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void) > return false; > } > > +static int vfio_viommu_get_max_iova(hwaddr *max_iova) > +{ > + VFIOAddressSpace *space; > + > + *max_iova = 0; > + > + QLIST_FOREACH(space, &vfio_address_spaces, list) { > + if (space->as == &address_space_memory) { > + continue; > + } Just curious why address_space_memory is bypassed? Imagine two vfio devices linked to two host bridge, one has bypass_iommu set and the other not. Don't we need to include the guest memory ranges in address_space_memory? > + > + if (*max_iova < space->max_iova) { > + *max_iova = space->max_iova; > + } > + } > + > + return *max_iova == 0; > +} > + > int vfio_block_giommu_migration(Error **errp) > { > int ret; > @@ -1464,10 +1483,11 @@ static const MemoryListener vfio_dirty_tracking_listener = { > .region_add = vfio_listener_dirty_tracking_update, > }; > > -static void vfio_dirty_tracking_init(VFIOContainer *container, > +static int vfio_dirty_tracking_init(VFIOContainer *container, > VFIODirtyRanges *ranges) > { > VFIODirtyRangesListener dirty; > + int ret; > > memset(&dirty, 0, sizeof(dirty)); > dirty.ranges.min32 = UINT32_MAX; > @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, > dirty.listener = vfio_dirty_tracking_listener; > dirty.container = container; > > - memory_listener_register(&dirty.listener, > - container->space->as); > + if (vfio_viommu_preset()) { > + hwaddr iommu_max_iova; > + > + ret = vfio_viommu_get_max_iova(&iommu_max_iova); > + if (ret) { > + return -EINVAL; > + } > + > + vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges); > + } else { > + memory_listener_register(&dirty.listener, > + container->space->as); > + /* > + * The memory listener is synchronous, and used to calculate the range > + * to dirty tracking. Unregister it after we are done as we are not > + * interested in any follow-up updates. > + */ > + memory_listener_unregister(&dirty.listener); > + } > > *ranges = dirty.ranges; > > - /* > - * The memory listener is synchronous, and used to calculate the range > - * to dirty tracking. Unregister it after we are done as we are not > - * interested in any follow-up updates. > - */ > - memory_listener_unregister(&dirty.listener); > + return 0; > } > > static void vfio_devices_dma_logging_stop(VFIOContainer *container) > @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) > VFIOGroup *group; > int ret = 0; > > - vfio_dirty_tracking_init(container, &ranges); > + ret = vfio_dirty_tracking_init(container, &ranges); > + if (ret) { > + error_report("Failed to init DMA logging ranges, err %d", > + ret); > + return -EOPNOTSUPP; > + } > + > feature = vfio_device_feature_dma_logging_start_create(container, > &ranges); No clear how much dirty range size could impact device dirty tracking. Maybe some devices linking to vIOMMU with small aw_bits or bypassing vIOMMU with small guest memory could benefit if we use per address space's dirty range Thanks Zhenzhong
On 08/09/2023 07:11, Duan, Zhenzhong wrote: > Hi Joao, > > On 6/23/2023 5:48 AM, Joao Martins wrote: >> Currently, device dirty page tracking with vIOMMU is not supported, >> and a blocker is added and the migration is prevented. >> >> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as >> requesting by the vIOMMU. These IOVA ranges can potentially be mapped >> anywhere in the vIOMMU IOVA space as advertised by the VMM. >> >> To support device dirty tracking when vIOMMU enabled instead create the >> dirty ranges based on the vIOMMU provided limits, which leads to the >> tracking of the whole IOVA space regardless of what devices use. >> >> Signed-off-by: Avihai Horon <avihaih@nvidia.com> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >> --- >> include/hw/vfio/vfio-common.h | 1 + >> hw/vfio/common.c | 58 +++++++++++++++++++++++++++++------ >> hw/vfio/pci.c | 7 +++++ >> 3 files changed, 56 insertions(+), 10 deletions(-) >> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index f41860988d6b..c4bafad084b4 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -71,6 +71,7 @@ typedef struct VFIOMigration { >> typedef struct VFIOAddressSpace { >> AddressSpace *as; >> bool no_dma_translation; >> + hwaddr max_iova; >> QLIST_HEAD(, VFIOContainer) containers; >> QLIST_ENTRY(VFIOAddressSpace) list; >> } VFIOAddressSpace; >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index ecfb9afb3fb6..85fddef24026 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void) >> return false; >> } >> +static int vfio_viommu_get_max_iova(hwaddr *max_iova) >> +{ >> + VFIOAddressSpace *space; >> + >> + *max_iova = 0; >> + >> + QLIST_FOREACH(space, &vfio_address_spaces, list) { >> + if (space->as == &address_space_memory) { >> + continue; >> + } > > Just curious why address_space_memory is bypassed? > But address_space_memory part is done by memory listeners, but I see your point conceptually on not considering it > Imagine two vfio devices linked to two host bridge, one has bypass_iommu set > > and the other not. Don't we need to include the guest memory ranges in > > address_space_memory? I am probably making a bad assumption that vIOMMU maximum adress space is a superset of the CPU address space. But as you just reminded me, there is a user case where all it takes is one bypass_iommu=true on a 2T guest setup with aw-bits=39. I'll rework this to consider this. > >> + >> + if (*max_iova < space->max_iova) { >> + *max_iova = space->max_iova; >> + } >> + } >> + >> + return *max_iova == 0; >> +} >> + >> int vfio_block_giommu_migration(Error **errp) >> { >> int ret; >> @@ -1464,10 +1483,11 @@ static const MemoryListener >> vfio_dirty_tracking_listener = { >> .region_add = vfio_listener_dirty_tracking_update, >> }; >> -static void vfio_dirty_tracking_init(VFIOContainer *container, >> +static int vfio_dirty_tracking_init(VFIOContainer *container, >> VFIODirtyRanges *ranges) >> { >> VFIODirtyRangesListener dirty; >> + int ret; >> memset(&dirty, 0, sizeof(dirty)); >> dirty.ranges.min32 = UINT32_MAX; >> @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer >> *container, >> dirty.listener = vfio_dirty_tracking_listener; >> dirty.container = container; >> - memory_listener_register(&dirty.listener, >> - container->space->as); >> + if (vfio_viommu_preset()) { >> + hwaddr iommu_max_iova; >> + >> + ret = vfio_viommu_get_max_iova(&iommu_max_iova); >> + if (ret) { >> + return -EINVAL; >> + } >> + >> + vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges); >> + } else { >> + memory_listener_register(&dirty.listener, >> + container->space->as); >> + /* >> + * The memory listener is synchronous, and used to calculate the range >> + * to dirty tracking. Unregister it after we are done as we are not >> + * interested in any follow-up updates. >> + */ >> + memory_listener_unregister(&dirty.listener); >> + } >> *ranges = dirty.ranges; >> - /* >> - * The memory listener is synchronous, and used to calculate the range >> - * to dirty tracking. Unregister it after we are done as we are not >> - * interested in any follow-up updates. >> - */ >> - memory_listener_unregister(&dirty.listener); >> + return 0; >> } >> static void vfio_devices_dma_logging_stop(VFIOContainer *container) >> @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer >> *container) >> VFIOGroup *group; >> int ret = 0; >> - vfio_dirty_tracking_init(container, &ranges); >> + ret = vfio_dirty_tracking_init(container, &ranges); >> + if (ret) { >> + error_report("Failed to init DMA logging ranges, err %d", >> + ret); >> + return -EOPNOTSUPP; >> + } >> + >> feature = vfio_device_feature_dma_logging_start_create(container, >> &ranges); > > No clear how much dirty range size could impact device dirty tracking. > > Maybe some devices linking to vIOMMU with small aw_bits or bypassing vIOMMU > So, my intended usecase with this series starts with DMA_TRANSLATION=off, where vIOMMU is restricted to interrupt remapping, yet guest only uses it for interrupt remapping. Right now only supported by intel-iommu, but my understanding of AMD IOMMU is that is also possible there (haven't checked smmu-v3). This unblocks guests with more >255. I have another patch separate to this that hopefully relaxes vIOMMU blockage for these older guests. Now with real emulated vIOMMU DMA translation usage, intel-iommu there's only 39, 48, 57 address space width (mimmicing the levels). And it's true that only the first value is supportable and somewhat limiting as you say. virtio-iommu has more going there as you can limit input iova to be the size of the CPU address space. Eric latest series[0] is actually nice on that end of fixing that issue (my alternative I had there was to introduce a gaw-bits equivalent, but it's better done like [0]). [0] https://lore.kernel.org/qemu-devel/20230904080451.424731-1-eric.auger@redhat.com/ > with small guest memory could benefit if we use per address space's dirty range > Yeah, I'll need to fix that, when there's big guest memory and small vIOMMU address space. Hopefully I picked up on your comment right :)
On 9/8/2023 6:11 PM, Joao Martins wrote: > On 08/09/2023 07:11, Duan, Zhenzhong wrote: >> Hi Joao, >> >> On 6/23/2023 5:48 AM, Joao Martins wrote: >>> Currently, device dirty page tracking with vIOMMU is not supported, >>> and a blocker is added and the migration is prevented. >>> >>> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as >>> requesting by the vIOMMU. These IOVA ranges can potentially be mapped >>> anywhere in the vIOMMU IOVA space as advertised by the VMM. >>> >>> To support device dirty tracking when vIOMMU enabled instead create the >>> dirty ranges based on the vIOMMU provided limits, which leads to the >>> tracking of the whole IOVA space regardless of what devices use. >>> >>> Signed-off-by: Avihai Horon <avihaih@nvidia.com> >>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >>> --- >>> include/hw/vfio/vfio-common.h | 1 + >>> hw/vfio/common.c | 58 +++++++++++++++++++++++++++++------ >>> hw/vfio/pci.c | 7 +++++ >>> 3 files changed, 56 insertions(+), 10 deletions(-) >>> >>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >>> index f41860988d6b..c4bafad084b4 100644 >>> --- a/include/hw/vfio/vfio-common.h >>> +++ b/include/hw/vfio/vfio-common.h >>> @@ -71,6 +71,7 @@ typedef struct VFIOMigration { >>> typedef struct VFIOAddressSpace { >>> AddressSpace *as; >>> bool no_dma_translation; >>> + hwaddr max_iova; >>> QLIST_HEAD(, VFIOContainer) containers; >>> QLIST_ENTRY(VFIOAddressSpace) list; >>> } VFIOAddressSpace; >>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>> index ecfb9afb3fb6..85fddef24026 100644 >>> --- a/hw/vfio/common.c >>> +++ b/hw/vfio/common.c >>> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void) >>> return false; >>> } >>> +static int vfio_viommu_get_max_iova(hwaddr *max_iova) >>> +{ >>> + VFIOAddressSpace *space; >>> + >>> + *max_iova = 0; >>> + >>> + QLIST_FOREACH(space, &vfio_address_spaces, list) { >>> + if (space->as == &address_space_memory) { >>> + continue; >>> + } >> Just curious why address_space_memory is bypassed? >> > But address_space_memory part is done by memory listeners Only this part. Still think about the case with two vfio devices, one bypass iommu, the other not. The device bypassing iommu will get address_space_memory, the other get iommu address space. vfio_viommu_preset() return true for any device, so we never run into memory listener even for device bypassing iommu? Thanks Zhenzhong
On 08/09/2023 12:52, Duan, Zhenzhong wrote: > On 9/8/2023 6:11 PM, Joao Martins wrote: >> On 08/09/2023 07:11, Duan, Zhenzhong wrote: >>> Hi Joao, >>> >>> On 6/23/2023 5:48 AM, Joao Martins wrote: >>>> Currently, device dirty page tracking with vIOMMU is not supported, >>>> and a blocker is added and the migration is prevented. >>>> >>>> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as >>>> requesting by the vIOMMU. These IOVA ranges can potentially be mapped >>>> anywhere in the vIOMMU IOVA space as advertised by the VMM. >>>> >>>> To support device dirty tracking when vIOMMU enabled instead create the >>>> dirty ranges based on the vIOMMU provided limits, which leads to the >>>> tracking of the whole IOVA space regardless of what devices use. >>>> >>>> Signed-off-by: Avihai Horon <avihaih@nvidia.com> >>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >>>> --- >>>> include/hw/vfio/vfio-common.h | 1 + >>>> hw/vfio/common.c | 58 +++++++++++++++++++++++++++++------ >>>> hw/vfio/pci.c | 7 +++++ >>>> 3 files changed, 56 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >>>> index f41860988d6b..c4bafad084b4 100644 >>>> --- a/include/hw/vfio/vfio-common.h >>>> +++ b/include/hw/vfio/vfio-common.h >>>> @@ -71,6 +71,7 @@ typedef struct VFIOMigration { >>>> typedef struct VFIOAddressSpace { >>>> AddressSpace *as; >>>> bool no_dma_translation; >>>> + hwaddr max_iova; >>>> QLIST_HEAD(, VFIOContainer) containers; >>>> QLIST_ENTRY(VFIOAddressSpace) list; >>>> } VFIOAddressSpace; >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>>> index ecfb9afb3fb6..85fddef24026 100644 >>>> --- a/hw/vfio/common.c >>>> +++ b/hw/vfio/common.c >>>> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void) >>>> return false; >>>> } >>>> +static int vfio_viommu_get_max_iova(hwaddr *max_iova) >>>> +{ >>>> + VFIOAddressSpace *space; >>>> + >>>> + *max_iova = 0; >>>> + >>>> + QLIST_FOREACH(space, &vfio_address_spaces, list) { >>>> + if (space->as == &address_space_memory) { >>>> + continue; >>>> + } >>> Just curious why address_space_memory is bypassed? >>> >> But address_space_memory part is done by memory listeners > > Only this part. Still think about the case with two vfio devices, one bypass > iommu, the other not. > > The device bypassing iommu will get address_space_memory, the other get iommu > > address space. vfio_viommu_preset() return true for any device, so we never run > into > > memory listener even for device bypassing iommu? Yeap, that's correct. When I said earlier 'reworking this' I meant this part exactly to do both.
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f41860988d6b..c4bafad084b4 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -71,6 +71,7 @@ typedef struct VFIOMigration { typedef struct VFIOAddressSpace { AddressSpace *as; bool no_dma_translation; + hwaddr max_iova; QLIST_HEAD(, VFIOContainer) containers; QLIST_ENTRY(VFIOAddressSpace) list; } VFIOAddressSpace; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ecfb9afb3fb6..85fddef24026 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void) return false; } +static int vfio_viommu_get_max_iova(hwaddr *max_iova) +{ + VFIOAddressSpace *space; + + *max_iova = 0; + + QLIST_FOREACH(space, &vfio_address_spaces, list) { + if (space->as == &address_space_memory) { + continue; + } + + if (*max_iova < space->max_iova) { + *max_iova = space->max_iova; + } + } + + return *max_iova == 0; +} + int vfio_block_giommu_migration(Error **errp) { int ret; @@ -1464,10 +1483,11 @@ static const MemoryListener vfio_dirty_tracking_listener = { .region_add = vfio_listener_dirty_tracking_update, }; -static void vfio_dirty_tracking_init(VFIOContainer *container, +static int vfio_dirty_tracking_init(VFIOContainer *container, VFIODirtyRanges *ranges) { VFIODirtyRangesListener dirty; + int ret; memset(&dirty, 0, sizeof(dirty)); dirty.ranges.min32 = UINT32_MAX; @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, dirty.listener = vfio_dirty_tracking_listener; dirty.container = container; - memory_listener_register(&dirty.listener, - container->space->as); + if (vfio_viommu_preset()) { + hwaddr iommu_max_iova; + + ret = vfio_viommu_get_max_iova(&iommu_max_iova); + if (ret) { + return -EINVAL; + } + + vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges); + } else { + memory_listener_register(&dirty.listener, + container->space->as); + /* + * The memory listener is synchronous, and used to calculate the range + * to dirty tracking. Unregister it after we are done as we are not + * interested in any follow-up updates. + */ + memory_listener_unregister(&dirty.listener); + } *ranges = dirty.ranges; - /* - * The memory listener is synchronous, and used to calculate the range - * to dirty tracking. Unregister it after we are done as we are not - * interested in any follow-up updates. - */ - memory_listener_unregister(&dirty.listener); + return 0; } static void vfio_devices_dma_logging_stop(VFIOContainer *container) @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) VFIOGroup *group; int ret = 0; - vfio_dirty_tracking_init(container, &ranges); + ret = vfio_dirty_tracking_init(container, &ranges); + if (ret) { + error_report("Failed to init DMA logging ranges, err %d", + ret); + return -EOPNOTSUPP; + } + feature = vfio_device_feature_dma_logging_start_create(container, &ranges); if (!feature) { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 8a98e6ffc480..3bda5618c5b5 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2974,6 +2974,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) &dma_translation); space->no_dma_translation = !dma_translation; + /* + * Support for advertised IOMMU address space boundaries is optional. + * By default, it is not advertised i.e. space::max_iova is 0. + */ + pci_device_iommu_get_attr(pdev, IOMMU_ATTR_MAX_IOVA, + &space->max_iova); + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { error_setg(errp, "device is already attached");