Message ID | 20230222174915.5647-18-avihaih@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio: Add migration pre-copy support and device dirty tracking | expand |
On Wed, 22 Feb 2023 19:49:12 +0200 Avihai Horon <avihaih@nvidia.com> wrote: > Currently, device dirty page tracking with vIOMMU is not supported - RAM > pages are perpetually marked dirty in this case. > > When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as > the vIOMMU maps/unmaps them. These IOVA ranges can potentially be mapped > anywhere in the vIOMMU IOVA space. > > Due to this dynamic nature of vIOMMU mapping/unmapping, tracking only > the currently mapped IOVA ranges, as done in the non-vIOMMU case, > doesn't work very well. > > Instead, to support device dirty tracking when vIOMMU is enabled, track > the entire vIOMMU IOVA space. If that fails (IOVA space can be rather > big and we might hit HW limitation), try tracking smaller range while > marking untracked ranges dirty. > > Signed-off-by: Avihai Horon <avihaih@nvidia.com> > --- > include/hw/vfio/vfio-common.h | 2 + > hw/vfio/common.c | 196 +++++++++++++++++++++++++++++++--- > 2 files changed, 181 insertions(+), 17 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 1f21e1fa43..1dc00cabcd 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -95,6 +95,8 @@ typedef struct VFIOContainer { > unsigned int dma_max_mappings; > IOVATree *mappings; > QemuMutex mappings_mutex; > + /* Represents the range [0, giommu_tracked_range) not inclusive */ > + hwaddr giommu_tracked_range; > QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; > QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; > QLIST_HEAD(, VFIOGroup) group_list; > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 4a7fff6eeb..1024788bcc 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -45,6 +45,8 @@ > #include "migration/qemu-file.h" > #include "sysemu/tpm.h" > #include "qemu/iova-tree.h" > +#include "hw/boards.h" > +#include "hw/mem/memory-device.h" > > VFIOGroupList vfio_group_list = > QLIST_HEAD_INITIALIZER(vfio_group_list); > @@ -430,6 +432,38 @@ void vfio_unblock_multiple_devices_migration(void) > multiple_devices_migration_blocker = NULL; > } > > +static uint64_t vfio_get_ram_size(void) > +{ > + MachineState *ms = MACHINE(qdev_get_machine()); > + uint64_t plugged_size; > + > + plugged_size = get_plugged_memory_size(); > + if (plugged_size == (uint64_t)-1) { > + plugged_size = 0; > + } > + > + return ms->ram_size + plugged_size; > +} > + > +static int vfio_iommu_get_max_iova(VFIOContainer *container, hwaddr *max_iova) > +{ > + VFIOGuestIOMMU *giommu; > + int ret; > + > + giommu = QLIST_FIRST(&container->giommu_list); > + if (!giommu) { > + return -ENOENT; > + } > + > + ret = memory_region_iommu_get_attr(giommu->iommu_mr, IOMMU_ATTR_MAX_IOVA, > + max_iova); > + if (ret) { > + return ret; > + } > + > + return 0; > +} > + > static bool vfio_have_giommu(VFIOContainer *container) > { > return !QLIST_EMPTY(&container->giommu_list); > @@ -1510,7 +1544,8 @@ static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data) > } > > static struct vfio_device_feature * > -vfio_device_feature_dma_logging_start_create(VFIOContainer *container) > +vfio_device_feature_dma_logging_start_create(VFIOContainer *container, > + bool giommu) > { > struct vfio_device_feature *feature; > size_t feature_size; > @@ -1529,6 +1564,16 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container) > control = (struct vfio_device_feature_dma_logging_control *)feature->data; > control->page_size = qemu_real_host_page_size(); > > + if (giommu) { > + ranges = g_malloc0(sizeof(*ranges)); > + ranges->iova = 0; > + ranges->length = container->giommu_tracked_range; > + control->num_ranges = 1; > + control->ranges = (uint64_t)ranges; > + > + return feature; > + } > + > QEMU_LOCK_GUARD(&container->mappings_mutex); > > /* > @@ -1578,12 +1623,12 @@ static void vfio_device_feature_dma_logging_start_destroy( > g_free(feature); > } > > -static int vfio_devices_dma_logging_start(VFIOContainer *container) > +static int vfio_devices_dma_logging_start(VFIOContainer *container, bool giommu) > { > struct vfio_device_feature *feature; > int ret; > > - feature = vfio_device_feature_dma_logging_start_create(container); > + feature = vfio_device_feature_dma_logging_start_create(container, giommu); > if (!feature) { > return -errno; > } > @@ -1598,18 +1643,128 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) > return ret; > } > > +typedef struct { > + hwaddr *ranges; > + unsigned int ranges_num; > +} VFIOGIOMMUDeviceDTRanges; > + > +/* > + * This value is used in the second attempt to start device dirty tracking with > + * vIOMMU, or if the giommu fails to report its max iova. > + * It should be in the middle, not too big and not too small, allowing devices > + * with HW limitations to do device dirty tracking while covering a fair amount > + * of the IOVA space. > + * > + * This arbitrary value was chosen becasue it is the minimum value of Intel > + * IOMMU max IOVA and mlx5 devices support tracking a range of this size. > + */ > +#define VFIO_IOMMU_DEFAULT_MAX_IOVA ((1ULL << 39) - 1) > + > +#define VFIO_IOMMU_RANGES_NUM 3 > +static VFIOGIOMMUDeviceDTRanges * > +vfio_iommu_device_dirty_tracking_ranges_create(VFIOContainer *container) > +{ > + hwaddr iommu_max_iova = VFIO_IOMMU_DEFAULT_MAX_IOVA; > + hwaddr retry_iova; > + hwaddr ram_size = vfio_get_ram_size(); > + VFIOGIOMMUDeviceDTRanges *dt_ranges; > + int ret; > + > + dt_ranges = g_try_new0(VFIOGIOMMUDeviceDTRanges, 1); > + if (!dt_ranges) { > + errno = ENOMEM; > + > + return NULL; > + } > + > + dt_ranges->ranges_num = VFIO_IOMMU_RANGES_NUM; > + > + dt_ranges->ranges = g_try_new0(hwaddr, dt_ranges->ranges_num); > + if (!dt_ranges->ranges) { > + g_free(dt_ranges); > + errno = ENOMEM; > + > + return NULL; > + } > + > + /* > + * With vIOMMU we try to track the entire IOVA space. As the IOVA space can > + * be rather big, devices might not be able to track it due to HW > + * limitations. In that case: > + * (1) Retry tracking a smaller part of the IOVA space. > + * (2) Retry tracking a range in the size of the physical memory. This looks really sketchy, why do we think there's a "good enough" value here? If we get it wrong, the device potentially has access to IOVA space that we're not tracking, right? I'd think the only viable fallback if the vIOMMU doesn't report its max IOVA is the full 64-bit address space, otherwise it seems like we need to add a migration blocker. BTW, virtio-iommu is actively working to support vfio devices, we should include support for it as well as VT-d. Thanks, Alex > + */ > + ret = vfio_iommu_get_max_iova(container, &iommu_max_iova); > + if (!ret) { > + /* Check 2^64 wrap around */ > + if (!REAL_HOST_PAGE_ALIGN(iommu_max_iova)) { > + iommu_max_iova -= qemu_real_host_page_size(); > + } > + } > + > + retry_iova = MIN(iommu_max_iova / 2, VFIO_IOMMU_DEFAULT_MAX_IOVA); > + > + dt_ranges->ranges[0] = REAL_HOST_PAGE_ALIGN(iommu_max_iova); > + dt_ranges->ranges[1] = REAL_HOST_PAGE_ALIGN(retry_iova); > + dt_ranges->ranges[2] = REAL_HOST_PAGE_ALIGN(MIN(ram_size, retry_iova / 2)); > + > + return dt_ranges; > +} > + > +static void vfio_iommu_device_dirty_tracking_ranges_destroy( > + VFIOGIOMMUDeviceDTRanges *dt_ranges) > +{ > + g_free(dt_ranges->ranges); > + g_free(dt_ranges); > +} > + > +static int vfio_devices_start_dirty_page_tracking(VFIOContainer *container) > +{ > + VFIOGIOMMUDeviceDTRanges *dt_ranges; > + int ret; > + int i; > + > + if (!vfio_have_giommu(container)) { > + return vfio_devices_dma_logging_start(container, false); > + } > + > + dt_ranges = vfio_iommu_device_dirty_tracking_ranges_create(container); > + if (!dt_ranges) { > + return -errno; > + } > + > + for (i = 0; i < dt_ranges->ranges_num; i++) { > + container->giommu_tracked_range = dt_ranges->ranges[i]; > + ret = vfio_devices_dma_logging_start(container, true); > + if (!ret) { > + break; > + } > + > + if (i < dt_ranges->ranges_num - 1) { > + warn_report("Failed to start device dirty tracking with vIOMMU " > + "with range of size 0x%" HWADDR_PRIx > + ", err: %d. Retrying with range " > + "of size 0x%" HWADDR_PRIx, > + dt_ranges->ranges[i], ret, dt_ranges->ranges[i + 1]); > + } else { > + error_report("Failed to start device dirty tracking with vIOMMU " > + "with range of size 0x%" HWADDR_PRIx ", err: %d", > + dt_ranges->ranges[i], ret); > + } > + } > + > + vfio_iommu_device_dirty_tracking_ranges_destroy(dt_ranges); > + > + return ret; > +} > + > static void vfio_listener_log_global_start(MemoryListener *listener) > { > VFIOContainer *container = container_of(listener, VFIOContainer, listener); > int ret; > > if (vfio_devices_all_device_dirty_tracking(container)) { > - if (vfio_have_giommu(container)) { > - /* Device dirty page tracking currently doesn't support vIOMMU */ > - return; > - } > - > - ret = vfio_devices_dma_logging_start(container); > + ret = vfio_devices_start_dirty_page_tracking(container); > } else { > ret = vfio_set_dirty_page_tracking(container, true); > } > @@ -1627,11 +1782,6 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) > int ret; > > if (vfio_devices_all_device_dirty_tracking(container)) { > - if (vfio_have_giommu(container)) { > - /* Device dirty page tracking currently doesn't support vIOMMU */ > - return; > - } > - > ret = vfio_devices_dma_logging_stop(container); > } else { > ret = vfio_set_dirty_page_tracking(container, false); > @@ -1670,6 +1820,17 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, > return 0; > } > > +static bool vfio_iommu_range_is_device_tracked(VFIOContainer *container, > + hwaddr iova, hwaddr size) > +{ > + /* Check for 2^64 wrap around */ > + if (!(iova + size)) { > + return false; > + } > + > + return iova + size <= container->giommu_tracked_range; > +} > + > static int vfio_devices_query_dirty_bitmap(VFIOContainer *container, > VFIOBitmap *vbmap, hwaddr iova, > hwaddr size) > @@ -1679,10 +1840,11 @@ static int vfio_devices_query_dirty_bitmap(VFIOContainer *container, > int ret; > > if (vfio_have_giommu(container)) { > - /* Device dirty page tracking currently doesn't support vIOMMU */ > - bitmap_set(vbmap->bitmap, 0, vbmap->pages); > + if (!vfio_iommu_range_is_device_tracked(container, iova, size)) { > + bitmap_set(vbmap->bitmap, 0, vbmap->pages); > > - return 0; > + return 0; > + } > } > > QLIST_FOREACH(group, &container->group_list, container_next) {
On Wed, Feb 22, 2023 at 04:34:39PM -0700, Alex Williamson wrote: > > + /* > > + * With vIOMMU we try to track the entire IOVA space. As the IOVA space can > > + * be rather big, devices might not be able to track it due to HW > > + * limitations. In that case: > > + * (1) Retry tracking a smaller part of the IOVA space. > > + * (2) Retry tracking a range in the size of the physical memory. > > This looks really sketchy, why do we think there's a "good enough" > value here? If we get it wrong, the device potentially has access to > IOVA space that we're not tracking, right? The idea was the untracked range becomes permanently dirty, so at worst this means the migration never converges. #2 is the presumption that the guest is using an identity map. > I'd think the only viable fallback if the vIOMMU doesn't report its max > IOVA is the full 64-bit address space, otherwise it seems like we need > to add a migration blocker. This is basically saying vIOMMU doesn't work with migration, and we've heard that this isn't OK. There are cases where vIOMMU is on but the guest always uses identity maps. eg for virtual interrupt remapping. We also have future problems that nested translation is incompatible with device dirty tracking.. Jason
On Wed, 22 Feb 2023 22:08:33 -0400 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Feb 22, 2023 at 04:34:39PM -0700, Alex Williamson wrote: > > > + /* > > > + * With vIOMMU we try to track the entire IOVA space. As the IOVA space can > > > + * be rather big, devices might not be able to track it due to HW > > > + * limitations. In that case: > > > + * (1) Retry tracking a smaller part of the IOVA space. > > > + * (2) Retry tracking a range in the size of the physical memory. > > > > This looks really sketchy, why do we think there's a "good enough" > > value here? If we get it wrong, the device potentially has access to > > IOVA space that we're not tracking, right? > > The idea was the untracked range becomes permanently dirty, so at > worst this means the migration never converges. I didn't spot the mechanics where that's implemented, I'll look again. > #2 is the presumption that the guest is using an identity map. This is a dangerous assumption. > > I'd think the only viable fallback if the vIOMMU doesn't report its max > > IOVA is the full 64-bit address space, otherwise it seems like we need > > to add a migration blocker. > > This is basically saying vIOMMU doesn't work with migration, and we've > heard that this isn't OK. There are cases where vIOMMU is on but the > guest always uses identity maps. eg for virtual interrupt remapping. Yes, the vIOMMU can be automatically added to a VM when we exceed 255 vCPUs, but I don't see how we can therefore deduce anything about the usage mode of the vIOMMU. Users also make use of vfio with vIOMMU for nested assignment, ie. userspace drivers running within the guest, where making assumptions about the IOVA extents of the userspace driver seems dangerous. Let's backup though, if a device doesn't support the full address width of the platform, it's the responsibility of the device driver to implement a DMA mask such that the device is never asked to DMA outside of its address space support. Therefore how could a device ever dirty pages outside of its own limitations? Isn't it reasonable to require that a device support dirty tracking for the entire extent if its DMA address width in order to support this feature? If we can make those assumptions, then the vfio driver should happily accept a range exceeding the device's DMA address width capabilities, knowing that the device cannot dirty anything beyond its addressable range. > We also have future problems that nested translation is incompatible > with device dirty tracking.. :-\ Thanks, Alex
On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: > > #2 is the presumption that the guest is using an identity map. > > This is a dangerous assumption. > > > > I'd think the only viable fallback if the vIOMMU doesn't report its max > > > IOVA is the full 64-bit address space, otherwise it seems like we need > > > to add a migration blocker. > > > > This is basically saying vIOMMU doesn't work with migration, and we've > > heard that this isn't OK. There are cases where vIOMMU is on but the > > guest always uses identity maps. eg for virtual interrupt remapping. > > Yes, the vIOMMU can be automatically added to a VM when we exceed 255 > vCPUs, but I don't see how we can therefore deduce anything about the > usage mode of the vIOMMU. We just loose optimizations. Any mappings that are established outside the dirty tracking range are permanently dirty. So at worst the guest can block migration by establishing bad mappings. It is not exactly production quality but it is still useful for a closed environment with known guest configurations. > nested assignment, ie. userspace drivers running within the guest, > where making assumptions about the IOVA extents of the userspace driver > seems dangerous. > > Let's backup though, if a device doesn't support the full address width > of the platform, it's the responsibility of the device driver to > implement a DMA mask such that the device is never asked to DMA outside > of its address space support. Therefore how could a device ever dirty > pages outside of its own limitations? The device always supports the full address space. We can't enforce any kind of limit on the VM It just can't dirty track it all. > Isn't it reasonable to require that a device support dirty tracking for > the entire extent if its DMA address width in order to support this > feature? No, 2**64 is too big a number to be reasonable. Ideally we'd work it the other way and tell the vIOMMU that the vHW only supports a limited number of address bits for the translation, eg through the ACPI tables. Then the dirty tracking could safely cover the larger of all system memory or the limited IOVA address space. Or even better figure out how to get interrupt remapping without IOMMU support :\ Jason
On 23/02/2023 20:55, Jason Gunthorpe wrote: > On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: >>> #2 is the presumption that the guest is using an identity map. >> Isn't it reasonable to require that a device support dirty tracking for >> the entire extent if its DMA address width in order to support this >> feature? > > No, 2**64 is too big a number to be reasonable. > +1 > Ideally we'd work it the other way and tell the vIOMMU that the vHW > only supports a limited number of address bits for the translation, eg > through the ACPI tables. Then the dirty tracking could safely cover > the larger of all system memory or the limited IOVA address space. > > Or even better figure out how to get interrupt remapping without IOMMU > support :\ FWIW That's generally my use of `iommu=pt` because all I want is interrupt remapping, not the DMA remapping part. And this is going to be specially relevant with these new boxes that easily surprass the >255 dedicated physical CPUs mark with just two sockets. The only other alternative I could see is to rely on IOMMU attribute for DMA translation. Today you can actually toggle that 'off' in VT-d (and I can imagine the same thing working for AMD-vIOMMU). In Intel it just omits the 39 Address-width cap. And it means it doesn't have virtual addressing. Similar to what Avihai already does for MAX_IOVA, we would do for DMA_TRANSLATION, and let each vIOMMU implementation support that. But to be honest I am not sure how robust relying on that is as that doesn't really represent a hardware implementation. Without vIOMMU you have a (KVM) PV op in new *guest* kernels that (ab)uses some unused bits in IOAPIC for a 24-bit DestID. But this is only on new guests and hypervisors, old *guests* running older < 5.15 kernels won't work. ... So iommu=pt really is the most convenient right now :/ Joao
On Thu, 23 Feb 2023 16:55:54 -0400 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: > > > #2 is the presumption that the guest is using an identity map. > > > > This is a dangerous assumption. > > > > > > I'd think the only viable fallback if the vIOMMU doesn't report its max > > > > IOVA is the full 64-bit address space, otherwise it seems like we need > > > > to add a migration blocker. > > > > > > This is basically saying vIOMMU doesn't work with migration, and we've > > > heard that this isn't OK. There are cases where vIOMMU is on but the > > > guest always uses identity maps. eg for virtual interrupt remapping. > > > > Yes, the vIOMMU can be automatically added to a VM when we exceed 255 > > vCPUs, but I don't see how we can therefore deduce anything about the > > usage mode of the vIOMMU. > > We just loose optimizations. Any mappings that are established outside > the dirty tracking range are permanently dirty. So at worst the guest > can block migration by establishing bad mappings. It is not exactly > production quality but it is still useful for a closed environment > with known guest configurations. That doesn't seem to be what happens in this series, nor does it really make sense to me that userspace would simply decide to truncate the dirty tracking ranges array. > > nested assignment, ie. userspace drivers running within the guest, > > where making assumptions about the IOVA extents of the userspace driver > > seems dangerous. > > > > Let's backup though, if a device doesn't support the full address width > > of the platform, it's the responsibility of the device driver to > > implement a DMA mask such that the device is never asked to DMA outside > > of its address space support. Therefore how could a device ever dirty > > pages outside of its own limitations? > > The device always supports the full address space. We can't enforce > any kind of limit on the VM > > It just can't dirty track it all. > > > Isn't it reasonable to require that a device support dirty tracking for > > the entire extent if its DMA address width in order to support this > > feature? > > No, 2**64 is too big a number to be reasonable. So what are the actual restrictions were dealing with here? I think it would help us collaborate on a solution if we didn't have these device specific restrictions sprinkled through the base implementation. > Ideally we'd work it the other way and tell the vIOMMU that the vHW > only supports a limited number of address bits for the translation, eg > through the ACPI tables. Then the dirty tracking could safely cover > the larger of all system memory or the limited IOVA address space. Why can't we do that? Hotplug is an obvious issue, but maybe it's not vHW telling the vIOMMU a restriction, maybe it's a QEMU machine or vIOMMU option and if it's not set to something the device can support, migration is blocked. > Or even better figure out how to get interrupt remapping without IOMMU > support :\ -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \ -device intel-iommu,caching-mode=on,intremap=on Thanks, Alex
On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote: > On Thu, 23 Feb 2023 16:55:54 -0400 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: > > > > #2 is the presumption that the guest is using an identity map. > > > > > > This is a dangerous assumption. > > > > > > > > I'd think the only viable fallback if the vIOMMU doesn't report its max > > > > > IOVA is the full 64-bit address space, otherwise it seems like we need > > > > > to add a migration blocker. > > > > > > > > This is basically saying vIOMMU doesn't work with migration, and we've > > > > heard that this isn't OK. There are cases where vIOMMU is on but the > > > > guest always uses identity maps. eg for virtual interrupt remapping. > > > > > > Yes, the vIOMMU can be automatically added to a VM when we exceed 255 > > > vCPUs, but I don't see how we can therefore deduce anything about the > > > usage mode of the vIOMMU. > > > > We just loose optimizations. Any mappings that are established outside > > the dirty tracking range are permanently dirty. So at worst the guest > > can block migration by establishing bad mappings. It is not exactly > > production quality but it is still useful for a closed environment > > with known guest configurations. > > That doesn't seem to be what happens in this series, Seems like something is missed then > nor does it really make sense to me that userspace would simply > decide to truncate the dirty tracking ranges array. Who else would do it? > > No, 2**64 is too big a number to be reasonable. > > So what are the actual restrictions were dealing with here? I think it > would help us collaborate on a solution if we didn't have these device > specific restrictions sprinkled through the base implementation. Hmm? It was always like this, the driver gets to decide if it accepts the proprosed tracking ranges or not. Given how the implementation has to work there is no device that could do 2**64... At least for mlx5 it is in the multi-TB range. Enough for physical memory on any real server. > > Ideally we'd work it the other way and tell the vIOMMU that the vHW > > only supports a limited number of address bits for the translation, eg > > through the ACPI tables. Then the dirty tracking could safely cover > > the larger of all system memory or the limited IOVA address space. > > Why can't we do that? Hotplug is an obvious issue, but maybe it's not > vHW telling the vIOMMU a restriction, maybe it's a QEMU machine or > vIOMMU option and if it's not set to something the device can support, > migration is blocked. I don't know, maybe we should if we can. > > Or even better figure out how to get interrupt remapping without IOMMU > > support :\ > > -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \ > -device intel-iommu,caching-mode=on,intremap=on Joao? If this works lets just block migration if the vIOMMU is turned on.. Jason
On 23/02/2023 23:26, Jason Gunthorpe wrote: > On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote: >> On Thu, 23 Feb 2023 16:55:54 -0400 >> Jason Gunthorpe <jgg@nvidia.com> wrote: >>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: >>> Or even better figure out how to get interrupt remapping without IOMMU >>> support :\ >> >> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \ >> -device intel-iommu,caching-mode=on,intremap=on > > Joao? > > If this works lets just block migration if the vIOMMU is turned on.. At a first glance, this looked like my regular iommu incantation. But reading the code this ::bypass_iommu (new to me) apparently tells that vIOMMU is bypassed or not for the PCI devices all the way to avoiding enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and such. You can see from the other email that all of the other options in my head were either bit inconvenient or risky. I wasn't aware of this option for what is worth -- much simpler, should work! And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live migration blocker if `bypass_iommu` is off for any PCI device. Joao
On 24/02/2023 11:25, Joao Martins wrote: > On 23/02/2023 23:26, Jason Gunthorpe wrote: >> On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote: >>> On Thu, 23 Feb 2023 16:55:54 -0400 >>> Jason Gunthorpe <jgg@nvidia.com> wrote: >>>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: >>>> Or even better figure out how to get interrupt remapping without IOMMU >>>> support :\ >>> >>> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \ >>> -device intel-iommu,caching-mode=on,intremap=on >> >> Joao? >> >> If this works lets just block migration if the vIOMMU is turned on.. > > At a first glance, this looked like my regular iommu incantation. > > But reading the code this ::bypass_iommu (new to me) apparently tells that > vIOMMU is bypassed or not for the PCI devices all the way to avoiding > enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether > PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and > such. > > You can see from the other email that all of the other options in my head were > either bit inconvenient or risky. I wasn't aware of this option for what is > worth -- much simpler, should work! > I say *should*, but on a second thought interrupt remapping may still be required to one of these devices that are IOMMU-bypassed. Say to put affinities to vcpus above 255? I was trying this out with more than 255 vcpus with a couple VFs and at a first glance these VFs fail to probe (these are CX6 VFs). It is a working setup without the parameter, but now adding a default_bus_bypass_iommu=on fails to init VFs: [ 32.412733] mlx5_core 0000:00:02.0: Rate limit: 127 rates are supported, range: 0Mbps to 97656Mbps [ 32.416242] mlx5_core 0000:00:02.0: mlx5_load:1204:(pid 3361): Failed to alloc IRQs [ 33.227852] mlx5_core 0000:00:02.0: probe_one:1684:(pid 3361): mlx5_init_one failed with error code -19 [ 33.242182] mlx5_core 0000:00:03.0: firmware version: 22.31.1660 [ 33.415876] mlx5_core 0000:00:03.0: Rate limit: 127 rates are supported, range: 0Mbps to 97656Mbps [ 33.448016] mlx5_core 0000:00:03.0: mlx5_load:1204:(pid 3361): Failed to alloc IRQs [ 34.207532] mlx5_core 0000:00:03.0: probe_one:1684:(pid 3361): mlx5_init_one failed with error code -19 I haven't dived yet into why it fails. > And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live > migration blocker if `bypass_iommu` is off for any PCI device. > Still we could have for starters a live migration blocker until we revisit the vIOMMU case ... should we deem that the default_bus_bypass_iommu=on or the others I suggested as non-options?
On Fri, Feb 24, 2023 at 12:53:26PM +0000, Joao Martins wrote: > > But reading the code this ::bypass_iommu (new to me) apparently tells that > > vIOMMU is bypassed or not for the PCI devices all the way to avoiding > > enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether > > PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and > > such. > > > > You can see from the other email that all of the other options in my head were > > either bit inconvenient or risky. I wasn't aware of this option for what is > > worth -- much simpler, should work! > > > > I say *should*, but on a second thought interrupt remapping may still be > required to one of these devices that are IOMMU-bypassed. Say to put affinities > to vcpus above 255? I was trying this out with more than 255 vcpus with a couple > VFs and at a first glance these VFs fail to probe (these are CX6 > VFs). It is pretty bizarre, but the Intel iommu driver is responsible for installing the interrupt remapping irq driver on the devices. So if there is no iommu driver bound then there won't be any interrupt remapping capability for the device even if the interrupt remapping HW is otherwise setup. The only reason Avihai is touching this is to try and keep the interrupt remapping emulation usable, we could certainly punt on that for now if it looks too ugly. Jason
On Fri, 24 Feb 2023 12:53:26 +0000 Joao Martins <joao.m.martins@oracle.com> wrote: > On 24/02/2023 11:25, Joao Martins wrote: > > On 23/02/2023 23:26, Jason Gunthorpe wrote: > >> On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote: > >>> On Thu, 23 Feb 2023 16:55:54 -0400 > >>> Jason Gunthorpe <jgg@nvidia.com> wrote: > >>>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: > >>>> Or even better figure out how to get interrupt remapping without IOMMU > >>>> support :\ > >>> > >>> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \ > >>> -device intel-iommu,caching-mode=on,intremap=on > >> > >> Joao? > >> > >> If this works lets just block migration if the vIOMMU is turned on.. > > > > At a first glance, this looked like my regular iommu incantation. > > > > But reading the code this ::bypass_iommu (new to me) apparently tells that > > vIOMMU is bypassed or not for the PCI devices all the way to avoiding > > enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether > > PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and > > such. > > > > You can see from the other email that all of the other options in my head were > > either bit inconvenient or risky. I wasn't aware of this option for what is > > worth -- much simpler, should work! > > > > I say *should*, but on a second thought interrupt remapping may still be > required to one of these devices that are IOMMU-bypassed. Say to put affinities > to vcpus above 255? I was trying this out with more than 255 vcpus with a couple > VFs and at a first glance these VFs fail to probe (these are CX6 VFs). > > It is a working setup without the parameter, but now adding a > default_bus_bypass_iommu=on fails to init VFs: > > [ 32.412733] mlx5_core 0000:00:02.0: Rate limit: 127 rates are supported, > range: 0Mbps to 97656Mbps > [ 32.416242] mlx5_core 0000:00:02.0: mlx5_load:1204:(pid 3361): Failed to > alloc IRQs > [ 33.227852] mlx5_core 0000:00:02.0: probe_one:1684:(pid 3361): mlx5_init_one > failed with error code -19 > [ 33.242182] mlx5_core 0000:00:03.0: firmware version: 22.31.1660 > [ 33.415876] mlx5_core 0000:00:03.0: Rate limit: 127 rates are supported, > range: 0Mbps to 97656Mbps > [ 33.448016] mlx5_core 0000:00:03.0: mlx5_load:1204:(pid 3361): Failed to > alloc IRQs > [ 34.207532] mlx5_core 0000:00:03.0: probe_one:1684:(pid 3361): mlx5_init_one > failed with error code -19 > > I haven't dived yet into why it fails. Hmm, I was thinking this would only affect DMA, but on second thought I think the DRHD also describes the interrupt remapping hardware and while interrupt remapping is an optional feature of the DRHD, DMA remapping is always supported afaict. I saw IR vectors in /proc/interrupts and thought it worked, but indeed an assigned device is having trouble getting vectors. > > > And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live > > migration blocker if `bypass_iommu` is off for any PCI device. > > > > Still we could have for starters a live migration blocker until we revisit the > vIOMMU case ... should we deem that the default_bus_bypass_iommu=on or the > others I suggested as non-options? I'm very uncomfortable presuming a vIOMMU usage model, especially when it leads to potentially untracked DMA if our assumptions are violated. We could use a MemoryListener on the IOVA space to record a high level mark, but we'd need to continue to monitor that mark while we're in pre-copy and I don't think anyone would agree that a migratable VM can suddenly become unmigratable due to a random IOVA allocation would be supportable. That leads me to think that a machine option to limit the vIOMMU address space, and testing that against the device prior to declaring migration support of the device is possibly our best option. Is that feasible? Do all the vIOMMU models have a means to limit the IOVA space? How does QEMU learn a limit for a given device? We probably need to think about whether there are devices that can even support the guest physical memory ranges when we start relocating RAM to arbitrary addresses (ex. hypertransport). Can we infer anything from the vCPU virtual address space or is that still an unreasonable range to track for devices? Thanks, Alex
On 24/02/2023 15:56, Alex Williamson wrote: > On Fri, 24 Feb 2023 12:53:26 +0000 > Joao Martins <joao.m.martins@oracle.com> wrote: > >> On 24/02/2023 11:25, Joao Martins wrote: >>> On 23/02/2023 23:26, Jason Gunthorpe wrote: >>>> On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote: >>>>> On Thu, 23 Feb 2023 16:55:54 -0400 >>>>> Jason Gunthorpe <jgg@nvidia.com> wrote: >>>>>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote: >>>>>> Or even better figure out how to get interrupt remapping without IOMMU >>>>>> support :\ >>>>> >>>>> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \ >>>>> -device intel-iommu,caching-mode=on,intremap=on >>>> >>>> Joao? >>>> >>>> If this works lets just block migration if the vIOMMU is turned on.. >>> >>> At a first glance, this looked like my regular iommu incantation. >>> >>> But reading the code this ::bypass_iommu (new to me) apparently tells that >>> vIOMMU is bypassed or not for the PCI devices all the way to avoiding >>> enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether >>> PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and >>> such. >>> >>> You can see from the other email that all of the other options in my head were >>> either bit inconvenient or risky. I wasn't aware of this option for what is >>> worth -- much simpler, should work! >>> >> >> I say *should*, but on a second thought interrupt remapping may still be >> required to one of these devices that are IOMMU-bypassed. Say to put affinities >> to vcpus above 255? I was trying this out with more than 255 vcpus with a couple >> VFs and at a first glance these VFs fail to probe (these are CX6 VFs). >> >> It is a working setup without the parameter, but now adding a >> default_bus_bypass_iommu=on fails to init VFs: >> >> [ 32.412733] mlx5_core 0000:00:02.0: Rate limit: 127 rates are supported, >> range: 0Mbps to 97656Mbps >> [ 32.416242] mlx5_core 0000:00:02.0: mlx5_load:1204:(pid 3361): Failed to >> alloc IRQs >> [ 33.227852] mlx5_core 0000:00:02.0: probe_one:1684:(pid 3361): mlx5_init_one >> failed with error code -19 >> [ 33.242182] mlx5_core 0000:00:03.0: firmware version: 22.31.1660 >> [ 33.415876] mlx5_core 0000:00:03.0: Rate limit: 127 rates are supported, >> range: 0Mbps to 97656Mbps >> [ 33.448016] mlx5_core 0000:00:03.0: mlx5_load:1204:(pid 3361): Failed to >> alloc IRQs >> [ 34.207532] mlx5_core 0000:00:03.0: probe_one:1684:(pid 3361): mlx5_init_one >> failed with error code -19 >> >> I haven't dived yet into why it fails. > > Hmm, I was thinking this would only affect DMA, but on second thought > I think the DRHD also describes the interrupt remapping hardware and > while interrupt remapping is an optional feature of the DRHD, DMA > remapping is always supported afaict. I saw IR vectors in > /proc/interrupts and thought it worked, but indeed an assigned device > is having trouble getting vectors. > AMD/IVRS might be a little different. I also tried disabling dma-translation from IOMMU feature as I had mentioned in another email, and that renders the same result as default_bus_bypass_iommu. So it's either this KVM pv-op (which is not really interrupt remapping and it's x86 specific) or full vIOMMU. The PV op[*] has the natural disadvantage of requiring a compatible guest kernel. [*] See, KVM_FEATURE_MSI_EXT_DEST_ID. >> >>> And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live >>> migration blocker if `bypass_iommu` is off for any PCI device. >>> >> >> Still we could have for starters a live migration blocker until we revisit the >> vIOMMU case ... should we deem that the default_bus_bypass_iommu=on or the >> others I suggested as non-options? > > I'm very uncomfortable presuming a vIOMMU usage model, especially when > it leads to potentially untracked DMA if our assumptions are violated. We can track DMA that got dirtied, but it doesn't mean that said DMA is mapped. I don't think VFIO ties those two in? Like you can ask to track certain ranges, but if it's in IOMMU then device gets target abort. Start dirty tracking, doesn't imply that you allow such DMA with vIOMMU it's just anything that falls outside the IOMMU mapped ranges (or identity map) get always marked as dirty if it wasn't armed in the device dirty tracker. It's a best effort basis -- as I don't think supporting vIOMMU has a ton of options without a more significant compromise. If the vIOMMU is in passthrough mode, then things work just as if no-vIOMMU is there. Avihai's code reflects that. Considering your earlier suggestion that we only start dirty tracking and record ranges *when* dirty tracking start operation happens ... then this gets further simplified. We also have to take into account that we can't have guarantees that we can change ranges under tracking to be dynamic. For improving vIOMMU case we either track the MAX_IOVA or we compose an artifical range based the max-iova of current vIOMMU maps. > We could use a MemoryListener on the IOVA space to record a high level > mark, but we'd need to continue to monitor that mark while we're in > pre-copy and I don't think anyone would agree that a migratable VM can > suddenly become unmigratable due to a random IOVA allocation would be > supportable. That leads me to think that a machine option to limit the > vIOMMU address space, and testing that against the device prior to > declaring migration support of the device is possibly our best option. > > Is that feasible? Do all the vIOMMU models have a means to limit the > IOVA space? I can say that *at least* AMD and Intel support that. Intel supports either 39 or 48 address-width modes (only those two values as I understand). AMD supposedly has a more granular management of VASize and PASize. I have no idea on smmuv3 or virtio-iommu. But isn't this is actually what Avihai does in the series, but minus the device part? The address-width is fetched directly from the vIOMMU model, via the IOMMU_ATTR_MAX_IOVA, and one of the options is to compose a range based on max vIOMMU range. > How does QEMU learn a limit for a given device? IOMMU_ATTR_MAX_IOVA for vIOMMU For device this is not described in ACPI or any place that I know :/ without getting into VF specifics > We > probably need to think about whether there are devices that can even > support the guest physical memory ranges when we start relocating RAM > to arbitrary addresses (ex. hypertransport). In theory we require one-bit more in device DMA engine. so instead of max 39bits we require 40bits for a 1T guest. GPUs and modern NICs are 64-bit DMA address capable devices, but it's a bit hard to learn this as it's device specific. > Can we infer anything > from the vCPU virtual address space or is that still an unreasonable > range to track for devices? Thanks, > We sort of rely on that for iommu=pt or no-vIOMMU case where vCPU address space matches that of IOVA space, but that not sure how much you would from vCPU address space that vIOMMU mapping doesn't give you already
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 1f21e1fa43..1dc00cabcd 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -95,6 +95,8 @@ typedef struct VFIOContainer { unsigned int dma_max_mappings; IOVATree *mappings; QemuMutex mappings_mutex; + /* Represents the range [0, giommu_tracked_range) not inclusive */ + hwaddr giommu_tracked_range; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 4a7fff6eeb..1024788bcc 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -45,6 +45,8 @@ #include "migration/qemu-file.h" #include "sysemu/tpm.h" #include "qemu/iova-tree.h" +#include "hw/boards.h" +#include "hw/mem/memory-device.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -430,6 +432,38 @@ void vfio_unblock_multiple_devices_migration(void) multiple_devices_migration_blocker = NULL; } +static uint64_t vfio_get_ram_size(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + uint64_t plugged_size; + + plugged_size = get_plugged_memory_size(); + if (plugged_size == (uint64_t)-1) { + plugged_size = 0; + } + + return ms->ram_size + plugged_size; +} + +static int vfio_iommu_get_max_iova(VFIOContainer *container, hwaddr *max_iova) +{ + VFIOGuestIOMMU *giommu; + int ret; + + giommu = QLIST_FIRST(&container->giommu_list); + if (!giommu) { + return -ENOENT; + } + + ret = memory_region_iommu_get_attr(giommu->iommu_mr, IOMMU_ATTR_MAX_IOVA, + max_iova); + if (ret) { + return ret; + } + + return 0; +} + static bool vfio_have_giommu(VFIOContainer *container) { return !QLIST_EMPTY(&container->giommu_list); @@ -1510,7 +1544,8 @@ static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data) } static struct vfio_device_feature * -vfio_device_feature_dma_logging_start_create(VFIOContainer *container) +vfio_device_feature_dma_logging_start_create(VFIOContainer *container, + bool giommu) { struct vfio_device_feature *feature; size_t feature_size; @@ -1529,6 +1564,16 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container) control = (struct vfio_device_feature_dma_logging_control *)feature->data; control->page_size = qemu_real_host_page_size(); + if (giommu) { + ranges = g_malloc0(sizeof(*ranges)); + ranges->iova = 0; + ranges->length = container->giommu_tracked_range; + control->num_ranges = 1; + control->ranges = (uint64_t)ranges; + + return feature; + } + QEMU_LOCK_GUARD(&container->mappings_mutex); /* @@ -1578,12 +1623,12 @@ static void vfio_device_feature_dma_logging_start_destroy( g_free(feature); } -static int vfio_devices_dma_logging_start(VFIOContainer *container) +static int vfio_devices_dma_logging_start(VFIOContainer *container, bool giommu) { struct vfio_device_feature *feature; int ret; - feature = vfio_device_feature_dma_logging_start_create(container); + feature = vfio_device_feature_dma_logging_start_create(container, giommu); if (!feature) { return -errno; } @@ -1598,18 +1643,128 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) return ret; } +typedef struct { + hwaddr *ranges; + unsigned int ranges_num; +} VFIOGIOMMUDeviceDTRanges; + +/* + * This value is used in the second attempt to start device dirty tracking with + * vIOMMU, or if the giommu fails to report its max iova. + * It should be in the middle, not too big and not too small, allowing devices + * with HW limitations to do device dirty tracking while covering a fair amount + * of the IOVA space. + * + * This arbitrary value was chosen becasue it is the minimum value of Intel + * IOMMU max IOVA and mlx5 devices support tracking a range of this size. + */ +#define VFIO_IOMMU_DEFAULT_MAX_IOVA ((1ULL << 39) - 1) + +#define VFIO_IOMMU_RANGES_NUM 3 +static VFIOGIOMMUDeviceDTRanges * +vfio_iommu_device_dirty_tracking_ranges_create(VFIOContainer *container) +{ + hwaddr iommu_max_iova = VFIO_IOMMU_DEFAULT_MAX_IOVA; + hwaddr retry_iova; + hwaddr ram_size = vfio_get_ram_size(); + VFIOGIOMMUDeviceDTRanges *dt_ranges; + int ret; + + dt_ranges = g_try_new0(VFIOGIOMMUDeviceDTRanges, 1); + if (!dt_ranges) { + errno = ENOMEM; + + return NULL; + } + + dt_ranges->ranges_num = VFIO_IOMMU_RANGES_NUM; + + dt_ranges->ranges = g_try_new0(hwaddr, dt_ranges->ranges_num); + if (!dt_ranges->ranges) { + g_free(dt_ranges); + errno = ENOMEM; + + return NULL; + } + + /* + * With vIOMMU we try to track the entire IOVA space. As the IOVA space can + * be rather big, devices might not be able to track it due to HW + * limitations. In that case: + * (1) Retry tracking a smaller part of the IOVA space. + * (2) Retry tracking a range in the size of the physical memory. + */ + ret = vfio_iommu_get_max_iova(container, &iommu_max_iova); + if (!ret) { + /* Check 2^64 wrap around */ + if (!REAL_HOST_PAGE_ALIGN(iommu_max_iova)) { + iommu_max_iova -= qemu_real_host_page_size(); + } + } + + retry_iova = MIN(iommu_max_iova / 2, VFIO_IOMMU_DEFAULT_MAX_IOVA); + + dt_ranges->ranges[0] = REAL_HOST_PAGE_ALIGN(iommu_max_iova); + dt_ranges->ranges[1] = REAL_HOST_PAGE_ALIGN(retry_iova); + dt_ranges->ranges[2] = REAL_HOST_PAGE_ALIGN(MIN(ram_size, retry_iova / 2)); + + return dt_ranges; +} + +static void vfio_iommu_device_dirty_tracking_ranges_destroy( + VFIOGIOMMUDeviceDTRanges *dt_ranges) +{ + g_free(dt_ranges->ranges); + g_free(dt_ranges); +} + +static int vfio_devices_start_dirty_page_tracking(VFIOContainer *container) +{ + VFIOGIOMMUDeviceDTRanges *dt_ranges; + int ret; + int i; + + if (!vfio_have_giommu(container)) { + return vfio_devices_dma_logging_start(container, false); + } + + dt_ranges = vfio_iommu_device_dirty_tracking_ranges_create(container); + if (!dt_ranges) { + return -errno; + } + + for (i = 0; i < dt_ranges->ranges_num; i++) { + container->giommu_tracked_range = dt_ranges->ranges[i]; + ret = vfio_devices_dma_logging_start(container, true); + if (!ret) { + break; + } + + if (i < dt_ranges->ranges_num - 1) { + warn_report("Failed to start device dirty tracking with vIOMMU " + "with range of size 0x%" HWADDR_PRIx + ", err: %d. Retrying with range " + "of size 0x%" HWADDR_PRIx, + dt_ranges->ranges[i], ret, dt_ranges->ranges[i + 1]); + } else { + error_report("Failed to start device dirty tracking with vIOMMU " + "with range of size 0x%" HWADDR_PRIx ", err: %d", + dt_ranges->ranges[i], ret); + } + } + + vfio_iommu_device_dirty_tracking_ranges_destroy(dt_ranges); + + return ret; +} + static void vfio_listener_log_global_start(MemoryListener *listener) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); int ret; if (vfio_devices_all_device_dirty_tracking(container)) { - if (vfio_have_giommu(container)) { - /* Device dirty page tracking currently doesn't support vIOMMU */ - return; - } - - ret = vfio_devices_dma_logging_start(container); + ret = vfio_devices_start_dirty_page_tracking(container); } else { ret = vfio_set_dirty_page_tracking(container, true); } @@ -1627,11 +1782,6 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) int ret; if (vfio_devices_all_device_dirty_tracking(container)) { - if (vfio_have_giommu(container)) { - /* Device dirty page tracking currently doesn't support vIOMMU */ - return; - } - ret = vfio_devices_dma_logging_stop(container); } else { ret = vfio_set_dirty_page_tracking(container, false); @@ -1670,6 +1820,17 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, return 0; } +static bool vfio_iommu_range_is_device_tracked(VFIOContainer *container, + hwaddr iova, hwaddr size) +{ + /* Check for 2^64 wrap around */ + if (!(iova + size)) { + return false; + } + + return iova + size <= container->giommu_tracked_range; +} + static int vfio_devices_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) @@ -1679,10 +1840,11 @@ static int vfio_devices_query_dirty_bitmap(VFIOContainer *container, int ret; if (vfio_have_giommu(container)) { - /* Device dirty page tracking currently doesn't support vIOMMU */ - bitmap_set(vbmap->bitmap, 0, vbmap->pages); + if (!vfio_iommu_range_is_device_tracked(container, iova, size)) { + bitmap_set(vbmap->bitmap, 0, vbmap->pages); - return 0; + return 0; + } } QLIST_FOREACH(group, &container->group_list, container_next) {
Currently, device dirty page tracking with vIOMMU is not supported - RAM pages are perpetually marked dirty in this case. When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as the vIOMMU maps/unmaps them. These IOVA ranges can potentially be mapped anywhere in the vIOMMU IOVA space. Due to this dynamic nature of vIOMMU mapping/unmapping, tracking only the currently mapped IOVA ranges, as done in the non-vIOMMU case, doesn't work very well. Instead, to support device dirty tracking when vIOMMU is enabled, track the entire vIOMMU IOVA space. If that fails (IOVA space can be rather big and we might hit HW limitation), try tracking smaller range while marking untracked ranges dirty. Signed-off-by: Avihai Horon <avihaih@nvidia.com> --- include/hw/vfio/vfio-common.h | 2 + hw/vfio/common.c | 196 +++++++++++++++++++++++++++++++--- 2 files changed, 181 insertions(+), 17 deletions(-)