@@ -555,11 +555,34 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
return true;
}
+static VFIOPCIDevice *vfio_section_is_vfio_pci(MemoryRegionSection *section,
+ VFIOContainerBase *bcontainer)
+{
+ VFIOPCIDevice *pcidev;
+ VFIODevice *vbasedev;
+ Object *owner;
+
+ owner = memory_region_owner(section->mr);
+
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+ if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+ continue;
+ }
+ pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+ if (OBJECT(pcidev) == owner) {
+ return pcidev;
+ }
+ }
+
+ return NULL;
+}
+
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
+ VFIOPCIDevice *vdev;
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
@@ -630,6 +653,18 @@ static void vfio_listener_region_add(MemoryListener *listener,
/* Here we assume that memory_region_is_ram(section->mr)==true */
+ /* skip if the region is a BAR and the power state forbids DMA MAP */
+ vdev = vfio_section_is_vfio_pci(section, bcontainer);
+ if (vdev) {
+ VFIODevice *vbasedev = &vdev->vbasedev;
+ assert(vbasedev->ops->vfio_is_dma_map_allowed);
+ if (!vbasedev->ops->vfio_is_dma_map_allowed(vbasedev)) {
+ trace_vfio_listener_region_add_skip(section->mr->name);
+ return;
+ }
+ }
+
+
/*
* For RAM memory regions with a RamDiscardManager, we only want to map the
* actually populated parts - and update the mapping whenever we're notified
@@ -804,28 +839,6 @@ typedef struct VFIODirtyRangesListener {
MemoryListener listener;
} VFIODirtyRangesListener;
-static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
- VFIOContainerBase *bcontainer)
-{
- VFIOPCIDevice *pcidev;
- VFIODevice *vbasedev;
- Object *owner;
-
- owner = memory_region_owner(section->mr);
-
- QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
- if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
- continue;
- }
- pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
- if (OBJECT(pcidev) == owner) {
- return true;
- }
- }
-
- return false;
-}
-
static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
hwaddr iova, hwaddr end,
bool update_pci)
@@ -2653,6 +2653,26 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
return ret;
}
+/*
+ * BARs cannot be dma-mapped if the device is in D3hot state since
+ * linux commit 2b2c651baf1c ("vfio/pci: Invalidate mmaps and block
+ * the access in D3hot power state")
+ */
+static bool vfio_pci_is_dma_map_allowed(VFIODevice *vbasedev)
+{
+ VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+ uint16_t pmcsr;
+ uint8_t state;
+
+ pmcsr = vfio_pci_read_config(&vdev->pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
+ state = pmcsr & PCI_PM_CTRL_STATE_MASK;
+ if (state == 3) {
+ return false;
+ }
+ return true;
+}
+
+
static VFIODeviceOps vfio_pci_ops = {
.vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
.vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
@@ -2660,6 +2680,7 @@ static VFIODeviceOps vfio_pci_ops = {
.vfio_get_object = vfio_pci_get_object,
.vfio_save_config = vfio_pci_save_config,
.vfio_load_config = vfio_pci_load_config,
+ .vfio_is_dma_map_allowed = vfio_pci_is_dma_map_allowed,
};
bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
@@ -3477,3 +3498,4 @@ static void register_vfio_pci_dev_type(void)
}
type_init(register_vfio_pci_dev_type)
+
@@ -121,6 +121,7 @@ vfio_legacy_dma_unmap_overflow_workaround(void) ""
vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
vfio_reset_handler(void) ""
+vfio_listener_region_add_skip(const char *name) "DMA MAP would fail on region %s due to incompatible power state, skip it"
# platform.c
vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s"
Since kernel commit: 2b2c651baf1c ("vfio/pci: Invalidate mmaps and block the access in D3hot power state") any attempt to do an mmap access to a BAR when the device is in d3hot state will generate a fault. On system_powerdown, if the VFIO device is translated by an IOMMU, the device is moved to D3hot state and then the vIOMMU gets disabled by the guest. As a result of this later operation, the address space is swapped from translated to untranslated. When re-enabling the aliased regions, the RAM regions are dma-mapped again and this causes DMA_MAP faults when attempting the operation on BARs. To avoid doing the remap on those BARs, we need to retrieve the information whether the device is in a non compatible state. Implement the vfio_is_dma_map_allowed() callback for PCI devices. If the device is in D3hot state, skip the DMA MAP in vfio_listener_add(). To ease the implementation, vfio_section_is_vfio_pci now returns a VFIOPCIDevice pointer and the function is moved before the first caller. Signed-off-by: Eric Auger <eric.auger@redhat.com> --- hw/vfio/common.c | 57 +++++++++++++++++++++++++++----------------- hw/vfio/pci.c | 22 +++++++++++++++++ hw/vfio/trace-events | 1 + 3 files changed, 58 insertions(+), 22 deletions(-)