diff mbox series

[RFC,2/2] hw/vfio/pci: Prevents BARs from being dma mapped in d3hot state

Message ID 20250219175941.135390-3-eric.auger@redhat.com (mailing list archive)
State New
Headers show
Series hw/vfio/pci: Prevent BARs from being dma mapped in d3hot state | expand

Commit Message

Eric Auger Feb. 19, 2025, 5:59 p.m. UTC
Since kernel commit:
2b2c651baf1c ("vfio/pci: Invalidate mmaps and block the access
in D3hot power state")
any attempt to do an mmap access to a BAR when the device is in d3hot
state will generate a fault.

On system_powerdown, if the VFIO device is translated by an IOMMU,
the device is moved to D3hot state and then the vIOMMU gets disabled
by the guest. As a result of this later operation, the address space is
swapped from translated to untranslated. When re-enabling the aliased
regions, the RAM regions are dma-mapped again and this causes DMA_MAP
faults when attempting the operation on BARs.

To avoid doing the remap on those BARs, we need to retrieve the
information whether the device is in a non compatible state.

Implement the vfio_is_dma_map_allowed() callback for PCI devices.
If the device is in D3hot state, skip the DMA MAP in vfio_listener_add().

To ease the implementation, vfio_section_is_vfio_pci now returns
a VFIOPCIDevice pointer and the function is moved before the first
caller.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 hw/vfio/common.c     | 57 +++++++++++++++++++++++++++-----------------
 hw/vfio/pci.c        | 22 +++++++++++++++++
 hw/vfio/trace-events |  1 +
 3 files changed, 58 insertions(+), 22 deletions(-)
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 173fb3a997..96f401f10a 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -555,11 +555,34 @@  static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
     return true;
 }
 
+static VFIOPCIDevice *vfio_section_is_vfio_pci(MemoryRegionSection *section,
+                                     VFIOContainerBase *bcontainer)
+{
+    VFIOPCIDevice *pcidev;
+    VFIODevice *vbasedev;
+    Object *owner;
+
+    owner = memory_region_owner(section->mr);
+
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+        if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+            continue;
+        }
+        pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+        if (OBJECT(pcidev) == owner) {
+            return pcidev;
+        }
+    }
+
+    return NULL;
+}
+
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
     VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
                                                  listener);
+    VFIOPCIDevice *vdev;
     hwaddr iova, end;
     Int128 llend, llsize;
     void *vaddr;
@@ -630,6 +653,18 @@  static void vfio_listener_region_add(MemoryListener *listener,
 
     /* Here we assume that memory_region_is_ram(section->mr)==true */
 
+    /* skip if the region is a BAR and the power state forbids DMA MAP */
+    vdev = vfio_section_is_vfio_pci(section, bcontainer);
+    if (vdev) {
+        VFIODevice *vbasedev = &vdev->vbasedev;
+        assert(vbasedev->ops->vfio_is_dma_map_allowed);
+        if (!vbasedev->ops->vfio_is_dma_map_allowed(vbasedev)) {
+            trace_vfio_listener_region_add_skip(section->mr->name);
+            return;
+        }
+    }
+
+
     /*
      * For RAM memory regions with a RamDiscardManager, we only want to map the
      * actually populated parts - and update the mapping whenever we're notified
@@ -804,28 +839,6 @@  typedef struct VFIODirtyRangesListener {
     MemoryListener listener;
 } VFIODirtyRangesListener;
 
-static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
-                                     VFIOContainerBase *bcontainer)
-{
-    VFIOPCIDevice *pcidev;
-    VFIODevice *vbasedev;
-    Object *owner;
-
-    owner = memory_region_owner(section->mr);
-
-    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
-        if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
-            continue;
-        }
-        pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
-        if (OBJECT(pcidev) == owner) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
                                              hwaddr iova, hwaddr end,
                                              bool update_pci)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ab17a98ee5..314dddae4a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2653,6 +2653,26 @@  static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
     return ret;
 }
 
+/*
+ * BARs cannot be dma-mapped if the device is in D3hot state since
+ * linux commit 2b2c651baf1c ("vfio/pci: Invalidate mmaps and block
+ * the access in D3hot power state")
+ */
+static bool vfio_pci_is_dma_map_allowed(VFIODevice *vbasedev)
+{
+    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+    uint16_t pmcsr;
+    uint8_t state;
+
+    pmcsr = vfio_pci_read_config(&vdev->pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
+    state = pmcsr & PCI_PM_CTRL_STATE_MASK;
+    if (state == 3) {
+        return false;
+    }
+    return true;
+}
+
+
 static VFIODeviceOps vfio_pci_ops = {
     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
@@ -2660,6 +2680,7 @@  static VFIODeviceOps vfio_pci_ops = {
     .vfio_get_object = vfio_pci_get_object,
     .vfio_save_config = vfio_pci_save_config,
     .vfio_load_config = vfio_pci_load_config,
+    .vfio_is_dma_map_allowed = vfio_pci_is_dma_map_allowed,
 };
 
 bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
@@ -3477,3 +3498,4 @@  static void register_vfio_pci_dev_type(void)
 }
 
 type_init(register_vfio_pci_dev_type)
+
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index c5385e1a4f..a0d5868c2f 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -121,6 +121,7 @@  vfio_legacy_dma_unmap_overflow_workaround(void) ""
 vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
 vfio_reset_handler(void) ""
+vfio_listener_region_add_skip(const char *name) "DMA MAP would fail on region %s due to incompatible power state, skip it"
 
 # platform.c
 vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s"