@@ -33,6 +33,7 @@
#include "hw/i386/apic-msidef.h"
#include "hw/qdev-properties.h"
#include "kvm/kvm_i386.h"
+#include "qemu/iova-tree.h"
/* used AMD-Vi MMIO registers */
const char *amdvi_mmio_low[] = {
@@ -73,6 +74,7 @@ struct AMDVIAddressSpace {
QLIST_ENTRY(AMDVIAddressSpace) next;
/* DMA address translation active */
bool addr_translation;
+ IOVATree *iova_tree; /* Record DMA translation ranges */
};
/* AMDVI cache entry */
@@ -103,6 +105,7 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n);
static void amdvi_address_space_sync(AMDVIAddressSpace *as);
static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as);
+static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event);
uint64_t amdvi_extended_feature_register(AMDVIState *s)
{
@@ -1366,6 +1369,7 @@ static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n)
hwaddr start = n->start;
hwaddr end = n->end;
hwaddr remain;
+ DMAMap map;
assert(start <= end);
remain = end - start + 1;
@@ -1399,6 +1403,11 @@ static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n)
}
assert(!remain);
+
+ map.iova = n->start;
+ map.size = n->end - n->start;
+
+ iova_tree_remove(as->iova_tree, map);
}
/*
@@ -1908,7 +1917,7 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
{
IOMMUTLBEvent event;
- hwaddr iova_next, page_mask, pagesize;
+ hwaddr page_mask, pagesize;
hwaddr iova = addr;
hwaddr end = iova + size - 1;
@@ -1930,7 +1939,6 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
/* PTE has been validated for major errors and pagesize is set */
assert(pagesize);
page_mask = ~(pagesize - 1);
- iova_next = (iova & page_mask) + pagesize;
if (pte == (uint64_t)-1) {
/*
@@ -1963,12 +1971,90 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
event.type = IOMMU_NOTIFIER_MAP;
}
- /* Invoke the notifiers registered for this address space */
- memory_region_notify_iommu(&as->iommu, 0, event);
+ /*
+ * The following call might need to adjust event.entry.size in cases
+ * where the guest unmapped a series of large pages.
+ */
+ amdvi_notify_iommu(as, &event);
+ /*
+ * In the special scenario where the guest is unmapping a large page,
+ * addr_mask has been adjusted before sending the notification. Update
+ * pagesize accordingly in order to correctly compute the next IOVA.
+ */
+ pagesize = event.entry.addr_mask + 1;
next:
- iova = iova_next;
+ iova = (iova & ~(pagesize - 1)) + pagesize;
+ }
+}
+
+/*
+ * Invoke notifiers registered for the address space. Update record of mapped
+ * ranges in IOVA Tree.
+ */
+static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event)
+{
+ IOMMUTLBEntry *entry = &event->entry;
+
+ DMAMap target = {
+ .iova = entry->iova,
+ .size = entry->addr_mask,
+ .translated_addr = entry->translated_addr,
+ .perm = entry->perm,
+ };
+
+ /*
+ * Search the IOVA Tree for an existing translation for the target, and skip
+ * the notification if the mapping is already recorded.
+ * When the guest uses large pages, comparing against the record makes it
+ * possible to determine the size of the original MAP and adjust the UNMAP
+ * request to match it. This avoids failed checks against the mappings kept
+ * by the VFIO kernel driver.
+ */
+ const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+ if (event->type == IOMMU_NOTIFIER_UNMAP) {
+ if (!mapped) {
+ /* No record exists of this mapping, nothing to do */
+ return;
+ }
+ /*
+ * Adjust the size based on the original record. This is essential to
+ * determine when large/contiguous pages are used, since the guest has
+ * already cleared the PTE (erasing the pagesize encoded on it) before
+ * issuing the invalidation command.
+ */
+ if (mapped->size != target.size) {
+ assert(mapped->size > target.size);
+ target.size = mapped->size;
+ /* Adjust event to invoke notifier with correct range */
+ entry->addr_mask = mapped->size;
+ }
+ iova_tree_remove(as->iova_tree, target);
+ } else { /* IOMMU_NOTIFIER_MAP */
+ if (mapped) {
+ /*
+ * If a mapping is present and matches the request, skip the
+ * notification.
+ */
+ if (!memcmp(mapped, &target, sizeof(DMAMap))) {
+ return;
+ } else {
+ /*
+ * This should never happen unless a buggy guest OS omits or
+ * sends incorrect invalidation(s). Report an error in the event
+ * it does happen.
+ */
+ error_report("Found conflicting translation. This could be due "
+ "to an incorrect or missing invalidation command");
+ }
+ }
+ /* Record the new mapping */
+ iova_tree_insert(as->iova_tree, &target);
}
+
+ /* Invoke the notifiers registered for this address space */
+ memory_region_notify_iommu(&as->iommu, 0, *event);
}
/*
@@ -2034,6 +2120,7 @@ static void amdvi_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
return;
}
+ /* Dropping all mappings for the addres space. Also clears the IOVA tree */
amdvi_address_space_unmap(as, n);
amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, false);
@@ -2062,6 +2149,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
iommu_as[devfn]->bus_num = (uint8_t)bus_num;
iommu_as[devfn]->devfn = (uint8_t)devfn;
iommu_as[devfn]->iommu_state = s;
+ iommu_as[devfn]->iova_tree = iova_tree_new();
amdvi_dev_as = iommu_as[devfn];
Keep a record of mapped IOVA ranges per address space, using the iova_tree implementation. Besides enabling optimizations like avoiding unnecessary notifications, a record of existing <IOVA, size> mappings makes it possible to determine if a specific IOVA is mapped by the guest using a large page, and adjust the size when notifying UNMAP events. When unmapping a large page, the information in the guest PTE encoding the page size is lost, since the guest clears the PTE before issuing the invalidation command to the IOMMU. In such case, the size of the original mapping can be retrieved from the iova_tree and used to issue the UNMAP notification. Using the correct size is essential since the VFIO IOMMU Type1v2 driver in the host kernel will reject unmap requests that do not fully cover previous mappings. Signed-off-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com> --- hw/i386/amd_iommu.c | 98 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 5 deletions(-)