Message ID | 1463847590-22782-4-git-send-email-bd.aviv@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, 21 May 2016 19:19:50 +0300 "Aviv B.D" <bd.aviv@gmail.com> wrote: > From: "Aviv Ben-David" <bd.aviv@gmail.com> > Some commentary about the changes necessary to achieve $SUBJECT would be nice here. > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com> > --- > hw/i386/intel_iommu.c | 69 ++++++++++++++++++++++++++++++++++++++++-- > hw/i386/intel_iommu_internal.h | 2 ++ > hw/vfio/common.c | 11 +++++-- > include/hw/i386/intel_iommu.h | 4 +++ > include/hw/vfio/vfio-common.h | 1 + > 5 files changed, 81 insertions(+), 6 deletions(-) > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 410f810..128ec7c 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR); > #define VTD_DPRINTF(what, fmt, ...) do {} while (0) > #endif > > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > + uint8_t devfn, VTDContextEntry *ce); > + > static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, > uint64_t wmask, uint64_t w1cmask) > { > @@ -126,6 +129,22 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, > return new_val; > } > > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, uint16_t * domain_id) > +{ > + VTDContextEntry ce; > + int ret_fr; > + > + assert(domain_id); > + > + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); > + if (ret_fr){ > + return -1; > + } > + > + *domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi); > + return 0; > +} > + > static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, > uint64_t clear, uint64_t mask) > { > @@ -724,9 +743,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > } > > if (!vtd_context_entry_present(ce)) { > - VTD_DPRINTF(GENERAL, > - "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " > - "is not present", devfn, bus_num); > return -VTD_FR_CONTEXT_ENTRY_P; > } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { > @@ -1033,18 +1049,58 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) > &domain_id); > } > > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, uint16_t domain_id, > + hwaddr addr, uint8_t am) > +{ > + VFIOGuestIOMMU * giommu; > + VT-d parsing VFIO private data structures, nope this is not a good idea. > + QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ > + VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu); VT-d needs to keep track of its own address spaces and call the iommu notifier, it should have no visibility whatsoever that there are vfio devices attached. > + uint16_t vfio_domain_id; > + int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &vfio_domain_id); > + int i=0; > + if (!ret && domain_id == vfio_domain_id){ > + IOMMUTLBEntry entry; > + > + /* do vfio unmap */ > + VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am); > + entry.target_as = NULL; > + entry.iova = addr & VTD_PAGE_MASK_4K; > + entry.translated_addr = 0; > + entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am); > + entry.perm = IOMMU_NONE; > + memory_region_notify_iommu(giommu->iommu, entry); > + > + /* do vfio map */ > + VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am); > + /* call to vtd_iommu_translate */ > + for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){ > + IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL); > + if (entry.perm != IOMMU_NONE){ > + memory_region_notify_iommu(giommu->iommu, entry); > + } > + } > + } > + } > +} > + > static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, > hwaddr addr, uint8_t am) > { > VTDIOTLBPageInvInfo info; > > assert(am <= VTD_MAMV); > + > info.domain_id = domain_id; > info.addr = addr; > info.mask = ~((1 << am) - 1); > + > g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); > + > + vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am); > } > > + > /* Flush IOTLB > * Returns the IOTLB Actual Invalidation Granularity. > * @val: the content of the IOTLB_REG > @@ -1912,6 +1968,13 @@ static Property vtd_properties[] = { > DEFINE_PROP_END_OF_LIST(), > }; > > +void vtd_register_giommu(VFIOGuestIOMMU * giommu) > +{ > + VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu); > + IntelIOMMUState *s = vtd_as->iommu_state; > + > + QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next); > +} > > VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) > { > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > index ae40f73..102e9a5 100644 > --- a/hw/i386/intel_iommu_internal.h > +++ b/hw/i386/intel_iommu_internal.h > @@ -339,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; > #define VTD_PAGE_SHIFT_1G 30 > #define VTD_PAGE_MASK_1G (~((1ULL << VTD_PAGE_SHIFT_1G) - 1)) > > +#define VTD_PAGE_MASK(shift) (~((1ULL << (shift)) - 1)) > + > struct VTDRootEntry { > uint64_t val; > uint64_t rsvd; > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 88154a1..54fc8bc 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -35,6 +35,9 @@ > #endif > #include "trace.h" > > +#include "hw/sysbus.h" > +#include "hw/i386/intel_iommu.h" > + > struct vfio_group_head vfio_group_list = > QLIST_HEAD_INITIALIZER(vfio_group_list); > struct vfio_as_head vfio_address_spaces = > @@ -315,12 +318,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data) > out: > rcu_read_unlock(); > } > - > +#if 0 > static hwaddr vfio_container_granularity(VFIOContainer *container) > { > return (hwaddr)1 << ctz64(container->iova_pgsizes); > } > - > +#endif Clearly this is unacceptable, the code has a purpose. > static void vfio_listener_region_add(MemoryListener *listener, > MemoryRegionSection *section) > { > @@ -384,11 +387,13 @@ static void vfio_listener_region_add(MemoryListener *listener, > giommu->n.notify = vfio_iommu_map_notify; > QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); > > + vtd_register_giommu(giommu); vfio will not assume VT-d, this is why we register the notifier below. > memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); > +#if 0 > memory_region_iommu_replay(giommu->iommu, &giommu->n, > vfio_container_granularity(container), > false); > - > +#endif Clearly this also has a purpose. > return; > } > > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > index b024ffa..22f3f83 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -23,6 +23,7 @@ > #define INTEL_IOMMU_H > #include "hw/qdev.h" > #include "sysemu/dma.h" > +#include "hw/vfio/vfio-common.h" No. This header probably should not have been put under include, VT-d has no business walking our guest IOMMU list. > > #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" > #define INTEL_IOMMU_DEVICE(obj) \ > @@ -123,6 +124,8 @@ struct IntelIOMMUState { > MemoryRegionIOMMUOps iommu_ops; > GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by PCIBus* reference */ > VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */ > + > + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; > }; > > /* Find the VTD Address space associated with the given bus pointer, > @@ -130,4 +133,5 @@ struct IntelIOMMUState { > */ > VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn); > > +void vtd_register_giommu(VFIOGuestIOMMU * giommu); > #endif > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index eb0e1b0..bf56a1d 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -92,6 +92,7 @@ typedef struct VFIOGuestIOMMU { > MemoryRegion *iommu; > Notifier n; > QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; > + QLIST_ENTRY(VFIOGuestIOMMU) iommu_next; No. Use the existing interfaces, create your own address space tracking in VT-d, we are not going to host a list for VT-d to use. Also note that there's no consideration of hot-unplug support in these changes. vfio already works with guest iommus on powerpc, so any change to vfio needs to be justified and generalized to a common guest iommu api. Thanks, Alex > } VFIOGuestIOMMU; > > typedef struct VFIODeviceOps VFIODeviceOps;
On Mon, 23 May 2016 11:53:42 -0600 Alex Williamson <alex.williamson@redhat.com> wrote: > On Sat, 21 May 2016 19:19:50 +0300 > "Aviv B.D" <bd.aviv@gmail.com> wrote: > > > From: "Aviv Ben-David" <bd.aviv@gmail.com> > > > > Some commentary about the changes necessary to achieve $SUBJECT would > be nice here. > > > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com> > > --- > > hw/i386/intel_iommu.c | 69 ++++++++++++++++++++++++++++++++++++++++-- > > hw/i386/intel_iommu_internal.h | 2 ++ > > hw/vfio/common.c | 11 +++++-- > > include/hw/i386/intel_iommu.h | 4 +++ > > include/hw/vfio/vfio-common.h | 1 + > > 5 files changed, 81 insertions(+), 6 deletions(-) > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > > index 410f810..128ec7c 100644 > > --- a/hw/i386/intel_iommu.c > > +++ b/hw/i386/intel_iommu.c > > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR); > > #define VTD_DPRINTF(what, fmt, ...) do {} while (0) > > #endif > > > > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > > + uint8_t devfn, VTDContextEntry *ce); > > + > > static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, > > uint64_t wmask, uint64_t w1cmask) > > { > > @@ -126,6 +129,22 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, > > return new_val; > > } > > > > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, uint16_t * domain_id) > > +{ > > + VTDContextEntry ce; > > + int ret_fr; > > + > > + assert(domain_id); > > + > > + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); > > + if (ret_fr){ > > + return -1; > > + } > > + > > + *domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi); > > + return 0; > > +} > > + > > static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, > > uint64_t clear, uint64_t mask) > > { > > @@ -724,9 +743,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > > } > > > > if (!vtd_context_entry_present(ce)) { > > - VTD_DPRINTF(GENERAL, > > - "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " > > - "is not present", devfn, bus_num); > > return -VTD_FR_CONTEXT_ENTRY_P; > > } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || > > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { > > @@ -1033,18 +1049,58 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) > > &domain_id); > > } > > > > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, uint16_t domain_id, > > + hwaddr addr, uint8_t am) > > +{ > > + VFIOGuestIOMMU * giommu; > > + > > VT-d parsing VFIO private data structures, nope this is not a good idea. > > > + QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ > > + VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu); > > VT-d needs to keep track of its own address spaces and call the iommu > notifier, it should have no visibility whatsoever that there are vfio > devices attached. > > > + uint16_t vfio_domain_id; > > + int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &vfio_domain_id); > > + int i=0; > > + if (!ret && domain_id == vfio_domain_id){ > > + IOMMUTLBEntry entry; > > + > > + /* do vfio unmap */ > > + VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am); > > + entry.target_as = NULL; > > + entry.iova = addr & VTD_PAGE_MASK_4K; > > + entry.translated_addr = 0; > > + entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am); > > + entry.perm = IOMMU_NONE; > > + memory_region_notify_iommu(giommu->iommu, entry); > > + > > + /* do vfio map */ > > + VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am); > > + /* call to vtd_iommu_translate */ > > + for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){ > > + IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL); > > + if (entry.perm != IOMMU_NONE){ > > + memory_region_notify_iommu(giommu->iommu, entry); > > + } > > + } > > + } > > + } > > +} > > + I think I see what you're trying to do here, find the VTDAddressSpaces that vfio knows about and determine which are affected by an invalidation, but I think it really just represents a flaw in the VT-d code not really matching real hardware. As I understand the real hardware, per device translations use the bus:devfn to get the context entry. The context entry contains both the domain_id and a pointer to the page table. Note that VT-d requires that domain_id and page table pointers are consistent for all entries, there cannot be two separate context entries with the same domain_id that point to different page tables. So really, VTDAddressSpace should be based at the domain, not the context entry. Multiple context entries can point to the same domain, and thus the same VTDAddressSpace. Doing so would make the invalidation trivial, simply lookup the VTDAddressSpace based on the domain_id, get the MemoryRegion, and fire off memory_region_notify_iommu()s, which then get {un}mapped through vfio without any direct interaction. Unfortunately I think that means that intel-iommu needs some data structure surgery. With the current code, I think the best we could do would be to look through every context for matching domain_ids. For each one, use that bus:devfn to get the VTDAddressSpace and thus the MemoryRegion and call a notify for each. That's not only an inefficient lookup, but requires a notify per matching bus:devfn since each uses a separate VTDAddressSpace when we should really only need a notify per domain_id. Also, I haven't figured it out yet, but I think we're also going to need to actually populate the MemoryRegion rather than just use it to send notifies, otherwise replay won't work, which means that a device added to a domain with existing mappings wouldn't work. Thanks, Alex
Hi, Your idea to search the relevent VTDAddressSpace and call it's notifier will probably work. Next week I'll try to implement it (for now with the costly scan of each context). I still not sure if populating the MemoryRegion will suffice for hot plug vfio device but i'll try to look into it. As far as I understand the memory_region_iommu_replay function, it still scans the whole 64bit address space, and therefore may hang the VM for a long time. Aviv. On Thu, May 26, 2016 at 11:58 PM Alex Williamson <alex.williamson@redhat.com> wrote: > On Mon, 23 May 2016 11:53:42 -0600 > Alex Williamson <alex.williamson@redhat.com> wrote: > > > On Sat, 21 May 2016 19:19:50 +0300 > > "Aviv B.D" <bd.aviv@gmail.com> wrote: > > > > > From: "Aviv Ben-David" <bd.aviv@gmail.com> > > > > > > > Some commentary about the changes necessary to achieve $SUBJECT would > > be nice here. > > > > > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com> > > > --- > > > hw/i386/intel_iommu.c | 69 > ++++++++++++++++++++++++++++++++++++++++-- > > > hw/i386/intel_iommu_internal.h | 2 ++ > > > hw/vfio/common.c | 11 +++++-- > > > include/hw/i386/intel_iommu.h | 4 +++ > > > include/hw/vfio/vfio-common.h | 1 + > > > 5 files changed, 81 insertions(+), 6 deletions(-) > > > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > > > index 410f810..128ec7c 100644 > > > --- a/hw/i386/intel_iommu.c > > > +++ b/hw/i386/intel_iommu.c > > > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | > VTD_DBGBIT(CSR); > > > #define VTD_DPRINTF(what, fmt, ...) do {} while (0) > > > #endif > > > > > > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t > bus_num, > > > + uint8_t devfn, VTDContextEntry > *ce); > > > + > > > static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t > val, > > > uint64_t wmask, uint64_t w1cmask) > > > { > > > @@ -126,6 +129,22 @@ static uint32_t > vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, > > > return new_val; > > > } > > > > > > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, > uint8_t devfn, uint16_t * domain_id) > > > +{ > > > + VTDContextEntry ce; > > > + int ret_fr; > > > + > > > + assert(domain_id); > > > + > > > + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); > > > + if (ret_fr){ > > > + return -1; > > > + } > > > + > > > + *domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi); > > > + return 0; > > > +} > > > + > > > static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr > addr, > > > uint64_t clear, uint64_t mask) > > > { > > > @@ -724,9 +743,6 @@ static int > vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > > > } > > > > > > if (!vtd_context_entry_present(ce)) { > > > - VTD_DPRINTF(GENERAL, > > > - "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " > > > - "is not present", devfn, bus_num); > > > return -VTD_FR_CONTEXT_ENTRY_P; > > > } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || > > > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { > > > @@ -1033,18 +1049,58 @@ static void > vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) > > > &domain_id); > > > } > > > > > > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, > uint16_t domain_id, > > > + hwaddr addr, uint8_t am) > > > +{ > > > + VFIOGuestIOMMU * giommu; > > > + > > > > VT-d parsing VFIO private data structures, nope this is not a good idea. > > > > > + QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ > > > + VTDAddressSpace *vtd_as = container_of(giommu->iommu, > VTDAddressSpace, iommu); > > > > VT-d needs to keep track of its own address spaces and call the iommu > > notifier, it should have no visibility whatsoever that there are vfio > > devices attached. > > > > > + uint16_t vfio_domain_id; > > > + int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), > vtd_as->devfn, &vfio_domain_id); > > > + int i=0; > > > + if (!ret && domain_id == vfio_domain_id){ > > > + IOMMUTLBEntry entry; > > > + > > > + /* do vfio unmap */ > > > + VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", > addr, am); > > > + entry.target_as = NULL; > > > + entry.iova = addr & VTD_PAGE_MASK_4K; > > > + entry.translated_addr = 0; > > > + entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am); > > > + entry.perm = IOMMU_NONE; > > > + memory_region_notify_iommu(giommu->iommu, entry); > > > + > > > + /* do vfio map */ > > > + VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", > addr, am); > > > + /* call to vtd_iommu_translate */ > > > + for (i = 0; i < (1 << am); i++, addr+=(1 << > VTD_PAGE_SHIFT_4K)){ > > > + IOMMUTLBEntry entry = > s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL); > > > + if (entry.perm != IOMMU_NONE){ > > > + memory_region_notify_iommu(giommu->iommu, entry); > > > + } > > > + } > > > + } > > > + } > > > +} > > > + > > > I think I see what you're trying to do here, find the VTDAddressSpaces > that vfio knows about and determine which are affected by an > invalidation, but I think it really just represents a flaw in the VT-d > code not really matching real hardware. As I understand the real > hardware, per device translations use the bus:devfn to get the context > entry. The context entry contains both the domain_id and a pointer to > the page table. Note that VT-d requires that domain_id and page table > pointers are consistent for all entries, there cannot be two separate > context entries with the same domain_id that point to different page > tables. So really, VTDAddressSpace should be based at the domain, not > the context entry. Multiple context entries can point to the same > domain, and thus the same VTDAddressSpace. Doing so would make the > invalidation trivial, simply lookup the VTDAddressSpace based on the > domain_id, get the MemoryRegion, and fire off > memory_region_notify_iommu()s, which then get {un}mapped through vfio > without any direct interaction. Unfortunately I think that means that > intel-iommu needs some data structure surgery. > > With the current code, I think the best we could do would be to look > through every context for matching domain_ids. For each one, use that > bus:devfn to get the VTDAddressSpace and thus the MemoryRegion and > call a notify for each. That's not only an inefficient lookup, but > requires a notify per matching bus:devfn since each uses a separate > VTDAddressSpace when we should really only need a notify per domain_id. > > Also, I haven't figured it out yet, but I think we're also going to > need to actually populate the MemoryRegion rather than just use it to > send notifies, otherwise replay won't work, which means that a device > added to a domain with existing mappings wouldn't work. Thanks, > > Alex >
On Sat, 28 May 2016 10:52:58 +0000 "Aviv B.D." <bd.aviv@gmail.com> wrote: > Hi, > Your idea to search the relevent VTDAddressSpace and call it's notifier > will > probably work. Next week I'll try to implement it (for now with the costly > scan > of each context). I think an optimization we can make is to use pci_for_each_bus() and pci_for_each_device() to scan only context entries where devices are present. Then for each context entry, retrieve the DID, if it matches the invalidation domain_id, retrieve the VTDAddressSpace and perform a memory_region_notify_iommu() using VTDAddressSpace.iommu. Still horribly inefficient, but an improvement over walking all context entries and avoids gratuitous callbacks between unrelated drivers in QEMU. Overall, I have very little faith that this will be the only change required to make this work though. For instance, if a device is added or removed from a domain, where is that accounted for? Ideally this should trigger the region_add/region_del listener callbacks, but I don't see how that works with how VT-d creates a fixed VTDAddressSpace per device, and in fact how our QEMU memory model doesn't allow the address space of a device to be dynamically aliased against other address spaces or really changed at all. > I still not sure if populating the MemoryRegion will suffice for hot plug > vfio > device but i'll try to look into it. > > As far as I understand the memory_region_iommu_replay function, it still > scans > the whole 64bit address space, and therefore may hang the VM for a long > time. Then we need to fix that problem, one option might be to make a replay callback on MemoryRegionIOMMUOps that walks the page tables for a given context entry rather than blindly traversing a 64bit address space. We can't simply ignore the issue by #ifdef'ing out the code. I suspect there's a lot more involved to make VT-d interact properly with a physical device than what's been proposed so far. At every invalidation, we need to figure out what's changed and update the host mappings. We also need better, more dynamic address space management to make the virtual hardware reflect physical hardware when we enable things like passthrough mode or have multiple devices sharing an iommu domain. I think we're just barely scratching the surface here. Thanks, Alex
On Sat, May 28, 2016 at 7:02 PM Alex Williamson <alex.williamson@redhat.com> wrote: > On Sat, 28 May 2016 10:52:58 +0000 > "Aviv B.D." <bd.aviv@gmail.com> wrote: > > > Hi, > > Your idea to search the relevent VTDAddressSpace and call it's notifier > > will > > probably work. Next week I'll try to implement it (for now with the > costly > > scan > > of each context). > > I think an optimization we can make is to use pci_for_each_bus() and > pci_for_each_device() to scan only context entries where devices are > present. Then for each context entry, retrieve the DID, if it matches > the invalidation domain_id, retrieve the VTDAddressSpace and perform a > memory_region_notify_iommu() using VTDAddressSpace.iommu. Still > horribly inefficient, but an improvement over walking all context > entries and avoids gratuitous callbacks between unrelated drivers in > QEMU. > Thanks for the references on how I can do it. :) > > Overall, I have very little faith that this will be the only change > required to make this work though. For instance, if a device is added > or removed from a domain, where is that accounted for? Ideally this > should trigger the region_add/region_del listener callbacks, but I > don't see how that works with how VT-d creates a fixed VTDAddressSpace > per device, and in fact how our QEMU memory model doesn't allow the > address space of a device to be dynamically aliased against other > address spaces or really changed at all. > > > I still not sure if populating the MemoryRegion will suffice for hot plug > > vfio > > device but i'll try to look into it. > > > > As far as I understand the memory_region_iommu_replay function, it still > > scans > > the whole 64bit address space, and therefore may hang the VM for a long > > time. > > Then we need to fix that problem, one option might be to make a replay > callback on MemoryRegionIOMMUOps that walks the page tables for a given > context entry rather than blindly traversing a 64bit address space. We > can't simply ignore the issue by #ifdef'ing out the code. I suspect > there's a lot more involved to make VT-d interact properly with a > physical device than what's been proposed so far. At every > invalidation, we need to figure out what's changed and update the host > mappings. We also need better, more dynamic address space management > to make the virtual hardware reflect physical hardware when we enable > things like passthrough mode or have multiple devices sharing an iommu > domain. I think we're just barely scratching the surface here. Thanks, > > Alex > I agree with you regarding hotplug, therefore I only ifdef this code out and didn't delete it. With the call to memory_region_iommu_replay QEMU hangs on startup with a very long loop that prevent any device assignment with vIOMMU enabled. I'm hoping not to enlarge the scope of this patch to include hotplug device assignment with iommu enabled. Thanks, Aviv.
On Sat, 28 May 2016 16:10:55 +0000 "Aviv B.D." <bd.aviv@gmail.com> wrote: > On Sat, May 28, 2016 at 7:02 PM Alex Williamson <alex.williamson@redhat.com> > wrote: > > > On Sat, 28 May 2016 10:52:58 +0000 > > "Aviv B.D." <bd.aviv@gmail.com> wrote: > > > > > Hi, > > > Your idea to search the relevent VTDAddressSpace and call it's notifier > > > will > > > probably work. Next week I'll try to implement it (for now with the > > costly > > > scan > > > of each context). > > > > I think an optimization we can make is to use pci_for_each_bus() and > > pci_for_each_device() to scan only context entries where devices are > > present. Then for each context entry, retrieve the DID, if it matches > > the invalidation domain_id, retrieve the VTDAddressSpace and perform a > > memory_region_notify_iommu() using VTDAddressSpace.iommu. Still > > horribly inefficient, but an improvement over walking all context > > entries and avoids gratuitous callbacks between unrelated drivers in > > QEMU. > > > > Thanks for the references on how I can do it. :) > > > > > Overall, I have very little faith that this will be the only change > > required to make this work though. For instance, if a device is added > > or removed from a domain, where is that accounted for? Ideally this > > should trigger the region_add/region_del listener callbacks, but I > > don't see how that works with how VT-d creates a fixed VTDAddressSpace > > per device, and in fact how our QEMU memory model doesn't allow the > > address space of a device to be dynamically aliased against other > > address spaces or really changed at all. > > > > > I still not sure if populating the MemoryRegion will suffice for hot plug > > > vfio > > > device but i'll try to look into it. > > > > > > As far as I understand the memory_region_iommu_replay function, it still > > > scans > > > the whole 64bit address space, and therefore may hang the VM for a long > > > time. > > > > Then we need to fix that problem, one option might be to make a replay > > callback on MemoryRegionIOMMUOps that walks the page tables for a given > > context entry rather than blindly traversing a 64bit address space. We > > can't simply ignore the issue by #ifdef'ing out the code. I suspect > > there's a lot more involved to make VT-d interact properly with a > > physical device than what's been proposed so far. At every > > invalidation, we need to figure out what's changed and update the host > > mappings. We also need better, more dynamic address space management > > to make the virtual hardware reflect physical hardware when we enable > > things like passthrough mode or have multiple devices sharing an iommu > > domain. I think we're just barely scratching the surface here. Thanks, > > > > Alex > > > > > I agree with you regarding hotplug, therefore I only ifdef this code out > and didn't > delete it. With the call to memory_region_iommu_replay QEMU hangs on startup > with a very long loop that prevent any device assignment with vIOMMU > enabled. > > I'm hoping not to enlarge the scope of this patch to include hotplug device > assignment > with iommu enabled. It's not just hotplug, any case where an existing domain can be applied to a device. The series is incomplete without such support and I won't accept any changes into vfio that disables code that's correct in other contexts. Thanks, Alex
Hi, As far as I tested the disabled code (call to memory_region_iommu_replay) hangup QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes more than an hour on modern hardware) , at least on x86 hardware. So the code is not 100% correct for any context. Maybe it just should be disabled for x86 architecture? By specification any such behavior of applying a domain to device should include cache invalidation if CM flag is present so I'm not thinking that my patch break this scenario. Thanks, Aviv. On Sat, May 28, 2016 at 8:39 PM Alex Williamson <alex.williamson@redhat.com> wrote: > On Sat, 28 May 2016 16:10:55 +0000 > "Aviv B.D." <bd.aviv@gmail.com> wrote: > > > On Sat, May 28, 2016 at 7:02 PM Alex Williamson < > alex.williamson@redhat.com> > > wrote: > > > > > On Sat, 28 May 2016 10:52:58 +0000 > > > "Aviv B.D." <bd.aviv@gmail.com> wrote: > > > > > > > Hi, > > > > Your idea to search the relevent VTDAddressSpace and call it's > notifier > > > > will > > > > probably work. Next week I'll try to implement it (for now with the > > > costly > > > > scan > > > > of each context). > > > > > > I think an optimization we can make is to use pci_for_each_bus() and > > > pci_for_each_device() to scan only context entries where devices are > > > present. Then for each context entry, retrieve the DID, if it matches > > > the invalidation domain_id, retrieve the VTDAddressSpace and perform a > > > memory_region_notify_iommu() using VTDAddressSpace.iommu. Still > > > horribly inefficient, but an improvement over walking all context > > > entries and avoids gratuitous callbacks between unrelated drivers in > > > QEMU. > > > > > > > Thanks for the references on how I can do it. :) > > > > > > > > Overall, I have very little faith that this will be the only change > > > required to make this work though. For instance, if a device is added > > > or removed from a domain, where is that accounted for? Ideally this > > > should trigger the region_add/region_del listener callbacks, but I > > > don't see how that works with how VT-d creates a fixed VTDAddressSpace > > > per device, and in fact how our QEMU memory model doesn't allow the > > > address space of a device to be dynamically aliased against other > > > address spaces or really changed at all. > > > > > > > I still not sure if populating the MemoryRegion will suffice for hot > plug > > > > vfio > > > > device but i'll try to look into it. > > > > > > > > As far as I understand the memory_region_iommu_replay function, it > still > > > > scans > > > > the whole 64bit address space, and therefore may hang the VM for a > long > > > > time. > > > > > > Then we need to fix that problem, one option might be to make a replay > > > callback on MemoryRegionIOMMUOps that walks the page tables for a given > > > context entry rather than blindly traversing a 64bit address space. We > > > can't simply ignore the issue by #ifdef'ing out the code. I suspect > > > there's a lot more involved to make VT-d interact properly with a > > > physical device than what's been proposed so far. At every > > > invalidation, we need to figure out what's changed and update the host > > > mappings. We also need better, more dynamic address space management > > > to make the virtual hardware reflect physical hardware when we enable > > > things like passthrough mode or have multiple devices sharing an iommu > > > domain. I think we're just barely scratching the surface here. > Thanks, > > > > > > Alex > > > > > > > > > I agree with you regarding hotplug, therefore I only ifdef this code out > > and didn't > > delete it. With the call to memory_region_iommu_replay QEMU hangs on > startup > > with a very long loop that prevent any device assignment with vIOMMU > > enabled. > > > > I'm hoping not to enlarge the scope of this patch to include hotplug > device > > assignment > > with iommu enabled. > > It's not just hotplug, any case where an existing domain can be applied > to a device. The series is incomplete without such support and I won't > accept any changes into vfio that disables code that's correct in other > contexts. Thanks, > > Alex >
On Sat, 28 May 2016 18:14:18 +0000 "Aviv B.D." <bd.aviv@gmail.com> wrote: > Hi, > As far as I tested the disabled code (call to memory_region_iommu_replay) > hangup > QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes > more > than an hour on modern hardware) , at least on x86 hardware. So the code is > not 100% > correct for any context. Maybe it just should be disabled for x86 > architecture? > > By specification any such behavior of applying a domain to device should > include > cache invalidation if CM flag is present so I'm not thinking that my patch > break > this scenario. The functionality is completely necessary, imagine moving a device from an IOMMU API domain in the guest back to the passthrough domain, if there is no replay of the IOMMU context, the device cannot perform any DMA at all. The current replay mechanism is obviously not designed for iterating over every page of a 64bit address space, which is why I suggest a replay callback on MemoryRegionIOMMUOps so that VT-d can optimize the replay by walking the VT-d page tables and perhaps implementation of hardware passthrough mode and the ability to dynamically switch a device to address_space_memory. The current replay code is correct and functional in a context with a window based IOMMU where the IOMMU address space is much smaller. We cannot have correct operation without a mechanism to rebuild the host IOMMU context when a device is switched to a new domain. Please address it. Thanks, Alex
Hi, In case of hot plug vfio device there should not be any active mapping to this device prior the device addition. Also before it added to a guest the guest should not attach the device to any domain as the device is not present. With CM enabled the guest must invalidate the domain or individual mappings that belong to this new device before any use of those maps. I'm still not sure that this functionality is necessary in x86 and currently there is a scenario (for x86) that use this functionality. Thanks, Aviv. On Sat, May 28, 2016 at 10:48 PM Alex Williamson <alex.williamson@redhat.com> wrote: > On Sat, 28 May 2016 18:14:18 +0000 > "Aviv B.D." <bd.aviv@gmail.com> wrote: > > > Hi, > > As far as I tested the disabled code (call to memory_region_iommu_replay) > > hangup > > QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes > > more > > than an hour on modern hardware) , at least on x86 hardware. So the code > is > > not 100% > > correct for any context. Maybe it just should be disabled for x86 > > architecture? > > > > By specification any such behavior of applying a domain to device should > > include > > cache invalidation if CM flag is present so I'm not thinking that my > patch > > break > > this scenario. > > The functionality is completely necessary, imagine moving a device from > an IOMMU API domain in the guest back to the passthrough domain, if > there is no replay of the IOMMU context, the device cannot perform any > DMA at all. The current replay mechanism is obviously not designed for > iterating over every page of a 64bit address space, which is why I > suggest a replay callback on MemoryRegionIOMMUOps so that VT-d can > optimize the replay by walking the VT-d page tables and perhaps > implementation of hardware passthrough mode and the ability to > dynamically switch a device to address_space_memory. The current > replay code is correct and functional in a context with a window based > IOMMU where the IOMMU address space is much smaller. We cannot have > correct operation without a mechanism to rebuild the host IOMMU context > when a device is switched to a new domain. Please address it. Thanks, > > Alex >
On Thu, 02 Jun 2016 13:09:27 +0000 "Aviv B.D." <bd.aviv@gmail.com> wrote: > Hi, > > In case of hot plug vfio device there should not be any active mapping > to this device prior the device addition. Counter example - a device is hot added to a guest booted with iommu=pt. > Also before it added to a guest > the guest should not attach the device to any domain as the device is not > present. The static identity map domain (ie. passthrough domain) can precede the device existing. > With CM enabled the guest must invalidate the domain or individual mappings > that belong to this new device before any use of those maps. > > I'm still not sure that this functionality is necessary in x86 and > currently there > is a scenario (for x86) that use this functionality. Clearly I disagree, it is necessary. Thanks, Alex
Some questions not quite related to this patch content but vfio... On Mon, May 23, 2016 at 11:53:42AM -0600, Alex Williamson wrote: > On Sat, 21 May 2016 19:19:50 +0300 > "Aviv B.D" <bd.aviv@gmail.com> wrote: [...] > > +#if 0 > > static hwaddr vfio_container_granularity(VFIOContainer *container) > > { > > return (hwaddr)1 << ctz64(container->iova_pgsizes); > > } > > - > > +#endif Here we are fetching the smallest page size that host IOMMU support, so even if host IOMMU support large pages, it will not be used as long as guest enabled vIOMMU, right? > > > Clearly this is unacceptable, the code has a purpose. > > > static void vfio_listener_region_add(MemoryListener *listener, > > MemoryRegionSection *section) > > { > > @@ -384,11 +387,13 @@ static void vfio_listener_region_add(MemoryListener *listener, > > giommu->n.notify = vfio_iommu_map_notify; > > QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); > > > > + vtd_register_giommu(giommu); > > vfio will not assume VT-d, this is why we register the notifier below. > > > memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); > > +#if 0 > > memory_region_iommu_replay(giommu->iommu, &giommu->n, > > vfio_container_granularity(container), > > false); For memory_region_iommu_replay(), we are using vfio_container_granularity() as the granularity, which is the host IOMMU page size. However inside it: void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, hwaddr granularity, bool is_write) { hwaddr addr; IOMMUTLBEntry iotlb; for (addr = 0; addr < memory_region_size(mr); addr += granularity) { iotlb = mr->iommu_ops->translate(mr, addr, is_write); if (iotlb.perm != IOMMU_NONE) { n->notify(n, &iotlb); } /* if (2^64 - MR size) < granularity, it's possible to get an * infinite loop here. This should catch such a wraparound */ if ((addr + granularity) < addr) { break; } } } Is it possible that iotlb mapped to a large page (or any page that is not the same as granularity)? The above code should have assumed that host/guest IOMMU are having the same page size == granularity? Thanks, -- peterx
On Thu, Jun 02, 2016 at 07:34:17AM -0600, Alex Williamson wrote: > On Thu, 02 Jun 2016 13:09:27 +0000 > "Aviv B.D." <bd.aviv@gmail.com> wrote: > > > Hi, > > > > In case of hot plug vfio device there should not be any active mapping > > to this device prior the device addition. > > Counter example - a device is hot added to a guest booted with iommu=pt. I got the same question with Aviv... For hot-plug devices, even if it is using iommu=pt, shouldn't it still follow the steps that first init vfio device, then configure device context entry? Let me list the steps for device addition in case I got any mistake: 1. user add new VFIO device A 2. vfio_listener_region_add() called for device A on the IOMMU mr, here we should create the iommu notifier. However since the context entry still does not exist, memory_region_iommu_replay() will got all invalid IOTLB (IOMMU_NONE entries) 3. guest kernel found the device, enabled the device, filled in context entry for device A with "pass-through" (so the SLPTPTR is invalid) 4. guest sent context invalidation to QEMU vIOMMU since we have CM=1 set for guest vIOMMU 5. QEMU vIOMMU handle the invalidation, trigger VFIO notify to do correct VFIO mapping for device A Though here step 5 should still be missing (IIUC Aviv's patch 3 still not handled context invalidation). Just want to know whether we can avoid the replay operation for Intel vIOMMUs (for Intel only, because Intel has context invalidation and cache mode support, not sure about other platform)? Thanks, -- peterx
On Mon, 6 Jun 2016 15:38:25 +0800 Peter Xu <peterx@redhat.com> wrote: > Some questions not quite related to this patch content but vfio... > > On Mon, May 23, 2016 at 11:53:42AM -0600, Alex Williamson wrote: > > On Sat, 21 May 2016 19:19:50 +0300 > > "Aviv B.D" <bd.aviv@gmail.com> wrote: > > [...] > > > > +#if 0 > > > static hwaddr vfio_container_granularity(VFIOContainer *container) > > > { > > > return (hwaddr)1 << ctz64(container->iova_pgsizes); > > > } > > > - > > > +#endif > > Here we are fetching the smallest page size that host IOMMU support, > so even if host IOMMU support large pages, it will not be used as long > as guest enabled vIOMMU, right? Not using this replay mechanism, correct. AFAIK, this replay code has only been tested on POWER where the window is much, much smaller than the 64bit address space and hugepages are not supported. A replay callback into the iommu could could not only walk the address space more efficiently, but also attempt to map with hugepages. It would however need to be cautious not to coalesce separate mappings by the guest into a single mapping through vfio, or else we're going to have inconsistency for mapping vs unmapping that vfio does not expect or support. > > > > > > Clearly this is unacceptable, the code has a purpose. > > > > > static void vfio_listener_region_add(MemoryListener *listener, > > > MemoryRegionSection *section) > > > { > > > @@ -384,11 +387,13 @@ static void vfio_listener_region_add(MemoryListener *listener, > > > giommu->n.notify = vfio_iommu_map_notify; > > > QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); > > > > > > + vtd_register_giommu(giommu); > > > > vfio will not assume VT-d, this is why we register the notifier below. > > > > > memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); > > > +#if 0 > > > memory_region_iommu_replay(giommu->iommu, &giommu->n, > > > vfio_container_granularity(container), > > > false); > > For memory_region_iommu_replay(), we are using > vfio_container_granularity() as the granularity, which is the host > IOMMU page size. However inside it: > > void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, > hwaddr granularity, bool is_write) > { > hwaddr addr; > IOMMUTLBEntry iotlb; > > for (addr = 0; addr < memory_region_size(mr); addr += granularity) { > iotlb = mr->iommu_ops->translate(mr, addr, is_write); > if (iotlb.perm != IOMMU_NONE) { > n->notify(n, &iotlb); > } > > /* if (2^64 - MR size) < granularity, it's possible to get an > * infinite loop here. This should catch such a wraparound */ > if ((addr + granularity) < addr) { > break; > } > } > } > > Is it possible that iotlb mapped to a large page (or any page that is > not the same as granularity)? The above code should have assumed that > host/guest IOMMU are having the same page size == granularity? I think this is answered above. This is not remotely efficient code for a real 64bit IOMMU (BTW, VT-d does not support the full 64bit address space either, I believe it's more like 48bits) and is not going to replay hugepages, but it will give us sufficiently correct IOMMU entries... eventually. Thanks, Alex
On Mon, 6 Jun 2016 16:09:11 +0800 Peter Xu <peterx@redhat.com> wrote: > On Thu, Jun 02, 2016 at 07:34:17AM -0600, Alex Williamson wrote: > > On Thu, 02 Jun 2016 13:09:27 +0000 > > "Aviv B.D." <bd.aviv@gmail.com> wrote: > > > > > Hi, > > > > > > In case of hot plug vfio device there should not be any active mapping > > > to this device prior the device addition. > > > > Counter example - a device is hot added to a guest booted with iommu=pt. > > I got the same question with Aviv... > > For hot-plug devices First, let's just remove "hot-plug" from this discussion because I'm afraid someone is going to say "let's just not support hotplug devices". The issue of moving a device to a pre-populated domain can occur any time a device attaches to a new domain. It might occur if a device is added to a vfio driver in the L1 guest where it's already managing a device. It might occur any time the device is released from an L1 vfio user and returned to the static identity map in the L1 guest kernel. >, even if it is using iommu=pt, shouldn't it still > follow the steps that first init vfio device, then configure device > context entry? Let me list the steps for device addition in case I got > any mistake: > > 1. user add new VFIO device A > > 2. vfio_listener_region_add() called for device A on the IOMMU mr, > here we should create the iommu notifier. However since the context > entry still does not exist, memory_region_iommu_replay() will got > all invalid IOTLB (IOMMU_NONE entries) > > 3. guest kernel found the device, enabled the device, filled in > context entry for device A with "pass-through" (so the SLPTPTR is > invalid) > > 4. guest sent context invalidation to QEMU vIOMMU since we have CM=1 > set for guest vIOMMU > > 5. QEMU vIOMMU handle the invalidation, trigger VFIO notify to do > correct VFIO mapping for device A > > Though here step 5 should still be missing (IIUC Aviv's patch 3 still > not handled context invalidation). Just want to know whether we can > avoid the replay operation for Intel vIOMMUs (for Intel only, because > Intel has context invalidation and cache mode support, not sure about > other platform)? I'm not sure why you're so eager to avoid implementing a replay callback for VT-d. What happens when VT-d is not enabled by the guest? vfio/pci.c calls pci_device_iommu_address_space() to get the address space for the device, which results in vtd_find_add_as() which gives us a unique VTDAddressSpace.as for that device. With VT-d not in use by the guest, when do steps 3-5 occur? I agree with your reasoning when VT-d is enabled, but the BIOS/UEFI does not enable VT-d, so are we going to exclude all use cases of an assigned device prior to the guest enabling VT-d? On that same topic, I'm really dubious that we have the flexibility in our memory API to really support an IOMMU like VT-d and the layout of having a VTDAddressSpace per device, rather than exposing shared address spaces, has implications on the efficiency and locked memory requirements for vfio. In the above case with VT-d disabled, the VTDAddressSpace should alias to the system memory AddressSpace and dynamically switch to a per device address space when VT-d is enabled. AFAICT, we don't have anything resembling that sort of feature, so we rely on the IOMMU driver to replay, perhaps even configuring its own MemoryListener to IOMMU notifier gateway, which is also something that doesn't currently exist. Additionally, if there are things unique to VT-d, for instance if VT-d is enabled and we can rely on the sequence of events you've set forth, then yes, the replay mechanism should do nothing. But only the VT-d code can decide that, which implies that vfio always needs to call the replay function and a new MemoryRegionIOMMUOps callback needs to decide what to do given the current state of the vIOMMU. Thanks, Alex
On Mon, Jun 06, 2016 at 12:21:34PM -0600, Alex Williamson wrote: [...] > I'm not sure why you're so eager to avoid implementing a replay > callback for VT-d. What happens when VT-d is not enabled by the > guest? vfio/pci.c calls pci_device_iommu_address_space() to get the > address space for the device, which results in vtd_find_add_as() which > gives us a unique VTDAddressSpace.as for that device. With VT-d not in > use by the guest, when do steps 3-5 occur? I agree with your reasoning > when VT-d is enabled, but the BIOS/UEFI does not enable VT-d, so are we > going to exclude all use cases of an assigned device prior to the guest > enabling VT-d? I think I got the point. I failed to consider the case that we can run IOMMU without enabling it, like BIOS (as you have mentioned), or we can disable IOMMU in kernel boot parameters (though, iommu=pt should still follow the case that IOMMU is enabled). Sounds like a replay callback is a good idea. For Intel version of the callback: when DMAR is enabled, we can just return directly. when DMAR is disabled, we just do whatever we need to do region_add() for the global address_space_memory. > > On that same topic, I'm really dubious that we have the flexibility in > our memory API to really support an IOMMU like VT-d and the layout of > having a VTDAddressSpace per device, rather than exposing shared > address spaces, has implications on the efficiency and locked memory > requirements for vfio. In the above case with VT-d disabled, the > VTDAddressSpace should alias to the system memory AddressSpace and > dynamically switch to a per device address space when VT-d is enabled. > AFAICT, we don't have anything resembling that sort of feature, so we > rely on the IOMMU driver to replay, perhaps even configuring its own > MemoryListener to IOMMU notifier gateway, which is also something that > doesn't currently exist. It sounds more like a notifier for "IOMMU enablement"? The notifier should be triggered when IOMMU switch between enabled <-> disabled? When this happens, we rebuild the mapping in some way. > > Additionally, if there are things unique to VT-d, for instance if VT-d > is enabled and we can rely on the sequence of events you've set forth, > then yes, the replay mechanism should do nothing. But only the VT-d > code can decide that, which implies that vfio always needs to call the > replay function and a new MemoryRegionIOMMUOps callback needs to decide > what to do given the current state of the vIOMMU. Thanks, Right. Thanks. -- peterx
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 410f810..128ec7c 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR); #define VTD_DPRINTF(what, fmt, ...) do {} while (0) #endif +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, + uint8_t devfn, VTDContextEntry *ce); + static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, uint64_t wmask, uint64_t w1cmask) { @@ -126,6 +129,22 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, return new_val; } +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, uint16_t * domain_id) +{ + VTDContextEntry ce; + int ret_fr; + + assert(domain_id); + + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); + if (ret_fr){ + return -1; + } + + *domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi); + return 0; +} + static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, uint64_t clear, uint64_t mask) { @@ -724,9 +743,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, } if (!vtd_context_entry_present(ce)) { - VTD_DPRINTF(GENERAL, - "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " - "is not present", devfn, bus_num); return -VTD_FR_CONTEXT_ENTRY_P; } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { @@ -1033,18 +1049,58 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) &domain_id); } +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, uint16_t domain_id, + hwaddr addr, uint8_t am) +{ + VFIOGuestIOMMU * giommu; + + QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ + VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu); + uint16_t vfio_domain_id; + int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &vfio_domain_id); + int i=0; + if (!ret && domain_id == vfio_domain_id){ + IOMMUTLBEntry entry; + + /* do vfio unmap */ + VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am); + entry.target_as = NULL; + entry.iova = addr & VTD_PAGE_MASK_4K; + entry.translated_addr = 0; + entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am); + entry.perm = IOMMU_NONE; + memory_region_notify_iommu(giommu->iommu, entry); + + /* do vfio map */ + VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am); + /* call to vtd_iommu_translate */ + for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){ + IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL); + if (entry.perm != IOMMU_NONE){ + memory_region_notify_iommu(giommu->iommu, entry); + } + } + } + } +} + static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) { VTDIOTLBPageInvInfo info; assert(am <= VTD_MAMV); + info.domain_id = domain_id; info.addr = addr; info.mask = ~((1 << am) - 1); + g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + + vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am); } + /* Flush IOTLB * Returns the IOTLB Actual Invalidation Granularity. * @val: the content of the IOTLB_REG @@ -1912,6 +1968,13 @@ static Property vtd_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +void vtd_register_giommu(VFIOGuestIOMMU * giommu) +{ + VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu); + IntelIOMMUState *s = vtd_as->iommu_state; + + QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next); +} VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) { diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index ae40f73..102e9a5 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -339,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; #define VTD_PAGE_SHIFT_1G 30 #define VTD_PAGE_MASK_1G (~((1ULL << VTD_PAGE_SHIFT_1G) - 1)) +#define VTD_PAGE_MASK(shift) (~((1ULL << (shift)) - 1)) + struct VTDRootEntry { uint64_t val; uint64_t rsvd; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 88154a1..54fc8bc 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -35,6 +35,9 @@ #endif #include "trace.h" +#include "hw/sysbus.h" +#include "hw/i386/intel_iommu.h" + struct vfio_group_head vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); struct vfio_as_head vfio_address_spaces = @@ -315,12 +318,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data) out: rcu_read_unlock(); } - +#if 0 static hwaddr vfio_container_granularity(VFIOContainer *container) { return (hwaddr)1 << ctz64(container->iova_pgsizes); } - +#endif static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -384,11 +387,13 @@ static void vfio_listener_region_add(MemoryListener *listener, giommu->n.notify = vfio_iommu_map_notify; QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); + vtd_register_giommu(giommu); memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); +#if 0 memory_region_iommu_replay(giommu->iommu, &giommu->n, vfio_container_granularity(container), false); - +#endif return; } diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index b024ffa..22f3f83 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -23,6 +23,7 @@ #define INTEL_IOMMU_H #include "hw/qdev.h" #include "sysemu/dma.h" +#include "hw/vfio/vfio-common.h" #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" #define INTEL_IOMMU_DEVICE(obj) \ @@ -123,6 +124,8 @@ struct IntelIOMMUState { MemoryRegionIOMMUOps iommu_ops; GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by PCIBus* reference */ VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */ + + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; }; /* Find the VTD Address space associated with the given bus pointer, @@ -130,4 +133,5 @@ struct IntelIOMMUState { */ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn); +void vtd_register_giommu(VFIOGuestIOMMU * giommu); #endif diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index eb0e1b0..bf56a1d 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -92,6 +92,7 @@ typedef struct VFIOGuestIOMMU { MemoryRegion *iommu; Notifier n; QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; + QLIST_ENTRY(VFIOGuestIOMMU) iommu_next; } VFIOGuestIOMMU; typedef struct VFIODeviceOps VFIODeviceOps;