diff mbox

[RFC] IOMMU: Add Support to VFIO devices with vIOMMU present

Message ID CAM3WwMgvSW7c74An0zGk+DbVmPmDpb5Q0ghjLC09H6dXXxaRoQ@mail.gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Aviv B.D. March 12, 2016, 4:13 p.m. UTC
From: "Aviv B.D." <bd.aviv@gmail.com>

 * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
    device are present.
 * Advertise Cache Mode capability in iommu cap register.
 * Register every VFIO device with IOMMU state.
 * On page cache invalidation in vIOMMU, check if the domain belong to
   VFIO device and mirror the guest requests to host.

Not working (Yet!):
 * Tested only with network interface card (ixgbevf) and
    intel_iommu=strict in guest's kernel command line.
 * Lock up under high load.
 * Errors on guest poweroff.
 * High relative latency compare to VFIO without IOMMU.

Signed-off-by: Aviv B.D. <bd.aviv@gmail.com>
---
 hw/i386/intel_iommu.c          | 76
++++++++++++++++++++++++++++++++++++++----
 hw/i386/intel_iommu_internal.h |  1 +
 hw/vfio/common.c               | 12 +++++--
 include/hw/i386/intel_iommu.h  |  4 +++
 include/hw/vfio/vfio-common.h  |  1 +
 5 files changed, 85 insertions(+), 9 deletions(-)

Comments

Marcel Apfelbaum March 14, 2016, 6:52 p.m. UTC | #1
On 03/12/2016 06:13 PM, Aviv B.D. wrote:
> From: "Aviv B.D." <bd.aviv@gmail.com <mailto:bd.aviv@gmail.com>>
>
>   * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
>      device are present.
>   * Advertise Cache Mode capability in iommu cap register.
>   * Register every VFIO device with IOMMU state.
>   * On page cache invalidation in vIOMMU, check if the domain belong to
>     VFIO device and mirror the guest requests to host.
>
> Not working (Yet!):
>   * Tested only with network interface card (ixgbevf) and
>      intel_iommu=strict in guest's kernel command line.
>   * Lock up under high load.
>   * Errors on guest poweroff.
>   * High relative latency compare to VFIO without IOMMU.

Adding (possibly) interested developers to the thread.

Thanks,
Marcel

>
> Signed-off-by: Aviv B.D. <bd.aviv@gmail.com <mailto:bd.aviv@gmail.com>>
> ---
>   hw/i386/intel_iommu.c          | 76 ++++++++++++++++++++++++++++++++++++++----
>   hw/i386/intel_iommu_internal.h |  1 +
>   hw/vfio/common.c               | 12 +++++--
>   include/hw/i386/intel_iommu.h  |  4 +++
>   include/hw/vfio/vfio-common.h  |  1 +
>   5 files changed, 85 insertions(+), 9 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 347718f..046688f 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
>   #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>   #endif
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>   static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                               uint64_t wmask, uint64_t w1cmask)
>   {
> @@ -126,6 +130,19 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
>       return new_val;
>   }
> +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr){
> +        return -1;
> +    }
> +
> +    return VTD_CONTEXT_ENTRY_DID(ce.hi);
> +}
> +
>   static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>                                           uint64_t clear, uint64_t mask)
>   {
> @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>       }
>       if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> +        /*VTD_DPRINTF(GENERAL,
>                       "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);
> +                    "is not present", devfn, bus_num);*/
>           return -VTD_FR_CONTEXT_ENTRY_P;
>       } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                  (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -1020,14 +1037,53 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>                                         hwaddr addr, uint8_t am)
>   {
>       VTDIOTLBPageInvInfo info;
> +    VFIOGuestIOMMU * giommu;
> +    bool flag = false;
>       assert(am <= VTD_MAMV);
>       info.domain_id = domain_id;
>       info.addr = addr;
>       info.mask = ~((1 << am) - 1);
> +
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
> +        uint16_t vfio_source_id = vtd_make_source_id(pci_bus_num(vtd_as->bus), vtd_as->devfn);
> +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn);
> +        if (vfio_domain_id != (uint16_t)-1 &&
> +                domain_id == vfio_domain_id){
> +            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s, vfio_source_id, addr);
> +            if (iotlb_entry != NULL){
> +                IOMMUTLBEntry entry;
> +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> +                entry.iova = addr & VTD_PAGE_MASK_4K;
> +                entry.translated_addr = vtd_get_slpte_addr(iotlb_entry->slpte) & VTD_PAGE_MASK_4K;
> +                entry.addr_mask = ~VTD_PAGE_MASK_4K;
> +                entry.perm = IOMMU_NONE;
> +                memory_region_notify_iommu(giommu->iommu, entry);
> +                flag = true;
> +
> +            }
> +        }
> +
> +    }
> +
>       g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> -}
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
> +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn);
> +        if (vfio_domain_id != (uint16_t)-1 &&
> +                domain_id == vfio_domain_id && !flag){
> +            /* do vfio map */
> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +            /* call to vtd_iommu_translate */
> +            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, 0);
> +            entry.perm = IOMMU_RW;
> +            memory_region_notify_iommu(giommu->iommu, entry);
> +            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> +        }
> +    }
> +}
>   /* Flush IOTLB
>    * Returns the IOTLB Actual Invalidation Granularity.
>    * @val: the content of the IOTLB_REG
> @@ -1895,6 +1951,13 @@ static Property vtd_properties[] = {
>       DEFINE_PROP_END_OF_LIST(),
>   };
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> +{
> +    VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +
> +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> +}
>   VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>   {
> @@ -1949,7 +2012,8 @@ static void vtd_init(IntelIOMMUState *s)
>       s->iq_last_desc_type = VTD_INV_DESC_NONE;
>       s->next_frcd_reg = 0;
>       s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
> +             VTD_CAP_CM;
>       s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
>       vtd_reset_context_cache(s);
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index e5f514c..ae40f73 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -190,6 +190,7 @@
>   #define VTD_CAP_MAMV                (VTD_MAMV << 48)
>   #define VTD_CAP_PSI                 (1ULL << 39)
>   #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> +#define VTD_CAP_CM                  (1ULL << 7)
>   /* Supported Adjusted Guest Address Widths */
>   #define VTD_CAP_SAGAW_SHIFT         8
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 607ec70..98c8d67 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -32,6 +32,9 @@
>   #include "sysemu/kvm.h"
>   #include "trace.h"
> +#include "hw/sysbus.h"
> +#include "hw/i386/intel_iommu.h"
> +
>   struct vfio_group_head vfio_group_list =
>       QLIST_HEAD_INITIALIZER(vfio_group_list);
>   struct vfio_as_head vfio_address_spaces =
> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
>   out:
>       rcu_read_unlock();
>   }
> -
> +#if 0
>   static hwaddr vfio_container_granularity(VFIOContainer *container)
>   {
>       return (hwaddr)1 << ctz64(container->iova_pgsizes);
>   }
> -
> +#endif
>   static void vfio_listener_region_add(MemoryListener *listener,
>                                        MemoryRegionSection *section)
>   {
> @@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
>       iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>       llend = int128_make64(section->offset_within_address_space);
>       llend = int128_add(llend, section->size);
> +    llend = int128_add(llend, int128_exts64(-1));
>       llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>       if (int128_ge(int128_make64(iova), llend)) {
> @@ -381,11 +385,13 @@ static void vfio_listener_region_add(MemoryListener *listener,
>           giommu->n.notify = vfio_iommu_map_notify;
>           QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> +        vtd_register_giommu(giommu);
>           memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +#if 0
>           memory_region_iommu_replay(giommu->iommu, &giommu->n,
>                                      vfio_container_granularity(container),
>                                      false);
> -
> +#endif
>           return;
>       }
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index b024ffa..22f3f83 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -23,6 +23,7 @@
>   #define INTEL_IOMMU_H
>   #include "hw/qdev.h"
>   #include "sysemu/dma.h"
> +#include "hw/vfio/vfio-common.h"
>   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>   #define INTEL_IOMMU_DEVICE(obj) \
> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>       MemoryRegionIOMMUOps iommu_ops;
>       GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
>       VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
> +
> +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>   };
>   /* Find the VTD Address space associated with the given bus pointer,
> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>    */
>   VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>   #endif
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f037f3c..9225ba3 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>       MemoryRegion *iommu;
>       Notifier n;
>       QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>   } VFIOGuestIOMMU;
>   typedef struct VFIODeviceOps VFIODeviceOps;
> --
> 1.9.1
Jan Kiszka March 14, 2016, 6:58 p.m. UTC | #2
On 2016-03-14 19:52, Marcel Apfelbaum wrote:
> On 03/12/2016 06:13 PM, Aviv B.D. wrote:
>> From: "Aviv B.D." <bd.aviv@gmail.com <mailto:bd.aviv@gmail.com>>
>>
>>   * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
>>      device are present.
>>   * Advertise Cache Mode capability in iommu cap register.

For the final version: Please keep that feature optional, for the sake
of emulation accuracy (no modern hw exposes it any more). Maybe turn it
one once a vfio device is in the scope of the IOMMU?

>>   * Register every VFIO device with IOMMU state.
>>   * On page cache invalidation in vIOMMU, check if the domain belong to
>>     VFIO device and mirror the guest requests to host.
>>
>> Not working (Yet!):
>>   * Tested only with network interface card (ixgbevf) and
>>      intel_iommu=strict in guest's kernel command line.
>>   * Lock up under high load.
>>   * Errors on guest poweroff.
>>   * High relative latency compare to VFIO without IOMMU.
> 
> Adding (possibly) interested developers to the thread.

Thanks,
Jan
Michael S. Tsirkin March 15, 2016, 7 a.m. UTC | #3
On Mon, Mar 14, 2016 at 07:58:23PM +0100, Jan Kiszka wrote:
> On 2016-03-14 19:52, Marcel Apfelbaum wrote:
> > On 03/12/2016 06:13 PM, Aviv B.D. wrote:
> >> From: "Aviv B.D." <bd.aviv@gmail.com <mailto:bd.aviv@gmail.com>>
> >>
> >>   * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
> >>      device are present.
> >>   * Advertise Cache Mode capability in iommu cap register.
> 
> For the final version: Please keep that feature optional, for the sake
> of emulation accuracy (no modern hw exposes it any more). Maybe turn it
> one once a vfio device is in the scope of the IOMMU?

That would be hard to implement: VFIO supports hotplug and there's no way to
change this on the fly.

I would say
	- make the feature an optional flag
	- deny adding a VFIO device if the flag is not set


> >>   * Register every VFIO device with IOMMU state.
> >>   * On page cache invalidation in vIOMMU, check if the domain belong to
> >>     VFIO device and mirror the guest requests to host.
> >>
> >> Not working (Yet!):
> >>   * Tested only with network interface card (ixgbevf) and
> >>      intel_iommu=strict in guest's kernel command line.
> >>   * Lock up under high load.
> >>   * Errors on guest poweroff.
> >>   * High relative latency compare to VFIO without IOMMU.
> > 
> > Adding (possibly) interested developers to the thread.
> 
> Thanks,
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT RDA ITP SES-DE
> Corporate Competence Center Embedded Linux
Peter Xu March 15, 2016, 8:52 a.m. UTC | #4
On Mon, Mar 14, 2016 at 08:52:33PM +0200, Marcel Apfelbaum wrote:
> On 03/12/2016 06:13 PM, Aviv B.D. wrote:
> Adding (possibly) interested developers to the thread.

Thanks CC.

Hi, Aviv, several questions inline.

[...]

> >@@ -1020,14 +1037,53 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >                                        hwaddr addr, uint8_t am)
> >  {
> >      VTDIOTLBPageInvInfo info;
> >+    VFIOGuestIOMMU * giommu;
> >+    bool flag = false;
> >      assert(am <= VTD_MAMV);
> >      info.domain_id = domain_id;
> >      info.addr = addr;
> >      info.mask = ~((1 << am) - 1);
> >+
> >+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> >+        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
> >+        uint16_t vfio_source_id = vtd_make_source_id(pci_bus_num(vtd_as->bus), vtd_as->devfn);
> >+        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn);
> >+        if (vfio_domain_id != (uint16_t)-1 &&

Could you (or anyone) help explain what does vfio_domain_id != -1
mean?

> >+                domain_id == vfio_domain_id){
> >+            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s, vfio_source_id, addr);
> >+            if (iotlb_entry != NULL){

Here, shall we notify VFIO even if the address is not cached in
IOTLB? Anyway, we need to do the unmap() of the address, am I
correct?

> >+                IOMMUTLBEntry entry;
> >+                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> >+                entry.iova = addr & VTD_PAGE_MASK_4K;
> >+                entry.translated_addr = vtd_get_slpte_addr(iotlb_entry->slpte) & VTD_PAGE_MASK_4K;
> >+                entry.addr_mask = ~VTD_PAGE_MASK_4K;
> >+                entry.perm = IOMMU_NONE;
> >+                memory_region_notify_iommu(giommu->iommu, entry);
> >+                flag = true;
> >+
> >+            }
> >+        }
> >+
> >+    }
> >+
> >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> >-}
> >+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> >+        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
> >+        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn);
> >+        if (vfio_domain_id != (uint16_t)-1 &&
> >+                domain_id == vfio_domain_id && !flag){
> >+            /* do vfio map */
> >+            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> >+            /* call to vtd_iommu_translate */
> >+            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, 0);
> >+            entry.perm = IOMMU_RW;
> >+            memory_region_notify_iommu(giommu->iommu, entry);
> >+            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> >+        }
> >+    }
> >+}

I see that we did handled all the page invalidations. Would it
possible that guest kernel send domain/global invalidations? Should
we handle them too?

[...]

> >  static void vfio_listener_region_add(MemoryListener *listener,
> >                                       MemoryRegionSection *section)
> >  {
> >@@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
> >      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> >      llend = int128_make64(section->offset_within_address_space);
> >      llend = int128_add(llend, section->size);
> >+    llend = int128_add(llend, int128_exts64(-1));

Here, -1 should fix the assertion core dump. However, shall we also
handle all the rest places that used "llend" (possibly with variable
"end") too? For example, at the end of current function, when we map
dma regions:

    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);

To:

    ret = vfio_dma_map(container, iova, end + 1 - iova, vaddr, section->readonly);

Thanks.
Peter
Michael S. Tsirkin March 15, 2016, 10:53 a.m. UTC | #5
On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote:
> From: "Aviv B.D." <bd.aviv@gmail.com>
> 
>  * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
>     device are present.
>  * Advertise Cache Mode capability in iommu cap register.
>  * Register every VFIO device with IOMMU state.
>  * On page cache invalidation in vIOMMU, check if the domain belong to
>    VFIO device and mirror the guest requests to host.
> 
> Not working (Yet!):
>  * Tested only with network interface card (ixgbevf) and
>     intel_iommu=strict in guest's kernel command line.
>  * Lock up under high load.
>  * Errors on guest poweroff.
>  * High relative latency compare to VFIO without IOMMU.
> 
> Signed-off-by: Aviv B.D. <bd.aviv@gmail.com>

Thanks, this is very interesting.
So this needs some cleanup, and some issues that will have to be addressed.
See below.
Thanks!


> ---
>  hw/i386/intel_iommu.c          | 76 ++++++++++++++++++++++++++++++++++++++----
>  hw/i386/intel_iommu_internal.h |  1 +
>  hw/vfio/common.c               | 12 +++++--
>  include/hw/i386/intel_iommu.h  |  4 +++
>  include/hw/vfio/vfio-common.h  |  1 +
>  5 files changed, 85 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 347718f..046688f 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT
> (CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
>  
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {
> @@ -126,6 +130,19 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState
> *s, hwaddr addr,
>      return new_val;
>  }
>  
> +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t
> devfn)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr){
> +        return -1;
> +    }
> +
> +    return VTD_CONTEXT_ENTRY_DID(ce.hi);
> +}
> +
>  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>                                          uint64_t clear, uint64_t mask)
>  {
> @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> uint8_t bus_num,
>      }
>  
>      if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> +        /*VTD_DPRINTF(GENERAL,
>                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);
> +                    "is not present", devfn, bus_num);*/
>          return -VTD_FR_CONTEXT_ENTRY_P;
>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -1020,14 +1037,53 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState
> *s, uint16_t domain_id,
>                                        hwaddr addr, uint8_t am)
>  {
>      VTDIOTLBPageInvInfo info;
> +    VFIOGuestIOMMU * giommu;
> +    bool flag = false;
>  
>      assert(am <= VTD_MAMV);
>      info.domain_id = domain_id;
>      info.addr = addr;
>      info.mask = ~((1 << am) - 1);
> +
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace,
> iommu);
> +        uint16_t vfio_source_id = vtd_make_source_id(pci_bus_num(vtd_as->bus),
> vtd_as->devfn);
> +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn);
> +        if (vfio_domain_id != (uint16_t)-1 && 
> +                domain_id == vfio_domain_id){
> +            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s, vfio_source_id,
> addr);
> +            if (iotlb_entry != NULL){
> +                IOMMUTLBEntry entry; 
> +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr,
> am);
> +                entry.iova = addr & VTD_PAGE_MASK_4K;
> +                entry.translated_addr = vtd_get_slpte_addr(iotlb_entry->slpte)
> & VTD_PAGE_MASK_4K;
> +                entry.addr_mask = ~VTD_PAGE_MASK_4K;
> +                entry.perm = IOMMU_NONE;
> +                memory_region_notify_iommu(giommu->iommu, entry);
> +                flag = true;
> +
> +            }
> +        }
> +
> +    }
> +
>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> -}
>  
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace,
> iommu);
> +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn);
> +        if (vfio_domain_id != (uint16_t)-1 && 
> +                domain_id == vfio_domain_id && !flag){
> +            /* do vfio map */
> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +            /* call to vtd_iommu_translate */
> +            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr,
> 0); 
> +            entry.perm = IOMMU_RW;
> +            memory_region_notify_iommu(giommu->iommu, entry);

So this just triggers the iommu notifiers in the regular way.
But what if two devices have conflicting entries?
We need to setup host IOMMU differently for them.
But see below

> +            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> +        }
> +    }
> +}
>  /* Flush IOTLB
>   * Returns the IOTLB Actual Invalidation Granularity.
>   * @val: the content of the IOTLB_REG
> @@ -1895,6 +1951,13 @@ static Property vtd_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> +{
> +    VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace,
> iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    
> +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> +}
>  
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>  {
> @@ -1949,7 +2012,8 @@ static void vtd_init(IntelIOMMUState *s)
>      s->iq_last_desc_type = VTD_INV_DESC_NONE;
>      s->next_frcd_reg = 0;
>      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
> +             VTD_CAP_CM;
>      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
>  
>      vtd_reset_context_cache(s);
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index e5f514c..ae40f73 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -190,6 +190,7 @@
>  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
>  #define VTD_CAP_PSI                 (1ULL << 39)
>  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> +#define VTD_CAP_CM                  (1ULL << 7)
>  
>  /* Supported Adjusted Guest Address Widths */
>  #define VTD_CAP_SAGAW_SHIFT         8
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 607ec70..98c8d67 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -32,6 +32,9 @@
>  #include "sysemu/kvm.h"
>  #include "trace.h"
>  
> +#include "hw/sysbus.h"
> +#include "hw/i386/intel_iommu.h"
> +
>  struct vfio_group_head vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>  struct vfio_as_head vfio_address_spaces =
> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void
> *data)
>  out:
>      rcu_read_unlock();
>  }
> -
> +#if 0
>  static hwaddr vfio_container_granularity(VFIOContainer *container)
>  {
>      return (hwaddr)1 << ctz64(container->iova_pgsizes);
>  }
> -
> +#endif
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener
> *listener,
>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>      llend = int128_make64(section->offset_within_address_space);
>      llend = int128_add(llend, section->size);
> +    llend = int128_add(llend, int128_exts64(-1));
>      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>  
>      if (int128_ge(int128_make64(iova), llend)) {
> @@ -381,11 +385,13 @@ static void vfio_listener_region_add(MemoryListener
> *listener,
>          giommu->n.notify = vfio_iommu_map_notify;
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>  
> +        vtd_register_giommu(giommu);
>          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +#if 0
>          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>                                     vfio_container_granularity(container),
>                                     false);
> -
> +#endif
>          return;
>      }
> 

Looks like there is still a single global domain on the host.
Works up to a point as long as we have a single VFIO device.

 
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index b024ffa..22f3f83 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -23,6 +23,7 @@
>  #define INTEL_IOMMU_H
>  #include "hw/qdev.h"
>  #include "sysemu/dma.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>      MemoryRegionIOMMUOps iommu_ops;
>      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus*
> reference */
>      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by
> bus number */
> +
> +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>  };
>  
>  /* Find the VTD Address space associated with the given bus pointer,
> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>   */
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
>  
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>  #endif
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f037f3c..9225ba3 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>      MemoryRegion *iommu;
>      Notifier n;
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>  } VFIOGuestIOMMU;
>  
>  typedef struct VFIODeviceOps VFIODeviceOps;
> -- 
> 1.9.1
>
Aviv B.D. March 17, 2016, 11:17 a.m. UTC | #6
On Tue, Mar 15, 2016 at 10:52 AM, Peter Xu <peterx@redhat.com> wrote:

> On Mon, Mar 14, 2016 at 08:52:33PM +0200, Marcel Apfelbaum wrote:
> > On 03/12/2016 06:13 PM, Aviv B.D. wrote:
> > Adding (possibly) interested developers to the thread.
>
> Thanks CC.
>
> Hi, Aviv, several questions inline.
>
> [...]
>
> > >@@ -1020,14 +1037,53 @@ static void
> vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> > >                                        hwaddr addr, uint8_t am)
> > >  {
> > >      VTDIOTLBPageInvInfo info;
> > >+    VFIOGuestIOMMU * giommu;
> > >+    bool flag = false;
> > >      assert(am <= VTD_MAMV);
> > >      info.domain_id = domain_id;
> > >      info.addr = addr;
> > >      info.mask = ~((1 << am) - 1);
> > >+
> > >+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > >+        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> > >+        uint16_t vfio_source_id =
> vtd_make_source_id(pci_bus_num(vtd_as->bus), vtd_as->devfn);
> > >+        uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus), vtd_as->devfn);
> > >+        if (vfio_domain_id != (uint16_t)-1 &&
>
> Could you (or anyone) help explain what does vfio_domain_id != -1
> mean?


vtd_get_did_dev returns -1 if the device is not mapped to any domain
(generally, the CE is not present).
probably a better interface is to return whether the device has a domain or
not and returns the domain_id via the pointer argument.


>
> > >+                domain_id == vfio_domain_id){
> > >+            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> vfio_source_id, addr);
> > >+            if (iotlb_entry != NULL){
>
> Here, shall we notify VFIO even if the address is not cached in
> IOTLB? Anyway, we need to do the unmap() of the address, am I
> correct?
>
With this code I do a unmap operation if the address was cached in the
IOTLB, if not I'm assuming that the current invalidation invalidate an
(previously) non present address and I should do a map operation (during
the map operation I'm calling s->iommu_ops.translate that caches the
address).

>
> > >+                IOMMUTLBEntry entry;
> > >+                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
> %d", addr, am);
> > >+                entry.iova = addr & VTD_PAGE_MASK_4K;
> > >+                entry.translated_addr =
> vtd_get_slpte_addr(iotlb_entry->slpte) & VTD_PAGE_MASK_4K;
> > >+                entry.addr_mask = ~VTD_PAGE_MASK_4K;
> > >+                entry.perm = IOMMU_NONE;
> > >+                memory_region_notify_iommu(giommu->iommu, entry);
> > >+                flag = true;
> > >+
> > >+            }
> > >+        }
> > >+
> > >+    }
> > >+
> > >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page,
> &info);
> > >-}
> > >+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > >+        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> > >+        uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus), vtd_as->devfn);
> > >+        if (vfio_domain_id != (uint16_t)-1 &&
> > >+                domain_id == vfio_domain_id && !flag){
> > >+            /* do vfio map */
> > >+            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d",
> addr, am);
> > >+            /* call to vtd_iommu_translate */
> > >+            IOMMUTLBEntry entry =
> s->iommu_ops.translate(giommu->iommu, addr, 0);
> > >+            entry.perm = IOMMU_RW;
> > >+            memory_region_notify_iommu(giommu->iommu, entry);
> > >+            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> > >+        }
> > >+    }
> > >+}
>
> I see that we did handled all the page invalidations. Would it
> possible that guest kernel send domain/global invalidations? Should
> we handle them too?
>

You are correct, currently this code is pretty much at POC level, and i
support only the page invalidation because this is what linux is using. The
final version should support also those invalidation ops.

>
> [...]
>
> > >  static void vfio_listener_region_add(MemoryListener *listener,
> > >                                       MemoryRegionSection *section)
> > >  {
> > >@@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener
> *listener,
> > >      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> > >      llend = int128_make64(section->offset_within_address_space);
> > >      llend = int128_add(llend, section->size);
> > >+    llend = int128_add(llend, int128_exts64(-1));
>
> Here, -1 should fix the assertion core dump. However, shall we also
> handle all the rest places that used "llend" (possibly with variable
> "end") too? For example, at the end of current function, when we map
> dma regions:
>
>     ret = vfio_dma_map(container, iova, end - iova, vaddr,
> section->readonly);
>
> To:
>
>     ret = vfio_dma_map(container, iova, end + 1 - iova, vaddr,
> section->readonly);
>
> I will add this to the next version of the patch, thanks!

Thanks.
> Peter
>
Aviv B.D. March 17, 2016, 11:58 a.m. UTC | #7
On Tue, Mar 15, 2016 at 12:53 PM, Michael S. Tsirkin <mst@redhat.com> wrote:

> On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote:
> > From: "Aviv B.D." <bd.aviv@gmail.com>
> >
> >  * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
> >     device are present.
> >  * Advertise Cache Mode capability in iommu cap register.
> >  * Register every VFIO device with IOMMU state.
> >  * On page cache invalidation in vIOMMU, check if the domain belong to
> >    VFIO device and mirror the guest requests to host.
> >
> > Not working (Yet!):
> >  * Tested only with network interface card (ixgbevf) and
> >     intel_iommu=strict in guest's kernel command line.
> >  * Lock up under high load.
> >  * Errors on guest poweroff.
> >  * High relative latency compare to VFIO without IOMMU.
> >
> > Signed-off-by: Aviv B.D. <bd.aviv@gmail.com>
>
> Thanks, this is very interesting.
> So this needs some cleanup, and some issues that will have to be addressed.
> See below.
> Thanks!
>
> Thanks!

>
> > ---
> >  hw/i386/intel_iommu.c          | 76
> ++++++++++++++++++++++++++++++++++++++----
> >  hw/i386/intel_iommu_internal.h |  1 +
> >  hw/vfio/common.c               | 12 +++++--
> >  include/hw/i386/intel_iommu.h  |  4 +++
> >  include/hw/vfio/vfio-common.h  |  1 +
> >  5 files changed, 85 insertions(+), 9 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 347718f..046688f 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT
> > (CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > +                                    uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> >                              uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -126,6 +130,19 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState
> > *s, hwaddr addr,
> >      return new_val;
> >  }
> >
> > +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t
> > devfn)
> > +{
> > +    VTDContextEntry ce;
> > +    int ret_fr;
> > +
> > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +    if (ret_fr){
> > +        return -1;
> > +    }
> > +
> > +    return VTD_CONTEXT_ENTRY_DID(ce.hi);
> > +}
> > +
> >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> >                                          uint64_t clear, uint64_t mask)
> >  {
> > @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
> *s,
> > uint8_t bus_num,
> >      }
> >
> >      if (!vtd_context_entry_present(ce)) {
> > -        VTD_DPRINTF(GENERAL,
> > +        /*VTD_DPRINTF(GENERAL,
> >                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -                    "is not present", devfn, bus_num);
> > +                    "is not present", devfn, bus_num);*/
> >          return -VTD_FR_CONTEXT_ENTRY_P;
> >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -1020,14 +1037,53 @@ static void
> vtd_iotlb_page_invalidate(IntelIOMMUState
> > *s, uint16_t domain_id,
> >                                        hwaddr addr, uint8_t am)
> >  {
> >      VTDIOTLBPageInvInfo info;
> > +    VFIOGuestIOMMU * giommu;
> > +    bool flag = false;
> >
> >      assert(am <= VTD_MAMV);
> >      info.domain_id = domain_id;
> >      info.addr = addr;
> >      info.mask = ~((1 << am) - 1);
> > +
> > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +        uint16_t vfio_source_id =
> vtd_make_source_id(pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +        uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +        if (vfio_domain_id != (uint16_t)-1 &&
> > +                domain_id == vfio_domain_id){
> > +            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> vfio_source_id,
> > addr);
> > +            if (iotlb_entry != NULL){
> > +                IOMMUTLBEntry entry;
> > +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
> %d", addr,
> > am);
> > +                entry.iova = addr & VTD_PAGE_MASK_4K;
> > +                entry.translated_addr =
> vtd_get_slpte_addr(iotlb_entry->slpte)
> > & VTD_PAGE_MASK_4K;
> > +                entry.addr_mask = ~VTD_PAGE_MASK_4K;
> > +                entry.perm = IOMMU_NONE;
> > +                memory_region_notify_iommu(giommu->iommu, entry);
> > +                flag = true;
> > +
> > +            }
> > +        }
> > +
> > +    }
> > +
> >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page,
> &info);
> > -}
> >
> > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +        uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +        if (vfio_domain_id != (uint16_t)-1 &&
> > +                domain_id == vfio_domain_id && !flag){
> > +            /* do vfio map */
> > +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr,
> am);
> > +            /* call to vtd_iommu_translate */
> > +            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu,
> addr,
> > 0);
> > +            entry.perm = IOMMU_RW;
> > +            memory_region_notify_iommu(giommu->iommu, entry);
>
> So this just triggers the iommu notifiers in the regular way.
> But what if two devices have conflicting entries?
> We need to setup host IOMMU differently for them.
> But see below
>
> > +            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> > +        }
> > +    }
> > +}
> >  /* Flush IOTLB
> >   * Returns the IOTLB Actual Invalidation Granularity.
> >   * @val: the content of the IOTLB_REG
> > @@ -1895,6 +1951,13 @@ static Property vtd_properties[] = {
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >
> > +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> > +{
> > +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +    IntelIOMMUState *s = vtd_as->iommu_state;
> > +
> > +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> > +}
> >
> >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
> devfn)
> >  {
> > @@ -1949,7 +2012,8 @@ static void vtd_init(IntelIOMMUState *s)
> >      s->iq_last_desc_type = VTD_INV_DESC_NONE;
> >      s->next_frcd_reg = 0;
> >      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> > -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> > +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS
> |
> > +             VTD_CAP_CM;
> >      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> >
> >      vtd_reset_context_cache(s);
> > diff --git a/hw/i386/intel_iommu_internal.h
> b/hw/i386/intel_iommu_internal.h
> > index e5f514c..ae40f73 100644
> > --- a/hw/i386/intel_iommu_internal.h
> > +++ b/hw/i386/intel_iommu_internal.h
> > @@ -190,6 +190,7 @@
> >  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
> >  #define VTD_CAP_PSI                 (1ULL << 39)
> >  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> > +#define VTD_CAP_CM                  (1ULL << 7)
> >
> >  /* Supported Adjusted Guest Address Widths */
> >  #define VTD_CAP_SAGAW_SHIFT         8
> > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > index 607ec70..98c8d67 100644
> > --- a/hw/vfio/common.c
> > +++ b/hw/vfio/common.c
> > @@ -32,6 +32,9 @@
> >  #include "sysemu/kvm.h"
> >  #include "trace.h"
> >
> > +#include "hw/sysbus.h"
> > +#include "hw/i386/intel_iommu.h"
> > +
> >  struct vfio_group_head vfio_group_list =
> >      QLIST_HEAD_INITIALIZER(vfio_group_list);
> >  struct vfio_as_head vfio_address_spaces =
> > @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void
> > *data)
> >  out:
> >      rcu_read_unlock();
> >  }
> > -
> > +#if 0
> >  static hwaddr vfio_container_granularity(VFIOContainer *container)
> >  {
> >      return (hwaddr)1 << ctz64(container->iova_pgsizes);
> >  }
> > -
> > +#endif
> >  static void vfio_listener_region_add(MemoryListener *listener,
> >                                       MemoryRegionSection *section)
> >  {
> > @@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener
> > *listener,
> >      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> >      llend = int128_make64(section->offset_within_address_space);
> >      llend = int128_add(llend, section->size);
> > +    llend = int128_add(llend, int128_exts64(-1));
> >      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> >
> >      if (int128_ge(int128_make64(iova), llend)) {
> > @@ -381,11 +385,13 @@ static void vfio_listener_region_add(MemoryListener
> > *listener,
> >          giommu->n.notify = vfio_iommu_map_notify;
> >          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> >
> > +        vtd_register_giommu(giommu);
> >          memory_region_register_iommu_notifier(giommu->iommu,
> &giommu->n);
> > +#if 0
> >          memory_region_iommu_replay(giommu->iommu, &giommu->n,
> >
> vfio_container_granularity(container),
> >                                     false);
> > -
> > +#endif
> >          return;
> >      }
> >
>
> Looks like there is still a single global domain on the host.
> Works up to a point as long as we have a single VFIO device.
>

You are correct, I should call (from page_invalidation) only to the correct
vfio notifier that belong to this domain. This way (as long as no conflicts
mapping is present) several VFIO devices can works at the same time. I'll
add this feature to the next version of the patch.
currently I am not sure how the code should handle conflicting mapping
between two different domains.


>
> > diff --git a/include/hw/i386/intel_iommu.h
> b/include/hw/i386/intel_iommu.h
> > index b024ffa..22f3f83 100644
> > --- a/include/hw/i386/intel_iommu.h
> > +++ b/include/hw/i386/intel_iommu.h
> > @@ -23,6 +23,7 @@
> >  #define INTEL_IOMMU_H
> >  #include "hw/qdev.h"
> >  #include "sysemu/dma.h"
> > +#include "hw/vfio/vfio-common.h"
> >
> >  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> >  #define INTEL_IOMMU_DEVICE(obj) \
> > @@ -123,6 +124,8 @@ struct IntelIOMMUState {
> >      MemoryRegionIOMMUOps iommu_ops;
> >      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus*
> > reference */
> >      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
> indexed by
> > bus number */
> > +
> > +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
> >  };
> >
> >  /* Find the VTD Address space associated with the given bus pointer,
> > @@ -130,4 +133,5 @@ struct IntelIOMMUState {
> >   */
> >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
> devfn);
> >
> > +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
> >  #endif
> > diff --git a/include/hw/vfio/vfio-common.h
> b/include/hw/vfio/vfio-common.h
> > index f037f3c..9225ba3 100644
> > --- a/include/hw/vfio/vfio-common.h
> > +++ b/include/hw/vfio/vfio-common.h
> > @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
> >      MemoryRegion *iommu;
> >      Notifier n;
> >      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> > +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
> >  } VFIOGuestIOMMU;
> >
> >  typedef struct VFIODeviceOps VFIODeviceOps;
> > --
> > 1.9.1
> >
>
Peter Xu March 18, 2016, 3:06 a.m. UTC | #8
On Thu, Mar 17, 2016 at 01:17:30PM +0200, Aviv B.D. wrote:
[...]
> vtd_get_did_dev returns -1 if the device is not mapped to any domain
> (generally, the CE is not present).
> probably a better interface is to return whether the device has a domain or
> not and returns the domain_id via the pointer argument.

Possibly, as long as guest kernel might be using (uint16_t)-1 as
domain ID. ;)

> 
> 
> >
> > > >+                domain_id == vfio_domain_id){
> > > >+            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> > vfio_source_id, addr);
> > > >+            if (iotlb_entry != NULL){
> >
> > Here, shall we notify VFIO even if the address is not cached in
> > IOTLB? Anyway, we need to do the unmap() of the address, am I
> > correct?
> >
> With this code I do a unmap operation if the address was cached in the
> IOTLB, if not I'm assuming that the current invalidation invalidate an
> (previously) non present address and I should do a map operation (during
> the map operation I'm calling s->iommu_ops.translate that caches the
> address).

I am not 100% sure of this, but... is this related to IOTLB at all?
What I see is that, IOTLB is only a cache layer of IOMMU, and it is
possible that we mapped some areas which are not in the IOTLB at
all.

Or, let's make an assumption here: what if I turn IOTLB off (or say,
set hash size to zero)? IOMMU should still work, though slower,
right?  However, due to above checking, we'll never do ummap() in
this case (while IMHO we should).

Thanks.

-- peterx
Aviv B.D. March 19, 2016, 9:40 a.m. UTC | #9
On Fri, Mar 18, 2016 at 5:06 AM, Peter Xu <peterx@redhat.com> wrote:

> On Thu, Mar 17, 2016 at 01:17:30PM +0200, Aviv B.D. wrote:
> [...]
> > vtd_get_did_dev returns -1 if the device is not mapped to any domain
> > (generally, the CE is not present).
> > probably a better interface is to return whether the device has a domain
> or
> > not and returns the domain_id via the pointer argument.
>
> Possibly, as long as guest kernel might be using (uint16_t)-1 as
> domain ID. ;)
>
> >
> >
> > >
> > > > >+                domain_id == vfio_domain_id){
> > > > >+            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> > > vfio_source_id, addr);
> > > > >+            if (iotlb_entry != NULL){
> > >
> > > Here, shall we notify VFIO even if the address is not cached in
> > > IOTLB? Anyway, we need to do the unmap() of the address, am I
> > > correct?
> > >
> > With this code I do a unmap operation if the address was cached in the
> > IOTLB, if not I'm assuming that the current invalidation invalidate an
> > (previously) non present address and I should do a map operation (during
> > the map operation I'm calling s->iommu_ops.translate that caches the
> > address).
>
> I am not 100% sure of this, but... is this related to IOTLB at all?
> What I see is that, IOTLB is only a cache layer of IOMMU, and it is
> possible that we mapped some areas which are not in the IOTLB at
> all.
>
> Or, let's make an assumption here: what if I turn IOTLB off (or say,
> set hash size to zero)? IOMMU should still work, though slower,
> right?  However, due to above checking, we'll never do ummap() in
> this case (while IMHO we should).
>
Thanks.
>
> -- peterx
>


Hi,
As far as I understand the code, currently there is no way to turn off the
IOTLB.
Furthermore. the IOTLB is not implemented as LRU, and actually caches
(indefinitely)
any accessed address, without any size constrains. I use those assumptions
to know
whether the current invalidation is for unmap operation or map operation.

But, I need to check if it possible (for the guest kernel) to squeeze
together unmap and
map operations and issue for them only one invalidation (probably the
answer is yes,
and it may explain one of my bugs)

Aviv.
Peter Xu March 21, 2016, 2:30 a.m. UTC | #10
On Sat, Mar 19, 2016 at 11:40:04AM +0200, Aviv B.D. wrote:
[...]
> As far as I understand the code, currently there is no way to turn off the
> IOTLB.
> Furthermore. the IOTLB is not implemented as LRU, and actually caches
> (indefinitely)
> any accessed address, without any size constrains. I use those assumptions
> to know
> whether the current invalidation is for unmap operation or map operation.

Please have a look at VTD_IOTLB_MAX_SIZE. It seems to be the size of
the hash.

Btw, I guess it's a much bigger problem if IOTLB has unlimited cache
size...

Thanks.

-- peterx
Aviv B.D. March 22, 2016, 8:13 a.m. UTC | #11
On Mon, Mar 21, 2016 at 4:30 AM, Peter Xu <peterx@redhat.com> wrote:

> On Sat, Mar 19, 2016 at 11:40:04AM +0200, Aviv B.D. wrote:
> [...]
> > As far as I understand the code, currently there is no way to turn off
> the
> > IOTLB.
> > Furthermore. the IOTLB is not implemented as LRU, and actually caches
> > (indefinitely)
> > any accessed address, without any size constrains. I use those
> assumptions
> > to know
> > whether the current invalidation is for unmap operation or map operation.
>
> Please have a look at VTD_IOTLB_MAX_SIZE. It seems to be the size of
> the hash.
>
> Btw, I guess it's a much bigger problem if IOTLB has unlimited cache
> size...
>
> Thanks.

-- peterx
>

You are correct, VTD_IOTLB_MAX_SIZE limits the cache size (and reset the
whole cache
if this threshold exceeds...) I will think on another mechanism to identify
the correct
action for each invalidation.

Thanks,
Aviv.
Michael S. Tsirkin March 23, 2016, 2:33 p.m. UTC | #12
On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote:
> From: "Aviv B.D." <bd.aviv@gmail.com>
> 
>  * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
>     device are present.
>  * Advertise Cache Mode capability in iommu cap register.
>  * Register every VFIO device with IOMMU state.
>  * On page cache invalidation in vIOMMU, check if the domain belong to
>    VFIO device and mirror the guest requests to host.
> 
> Not working (Yet!):
>  * Tested only with network interface card (ixgbevf) and
>     intel_iommu=strict in guest's kernel command line.
>  * Lock up under high load.
>  * Errors on guest poweroff.
>  * High relative latency compare to VFIO without IOMMU.
> 
> Signed-off-by: Aviv B.D. <bd.aviv@gmail.com>
> ---
>  hw/i386/intel_iommu.c          | 76 ++++++++++++++++++++++++++++++++++++++----
>  hw/i386/intel_iommu_internal.h |  1 +
>  hw/vfio/common.c               | 12 +++++--
>  include/hw/i386/intel_iommu.h  |  4 +++
>  include/hw/vfio/vfio-common.h  |  1 +
>  5 files changed, 85 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 347718f..046688f 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT
> (CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
>  
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {
> @@ -126,6 +130,19 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState
> *s, hwaddr addr,
>      return new_val;
>  }
>  
> +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t
> devfn)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr){
> +        return -1;
> +    }
> +
> +    return VTD_CONTEXT_ENTRY_DID(ce.hi);
> +}
> +
>  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>                                          uint64_t clear, uint64_t mask)
>  {
> @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> uint8_t bus_num,
>      }
>  
>      if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> +        /*VTD_DPRINTF(GENERAL,
>                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);
> +                    "is not present", devfn, bus_num);*/
>          return -VTD_FR_CONTEXT_ENTRY_P;
>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -1020,14 +1037,53 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState
> *s, uint16_t domain_id,
>                                        hwaddr addr, uint8_t am)
>  {
>      VTDIOTLBPageInvInfo info;
> +    VFIOGuestIOMMU * giommu;
> +    bool flag = false;
>  
>      assert(am <= VTD_MAMV);
>      info.domain_id = domain_id;
>      info.addr = addr;
>      info.mask = ~((1 << am) - 1);
> +
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace,
> iommu);
> +        uint16_t vfio_source_id = vtd_make_source_id(pci_bus_num(vtd_as->bus),
> vtd_as->devfn);
> +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn);
> +        if (vfio_domain_id != (uint16_t)-1 && 
> +                domain_id == vfio_domain_id){
> +            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s, vfio_source_id,
> addr);
> +            if (iotlb_entry != NULL){
> +                IOMMUTLBEntry entry; 
> +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr,
> am);
> +                entry.iova = addr & VTD_PAGE_MASK_4K;
> +                entry.translated_addr = vtd_get_slpte_addr(iotlb_entry->slpte)
> & VTD_PAGE_MASK_4K;
> +                entry.addr_mask = ~VTD_PAGE_MASK_4K;
> +                entry.perm = IOMMU_NONE;
> +                memory_region_notify_iommu(giommu->iommu, entry);
> +                flag = true;
> +
> +            }
> +        }
> +
> +    }
> +
>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> -}
>  
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace,
> iommu);
> +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn);
> +        if (vfio_domain_id != (uint16_t)-1 && 
> +                domain_id == vfio_domain_id && !flag){
> +            /* do vfio map */

So what happens here if the address is changed when entry stays valid?
I think vfio in kernel will fail the map address,
later in userspace the address is first unmapped, then new one mapped.
If a device accesses the entry during this process,
it will get a fault, and this looks like a bug to me.

I suspect we need to teach vfio in kernel to update
entries, this might not be easy to do.


> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +            /* call to vtd_iommu_translate */
> +            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr,
> 0); 
> +            entry.perm = IOMMU_RW;
> +            memory_region_notify_iommu(giommu->iommu, entry);
> +            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> +        }
> +    }
> +}
>  /* Flush IOTLB
>   * Returns the IOTLB Actual Invalidation Granularity.
>   * @val: the content of the IOTLB_REG
> @@ -1895,6 +1951,13 @@ static Property vtd_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> +{
> +    VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace,
> iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    
> +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> +}
>  
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>  {
> @@ -1949,7 +2012,8 @@ static void vtd_init(IntelIOMMUState *s)
>      s->iq_last_desc_type = VTD_INV_DESC_NONE;
>      s->next_frcd_reg = 0;
>      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
> +             VTD_CAP_CM;
>      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
>  
>      vtd_reset_context_cache(s);
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index e5f514c..ae40f73 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -190,6 +190,7 @@
>  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
>  #define VTD_CAP_PSI                 (1ULL << 39)
>  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> +#define VTD_CAP_CM                  (1ULL << 7)
>  
>  /* Supported Adjusted Guest Address Widths */
>  #define VTD_CAP_SAGAW_SHIFT         8
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 607ec70..98c8d67 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -32,6 +32,9 @@
>  #include "sysemu/kvm.h"
>  #include "trace.h"
>  
> +#include "hw/sysbus.h"
> +#include "hw/i386/intel_iommu.h"
> +
>  struct vfio_group_head vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>  struct vfio_as_head vfio_address_spaces =
> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void
> *data)
>  out:
>      rcu_read_unlock();
>  }
> -
> +#if 0
>  static hwaddr vfio_container_granularity(VFIOContainer *container)
>  {
>      return (hwaddr)1 << ctz64(container->iova_pgsizes);
>  }
> -
> +#endif
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener
> *listener,
>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>      llend = int128_make64(section->offset_within_address_space);
>      llend = int128_add(llend, section->size);
> +    llend = int128_add(llend, int128_exts64(-1));
>      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>  
>      if (int128_ge(int128_make64(iova), llend)) {
> @@ -381,11 +385,13 @@ static void vfio_listener_region_add(MemoryListener
> *listener,
>          giommu->n.notify = vfio_iommu_map_notify;
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>  
> +        vtd_register_giommu(giommu);
>          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +#if 0
>          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>                                     vfio_container_granularity(container),
>                                     false);
> -
> +#endif
>          return;
>      }
>  
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index b024ffa..22f3f83 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -23,6 +23,7 @@
>  #define INTEL_IOMMU_H
>  #include "hw/qdev.h"
>  #include "sysemu/dma.h"
> +#include "hw/vfio/vfio-common.h"
>  
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>      MemoryRegionIOMMUOps iommu_ops;
>      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus*
> reference */
>      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by
> bus number */
> +
> +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>  };
>  
>  /* Find the VTD Address space associated with the given bus pointer,
> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>   */
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
>  
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>  #endif
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f037f3c..9225ba3 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>      MemoryRegion *iommu;
>      Notifier n;
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>  } VFIOGuestIOMMU;
>  
>  typedef struct VFIODeviceOps VFIODeviceOps;
> -- 
> 1.9.1
>
Michael S. Tsirkin March 23, 2016, 2:34 p.m. UTC | #13
On Thu, Mar 17, 2016 at 01:58:13PM +0200, Aviv B.D. wrote:
> 
> 
> On Tue, Mar 15, 2016 at 12:53 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> 
>     On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote:
>     > From: "Aviv B.D." <bd.aviv@gmail.com>
>     >
>     >  * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
>     >     device are present.
>     >  * Advertise Cache Mode capability in iommu cap register.
>     >  * Register every VFIO device with IOMMU state.
>     >  * On page cache invalidation in vIOMMU, check if the domain belong to
>     >    VFIO device and mirror the guest requests to host.
>     >
>     > Not working (Yet!):
>     >  * Tested only with network interface card (ixgbevf) and
>     >     intel_iommu=strict in guest's kernel command line.
>     >  * Lock up under high load.
>     >  * Errors on guest poweroff.
>     >  * High relative latency compare to VFIO without IOMMU.
>     >
>     > Signed-off-by: Aviv B.D. <bd.aviv@gmail.com>
> 
>     Thanks, this is very interesting.
>     So this needs some cleanup, and some issues that will have to be addressed.
>     See below.
>     Thanks!
> 
> 
> Thanks! 
> 
> 
>     > ---
>     >  hw/i386/intel_iommu.c          | 76
>     ++++++++++++++++++++++++++++++++++++++----
>     >  hw/i386/intel_iommu_internal.h |  1 +
>     >  hw/vfio/common.c               | 12 +++++--
>     >  include/hw/i386/intel_iommu.h  |  4 +++
>     >  include/hw/vfio/vfio-common.h  |  1 +
>     >  5 files changed, 85 insertions(+), 9 deletions(-)
>     >
>     > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>     > index 347718f..046688f 100644
>     > --- a/hw/i386/intel_iommu.c
>     > +++ b/hw/i386/intel_iommu.c
>     > @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
>     VTD_DBGBIT
>     > (CSR);
>     >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>     >  #endif
>     >  
>     > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>     > +                                    uint8_t devfn, VTDContextEntry *ce);
>     > +
>     >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
>     val,
>     >                              uint64_t wmask, uint64_t w1cmask)
>     >  {
>     > @@ -126,6 +130,19 @@ static uint32_t vtd_set_clear_mask_long
>     (IntelIOMMUState
>     > *s, hwaddr addr,
>     >      return new_val;
>     >  }
>     >  
>     > +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>     uint8_t
>     > devfn)
>     > +{
>     > +    VTDContextEntry ce;
>     > +    int ret_fr;
>     > +
>     > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
>     > +    if (ret_fr){
>     > +        return -1;
>     > +    }
>     > +
>     > +    return VTD_CONTEXT_ENTRY_DID(ce.hi);
>     > +}
>     > +
>     >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>     >                                          uint64_t clear, uint64_t mask)
>     >  {
>     > @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
>     *s,
>     > uint8_t bus_num,
>     >      }
>     >  
>     >      if (!vtd_context_entry_present(ce)) {
>     > -        VTD_DPRINTF(GENERAL,
>     > +        /*VTD_DPRINTF(GENERAL,
>     >                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
>     > -                    "is not present", devfn, bus_num);
>     > +                    "is not present", devfn, bus_num);*/
>     >          return -VTD_FR_CONTEXT_ENTRY_P;
>     >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>     >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
>     > @@ -1020,14 +1037,53 @@ static void vtd_iotlb_page_invalidate
>     (IntelIOMMUState
>     > *s, uint16_t domain_id,
>     >                                        hwaddr addr, uint8_t am)
>     >  {
>     >      VTDIOTLBPageInvInfo info;
>     > +    VFIOGuestIOMMU * giommu;
>     > +    bool flag = false;
>     >  
>     >      assert(am <= VTD_MAMV);
>     >      info.domain_id = domain_id;
>     >      info.addr = addr;
>     >      info.mask = ~((1 << am) - 1);
>     > +
>     > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
>     > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
>     VTDAddressSpace,
>     > iommu);
>     > +        uint16_t vfio_source_id = vtd_make_source_id(pci_bus_num
>     (vtd_as->bus),
>     > vtd_as->devfn);
>     > +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num
>     (vtd_as->bus),
>     > vtd_as->devfn);
>     > +        if (vfio_domain_id != (uint16_t)-1 && 
>     > +                domain_id == vfio_domain_id){
>     > +            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
>     vfio_source_id,
>     > addr);
>     > +            if (iotlb_entry != NULL){
>     > +                IOMMUTLBEntry entry; 
>     > +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
>     addr,
>     > am);
>     > +                entry.iova = addr & VTD_PAGE_MASK_4K;
>     > +                entry.translated_addr = vtd_get_slpte_addr(iotlb_entry->
>     slpte)
>     > & VTD_PAGE_MASK_4K;
>     > +                entry.addr_mask = ~VTD_PAGE_MASK_4K;
>     > +                entry.perm = IOMMU_NONE;
>     > +                memory_region_notify_iommu(giommu->iommu, entry);
>     > +                flag = true;
>     > +
>     > +            }
>     > +        }
>     > +
>     > +    }
>     > +
>     >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &
>     info);
>     > -}
>     >  
>     > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
>     > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
>     VTDAddressSpace,
>     > iommu);
>     > +        uint16_t vfio_domain_id = vtd_get_did_dev(s, pci_bus_num
>     (vtd_as->bus),
>     > vtd_as->devfn);
>     > +        if (vfio_domain_id != (uint16_t)-1 && 
>     > +                domain_id == vfio_domain_id && !flag){
>     > +            /* do vfio map */
>     > +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr,
>     am);
>     > +            /* call to vtd_iommu_translate */
>     > +            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu,
>     addr,
>     > 0); 
>     > +            entry.perm = IOMMU_RW;
>     > +            memory_region_notify_iommu(giommu->iommu, entry);
> 
>     So this just triggers the iommu notifiers in the regular way.
>     But what if two devices have conflicting entries?
>     We need to setup host IOMMU differently for them.
>     But see below
> 
>     > +            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
>     > +        }
>     > +    }
>     > +}
>     >  /* Flush IOTLB
>     >   * Returns the IOTLB Actual Invalidation Granularity.
>     >   * @val: the content of the IOTLB_REG
>     > @@ -1895,6 +1951,13 @@ static Property vtd_properties[] = {
>     >      DEFINE_PROP_END_OF_LIST(),
>     >  };
>     >  
>     > +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
>     > +{
>     > +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
>     VTDAddressSpace,
>     > iommu);
>     > +    IntelIOMMUState *s = vtd_as->iommu_state;
>     > +    
>     > +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
>     > +}
>     >  
>     >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
>     devfn)
>     >  {
>     > @@ -1949,7 +2012,8 @@ static void vtd_init(IntelIOMMUState *s)
>     >      s->iq_last_desc_type = VTD_INV_DESC_NONE;
>     >      s->next_frcd_reg = 0;
>     >      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
>     > -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
>     > +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS
>     |
>     > +             VTD_CAP_CM;
>     >      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
>     >  
>     >      vtd_reset_context_cache(s);
>     > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/
>     intel_iommu_internal.h
>     > index e5f514c..ae40f73 100644
>     > --- a/hw/i386/intel_iommu_internal.h
>     > +++ b/hw/i386/intel_iommu_internal.h
>     > @@ -190,6 +190,7 @@
>     >  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
>     >  #define VTD_CAP_PSI                 (1ULL << 39)
>     >  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
>     > +#define VTD_CAP_CM                  (1ULL << 7)
>     >  
>     >  /* Supported Adjusted Guest Address Widths */
>     >  #define VTD_CAP_SAGAW_SHIFT         8
>     > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>     > index 607ec70..98c8d67 100644
>     > --- a/hw/vfio/common.c
>     > +++ b/hw/vfio/common.c
>     > @@ -32,6 +32,9 @@
>     >  #include "sysemu/kvm.h"
>     >  #include "trace.h"
>     >  
>     > +#include "hw/sysbus.h"
>     > +#include "hw/i386/intel_iommu.h"
>     > +
>     >  struct vfio_group_head vfio_group_list =
>     >      QLIST_HEAD_INITIALIZER(vfio_group_list);
>     >  struct vfio_as_head vfio_address_spaces =
>     > @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void
>     > *data)
>     >  out:
>     >      rcu_read_unlock();
>     >  }
>     > -
>     > +#if 0
>     >  static hwaddr vfio_container_granularity(VFIOContainer *container)
>     >  {
>     >      return (hwaddr)1 << ctz64(container->iova_pgsizes);
>     >  }
>     > -
>     > +#endif
>     >  static void vfio_listener_region_add(MemoryListener *listener,
>     >                                       MemoryRegionSection *section)
>     >  {
>     > @@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener
>     > *listener,
>     >      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>     >      llend = int128_make64(section->offset_within_address_space);
>     >      llend = int128_add(llend, section->size);
>     > +    llend = int128_add(llend, int128_exts64(-1));
>     >      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>     >  
>     >      if (int128_ge(int128_make64(iova), llend)) {
>     > @@ -381,11 +385,13 @@ static void vfio_listener_region_add(MemoryListener
>     > *listener,
>     >          giommu->n.notify = vfio_iommu_map_notify;
>     >          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>     >  
>     > +        vtd_register_giommu(giommu);
>     >          memory_region_register_iommu_notifier(giommu->iommu, &giommu->
>     n);
>     > +#if 0
>     >          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>     >                                     vfio_container_granularity
>     (container),
>     >                                     false);
>     > -
>     > +#endif
>     >          return;
>     >      }
>     >
> 
>     Looks like there is still a single global domain on the host.
>     Works up to a point as long as we have a single VFIO device.
> 
> 
> You are correct, I should call (from page_invalidation) only to the correct
> vfio notifier that belong to this domain. This way (as long as no conflicts
> mapping is present) several VFIO devices can works at the same time. I'll add
> this feature to the next version of the patch.
> currently I am not sure how the code should handle conflicting mapping between
> two different domains. 

If you have different vfio domains, then what is the
issue with fixing it?

> 
>      
>     > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/
>     intel_iommu.h
>     > index b024ffa..22f3f83 100644
>     > --- a/include/hw/i386/intel_iommu.h
>     > +++ b/include/hw/i386/intel_iommu.h
>     > @@ -23,6 +23,7 @@
>     >  #define INTEL_IOMMU_H
>     >  #include "hw/qdev.h"
>     >  #include "sysemu/dma.h"
>     > +#include "hw/vfio/vfio-common.h"
>     >  
>     >  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>     >  #define INTEL_IOMMU_DEVICE(obj) \
>     > @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>     >      MemoryRegionIOMMUOps iommu_ops;
>     >      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus*
>     > reference */
>     >      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
>     indexed by
>     > bus number */
>     > +
>     > +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>     >  };
>     >  
>     >  /* Find the VTD Address space associated with the given bus pointer,
>     > @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>     >   */
>     >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
>     devfn);
>     >  
>     > +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>     >  #endif
>     > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/
>     vfio-common.h
>     > index f037f3c..9225ba3 100644
>     > --- a/include/hw/vfio/vfio-common.h
>     > +++ b/include/hw/vfio/vfio-common.h
>     > @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>     >      MemoryRegion *iommu;
>     >      Notifier n;
>     >      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
>     > +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>     >  } VFIOGuestIOMMU;
>     >  
>     >  typedef struct VFIODeviceOps VFIODeviceOps;
>     > -- 
>     > 1.9.1
>     >  
> 
>
Aviv B.D. March 26, 2016, 2:47 p.m. UTC | #14
On Wed, Mar 23, 2016 at 5:33 PM, Michael S. Tsirkin <mst@redhat.com> wrote:

> On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote:
> > From: "Aviv B.D." <bd.aviv@gmail.com>
> >
> >  * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
> >     device are present.
> >  * Advertise Cache Mode capability in iommu cap register.
> >  * Register every VFIO device with IOMMU state.
> >  * On page cache invalidation in vIOMMU, check if the domain belong to
> >    VFIO device and mirror the guest requests to host.
> >
> > Not working (Yet!):
> >  * Tested only with network interface card (ixgbevf) and
> >     intel_iommu=strict in guest's kernel command line.
> >  * Lock up under high load.
> >  * Errors on guest poweroff.
> >  * High relative latency compare to VFIO without IOMMU.
> >
> > Signed-off-by: Aviv B.D. <bd.aviv@gmail.com>
> > ---
> >  hw/i386/intel_iommu.c          | 76
> ++++++++++++++++++++++++++++++++++++++----
> >  hw/i386/intel_iommu_internal.h |  1 +
> >  hw/vfio/common.c               | 12 +++++--
> >  include/hw/i386/intel_iommu.h  |  4 +++
> >  include/hw/vfio/vfio-common.h  |  1 +
> >  5 files changed, 85 insertions(+), 9 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 347718f..046688f 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT
> > (CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > +                                    uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> >                              uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -126,6 +130,19 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState
> > *s, hwaddr addr,
> >      return new_val;
> >  }
> >
> > +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t
> > devfn)
> > +{
> > +    VTDContextEntry ce;
> > +    int ret_fr;
> > +
> > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +    if (ret_fr){
> > +        return -1;
> > +    }
> > +
> > +    return VTD_CONTEXT_ENTRY_DID(ce.hi);
> > +}
> > +
> >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> >                                          uint64_t clear, uint64_t mask)
> >  {
> > @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
> *s,
> > uint8_t bus_num,
> >      }
> >
> >      if (!vtd_context_entry_present(ce)) {
> > -        VTD_DPRINTF(GENERAL,
> > +        /*VTD_DPRINTF(GENERAL,
> >                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -                    "is not present", devfn, bus_num);
> > +                    "is not present", devfn, bus_num);*/
> >          return -VTD_FR_CONTEXT_ENTRY_P;
> >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -1020,14 +1037,53 @@ static void
> vtd_iotlb_page_invalidate(IntelIOMMUState
> > *s, uint16_t domain_id,
> >                                        hwaddr addr, uint8_t am)
> >  {
> >      VTDIOTLBPageInvInfo info;
> > +    VFIOGuestIOMMU * giommu;
> > +    bool flag = false;
> >
> >      assert(am <= VTD_MAMV);
> >      info.domain_id = domain_id;
> >      info.addr = addr;
> >      info.mask = ~((1 << am) - 1);
> > +
> > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +        uint16_t vfio_source_id =
> vtd_make_source_id(pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +        uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +        if (vfio_domain_id != (uint16_t)-1 &&
> > +                domain_id == vfio_domain_id){
> > +            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> vfio_source_id,
> > addr);
> > +            if (iotlb_entry != NULL){
> > +                IOMMUTLBEntry entry;
> > +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
> %d", addr,
> > am);
> > +                entry.iova = addr & VTD_PAGE_MASK_4K;
> > +                entry.translated_addr =
> vtd_get_slpte_addr(iotlb_entry->slpte)
> > & VTD_PAGE_MASK_4K;
> > +                entry.addr_mask = ~VTD_PAGE_MASK_4K;
> > +                entry.perm = IOMMU_NONE;
> > +                memory_region_notify_iommu(giommu->iommu, entry);
> > +                flag = true;
> > +
> > +            }
> > +        }
> > +
> > +    }
> > +
> >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page,
> &info);
> > -}
> >
> > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +        uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +        if (vfio_domain_id != (uint16_t)-1 &&
> > +                domain_id == vfio_domain_id && !flag){
> > +            /* do vfio map */
>
> So what happens here if the address is changed when entry stays valid?
> I think vfio in kernel will fail the map address,
> later in userspace the address is first unmapped, then new one mapped.
> If a device accesses the entry during this process,
> it will get a fault, and this looks like a bug to me.
>
> I suspect we need to teach vfio in kernel to update
> entries, this might not be easy to do.
>
>
You are correct (and I already came across this behavior).
The correct(er) solution will be to remove each map (call unmap of vfio)
and try to
map again if the address is present in the  translation tables (regardless
of the cache status).

I hope to release a new version of this patch next weekend with most of the
remarks fixed.

Aviv.


>
> > +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr,
> am);
> > +            /* call to vtd_iommu_translate */
> > +            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu,
> addr,
> > 0);
> > +            entry.perm = IOMMU_RW;
> > +            memory_region_notify_iommu(giommu->iommu, entry);
> > +            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> > +        }
> > +    }
> > +}
> >  /* Flush IOTLB
> >   * Returns the IOTLB Actual Invalidation Granularity.
> >   * @val: the content of the IOTLB_REG
> > @@ -1895,6 +1951,13 @@ static Property vtd_properties[] = {
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >
> > +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> > +{
> > +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +    IntelIOMMUState *s = vtd_as->iommu_state;
> > +
> > +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> > +}
> >
> >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
> devfn)
> >  {
> > @@ -1949,7 +2012,8 @@ static void vtd_init(IntelIOMMUState *s)
> >      s->iq_last_desc_type = VTD_INV_DESC_NONE;
> >      s->next_frcd_reg = 0;
> >      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> > -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> > +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS
> |
> > +             VTD_CAP_CM;
> >      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> >
> >      vtd_reset_context_cache(s);
> > diff --git a/hw/i386/intel_iommu_internal.h
> b/hw/i386/intel_iommu_internal.h
> > index e5f514c..ae40f73 100644
> > --- a/hw/i386/intel_iommu_internal.h
> > +++ b/hw/i386/intel_iommu_internal.h
> > @@ -190,6 +190,7 @@
> >  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
> >  #define VTD_CAP_PSI                 (1ULL << 39)
> >  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> > +#define VTD_CAP_CM                  (1ULL << 7)
> >
> >  /* Supported Adjusted Guest Address Widths */
> >  #define VTD_CAP_SAGAW_SHIFT         8
> > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > index 607ec70..98c8d67 100644
> > --- a/hw/vfio/common.c
> > +++ b/hw/vfio/common.c
> > @@ -32,6 +32,9 @@
> >  #include "sysemu/kvm.h"
> >  #include "trace.h"
> >
> > +#include "hw/sysbus.h"
> > +#include "hw/i386/intel_iommu.h"
> > +
> >  struct vfio_group_head vfio_group_list =
> >      QLIST_HEAD_INITIALIZER(vfio_group_list);
> >  struct vfio_as_head vfio_address_spaces =
> > @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void
> > *data)
> >  out:
> >      rcu_read_unlock();
> >  }
> > -
> > +#if 0
> >  static hwaddr vfio_container_granularity(VFIOContainer *container)
> >  {
> >      return (hwaddr)1 << ctz64(container->iova_pgsizes);
> >  }
> > -
> > +#endif
> >  static void vfio_listener_region_add(MemoryListener *listener,
> >                                       MemoryRegionSection *section)
> >  {
> > @@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener
> > *listener,
> >      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> >      llend = int128_make64(section->offset_within_address_space);
> >      llend = int128_add(llend, section->size);
> > +    llend = int128_add(llend, int128_exts64(-1));
> >      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> >
> >      if (int128_ge(int128_make64(iova), llend)) {
> > @@ -381,11 +385,13 @@ static void vfio_listener_region_add(MemoryListener
> > *listener,
> >          giommu->n.notify = vfio_iommu_map_notify;
> >          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> >
> > +        vtd_register_giommu(giommu);
> >          memory_region_register_iommu_notifier(giommu->iommu,
> &giommu->n);
> > +#if 0
> >          memory_region_iommu_replay(giommu->iommu, &giommu->n,
> >
> vfio_container_granularity(container),
> >                                     false);
> > -
> > +#endif
> >          return;
> >      }
> >
> > diff --git a/include/hw/i386/intel_iommu.h
> b/include/hw/i386/intel_iommu.h
> > index b024ffa..22f3f83 100644
> > --- a/include/hw/i386/intel_iommu.h
> > +++ b/include/hw/i386/intel_iommu.h
> > @@ -23,6 +23,7 @@
> >  #define INTEL_IOMMU_H
> >  #include "hw/qdev.h"
> >  #include "sysemu/dma.h"
> > +#include "hw/vfio/vfio-common.h"
> >
> >  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> >  #define INTEL_IOMMU_DEVICE(obj) \
> > @@ -123,6 +124,8 @@ struct IntelIOMMUState {
> >      MemoryRegionIOMMUOps iommu_ops;
> >      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus*
> > reference */
> >      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
> indexed by
> > bus number */
> > +
> > +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
> >  };
> >
> >  /* Find the VTD Address space associated with the given bus pointer,
> > @@ -130,4 +133,5 @@ struct IntelIOMMUState {
> >   */
> >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
> devfn);
> >
> > +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
> >  #endif
> > diff --git a/include/hw/vfio/vfio-common.h
> b/include/hw/vfio/vfio-common.h
> > index f037f3c..9225ba3 100644
> > --- a/include/hw/vfio/vfio-common.h
> > +++ b/include/hw/vfio/vfio-common.h
> > @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
> >      MemoryRegion *iommu;
> >      Notifier n;
> >      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> > +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
> >  } VFIOGuestIOMMU;
> >
> >  typedef struct VFIODeviceOps VFIODeviceOps;
> > --
> > 1.9.1
> >
>
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 347718f..046688f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -43,6 +44,9 @@  static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif

+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+                                    uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
                             uint64_t wmask, uint64_t w1cmask)
 {
@@ -126,6 +130,19 @@  static uint32_t
vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
     return new_val;
 }

+static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
uint8_t devfn)
+{
+    VTDContextEntry ce;
+    int ret_fr;
+
+    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+    if (ret_fr){
+        return -1;
+    }
+
+    return VTD_CONTEXT_ENTRY_DID(ce.hi);
+}
+
 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
                                         uint64_t clear, uint64_t mask)
 {
@@ -711,9 +728,9 @@  static int vtd_dev_to_context_entry(IntelIOMMUState *s,
uint8_t bus_num,
     }

     if (!vtd_context_entry_present(ce)) {
-        VTD_DPRINTF(GENERAL,
+        /*VTD_DPRINTF(GENERAL,
                     "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-                    "is not present", devfn, bus_num);
+                    "is not present", devfn, bus_num);*/
         return -VTD_FR_CONTEXT_ENTRY_P;
     } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
                (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -1020,14 +1037,53 @@  static void
vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
                                       hwaddr addr, uint8_t am)
 {
     VTDIOTLBPageInvInfo info;
+    VFIOGuestIOMMU * giommu;
+    bool flag = false;

     assert(am <= VTD_MAMV);
     info.domain_id = domain_id;
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
+
+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
+        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
VTDAddressSpace, iommu);
+        uint16_t vfio_source_id =
vtd_make_source_id(pci_bus_num(vtd_as->bus), vtd_as->devfn);
+        uint16_t vfio_domain_id = vtd_get_did_dev(s,
pci_bus_num(vtd_as->bus), vtd_as->devfn);
+        if (vfio_domain_id != (uint16_t)-1 &&
+                domain_id == vfio_domain_id){
+            VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
vfio_source_id, addr);
+            if (iotlb_entry != NULL){
+                IOMMUTLBEntry entry;
+                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
addr, am);
+                entry.iova = addr & VTD_PAGE_MASK_4K;
+                entry.translated_addr =
vtd_get_slpte_addr(iotlb_entry->slpte) & VTD_PAGE_MASK_4K;
+                entry.addr_mask = ~VTD_PAGE_MASK_4K;
+                entry.perm = IOMMU_NONE;
+                memory_region_notify_iommu(giommu->iommu, entry);
+                flag = true;
+
+            }
+        }
+
+    }
+
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
-}

+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
+        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
VTDAddressSpace, iommu);
+        uint16_t vfio_domain_id = vtd_get_did_dev(s,
pci_bus_num(vtd_as->bus), vtd_as->devfn);
+        if (vfio_domain_id != (uint16_t)-1 &&
+                domain_id == vfio_domain_id && !flag){
+            /* do vfio map */
+            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr,
am);
+            /* call to vtd_iommu_translate */
+            IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu,
addr, 0);
+            entry.perm = IOMMU_RW;
+            memory_region_notify_iommu(giommu->iommu, entry);
+            //g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
+        }
+    }
+}
 /* Flush IOTLB
  * Returns the IOTLB Actual Invalidation Granularity.
  * @val: the content of the IOTLB_REG
@@ -1895,6 +1951,13 @@  static Property vtd_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };

+void vtd_register_giommu(VFIOGuestIOMMU * giommu)
+{
+    VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace,
iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+
+    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
+}

 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
devfn)
 {
@@ -1949,7 +2012,8 @@  static void vtd_init(IntelIOMMUState *s)
     s->iq_last_desc_type = VTD_INV_DESC_NONE;
     s->next_frcd_reg = 0;
     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
-             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
+             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
+             VTD_CAP_CM;
     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;

     vtd_reset_context_cache(s);
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index e5f514c..ae40f73 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,6 +190,7 @@ 
 #define VTD_CAP_MAMV                (VTD_MAMV << 48)
 #define VTD_CAP_PSI                 (1ULL << 39)
 #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM                  (1ULL << 7)

 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT         8
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 607ec70..98c8d67 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -32,6 +32,9 @@ 
 #include "sysemu/kvm.h"
 #include "trace.h"

+#include "hw/sysbus.h"
+#include "hw/i386/intel_iommu.h"
+
 struct vfio_group_head vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
 struct vfio_as_head vfio_address_spaces =
@@ -312,12 +315,12 @@  static void vfio_iommu_map_notify(Notifier *n, void
*data)
 out:
     rcu_read_unlock();
 }
-
+#if 0
 static hwaddr vfio_container_granularity(VFIOContainer *container)
 {
     return (hwaddr)1 << ctz64(container->iova_pgsizes);
 }
-
+#endif
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -344,6 +347,7 @@  static void vfio_listener_region_add(MemoryListener
*listener,
     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
     llend = int128_make64(section->offset_within_address_space);
     llend = int128_add(llend, section->size);
+    llend = int128_add(llend, int128_exts64(-1));
     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));

     if (int128_ge(int128_make64(iova), llend)) {
@@ -381,11 +385,13 @@  static void vfio_listener_region_add(MemoryListener
*listener,
         giommu->n.notify = vfio_iommu_map_notify;
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);

+        vtd_register_giommu(giommu);
         memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
+#if 0
         memory_region_iommu_replay(giommu->iommu, &giommu->n,
                                    vfio_container_granularity(container),
                                    false);
-
+#endif
         return;
     }

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index b024ffa..22f3f83 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -23,6 +23,7 @@ 
 #define INTEL_IOMMU_H
 #include "hw/qdev.h"
 #include "sysemu/dma.h"
+#include "hw/vfio/vfio-common.h"

 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -123,6 +124,8 @@  struct IntelIOMMUState {
     MemoryRegionIOMMUOps iommu_ops;
     GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus*
reference */
     VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed
by bus number */
+
+    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
 };

 /* Find the VTD Address space associated with the given bus pointer,
@@ -130,4 +133,5 @@  struct IntelIOMMUState {
  */
 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
devfn);

+void vtd_register_giommu(VFIOGuestIOMMU * giommu);
 #endif
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f037f3c..9225ba3 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -82,6 +82,7 @@  typedef struct VFIOGuestIOMMU {
     MemoryRegion *iommu;
     Notifier n;
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
 } VFIOGuestIOMMU;

 typedef struct VFIODeviceOps VFIODeviceOps;