diff mbox

[RFC,v2] IOMMU: Add Support to VFIO devices with vIOMMU present

Message ID CAM3WwMg2i5z56JB-wWXgztaCAgKA=eSQFZyXPejxUQ_HjLqwDQ@mail.gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Aviv B.D. April 9, 2016, 6:03 p.m. UTC
From: "Aviv Ben-David" <bd.aviv@gmail.com>
Date: Tue, 23 Feb 2016 00:24:54 +0200
Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present

* Fix bug that prevent qemu from starting up with vIOMMU and VFIO
  device are present.
* Advertize Cache Mode capability in iommu cap register.
* Register every VFIO device with IOMMU state.
* On page cache invalidation in vIOMMU, check if the domain belong to
  VFIO device and mirror the guest requests to host.

Changes from previous versions:
* remove assumption that the cache do not clears
* fix lock up on high load.
* refactor vtd_get_did_dev to return success return code, and actual
domain_id via argument.

Tested only on network cards (also with multiple cards at once).

Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
---
 hw/i386/intel_iommu.c          | 113 +++++++++++++++++++++++++++++++++++------
 hw/i386/intel_iommu_internal.h |   3 ++
 hw/vfio/common.c               |  12 +++--
 include/exec/memory.h          |   8 ++-
 include/hw/i386/intel_iommu.h  |   4 ++
 include/hw/vfio/vfio-common.h  |   1 +
 6 files changed, 121 insertions(+), 20 deletions(-)

Comments

Peter Xu April 11, 2016, 3:40 a.m. UTC | #1
Hi, Aviv,

On Sat, Apr 09, 2016 at 09:03:38PM +0300, Aviv B.D. wrote:

[...]

> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t devfn, uint16_t * domain_id)

It seems that there are two lines above, however what I feel is that
this is a long line splitted by the email client or something
else... are you sending the patch using git-send-email? Not sure
whether this would be a problem.

I see the same thing in previous patches too.

[...]

> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
>   */
>  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> -                                   uint8_t devfn, hwaddr addr, bool is_write,
> +                                   uint8_t devfn, hwaddr addr,
> IOMMUAccessPermissions is_write,

First of all, if we are to change this "is_write" into a permission,
I would prefer change it's name too from "is_write" to "perm" or
else, so that people would know this is not a boolean any more.

Secondly, I assume you have not handled all the is_write uses below,
right? So the code seems not consistent. Many places are still using
it as boolean (I suppose).

[...]

> uint16_t domain_id,
> +                                      hwaddr addr, uint8_t am)
> +{
> +    VFIOGuestIOMMU * giommu;
> +
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> +        uint16_t vfio_domain_id;
> +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn, &vfio_domain_id);
> +        int i=0;
> +        if (!ret && domain_id == vfio_domain_id){
> +            IOMMUTLBEntry entry;
> +
> +            /* do vfio unmap */
> +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> +            entry.target_as = NULL;
> +            entry.iova = addr & VTD_PAGE_MASK_4K;
> +            entry.translated_addr = 0;
> +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> +            entry.perm = IOMMU_NONE;
> +            memory_region_notify_iommu(giommu->iommu, entry);
> +
> +            /* do vfio map */
> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +            /* call to vtd_iommu_translate */
> +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
> +                IOMMUTLBEntry entry =
> s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
> +                if (entry.perm != IOMMU_NONE){
> +                    memory_region_notify_iommu(giommu->iommu, entry);
> +                }

Questions (that I am curious about only, not to mean that there is
something worng):

- What should we do if entry.perm == IOMMU_NONE? Is it possible? If
  not, I'd prefer assert to if.

- Here, we do the remap for every 4K, guess even with huge
  pages. Better way to do? A stupid one from me: take special care
  for am == 9, 18 cases?

- Is it possible that multiple devices use same domain ID? Not
  sure. If so, we will always map/unmap the same address twice?

[...]

>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -344,6 +347,7 @@ static void
> vfio_listener_region_add(MemoryListener *listener,
>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>      llend = int128_make64(section->offset_within_address_space);
>      llend = int128_add(llend, section->size);
> +    llend = int128_add(llend, int128_exts64(-1));

It seems that Bandan has fixed this. Please try pull latest master
branch and check commit 55efcc537d330. If so, maybe we need to
rebase?

>      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> 
>      if (int128_ge(int128_make64(iova), llend)) {
> @@ -381,11 +385,13 @@ static void
> vfio_listener_region_add(MemoryListener *listener,
>          giommu->n.notify = vfio_iommu_map_notify;
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> 
> +        vtd_register_giommu(giommu);
>          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +#if 0
>          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>                                     vfio_container_granularity(container),
>                                     false);
> -
> +#endif
>          return;
>      }
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 2de7898..0e814ab 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -146,10 +146,14 @@ struct MemoryRegionOps {
>  };
> 
>  typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
> -
> +typedef enum IOMMUAccessPermissions{
> +    IOMMU_READ = 0,
> +    IOMMU_WRITE = 1,
> +    IOMMU_ANY = 2
> +} IOMMUAccessPermissions;

I see this:

typedef enum {
    IOMMU_NONE = 0,
    IOMMU_RO   = 1,
    IOMMU_WO   = 2,
    IOMMU_RW   = 3,
} IOMMUAccessFlags;

in include/exec/memory.h. Shall we leverage it rather than define
another one?

Thanks.

-- peterx
Alex Williamson April 11, 2016, 3:35 p.m. UTC | #2
On Sat, 9 Apr 2016 21:03:38 +0300
"Aviv B.D." <bd.aviv@gmail.com> wrote:

> From: "Aviv Ben-David" <bd.aviv@gmail.com>
> Date: Tue, 23 Feb 2016 00:24:54 +0200
> Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present
> 
> * Fix bug that prevent qemu from starting up with vIOMMU and VFIO
>   device are present.
> * Advertize Cache Mode capability in iommu cap register.
> * Register every VFIO device with IOMMU state.
> * On page cache invalidation in vIOMMU, check if the domain belong to
>   VFIO device and mirror the guest requests to host.
> 
> Changes from previous versions:
> * remove assumption that the cache do not clears
> * fix lock up on high load.
> * refactor vtd_get_did_dev to return success return code, and actual
> domain_id via argument.
> 
> Tested only on network cards (also with multiple cards at once).
> 
> Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> ---
>  hw/i386/intel_iommu.c          | 113 +++++++++++++++++++++++++++++++++++------
>  hw/i386/intel_iommu_internal.h |   3 ++
>  hw/vfio/common.c               |  12 +++--
>  include/exec/memory.h          |   8 ++-
>  include/hw/i386/intel_iommu.h  |   4 ++
>  include/hw/vfio/vfio-common.h  |   1 +
>  6 files changed, 121 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 347718f..a568181 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT(CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
> 
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {
> @@ -126,6 +129,22 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
>      return new_val;
>  }
> 
> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t devfn, uint16_t * domain_id)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    assert(domain_id);
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr){
> +        return -1;
> +    }
> +
> +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> +    return 0;
> +}
> +
>  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>                                          uint64_t clear, uint64_t mask)
>  {
> @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
> uint32_t level)
>  /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
>   * of the translation, can be used for deciding the size of large page.
>   */
> -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
> +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
> IOMMUAccessPermissions is_write,
>                              uint64_t *slptep, uint32_t *slpte_level,
>                              bool *reads, bool *writes)
>  {
> @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
> uint64_t gpa, bool is_write,
>      }
> 
>      /* FIXME: what is the Atomics request here? */
> -    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
> +    switch(is_write){
> +        case IOMMU_WRITE:
> +            access_right_check = VTD_SL_W;
> +            break;
> +        case IOMMU_READ:
> +            access_right_check = VTD_SL_R;
> +            break;
> +        case IOMMU_ANY:
> +            access_right_check = VTD_SL_R | VTD_SL_W;
> +            break;
> +        default:
> +            assert(0);
> +    }
> 
>      while (true) {
>          offset = vtd_gpa_level_offset(gpa, level);
> @@ -711,9 +742,9 @@ static int
> vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>      }
> 
>      if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> +        /*VTD_DPRINTF(GENERAL,
>                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);
> +                    "is not present", devfn, bus_num);*/
>          return -VTD_FR_CONTEXT_ENTRY_P;
>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
>   */
>  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> -                                   uint8_t devfn, hwaddr addr, bool is_write,
> +                                   uint8_t devfn, hwaddr addr,
> IOMMUAccessPermissions is_write,
>                                     IOMMUTLBEntry *entry)
>  {
>      IntelIOMMUState *s = vtd_as->iommu_state;
> @@ -848,12 +879,14 @@ static void
> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>          is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
>          if (ret_fr) {
>              ret_fr = -ret_fr;
> -            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> -                VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
> +            if (is_write != IOMMU_ANY){
> +                if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> +                    VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
>                              "requests through this context-entry "
>                              "(with FPD Set)");
> -            } else {
> -                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +                } else {
> +                    vtd_report_dmar_fault(s, source_id, addr, ret_fr,
> is_write);
> +                }
>              }
>              return;
>          }
> @@ -870,11 +903,13 @@ static void
> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>                                &reads, &writes);
>      if (ret_fr) {
>          ret_fr = -ret_fr;
> -        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> -            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests "
> +        if (is_write != IOMMU_ANY){
> +            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> +                VTD_DPRINTF(FLOG, "fault processing is disabled for
> DMA requests "
>                          "through this context-entry (with FPD Set)");
> -        } else {
> -            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +            } else {
> +                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +            }
>          }
>          return;
>      }
> @@ -1016,18 +1051,58 @@ static void
> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>                                  &domain_id);
>  }
> 
> +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
> uint16_t domain_id,
> +                                      hwaddr addr, uint8_t am)
> +{
> +    VFIOGuestIOMMU * giommu;
> +
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> +        uint16_t vfio_domain_id;
> +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn, &vfio_domain_id);
> +        int i=0;
> +        if (!ret && domain_id == vfio_domain_id){
> +            IOMMUTLBEntry entry;
> +
> +            /* do vfio unmap */
> +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> +            entry.target_as = NULL;
> +            entry.iova = addr & VTD_PAGE_MASK_4K;
> +            entry.translated_addr = 0;
> +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> +            entry.perm = IOMMU_NONE;
> +            memory_region_notify_iommu(giommu->iommu, entry);
> +
> +            /* do vfio map */
> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +            /* call to vtd_iommu_translate */
> +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
> +                IOMMUTLBEntry entry =
> s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
> +                if (entry.perm != IOMMU_NONE){
> +                    memory_region_notify_iommu(giommu->iommu, entry);
> +                }
> +            }
> +        }
> +    }
> +}
> +
>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>                                        hwaddr addr, uint8_t am)
>  {
>      VTDIOTLBPageInvInfo info;
> 
>      assert(am <= VTD_MAMV);
> +
>      info.domain_id = domain_id;
>      info.addr = addr;
>      info.mask = ~((1 << am) - 1);
> +
>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> +
> +    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
>  }
> 
> +
>  /* Flush IOTLB
>   * Returns the IOTLB Actual Invalidation Granularity.
>   * @val: the content of the IOTLB_REG
> @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
>  }
> 
>  static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
> -                                         bool is_write)
> +                                         IOMMUAccessPermissions is_write)
>  {
>      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
>      IntelIOMMUState *s = vtd_as->iommu_state;
> @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
> 
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> +{
> +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +
> +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> +}
> 
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>  {
> @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s)
>      s->iq_last_desc_type = VTD_INV_DESC_NONE;
>      s->next_frcd_reg = 0;
>      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS|
> +             VTD_CAP_CM;
>      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> 
>      vtd_reset_context_cache(s);
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index e5f514c..102e9a5 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -190,6 +190,7 @@
>  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
>  #define VTD_CAP_PSI                 (1ULL << 39)
>  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> +#define VTD_CAP_CM                  (1ULL << 7)
> 
>  /* Supported Adjusted Guest Address Widths */
>  #define VTD_CAP_SAGAW_SHIFT         8
> @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
>  #define VTD_PAGE_SHIFT_1G           30
>  #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
> 
> +#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
> +
>  struct VTDRootEntry {
>      uint64_t val;
>      uint64_t rsvd;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 607ec70..98c8d67 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -32,6 +32,9 @@
>  #include "sysemu/kvm.h"
>  #include "trace.h"
> 
> +#include "hw/sysbus.h"
> +#include "hw/i386/intel_iommu.h"
> +
>  struct vfio_group_head vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>  struct vfio_as_head vfio_address_spaces =
> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
>  out:
>      rcu_read_unlock();
>  }
> -
> +#if 0
>  static hwaddr vfio_container_granularity(VFIOContainer *container)
>  {
>      return (hwaddr)1 << ctz64(container->iova_pgsizes);
>  }
> -
> +#endif
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -344,6 +347,7 @@ static void
> vfio_listener_region_add(MemoryListener *listener,
>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>      llend = int128_make64(section->offset_within_address_space);
>      llend = int128_add(llend, section->size);
> +    llend = int128_add(llend, int128_exts64(-1));
>      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> 
>      if (int128_ge(int128_make64(iova), llend)) {
> @@ -381,11 +385,13 @@ static void
> vfio_listener_region_add(MemoryListener *listener,
>          giommu->n.notify = vfio_iommu_map_notify;
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> 
> +        vtd_register_giommu(giommu);
>          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +#if 0
>          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>                                     vfio_container_granularity(container),
>                                     false);
> -
> +#endif

Clearly replay has a purpose here and we can't assume the vIOMMU is
VT-d, we can't really move beyond an RFC without resolving those.
However we're still ignoring the issue that the existing vfio code
attempts to put all devices into a single IOMMU domain on the host,
something that is generally successful on x86.  So it seems like mostly
wishful thinking that's preventing conflicts between devices.  Wouldn't
the right thing to do be to create a separate container and thus IOMMU
domain for each vIOMMU managed device?  We could make a simplifying
assertion that to support vIOMMU, there must be a single device per
IOMMU group.  The vfio type1 interfaces is probably going to quickly
show limitations for this usage mode, not only in mapping performance,
but also in locked page accounting.  Type1 is really meant for largely
static mappings, it's going to be a bottleneck for dynamic mappings,
and if any containers are in "passthrough" mode, mapping all VM memory,
the vIOMMU domains are going to start hitting locked memory limits.  We
can probably move forward with those latter issues, but we absolutely
need multiple containers for correctness.  Thanks,

Alex

>          return;
>      }
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 2de7898..0e814ab 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -146,10 +146,14 @@ struct MemoryRegionOps {
>  };
> 
>  typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
> -
> +typedef enum IOMMUAccessPermissions{
> +    IOMMU_READ = 0,
> +    IOMMU_WRITE = 1,
> +    IOMMU_ANY = 2
> +} IOMMUAccessPermissions;
>  struct MemoryRegionIOMMUOps {
>      /* Return a TLB entry that contains a given address. */
> -    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool
> is_write);
> +    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr,
> IOMMUAccessPermissions is_write);
>  };
> 
>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index b024ffa..22f3f83 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -23,6 +23,7 @@
>  #define INTEL_IOMMU_H
>  #include "hw/qdev.h"
>  #include "sysemu/dma.h"
> +#include "hw/vfio/vfio-common.h"
> 
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>      MemoryRegionIOMMUOps iommu_ops;
>      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by
> PCIBus* reference */
>      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
> indexed by bus number */
> +
> +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>  };
> 
>  /* Find the VTD Address space associated with the given bus pointer,
> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>   */
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
> 
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>  #endif
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f037f3c..9225ba3 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>      MemoryRegion *iommu;
>      Notifier n;
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>  } VFIOGuestIOMMU;
> 
>  typedef struct VFIODeviceOps VFIODeviceOps;
Michael S. Tsirkin April 11, 2016, 4:38 p.m. UTC | #3
On Mon, Apr 11, 2016 at 09:35:15AM -0600, Alex Williamson wrote:
> On Sat, 9 Apr 2016 21:03:38 +0300
> "Aviv B.D." <bd.aviv@gmail.com> wrote:
> 
> > From: "Aviv Ben-David" <bd.aviv@gmail.com>
> > Date: Tue, 23 Feb 2016 00:24:54 +0200
> > Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present
> > 
> > * Fix bug that prevent qemu from starting up with vIOMMU and VFIO
> >   device are present.
> > * Advertize Cache Mode capability in iommu cap register.
> > * Register every VFIO device with IOMMU state.
> > * On page cache invalidation in vIOMMU, check if the domain belong to
> >   VFIO device and mirror the guest requests to host.
> > 
> > Changes from previous versions:
> > * remove assumption that the cache do not clears
> > * fix lock up on high load.
> > * refactor vtd_get_did_dev to return success return code, and actual
> > domain_id via argument.
> > 
> > Tested only on network cards (also with multiple cards at once).
> > 
> > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> > ---
> >  hw/i386/intel_iommu.c          | 113 +++++++++++++++++++++++++++++++++++------
> >  hw/i386/intel_iommu_internal.h |   3 ++
> >  hw/vfio/common.c               |  12 +++--
> >  include/exec/memory.h          |   8 ++-
> >  include/hw/i386/intel_iommu.h  |   4 ++
> >  include/hw/vfio/vfio-common.h  |   1 +
> >  6 files changed, 121 insertions(+), 20 deletions(-)
> > 
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 347718f..a568181 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> > VTD_DBGBIT(CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> > 
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > +                                    uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
> >                              uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -126,6 +129,22 @@ static uint32_t
> > vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
> >      return new_val;
> >  }
> > 
> > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> > uint8_t devfn, uint16_t * domain_id)
> > +{
> > +    VTDContextEntry ce;
> > +    int ret_fr;
> > +
> > +    assert(domain_id);
> > +
> > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +    if (ret_fr){
> > +        return -1;
> > +    }
> > +
> > +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> > +    return 0;
> > +}
> > +
> >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> >                                          uint64_t clear, uint64_t mask)
> >  {
> > @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
> > uint32_t level)
> >  /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
> >   * of the translation, can be used for deciding the size of large page.
> >   */
> > -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
> > +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
> > IOMMUAccessPermissions is_write,
> >                              uint64_t *slptep, uint32_t *slpte_level,
> >                              bool *reads, bool *writes)
> >  {
> > @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
> > uint64_t gpa, bool is_write,
> >      }
> > 
> >      /* FIXME: what is the Atomics request here? */
> > -    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
> > +    switch(is_write){
> > +        case IOMMU_WRITE:
> > +            access_right_check = VTD_SL_W;
> > +            break;
> > +        case IOMMU_READ:
> > +            access_right_check = VTD_SL_R;
> > +            break;
> > +        case IOMMU_ANY:
> > +            access_right_check = VTD_SL_R | VTD_SL_W;
> > +            break;
> > +        default:
> > +            assert(0);
> > +    }
> > 
> >      while (true) {
> >          offset = vtd_gpa_level_offset(gpa, level);
> > @@ -711,9 +742,9 @@ static int
> > vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> >      }
> > 
> >      if (!vtd_context_entry_present(ce)) {
> > -        VTD_DPRINTF(GENERAL,
> > +        /*VTD_DPRINTF(GENERAL,
> >                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -                    "is not present", devfn, bus_num);
> > +                    "is not present", devfn, bus_num);*/
> >          return -VTD_FR_CONTEXT_ENTRY_P;
> >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
> >   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
> >   */
> >  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> > -                                   uint8_t devfn, hwaddr addr, bool is_write,
> > +                                   uint8_t devfn, hwaddr addr,
> > IOMMUAccessPermissions is_write,
> >                                     IOMMUTLBEntry *entry)
> >  {
> >      IntelIOMMUState *s = vtd_as->iommu_state;
> > @@ -848,12 +879,14 @@ static void
> > vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> >          is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
> >          if (ret_fr) {
> >              ret_fr = -ret_fr;
> > -            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > -                VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
> > +            if (is_write != IOMMU_ANY){
> > +                if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > +                    VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
> >                              "requests through this context-entry "
> >                              "(with FPD Set)");
> > -            } else {
> > -                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> > +                } else {
> > +                    vtd_report_dmar_fault(s, source_id, addr, ret_fr,
> > is_write);
> > +                }
> >              }
> >              return;
> >          }
> > @@ -870,11 +903,13 @@ static void
> > vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> >                                &reads, &writes);
> >      if (ret_fr) {
> >          ret_fr = -ret_fr;
> > -        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > -            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests "
> > +        if (is_write != IOMMU_ANY){
> > +            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > +                VTD_DPRINTF(FLOG, "fault processing is disabled for
> > DMA requests "
> >                          "through this context-entry (with FPD Set)");
> > -        } else {
> > -            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> > +            } else {
> > +                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> > +            }
> >          }
> >          return;
> >      }
> > @@ -1016,18 +1051,58 @@ static void
> > vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
> >                                  &domain_id);
> >  }
> > 
> > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
> > uint16_t domain_id,
> > +                                      hwaddr addr, uint8_t am)
> > +{
> > +    VFIOGuestIOMMU * giommu;
> > +
> > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> > VTDAddressSpace, iommu);
> > +        uint16_t vfio_domain_id;
> > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> > vtd_as->devfn, &vfio_domain_id);
> > +        int i=0;
> > +        if (!ret && domain_id == vfio_domain_id){
> > +            IOMMUTLBEntry entry;
> > +
> > +            /* do vfio unmap */
> > +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> > +            entry.target_as = NULL;
> > +            entry.iova = addr & VTD_PAGE_MASK_4K;
> > +            entry.translated_addr = 0;
> > +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> > +            entry.perm = IOMMU_NONE;
> > +            memory_region_notify_iommu(giommu->iommu, entry);
> > +
> > +            /* do vfio map */
> > +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> > +            /* call to vtd_iommu_translate */
> > +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
> > +                IOMMUTLBEntry entry =
> > s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
> > +                if (entry.perm != IOMMU_NONE){
> > +                    memory_region_notify_iommu(giommu->iommu, entry);
> > +                }
> > +            }
> > +        }
> > +    }
> > +}
> > +
> >  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >                                        hwaddr addr, uint8_t am)
> >  {
> >      VTDIOTLBPageInvInfo info;
> > 
> >      assert(am <= VTD_MAMV);
> > +
> >      info.domain_id = domain_id;
> >      info.addr = addr;
> >      info.mask = ~((1 << am) - 1);
> > +
> >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> > +
> > +    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
> >  }
> > 
> > +
> >  /* Flush IOTLB
> >   * Returns the IOTLB Actual Invalidation Granularity.
> >   * @val: the content of the IOTLB_REG
> > @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
> >  }
> > 
> >  static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
> > -                                         bool is_write)
> > +                                         IOMMUAccessPermissions is_write)
> >  {
> >      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
> >      IntelIOMMUState *s = vtd_as->iommu_state;
> > @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = {
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> > 
> > +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> > +{
> > +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> > VTDAddressSpace, iommu);
> > +    IntelIOMMUState *s = vtd_as->iommu_state;
> > +
> > +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> > +}
> > 
> >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
> >  {
> > @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s)
> >      s->iq_last_desc_type = VTD_INV_DESC_NONE;
> >      s->next_frcd_reg = 0;
> >      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> > -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> > +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS|
> > +             VTD_CAP_CM;
> >      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> > 
> >      vtd_reset_context_cache(s);
> > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> > index e5f514c..102e9a5 100644
> > --- a/hw/i386/intel_iommu_internal.h
> > +++ b/hw/i386/intel_iommu_internal.h
> > @@ -190,6 +190,7 @@
> >  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
> >  #define VTD_CAP_PSI                 (1ULL << 39)
> >  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> > +#define VTD_CAP_CM                  (1ULL << 7)
> > 
> >  /* Supported Adjusted Guest Address Widths */
> >  #define VTD_CAP_SAGAW_SHIFT         8
> > @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
> >  #define VTD_PAGE_SHIFT_1G           30
> >  #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
> > 
> > +#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
> > +
> >  struct VTDRootEntry {
> >      uint64_t val;
> >      uint64_t rsvd;
> > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > index 607ec70..98c8d67 100644
> > --- a/hw/vfio/common.c
> > +++ b/hw/vfio/common.c
> > @@ -32,6 +32,9 @@
> >  #include "sysemu/kvm.h"
> >  #include "trace.h"
> > 
> > +#include "hw/sysbus.h"
> > +#include "hw/i386/intel_iommu.h"
> > +
> >  struct vfio_group_head vfio_group_list =
> >      QLIST_HEAD_INITIALIZER(vfio_group_list);
> >  struct vfio_as_head vfio_address_spaces =
> > @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
> >  out:
> >      rcu_read_unlock();
> >  }
> > -
> > +#if 0
> >  static hwaddr vfio_container_granularity(VFIOContainer *container)
> >  {
> >      return (hwaddr)1 << ctz64(container->iova_pgsizes);
> >  }
> > -
> > +#endif
> >  static void vfio_listener_region_add(MemoryListener *listener,
> >                                       MemoryRegionSection *section)
> >  {
> > @@ -344,6 +347,7 @@ static void
> > vfio_listener_region_add(MemoryListener *listener,
> >      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> >      llend = int128_make64(section->offset_within_address_space);
> >      llend = int128_add(llend, section->size);
> > +    llend = int128_add(llend, int128_exts64(-1));
> >      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> > 
> >      if (int128_ge(int128_make64(iova), llend)) {
> > @@ -381,11 +385,13 @@ static void
> > vfio_listener_region_add(MemoryListener *listener,
> >          giommu->n.notify = vfio_iommu_map_notify;
> >          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> > 
> > +        vtd_register_giommu(giommu);
> >          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> > +#if 0
> >          memory_region_iommu_replay(giommu->iommu, &giommu->n,
> >                                     vfio_container_granularity(container),
> >                                     false);
> > -
> > +#endif
> 
> Clearly replay has a purpose here and we can't assume the vIOMMU is
> VT-d, we can't really move beyond an RFC without resolving those.
> However we're still ignoring the issue that the existing vfio code
> attempts to put all devices into a single IOMMU domain on the host,
> something that is generally successful on x86.  So it seems like mostly
> wishful thinking that's preventing conflicts between devices.  Wouldn't
> the right thing to do be to create a separate container and thus IOMMU
> domain for each vIOMMU managed device?  We could make a simplifying
> assertion that to support vIOMMU, there must be a single device per
> IOMMU group.  The vfio type1 interfaces is probably going to quickly
> show limitations for this usage mode, not only in mapping performance,
> but also in locked page accounting.  Type1 is really meant for largely
> static mappings, it's going to be a bottleneck for dynamic mappings,
> and if any containers are in "passthrough" mode, mapping all VM memory,
> the vIOMMU domains are going to start hitting locked memory limits.  We
> can probably move forward with those latter issues, but we absolutely
> need multiple containers for correctness.  Thanks,
> 
> Alex

I guess we could limit this to a single VFIO device, fail attempts to
add more.  This might be an easier intermediate step than
full multi-domain support.

> >          return;
> >      }
> > 
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 2de7898..0e814ab 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -146,10 +146,14 @@ struct MemoryRegionOps {
> >  };
> > 
> >  typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
> > -
> > +typedef enum IOMMUAccessPermissions{
> > +    IOMMU_READ = 0,
> > +    IOMMU_WRITE = 1,
> > +    IOMMU_ANY = 2
> > +} IOMMUAccessPermissions;
> >  struct MemoryRegionIOMMUOps {
> >      /* Return a TLB entry that contains a given address. */
> > -    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool
> > is_write);
> > +    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr,
> > IOMMUAccessPermissions is_write);
> >  };
> > 
> >  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> > index b024ffa..22f3f83 100644
> > --- a/include/hw/i386/intel_iommu.h
> > +++ b/include/hw/i386/intel_iommu.h
> > @@ -23,6 +23,7 @@
> >  #define INTEL_IOMMU_H
> >  #include "hw/qdev.h"
> >  #include "sysemu/dma.h"
> > +#include "hw/vfio/vfio-common.h"
> > 
> >  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> >  #define INTEL_IOMMU_DEVICE(obj) \
> > @@ -123,6 +124,8 @@ struct IntelIOMMUState {
> >      MemoryRegionIOMMUOps iommu_ops;
> >      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by
> > PCIBus* reference */
> >      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
> > indexed by bus number */
> > +
> > +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
> >  };
> > 
> >  /* Find the VTD Address space associated with the given bus pointer,
> > @@ -130,4 +133,5 @@ struct IntelIOMMUState {
> >   */
> >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
> > 
> > +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
> >  #endif
> > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> > index f037f3c..9225ba3 100644
> > --- a/include/hw/vfio/vfio-common.h
> > +++ b/include/hw/vfio/vfio-common.h
> > @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
> >      MemoryRegion *iommu;
> >      Notifier n;
> >      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> > +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
> >  } VFIOGuestIOMMU;
> > 
> >  typedef struct VFIODeviceOps VFIODeviceOps;
Alex Williamson April 11, 2016, 7:41 p.m. UTC | #4
On Mon, 11 Apr 2016 19:38:08 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Apr 11, 2016 at 09:35:15AM -0600, Alex Williamson wrote:
> > On Sat, 9 Apr 2016 21:03:38 +0300
> > "Aviv B.D." <bd.aviv@gmail.com> wrote:
> >   
> > > From: "Aviv Ben-David" <bd.aviv@gmail.com>
> > > Date: Tue, 23 Feb 2016 00:24:54 +0200
> > > Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present
> > > 
> > > * Fix bug that prevent qemu from starting up with vIOMMU and VFIO
> > >   device are present.
> > > * Advertize Cache Mode capability in iommu cap register.
> > > * Register every VFIO device with IOMMU state.
> > > * On page cache invalidation in vIOMMU, check if the domain belong to
> > >   VFIO device and mirror the guest requests to host.
> > > 
> > > Changes from previous versions:
> > > * remove assumption that the cache do not clears
> > > * fix lock up on high load.
> > > * refactor vtd_get_did_dev to return success return code, and actual
> > > domain_id via argument.
> > > 
> > > Tested only on network cards (also with multiple cards at once).
> > > 
> > > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> > > ---
> > >  hw/i386/intel_iommu.c          | 113 +++++++++++++++++++++++++++++++++++------
> > >  hw/i386/intel_iommu_internal.h |   3 ++
> > >  hw/vfio/common.c               |  12 +++--
> > >  include/exec/memory.h          |   8 ++-
> > >  include/hw/i386/intel_iommu.h  |   4 ++
> > >  include/hw/vfio/vfio-common.h  |   1 +
> > >  6 files changed, 121 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > index 347718f..a568181 100644
> > > --- a/hw/i386/intel_iommu.c
> > > +++ b/hw/i386/intel_iommu.c
> > > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> > > VTD_DBGBIT(CSR);
> > >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> > >  #endif
> > > 
> > > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > > +                                    uint8_t devfn, VTDContextEntry *ce);
> > > +
> > >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
> > >                              uint64_t wmask, uint64_t w1cmask)
> > >  {
> > > @@ -126,6 +129,22 @@ static uint32_t
> > > vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
> > >      return new_val;
> > >  }
> > > 
> > > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> > > uint8_t devfn, uint16_t * domain_id)
> > > +{
> > > +    VTDContextEntry ce;
> > > +    int ret_fr;
> > > +
> > > +    assert(domain_id);
> > > +
> > > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > > +    if (ret_fr){
> > > +        return -1;
> > > +    }
> > > +
> > > +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> > > +    return 0;
> > > +}
> > > +
> > >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> > >                                          uint64_t clear, uint64_t mask)
> > >  {
> > > @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
> > > uint32_t level)
> > >  /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
> > >   * of the translation, can be used for deciding the size of large page.
> > >   */
> > > -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
> > > +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
> > > IOMMUAccessPermissions is_write,
> > >                              uint64_t *slptep, uint32_t *slpte_level,
> > >                              bool *reads, bool *writes)
> > >  {
> > > @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
> > > uint64_t gpa, bool is_write,
> > >      }
> > > 
> > >      /* FIXME: what is the Atomics request here? */
> > > -    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
> > > +    switch(is_write){
> > > +        case IOMMU_WRITE:
> > > +            access_right_check = VTD_SL_W;
> > > +            break;
> > > +        case IOMMU_READ:
> > > +            access_right_check = VTD_SL_R;
> > > +            break;
> > > +        case IOMMU_ANY:
> > > +            access_right_check = VTD_SL_R | VTD_SL_W;
> > > +            break;
> > > +        default:
> > > +            assert(0);
> > > +    }
> > > 
> > >      while (true) {
> > >          offset = vtd_gpa_level_offset(gpa, level);
> > > @@ -711,9 +742,9 @@ static int
> > > vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > >      }
> > > 
> > >      if (!vtd_context_entry_present(ce)) {
> > > -        VTD_DPRINTF(GENERAL,
> > > +        /*VTD_DPRINTF(GENERAL,
> > >                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > > -                    "is not present", devfn, bus_num);
> > > +                    "is not present", devfn, bus_num);*/
> > >          return -VTD_FR_CONTEXT_ENTRY_P;
> > >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> > >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > > @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
> > >   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
> > >   */
> > >  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> > > -                                   uint8_t devfn, hwaddr addr, bool is_write,
> > > +                                   uint8_t devfn, hwaddr addr,
> > > IOMMUAccessPermissions is_write,
> > >                                     IOMMUTLBEntry *entry)
> > >  {
> > >      IntelIOMMUState *s = vtd_as->iommu_state;
> > > @@ -848,12 +879,14 @@ static void
> > > vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> > >          is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
> > >          if (ret_fr) {
> > >              ret_fr = -ret_fr;
> > > -            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > > -                VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
> > > +            if (is_write != IOMMU_ANY){
> > > +                if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > > +                    VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
> > >                              "requests through this context-entry "
> > >                              "(with FPD Set)");
> > > -            } else {
> > > -                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> > > +                } else {
> > > +                    vtd_report_dmar_fault(s, source_id, addr, ret_fr,
> > > is_write);
> > > +                }
> > >              }
> > >              return;
> > >          }
> > > @@ -870,11 +903,13 @@ static void
> > > vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> > >                                &reads, &writes);
> > >      if (ret_fr) {
> > >          ret_fr = -ret_fr;
> > > -        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > > -            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests "
> > > +        if (is_write != IOMMU_ANY){
> > > +            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> > > +                VTD_DPRINTF(FLOG, "fault processing is disabled for
> > > DMA requests "
> > >                          "through this context-entry (with FPD Set)");
> > > -        } else {
> > > -            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> > > +            } else {
> > > +                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> > > +            }
> > >          }
> > >          return;
> > >      }
> > > @@ -1016,18 +1051,58 @@ static void
> > > vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
> > >                                  &domain_id);
> > >  }
> > > 
> > > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
> > > uint16_t domain_id,
> > > +                                      hwaddr addr, uint8_t am)
> > > +{
> > > +    VFIOGuestIOMMU * giommu;
> > > +
> > > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> > > VTDAddressSpace, iommu);
> > > +        uint16_t vfio_domain_id;
> > > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> > > vtd_as->devfn, &vfio_domain_id);
> > > +        int i=0;
> > > +        if (!ret && domain_id == vfio_domain_id){
> > > +            IOMMUTLBEntry entry;
> > > +
> > > +            /* do vfio unmap */
> > > +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> > > +            entry.target_as = NULL;
> > > +            entry.iova = addr & VTD_PAGE_MASK_4K;
> > > +            entry.translated_addr = 0;
> > > +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> > > +            entry.perm = IOMMU_NONE;
> > > +            memory_region_notify_iommu(giommu->iommu, entry);
> > > +
> > > +            /* do vfio map */
> > > +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> > > +            /* call to vtd_iommu_translate */
> > > +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
> > > +                IOMMUTLBEntry entry =
> > > s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
> > > +                if (entry.perm != IOMMU_NONE){
> > > +                    memory_region_notify_iommu(giommu->iommu, entry);
> > > +                }
> > > +            }
> > > +        }
> > > +    }
> > > +}
> > > +
> > >  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> > >                                        hwaddr addr, uint8_t am)
> > >  {
> > >      VTDIOTLBPageInvInfo info;
> > > 
> > >      assert(am <= VTD_MAMV);
> > > +
> > >      info.domain_id = domain_id;
> > >      info.addr = addr;
> > >      info.mask = ~((1 << am) - 1);
> > > +
> > >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> > > +
> > > +    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
> > >  }
> > > 
> > > +
> > >  /* Flush IOTLB
> > >   * Returns the IOTLB Actual Invalidation Granularity.
> > >   * @val: the content of the IOTLB_REG
> > > @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
> > >  }
> > > 
> > >  static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
> > > -                                         bool is_write)
> > > +                                         IOMMUAccessPermissions is_write)
> > >  {
> > >      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
> > >      IntelIOMMUState *s = vtd_as->iommu_state;
> > > @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = {
> > >      DEFINE_PROP_END_OF_LIST(),
> > >  };
> > > 
> > > +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> > > +{
> > > +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> > > VTDAddressSpace, iommu);
> > > +    IntelIOMMUState *s = vtd_as->iommu_state;
> > > +
> > > +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> > > +}
> > > 
> > >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
> > >  {
> > > @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s)
> > >      s->iq_last_desc_type = VTD_INV_DESC_NONE;
> > >      s->next_frcd_reg = 0;
> > >      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> > > -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> > > +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS|
> > > +             VTD_CAP_CM;
> > >      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> > > 
> > >      vtd_reset_context_cache(s);
> > > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> > > index e5f514c..102e9a5 100644
> > > --- a/hw/i386/intel_iommu_internal.h
> > > +++ b/hw/i386/intel_iommu_internal.h
> > > @@ -190,6 +190,7 @@
> > >  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
> > >  #define VTD_CAP_PSI                 (1ULL << 39)
> > >  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> > > +#define VTD_CAP_CM                  (1ULL << 7)
> > > 
> > >  /* Supported Adjusted Guest Address Widths */
> > >  #define VTD_CAP_SAGAW_SHIFT         8
> > > @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
> > >  #define VTD_PAGE_SHIFT_1G           30
> > >  #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
> > > 
> > > +#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
> > > +
> > >  struct VTDRootEntry {
> > >      uint64_t val;
> > >      uint64_t rsvd;
> > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > > index 607ec70..98c8d67 100644
> > > --- a/hw/vfio/common.c
> > > +++ b/hw/vfio/common.c
> > > @@ -32,6 +32,9 @@
> > >  #include "sysemu/kvm.h"
> > >  #include "trace.h"
> > > 
> > > +#include "hw/sysbus.h"
> > > +#include "hw/i386/intel_iommu.h"
> > > +
> > >  struct vfio_group_head vfio_group_list =
> > >      QLIST_HEAD_INITIALIZER(vfio_group_list);
> > >  struct vfio_as_head vfio_address_spaces =
> > > @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
> > >  out:
> > >      rcu_read_unlock();
> > >  }
> > > -
> > > +#if 0
> > >  static hwaddr vfio_container_granularity(VFIOContainer *container)
> > >  {
> > >      return (hwaddr)1 << ctz64(container->iova_pgsizes);
> > >  }
> > > -
> > > +#endif
> > >  static void vfio_listener_region_add(MemoryListener *listener,
> > >                                       MemoryRegionSection *section)
> > >  {
> > > @@ -344,6 +347,7 @@ static void
> > > vfio_listener_region_add(MemoryListener *listener,
> > >      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> > >      llend = int128_make64(section->offset_within_address_space);
> > >      llend = int128_add(llend, section->size);
> > > +    llend = int128_add(llend, int128_exts64(-1));
> > >      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> > > 
> > >      if (int128_ge(int128_make64(iova), llend)) {
> > > @@ -381,11 +385,13 @@ static void
> > > vfio_listener_region_add(MemoryListener *listener,
> > >          giommu->n.notify = vfio_iommu_map_notify;
> > >          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> > > 
> > > +        vtd_register_giommu(giommu);
> > >          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> > > +#if 0
> > >          memory_region_iommu_replay(giommu->iommu, &giommu->n,
> > >                                     vfio_container_granularity(container),
> > >                                     false);
> > > -
> > > +#endif  
> > 
> > Clearly replay has a purpose here and we can't assume the vIOMMU is
> > VT-d, we can't really move beyond an RFC without resolving those.
> > However we're still ignoring the issue that the existing vfio code
> > attempts to put all devices into a single IOMMU domain on the host,
> > something that is generally successful on x86.  So it seems like mostly
> > wishful thinking that's preventing conflicts between devices.  Wouldn't
> > the right thing to do be to create a separate container and thus IOMMU
> > domain for each vIOMMU managed device?  We could make a simplifying
> > assertion that to support vIOMMU, there must be a single device per
> > IOMMU group.  The vfio type1 interfaces is probably going to quickly
> > show limitations for this usage mode, not only in mapping performance,
> > but also in locked page accounting.  Type1 is really meant for largely
> > static mappings, it's going to be a bottleneck for dynamic mappings,
> > and if any containers are in "passthrough" mode, mapping all VM memory,
> > the vIOMMU domains are going to start hitting locked memory limits.  We
> > can probably move forward with those latter issues, but we absolutely
> > need multiple containers for correctness.  Thanks,
> > 
> > Alex  
> 
> I guess we could limit this to a single VFIO device, fail attempts to
> add more.  This might be an easier intermediate step than
> full multi-domain support.

Hmm, does this already work?  vfio_initfn() calls vfio_get_group() with
and AddressSpace found from pci_device_iommu_address_space().  This
will return a VTDAddressSpace when VT-d is enabled.  That will be an
address space unique to the {bus, devfn}.  vfio_get_group() then goes
on to get a container (IOMMU domain) for the group and AddressSpace.
So it looks like we have both the IOMMU domain per device as well as
the rejection of multiple devices within a group trying to share a
container with different address spaces.

I'll need to look through more at how VT-d disabled mode works, I
suspect accounting will require a locked memory limit equal to VM
memory size * number of vfio devices initially, then drop as VT-d is
enabled (so long as passthrough mode is not enabled).  So for the 1
assigned device case, libvirt shouldn't know the difference.  Thanks,

Alex
Alex Williamson April 11, 2016, 8:25 p.m. UTC | #5
Some more detailed comments now that I have some faith that the host
IOMMU domain is working correctly...

On Sat, 9 Apr 2016 21:03:38 +0300
"Aviv B.D." <bd.aviv@gmail.com> wrote:

> From: "Aviv Ben-David" <bd.aviv@gmail.com>
> Date: Tue, 23 Feb 2016 00:24:54 +0200
> Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present
> 
> * Fix bug that prevent qemu from starting up with vIOMMU and VFIO
>   device are present.
> * Advertize Cache Mode capability in iommu cap register.
> * Register every VFIO device with IOMMU state.
> * On page cache invalidation in vIOMMU, check if the domain belong to
>   VFIO device and mirror the guest requests to host.
> 
> Changes from previous versions:
> * remove assumption that the cache do not clears
> * fix lock up on high load.
> * refactor vtd_get_did_dev to return success return code, and actual
> domain_id via argument.
> 
> Tested only on network cards (also with multiple cards at once).
> 
> Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> ---
>  hw/i386/intel_iommu.c          | 113 +++++++++++++++++++++++++++++++++++------
>  hw/i386/intel_iommu_internal.h |   3 ++
>  hw/vfio/common.c               |  12 +++--
>  include/exec/memory.h          |   8 ++-
>  include/hw/i386/intel_iommu.h  |   4 ++
>  include/hw/vfio/vfio-common.h  |   1 +
>  6 files changed, 121 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 347718f..a568181 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT(CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
> 
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {
> @@ -126,6 +129,22 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
>      return new_val;
>  }
> 
> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t devfn, uint16_t * domain_id)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    assert(domain_id);
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr){
> +        return -1;
> +    }
> +
> +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> +    return 0;
> +}
> +
>  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>                                          uint64_t clear, uint64_t mask)
>  {
> @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
> uint32_t level)
>  /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
>   * of the translation, can be used for deciding the size of large page.
>   */
> -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
> +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
> IOMMUAccessPermissions is_write,

"is_write" is binary, yes/no, IOMMUAccessPermissions clearly has more
states.  This should change to "flags" or something and should use
existing IOMMUAccessFlags rather than defining something new.  This
should be done in a separate patch that doesn't introduce new
functionality otherwise.

>                              uint64_t *slptep, uint32_t *slpte_level,
>                              bool *reads, bool *writes)
>  {
> @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
> uint64_t gpa, bool is_write,
>      }
> 
>      /* FIXME: what is the Atomics request here? */
> -    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
> +    switch(is_write){
> +        case IOMMU_WRITE:
> +            access_right_check = VTD_SL_W;
> +            break;
> +        case IOMMU_READ:
> +            access_right_check = VTD_SL_R;
> +            break;
> +        case IOMMU_ANY:
> +            access_right_check = VTD_SL_R | VTD_SL_W;
> +            break;
> +        default:
> +            assert(0);
> +    }
> 
>      while (true) {
>          offset = vtd_gpa_level_offset(gpa, level);
> @@ -711,9 +742,9 @@ static int
> vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>      }
> 
>      if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> +        /*VTD_DPRINTF(GENERAL,
>                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);
> +                    "is not present", devfn, bus_num);*/


Leftover debug?

>          return -VTD_FR_CONTEXT_ENTRY_P;
>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
>   */
>  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> -                                   uint8_t devfn, hwaddr addr, bool is_write,
> +                                   uint8_t devfn, hwaddr addr,
> IOMMUAccessPermissions is_write,
>                                     IOMMUTLBEntry *entry)
>  {
>      IntelIOMMUState *s = vtd_as->iommu_state;
> @@ -848,12 +879,14 @@ static void
> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>          is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
>          if (ret_fr) {
>              ret_fr = -ret_fr;
> -            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> -                VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
> +            if (is_write != IOMMU_ANY){

Is this debugging as well?  Seems like this hides the majority of
faults that might occur.

> +                if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> +                    VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
>                              "requests through this context-entry "
>                              "(with FPD Set)");
> -            } else {
> -                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +                } else {
> +                    vtd_report_dmar_fault(s, source_id, addr, ret_fr,
> is_write);
> +                }
>              }
>              return;
>          }
> @@ -870,11 +903,13 @@ static void
> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>                                &reads, &writes);
>      if (ret_fr) {
>          ret_fr = -ret_fr;
> -        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> -            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests "
> +        if (is_write != IOMMU_ANY){

Here as well, why only fault non-RW entries?

> +            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
> +                VTD_DPRINTF(FLOG, "fault processing is disabled for
> DMA requests "
>                          "through this context-entry (with FPD Set)");
> -        } else {
> -            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +            } else {
> +                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
> +            }
>          }
>          return;
>      }
> @@ -1016,18 +1051,58 @@ static void
> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>                                  &domain_id);
>  }
> 
> +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
> uint16_t domain_id,
> +                                      hwaddr addr, uint8_t am)
> +{
> +    VFIOGuestIOMMU * giommu;
> +
> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> +        uint16_t vfio_domain_id;
> +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn, &vfio_domain_id);
> +        int i=0;
> +        if (!ret && domain_id == vfio_domain_id){
> +            IOMMUTLBEntry entry;
> +
> +            /* do vfio unmap */
> +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> +            entry.target_as = NULL;
> +            entry.iova = addr & VTD_PAGE_MASK_4K;
> +            entry.translated_addr = 0;
> +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> +            entry.perm = IOMMU_NONE;
> +            memory_region_notify_iommu(giommu->iommu, entry);
> +
> +            /* do vfio map */
> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +            /* call to vtd_iommu_translate */
> +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
> +                IOMMUTLBEntry entry =
> s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
> +                if (entry.perm != IOMMU_NONE){
> +                    memory_region_notify_iommu(giommu->iommu, entry);
> +                }
> +            }
> +        }
> +    }
> +}
> +
>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>                                        hwaddr addr, uint8_t am)
>  {
>      VTDIOTLBPageInvInfo info;
> 
>      assert(am <= VTD_MAMV);
> +
>      info.domain_id = domain_id;
>      info.addr = addr;
>      info.mask = ~((1 << am) - 1);
> +
>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> +
> +    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);

Why is this vfio related and why does it need to know about giommus?
That's vfio private data.  Notifies need to happen regardless of
whether there's a vfio device attached or not.  It seems like this is
just filling a gap that current VT-d code doesn't notify everywhere it
needs to, but it shouldn't know about vfio.

>  }
> 
> +
>  /* Flush IOTLB
>   * Returns the IOTLB Actual Invalidation Granularity.
>   * @val: the content of the IOTLB_REG
> @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
>  }
> 
>  static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
> -                                         bool is_write)
> +                                         IOMMUAccessPermissions is_write)
>  {
>      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
>      IntelIOMMUState *s = vtd_as->iommu_state;
> @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
> 
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> +{
> +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +
> +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> +}

This function shouldn't be needed.

> 
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>  {
> @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s)
>      s->iq_last_desc_type = VTD_INV_DESC_NONE;
>      s->next_frcd_reg = 0;
>      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS|
> +             VTD_CAP_CM;

This should be a separate patch as well.

>      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> 
>      vtd_reset_context_cache(s);
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index e5f514c..102e9a5 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -190,6 +190,7 @@
>  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
>  #define VTD_CAP_PSI                 (1ULL << 39)
>  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
> +#define VTD_CAP_CM                  (1ULL << 7)
> 
>  /* Supported Adjusted Guest Address Widths */
>  #define VTD_CAP_SAGAW_SHIFT         8
> @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
>  #define VTD_PAGE_SHIFT_1G           30
>  #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
> 
> +#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
> +
>  struct VTDRootEntry {
>      uint64_t val;
>      uint64_t rsvd;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 607ec70..98c8d67 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -32,6 +32,9 @@
>  #include "sysemu/kvm.h"
>  #include "trace.h"
> 
> +#include "hw/sysbus.h"
> +#include "hw/i386/intel_iommu.h"
> +
>  struct vfio_group_head vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>  struct vfio_as_head vfio_address_spaces =
> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
>  out:
>      rcu_read_unlock();
>  }
> -
> +#if 0
>  static hwaddr vfio_container_granularity(VFIOContainer *container)
>  {
>      return (hwaddr)1 << ctz64(container->iova_pgsizes);
>  }
> -
> +#endif
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -344,6 +347,7 @@ static void
> vfio_listener_region_add(MemoryListener *listener,
>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>      llend = int128_make64(section->offset_within_address_space);
>      llend = int128_add(llend, section->size);
> +    llend = int128_add(llend, int128_exts64(-1));
>      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> 
>      if (int128_ge(int128_make64(iova), llend)) {
> @@ -381,11 +385,13 @@ static void
> vfio_listener_region_add(MemoryListener *listener,
>          giommu->n.notify = vfio_iommu_map_notify;
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> 
> +        vtd_register_giommu(giommu);
>          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +#if 0
>          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>                                     vfio_container_granularity(container),
>                                     false);
> -
> +#endif

AFAICT, none of the above vfio changes should be required.  The
overflow is already fixed in qemu.git, the giommu registration
shouldn't be necessary, the replay is probably not used, but shouldn't
be a problem either.  Not that there aren't vfio issues, but I think
they're internal, like how pages are accounted and map/unmap efficiency.

>          return;
>      }
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 2de7898..0e814ab 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -146,10 +146,14 @@ struct MemoryRegionOps {
>  };
> 
>  typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
> -
> +typedef enum IOMMUAccessPermissions{
> +    IOMMU_READ = 0,
> +    IOMMU_WRITE = 1,
> +    IOMMU_ANY = 2
> +} IOMMUAccessPermissions;
>  struct MemoryRegionIOMMUOps {
>      /* Return a TLB entry that contains a given address. */
> -    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool
> is_write);
> +    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr,
> IOMMUAccessPermissions is_write);
>  };
> 
>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index b024ffa..22f3f83 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -23,6 +23,7 @@
>  #define INTEL_IOMMU_H
>  #include "hw/qdev.h"
>  #include "sysemu/dma.h"
> +#include "hw/vfio/vfio-common.h"
> 
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>      MemoryRegionIOMMUOps iommu_ops;
>      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by
> PCIBus* reference */
>      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
> indexed by bus number */
> +
> +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>  };
> 
>  /* Find the VTD Address space associated with the given bus pointer,
> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>   */
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
> 
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>  #endif

Needing to know anything about vfio is an indication that this
shouldn't be necessary.

> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f037f3c..9225ba3 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>      MemoryRegion *iommu;
>      Notifier n;
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>  } VFIOGuestIOMMU;

This is clearly a layering violation, vt-d should not be managing a
list on a vfio data structure, especially one that it shouldn't even
have access to.  Thanks,

Alex
Aviv B.D. April 16, 2016, 3:08 p.m. UTC | #6
see comments below.

Thanks for your inputs,
Aviv

On Mon, Apr 11, 2016 at 6:40 AM, Peter Xu <peterx@redhat.com> wrote:
> Hi, Aviv,
>
> On Sat, Apr 09, 2016 at 09:03:38PM +0300, Aviv B.D. wrote:
>
> [...]
>
>> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>> uint8_t devfn, uint16_t * domain_id)
>
> It seems that there are two lines above, however what I feel is that
> this is a long line splitted by the email client or something
> else... are you sending the patch using git-send-email? Not sure
> whether this would be a problem.
>
> I see the same thing in previous patches too.

You are correct, next time I'll use git-send-email.

>
> [...]
>
>> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>>   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
>>   */
>>  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>> -                                   uint8_t devfn, hwaddr addr, bool is_write,
>> +                                   uint8_t devfn, hwaddr addr,
>> IOMMUAccessPermissions is_write,
>
> First of all, if we are to change this "is_write" into a permission,
> I would prefer change it's name too from "is_write" to "perm" or
> else, so that people would know this is not a boolean any more.
>
> Secondly, I assume you have not handled all the is_write uses below,
> right? So the code seems not consistent. Many places are still using
> it as boolean (I suppose).
>
you are correct, I'll fix those locations to use the correct enum.
The translate function returns entry.perm == IOMMU_NONE if no mapping
is existing, and this is a valid case. Actually, the kernel may signal
that the hardware should invalidate a chunk of 4 consecutive pages
with one page in the middle that is in invalid state. I want to skip
those pages (in the mapping stage of the function).

>
> - Here, we do the remap for every 4K, guess even with huge
>   pages. Better way to do? A stupid one from me: take special care
>   for am == 9, 18 cases?

You are correct, I want the code firstly to work correctly (even if a
bit slower than optimal). I'll try to include an optimizations for
this in the next version (but currently I can't promise that I'll have
the time).

>
> - Is it possible that multiple devices use same domain ID? Not
>   sure. If so, we will always map/unmap the same address twice?
>

yes. Domains can be shared by more than one device, but if QEMU assign
those devices to the same domain on host the map and unmap will happen
only once. As far as I understand the VFIO code in QEMU, it always try
to assign all of those devices to the same domain on host. Therefore
this problem doesn't existing.

> [...]
>
>>  static void vfio_listener_region_add(MemoryListener *listener,
>>                                       MemoryRegionSection *section)
>>  {
>> @@ -344,6 +347,7 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>>      llend = int128_make64(section->offset_within_address_space);
>>      llend = int128_add(llend, section->size);
>> +    llend = int128_add(llend, int128_exts64(-1));
>
> It seems that Bandan has fixed this. Please try pull latest master
> branch and check commit 55efcc537d330. If so, maybe we need to
> rebase?

Will do.
IOMMU_ANY is flag to suppress errors reports to guest kernel in case
of not existing mapping, So in this case I'm not sure that this enum
represent correctly this intent.

>
> Thanks.
>
> -- peterx
Aviv B.D. April 16, 2016, 3:40 p.m. UTC | #7
See my comments below,
Thanks,
Aviv.

On Mon, Apr 11, 2016 at 11:25 PM, Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> Some more detailed comments now that I have some faith that the host
> IOMMU domain is working correctly...
>
> On Sat, 9 Apr 2016 21:03:38 +0300
> "Aviv B.D." <bd.aviv@gmail.com> wrote:
>
>> From: "Aviv Ben-David" <bd.aviv@gmail.com>
>> Date: Tue, 23 Feb 2016 00:24:54 +0200
>> Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present
>>
>> * Fix bug that prevent qemu from starting up with vIOMMU and VFIO
>>   device are present.
>> * Advertize Cache Mode capability in iommu cap register.
>> * Register every VFIO device with IOMMU state.
>> * On page cache invalidation in vIOMMU, check if the domain belong to
>>   VFIO device and mirror the guest requests to host.
>>
>> Changes from previous versions:
>> * remove assumption that the cache do not clears
>> * fix lock up on high load.
>> * refactor vtd_get_did_dev to return success return code, and actual
>> domain_id via argument.
>>
>> Tested only on network cards (also with multiple cards at once).
>>
>> Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
>> ---
>>  hw/i386/intel_iommu.c          | 113 +++++++++++++++++++++++++++++++++++------
>>  hw/i386/intel_iommu_internal.h |   3 ++
>>  hw/vfio/common.c               |  12 +++--
>>  include/exec/memory.h          |   8 ++-
>>  include/hw/i386/intel_iommu.h  |   4 ++
>>  include/hw/vfio/vfio-common.h  |   1 +
>>  6 files changed, 121 insertions(+), 20 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index 347718f..a568181 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
>> VTD_DBGBIT(CSR);
>>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>>  #endif
>>
>> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>> +                                    uint8_t devfn, VTDContextEntry *ce);
>> +
>>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>>                              uint64_t wmask, uint64_t w1cmask)
>>  {
>> @@ -126,6 +129,22 @@ static uint32_t
>> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
>>      return new_val;
>>  }
>>
>> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>> uint8_t devfn, uint16_t * domain_id)
>> +{
>> +    VTDContextEntry ce;
>> +    int ret_fr;
>> +
>> +    assert(domain_id);
>> +
>> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
>> +    if (ret_fr){
>> +        return -1;
>> +    }
>> +
>> +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
>> +    return 0;
>> +}
>> +
>>  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>>                                          uint64_t clear, uint64_t mask)
>>  {
>> @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
>> uint32_t level)
>>  /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
>>   * of the translation, can be used for deciding the size of large page.
>>   */
>> -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
>> +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
>> IOMMUAccessPermissions is_write,
>
> "is_write" is binary, yes/no, IOMMUAccessPermissions clearly has more
> states.  This should change to "flags" or something and should use
> existing IOMMUAccessFlags rather than defining something new.  This
> should be done in a separate patch that doesn't introduce new
> functionality otherwise.

OK, I will do it.

>
>>                              uint64_t *slptep, uint32_t *slpte_level,
>>                              bool *reads, bool *writes)
>>  {
>> @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
>> uint64_t gpa, bool is_write,
>>      }
>>
>>      /* FIXME: what is the Atomics request here? */
>> -    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
>> +    switch(is_write){
>> +        case IOMMU_WRITE:
>> +            access_right_check = VTD_SL_W;
>> +            break;
>> +        case IOMMU_READ:
>> +            access_right_check = VTD_SL_R;
>> +            break;
>> +        case IOMMU_ANY:
>> +            access_right_check = VTD_SL_R | VTD_SL_W;
>> +            break;
>> +        default:
>> +            assert(0);
>> +    }
>>
>>      while (true) {
>>          offset = vtd_gpa_level_offset(gpa, level);
>> @@ -711,9 +742,9 @@ static int
>> vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>>      }
>>
>>      if (!vtd_context_entry_present(ce)) {
>> -        VTD_DPRINTF(GENERAL,
>> +        /*VTD_DPRINTF(GENERAL,
>>                      "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
>> -                    "is not present", devfn, bus_num);
>> +                    "is not present", devfn, bus_num);*/
>
>
> Leftover debug?

Yes :/
I'll clear them...

>
>>          return -VTD_FR_CONTEXT_ENTRY_P;
>>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
>> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>>   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
>>   */
>>  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>> -                                   uint8_t devfn, hwaddr addr, bool is_write,
>> +                                   uint8_t devfn, hwaddr addr,
>> IOMMUAccessPermissions is_write,
>>                                     IOMMUTLBEntry *entry)
>>  {
>>      IntelIOMMUState *s = vtd_as->iommu_state;
>> @@ -848,12 +879,14 @@ static void
>> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>>          is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
>>          if (ret_fr) {
>>              ret_fr = -ret_fr;
>> -            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> -                VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
>> +            if (is_write != IOMMU_ANY){
>
> Is this debugging as well?  Seems like this hides the majority of
> faults that might occur.

No, this is actually the purpose of IOMMU_ANY - to suppress
translate's errors reporting.
The guest kernel may issue invalidation of some consecutive pages that
some of them may not be present.

>
>> +                if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> +                    VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
>>                              "requests through this context-entry "
>>                              "(with FPD Set)");
>> -            } else {
>> -                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
>> +                } else {
>> +                    vtd_report_dmar_fault(s, source_id, addr, ret_fr,
>> is_write);
>> +                }
>>              }
>>              return;
>>          }
>> @@ -870,11 +903,13 @@ static void
>> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>>                                &reads, &writes);
>>      if (ret_fr) {
>>          ret_fr = -ret_fr;
>> -        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> -            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests "
>> +        if (is_write != IOMMU_ANY){
>
> Here as well, why only fault non-RW entries?

same as above, maybe the name IOMMU_ANY is misleading and should be
something like IOMMU_NO_FAIL...

>
>> +            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> +                VTD_DPRINTF(FLOG, "fault processing is disabled for
>> DMA requests "
>>                          "through this context-entry (with FPD Set)");
>> -        } else {
>> -            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
>> +            } else {
>> +                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
>> +            }
>>          }
>>          return;
>>      }
>> @@ -1016,18 +1051,58 @@ static void
>> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>>                                  &domain_id);
>>  }
>>
>> +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
>> uint16_t domain_id,
>> +                                      hwaddr addr, uint8_t am)
>> +{
>> +    VFIOGuestIOMMU * giommu;
>> +
>> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
>> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
>> VTDAddressSpace, iommu);
>> +        uint16_t vfio_domain_id;
>> +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
>> vtd_as->devfn, &vfio_domain_id);
>> +        int i=0;
>> +        if (!ret && domain_id == vfio_domain_id){
>> +            IOMMUTLBEntry entry;
>> +
>> +            /* do vfio unmap */
>> +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
>> +            entry.target_as = NULL;
>> +            entry.iova = addr & VTD_PAGE_MASK_4K;
>> +            entry.translated_addr = 0;
>> +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
>> +            entry.perm = IOMMU_NONE;
>> +            memory_region_notify_iommu(giommu->iommu, entry);
>> +
>> +            /* do vfio map */
>> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
>> +            /* call to vtd_iommu_translate */
>> +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
>> +                IOMMUTLBEntry entry =
>> s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
>> +                if (entry.perm != IOMMU_NONE){
>> +                    memory_region_notify_iommu(giommu->iommu, entry);
>> +                }
>> +            }
>> +        }
>> +    }
>> +}
>> +
>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>                                        hwaddr addr, uint8_t am)
>>  {
>>      VTDIOTLBPageInvInfo info;
>>
>>      assert(am <= VTD_MAMV);
>> +
>>      info.domain_id = domain_id;
>>      info.addr = addr;
>>      info.mask = ~((1 << am) - 1);
>> +
>>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
>> +
>> +    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
>
> Why is this vfio related and why does it need to know about giommus?
> That's vfio private data.  Notifies need to happen regardless of
> whether there's a vfio device attached or not.  It seems like this is
> just filling a gap that current VT-d code doesn't notify everywhere it
> needs to, but it shouldn't know about vfio.

Noted, I'll try to change them.

>
>>  }
>>
>> +
>>  /* Flush IOTLB
>>   * Returns the IOTLB Actual Invalidation Granularity.
>>   * @val: the content of the IOTLB_REG
>> @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
>>  }
>>
>>  static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
>> -                                         bool is_write)
>> +                                         IOMMUAccessPermissions is_write)
>>  {
>>      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
>>      IntelIOMMUState *s = vtd_as->iommu_state;
>> @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = {
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>
>> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
>> +{
>> +    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
>> VTDAddressSpace, iommu);
>> +    IntelIOMMUState *s = vtd_as->iommu_state;
>> +
>> +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
>> +}
>
> This function shouldn't be needed.
>
>>
>>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>>  {
>> @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s)
>>      s->iq_last_desc_type = VTD_INV_DESC_NONE;
>>      s->next_frcd_reg = 0;
>>      s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
>> -             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
>> +             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS|
>> +             VTD_CAP_CM;
>
> This should be a separate patch as well.
Noted.
>
>>      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
>>
>>      vtd_reset_context_cache(s);
>> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
>> index e5f514c..102e9a5 100644
>> --- a/hw/i386/intel_iommu_internal.h
>> +++ b/hw/i386/intel_iommu_internal.h
>> @@ -190,6 +190,7 @@
>>  #define VTD_CAP_MAMV                (VTD_MAMV << 48)
>>  #define VTD_CAP_PSI                 (1ULL << 39)
>>  #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
>> +#define VTD_CAP_CM                  (1ULL << 7)
>>
>>  /* Supported Adjusted Guest Address Widths */
>>  #define VTD_CAP_SAGAW_SHIFT         8
>> @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
>>  #define VTD_PAGE_SHIFT_1G           30
>>  #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
>>
>> +#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
>> +
>>  struct VTDRootEntry {
>>      uint64_t val;
>>      uint64_t rsvd;
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 607ec70..98c8d67 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -32,6 +32,9 @@
>>  #include "sysemu/kvm.h"
>>  #include "trace.h"
>>
>> +#include "hw/sysbus.h"
>> +#include "hw/i386/intel_iommu.h"
>> +
>>  struct vfio_group_head vfio_group_list =
>>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>>  struct vfio_as_head vfio_address_spaces =
>> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
>>  out:
>>      rcu_read_unlock();
>>  }
>> -
>> +#if 0
>>  static hwaddr vfio_container_granularity(VFIOContainer *container)
>>  {
>>      return (hwaddr)1 << ctz64(container->iova_pgsizes);
>>  }
>> -
>> +#endif
>>  static void vfio_listener_region_add(MemoryListener *listener,
>>                                       MemoryRegionSection *section)
>>  {
>> @@ -344,6 +347,7 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>>      llend = int128_make64(section->offset_within_address_space);
>>      llend = int128_add(llend, section->size);
>> +    llend = int128_add(llend, int128_exts64(-1));
>>      llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>>
>>      if (int128_ge(int128_make64(iova), llend)) {
>> @@ -381,11 +385,13 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>>          giommu->n.notify = vfio_iommu_map_notify;
>>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>>
>> +        vtd_register_giommu(giommu);
>>          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
>> +#if 0
>>          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>>                                     vfio_container_granularity(container),
>>                                     false);
>> -
>> +#endif
>
> AFAICT, none of the above vfio changes should be required.  The
> overflow is already fixed in qemu.git, the giommu registration
> shouldn't be necessary, the replay is probably not used, but shouldn't
> be a problem either.  Not that there aren't vfio issues, but I think
> they're internal, like how pages are accounted and map/unmap efficiency.
>
>>          return;
>>      }
>>
>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>> index 2de7898..0e814ab 100644
>> --- a/include/exec/memory.h
>> +++ b/include/exec/memory.h
>> @@ -146,10 +146,14 @@ struct MemoryRegionOps {
>>  };
>>
>>  typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
>> -
>> +typedef enum IOMMUAccessPermissions{
>> +    IOMMU_READ = 0,
>> +    IOMMU_WRITE = 1,
>> +    IOMMU_ANY = 2
>> +} IOMMUAccessPermissions;
>>  struct MemoryRegionIOMMUOps {
>>      /* Return a TLB entry that contains a given address. */
>> -    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool
>> is_write);
>> +    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr,
>> IOMMUAccessPermissions is_write);
>>  };
>>
>>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
>> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
>> index b024ffa..22f3f83 100644
>> --- a/include/hw/i386/intel_iommu.h
>> +++ b/include/hw/i386/intel_iommu.h
>> @@ -23,6 +23,7 @@
>>  #define INTEL_IOMMU_H
>>  #include "hw/qdev.h"
>>  #include "sysemu/dma.h"
>> +#include "hw/vfio/vfio-common.h"
>>
>>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>>  #define INTEL_IOMMU_DEVICE(obj) \
>> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>>      MemoryRegionIOMMUOps iommu_ops;
>>      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by
>> PCIBus* reference */
>>      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
>> indexed by bus number */
>> +
>> +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>>  };
>>
>>  /* Find the VTD Address space associated with the given bus pointer,
>> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>>   */
>>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
>>
>> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>>  #endif
>
> Needing to know anything about vfio is an indication that this
> shouldn't be necessary.
>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index f037f3c..9225ba3 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>>      MemoryRegion *iommu;
>>      Notifier n;
>>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
>> +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>>  } VFIOGuestIOMMU;
>
> This is clearly a layering violation, vt-d should not be managing a
> list on a vfio data structure, especially one that it shouldn't even
> have access to.  Thanks,

As above, I'll try to separate them.

>
> Alex
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 347718f..a568181 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -43,6 +43,9 @@  static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif

+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+                                    uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
                             uint64_t wmask, uint64_t w1cmask)
 {
@@ -126,6 +129,22 @@  static uint32_t
vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
     return new_val;
 }

+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
uint8_t devfn, uint16_t * domain_id)
+{
+    VTDContextEntry ce;
+    int ret_fr;
+
+    assert(domain_id);
+
+    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+    if (ret_fr){
+        return -1;
+    }
+
+    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+    return 0;
+}
+
 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
                                         uint64_t clear, uint64_t mask)
 {
@@ -621,7 +640,7 @@  static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
uint32_t level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
IOMMUAccessPermissions is_write,
                             uint64_t *slptep, uint32_t *slpte_level,
                             bool *reads, bool *writes)
 {
@@ -641,7 +660,19 @@  static int vtd_gpa_to_slpte(VTDContextEntry *ce,
uint64_t gpa, bool is_write,
     }

     /* FIXME: what is the Atomics request here? */
-    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+    switch(is_write){
+        case IOMMU_WRITE:
+            access_right_check = VTD_SL_W;
+            break;
+        case IOMMU_READ:
+            access_right_check = VTD_SL_R;
+            break;
+        case IOMMU_ANY:
+            access_right_check = VTD_SL_R | VTD_SL_W;
+            break;
+        default:
+            assert(0);
+    }

     while (true) {
         offset = vtd_gpa_level_offset(gpa, level);
@@ -711,9 +742,9 @@  static int
vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     }

     if (!vtd_context_entry_present(ce)) {
-        VTD_DPRINTF(GENERAL,
+        /*VTD_DPRINTF(GENERAL,
                     "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-                    "is not present", devfn, bus_num);
+                    "is not present", devfn, bus_num);*/
         return -VTD_FR_CONTEXT_ENTRY_P;
     } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
                (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -785,7 +816,7 @@  static inline bool vtd_is_interrupt_addr(hwaddr addr)
  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
  */
 static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
-                                   uint8_t devfn, hwaddr addr, bool is_write,
+                                   uint8_t devfn, hwaddr addr,
IOMMUAccessPermissions is_write,
                                    IOMMUTLBEntry *entry)
 {
     IntelIOMMUState *s = vtd_as->iommu_state;
@@ -848,12 +879,14 @@  static void
vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
         if (ret_fr) {
             ret_fr = -ret_fr;
-            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
-                VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
+            if (is_write != IOMMU_ANY){
+                if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
+                    VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
                             "requests through this context-entry "
                             "(with FPD Set)");
-            } else {
-                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
+                } else {
+                    vtd_report_dmar_fault(s, source_id, addr, ret_fr,
is_write);
+                }
             }
             return;
         }
@@ -870,11 +903,13 @@  static void
vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
                               &reads, &writes);
     if (ret_fr) {
         ret_fr = -ret_fr;
-        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
-            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests "
+        if (is_write != IOMMU_ANY){
+            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
+                VTD_DPRINTF(FLOG, "fault processing is disabled for
DMA requests "
                         "through this context-entry (with FPD Set)");
-        } else {
-            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
+            } else {
+                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
+            }
         }
         return;
     }
@@ -1016,18 +1051,58 @@  static void
vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
                                 &domain_id);
 }

+static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
uint16_t domain_id,
+                                      hwaddr addr, uint8_t am)
+{
+    VFIOGuestIOMMU * giommu;
+
+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
+        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
VTDAddressSpace, iommu);
+        uint16_t vfio_domain_id;
+        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &vfio_domain_id);
+        int i=0;
+        if (!ret && domain_id == vfio_domain_id){
+            IOMMUTLBEntry entry;
+
+            /* do vfio unmap */
+            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
+            entry.target_as = NULL;
+            entry.iova = addr & VTD_PAGE_MASK_4K;
+            entry.translated_addr = 0;
+            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
+            entry.perm = IOMMU_NONE;
+            memory_region_notify_iommu(giommu->iommu, entry);
+
+            /* do vfio map */
+            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
+            /* call to vtd_iommu_translate */
+            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
+                IOMMUTLBEntry entry =
s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
+                if (entry.perm != IOMMU_NONE){
+                    memory_region_notify_iommu(giommu->iommu, entry);
+                }
+            }
+        }
+    }
+}
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
                                       hwaddr addr, uint8_t am)
 {
     VTDIOTLBPageInvInfo info;

     assert(am <= VTD_MAMV);
+
     info.domain_id = domain_id;
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
+
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+
+    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
 }

+
 /* Flush IOTLB
  * Returns the IOTLB Actual Invalidation Granularity.
  * @val: the content of the IOTLB_REG
@@ -1840,7 +1915,7 @@  static void vtd_mem_write(void *opaque, hwaddr addr,
 }

 static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
-                                         bool is_write)
+                                         IOMMUAccessPermissions is_write)
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
     IntelIOMMUState *s = vtd_as->iommu_state;
@@ -1895,6 +1970,13 @@  static Property vtd_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };

+void vtd_register_giommu(VFIOGuestIOMMU * giommu)
+{
+    VTDAddressSpace *vtd_as = container_of(giommu->iommu,
VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+
+    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
+}

 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
 {
@@ -1949,7 +2031,8 @@  static void vtd_init(IntelIOMMUState *s)
     s->iq_last_desc_type = VTD_INV_DESC_NONE;
     s->next_frcd_reg = 0;
     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
-             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
+             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS|
+             VTD_CAP_CM;
     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;

     vtd_reset_context_cache(s);
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index e5f514c..102e9a5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,6 +190,7 @@ 
 #define VTD_CAP_MAMV                (VTD_MAMV << 48)
 #define VTD_CAP_PSI                 (1ULL << 39)
 #define VTD_CAP_SLLPS               ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM                  (1ULL << 7)

 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT         8
@@ -338,6 +339,8 @@  typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
 #define VTD_PAGE_SHIFT_1G           30
 #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))

+#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
+
 struct VTDRootEntry {
     uint64_t val;
     uint64_t rsvd;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 607ec70..98c8d67 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -32,6 +32,9 @@ 
 #include "sysemu/kvm.h"
 #include "trace.h"

+#include "hw/sysbus.h"
+#include "hw/i386/intel_iommu.h"
+
 struct vfio_group_head vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
 struct vfio_as_head vfio_address_spaces =
@@ -312,12 +315,12 @@  static void vfio_iommu_map_notify(Notifier *n, void *data)
 out:
     rcu_read_unlock();
 }
-
+#if 0
 static hwaddr vfio_container_granularity(VFIOContainer *container)
 {
     return (hwaddr)1 << ctz64(container->iova_pgsizes);
 }
-
+#endif
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -344,6 +347,7 @@  static void
vfio_listener_region_add(MemoryListener *listener,
     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
     llend = int128_make64(section->offset_within_address_space);
     llend = int128_add(llend, section->size);
+    llend = int128_add(llend, int128_exts64(-1));
     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));

     if (int128_ge(int128_make64(iova), llend)) {
@@ -381,11 +385,13 @@  static void
vfio_listener_region_add(MemoryListener *listener,
         giommu->n.notify = vfio_iommu_map_notify;
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);

+        vtd_register_giommu(giommu);
         memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
+#if 0
         memory_region_iommu_replay(giommu->iommu, &giommu->n,
                                    vfio_container_granularity(container),
                                    false);
-
+#endif
         return;
     }

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 2de7898..0e814ab 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -146,10 +146,14 @@  struct MemoryRegionOps {
 };

 typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
-
+typedef enum IOMMUAccessPermissions{
+    IOMMU_READ = 0,
+    IOMMU_WRITE = 1,
+    IOMMU_ANY = 2
+} IOMMUAccessPermissions;
 struct MemoryRegionIOMMUOps {
     /* Return a TLB entry that contains a given address. */
-    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool
is_write);
+    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr,
IOMMUAccessPermissions is_write);
 };

 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index b024ffa..22f3f83 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -23,6 +23,7 @@ 
 #define INTEL_IOMMU_H
 #include "hw/qdev.h"
 #include "sysemu/dma.h"
+#include "hw/vfio/vfio-common.h"

 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -123,6 +124,8 @@  struct IntelIOMMUState {
     MemoryRegionIOMMUOps iommu_ops;
     GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by
PCIBus* reference */
     VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
indexed by bus number */
+
+    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
 };

 /* Find the VTD Address space associated with the given bus pointer,
@@ -130,4 +133,5 @@  struct IntelIOMMUState {
  */
 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);

+void vtd_register_giommu(VFIOGuestIOMMU * giommu);
 #endif
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f037f3c..9225ba3 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -82,6 +82,7 @@  typedef struct VFIOGuestIOMMU {
     MemoryRegion *iommu;
     Notifier n;
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
 } VFIOGuestIOMMU;

 typedef struct VFIODeviceOps VFIODeviceOps;