diff mbox

[v4,RESEND,3/3] IOMMU: enable intel_iommu map and unmap notifiers

Message ID 1476719064-9242-4-git-send-email-bd.aviv@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Aviv B.D. Oct. 17, 2016, 3:44 p.m. UTC
From: "Aviv Ben-David" <bd.aviv@gmail.com>

Adds a list of registered vtd_as's to intel iommu state to save
iteration over each PCI device in a search of the corrosponding domain.

Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
---
 hw/i386/intel_iommu.c          | 102 ++++++++++++++++++++++++++++++++++++++---
 hw/i386/intel_iommu_internal.h |   2 +
 include/hw/i386/intel_iommu.h  |   9 ++++
 3 files changed, 106 insertions(+), 7 deletions(-)

Comments

David Gibson Oct. 18, 2016, 4:04 a.m. UTC | #1
On Mon, Oct 17, 2016 at 06:44:24PM +0300, Aviv B.D wrote:
> From: "Aviv Ben-David" <bd.aviv@gmail.com>
> 
> Adds a list of registered vtd_as's to intel iommu state to save
> iteration over each PCI device in a search of the corrosponding domain.
> 
> Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> ---
>  hw/i386/intel_iommu.c          | 102 ++++++++++++++++++++++++++++++++++++++---
>  hw/i386/intel_iommu_internal.h |   2 +
>  include/hw/i386/intel_iommu.h  |   9 ++++
>  3 files changed, 106 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index dcf45f0..34fc1e8 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
>  
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {
> @@ -142,6 +145,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>      return new_val;
>  }
>  
> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn,
> +                           uint16_t *domain_id)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    assert(domain_id);
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr) {
> +        return -1;
> +    }
> +
> +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> +    return 0;
> +}
> +
>  /* GHashTable functions */
>  static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
>  {
> @@ -683,9 +703,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
>          *reads = (*reads) && (slpte & VTD_SL_R);
>          *writes = (*writes) && (slpte & VTD_SL_W);
>          if (!(slpte & access_right_check)) {
> -            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
> -                        "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
> -                        (flags == IOMMU_WO ? "write" : "read"), gpa, slpte);
>              return (flags == IOMMU_RW || flags == IOMMU_WO) ?
>                     -VTD_FR_WRITE : -VTD_FR_READ;
>          }
> @@ -734,9 +751,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>      }
>  
>      if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> -                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);
>          return -VTD_FR_CONTEXT_ENTRY_P;
>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -1065,6 +1079,55 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>                                  &domain_id);
>  }
>  
> +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> +                                           uint16_t domain_id, hwaddr addr,
> +                                           uint8_t am)
> +{
> +    IntelIOMMUNotifierNode *node;
> +
> +    QLIST_FOREACH(node, &(s->notifiers_list), next) {

It's not really obvious to me why you need this additional list of
IntelIOMMUNotifierNode structures, rather than just the notifier list
already built into each MemoryRegion.

> +        VTDAddressSpace *vtd_as = node->vtd_as;
> +        uint16_t vfio_domain_id;
> +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn,
> +                                  &vfio_domain_id);
> +        if (!ret && domain_id == vfio_domain_id) {
> +            IOMMUTLBEntry entry;
> +
> +            /* notify unmap */
> +            if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
> +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
> +                            addr, am);
> +                entry.target_as = &address_space_memory;
> +                entry.iova = addr & VTD_PAGE_MASK_4K;
> +                entry.translated_addr = 0;
> +                entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> +                entry.perm = IOMMU_NONE;
> +                memory_region_notify_iommu(&node->vtd_as->iommu, entry);
> +            }
> +
> +            /* notify map */
> +            if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
> +                hwaddr original_addr = addr;
> +                VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +                while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
> +                    /* call to vtd_iommu_translate */
> +                    IOMMUTLBEntry entry = s->iommu_ops.translate(
> +                                                         &node->vtd_as->iommu,
> +                                                         addr,
> +                                                         IOMMU_NO_FAIL);
> +                    if (entry.perm != IOMMU_NONE) {
> +                        addr += entry.addr_mask + 1;
> +                        memory_region_notify_iommu(&node->vtd_as->iommu, entry);
> +                    } else {
> +                        addr += VTD_PAGE_SIZE;
> +                    }
> +                }
> +            }
> +        }
> +    }
> +}
> +
> +
>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>                                        hwaddr addr, uint8_t am)
>  {
> @@ -1075,6 +1138,8 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>      info.addr = addr;
>      info.mask = ~((1 << am) - 1);
>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> +
> +    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
>  }
>  
>  /* Flush IOTLB
> @@ -2000,8 +2065,10 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
>                                            IOMMUNotifierFlag new)
>  {
>      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    IntelIOMMUNotifierNode *node = NULL;
>  
> -    if (new & IOMMU_NOTIFIER_MAP) {
> +    if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
>          error_report("Device at bus %s addr %02x.%d requires iommu "
>                       "notifier which is currently not supported by "
>                       "intel-iommu emulation",
> @@ -2009,6 +2076,27 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
>                       PCI_FUNC(vtd_as->devfn));
>          exit(1);
>      }
> +
> +    /* Add new ndoe if no mapping was exising before this call */
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        node = g_malloc0(sizeof(*node));
> +        node->vtd_as = vtd_as;
> +        node->notifier_flag = new;
> +        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
> +        return;
> +    }
> +
> +    /* update notifier node with new flags */
> +    QLIST_FOREACH(node, &s->notifiers_list, next) {
> +        if (node->vtd_as == vtd_as) {
> +            if (new == IOMMU_NOTIFIER_NONE) {
> +                QLIST_REMOVE(node, next);
> +            } else {
> +                node->notifier_flag = new;
> +            }
> +            return;
> +        }
> +    }
>  }
>  
>  static const VMStateDescription vtd_vmstate = {
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 35d9f3a..96df4ae 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -381,6 +381,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
>  #define VTD_PAGE_SHIFT_1G           30
>  #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
>  
> +#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
> +
>  struct VTDRootEntry {
>      uint64_t val;
>      uint64_t rsvd;
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 7a94f16..c160706 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -63,6 +63,7 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
>  typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
>  typedef struct VTDIrq VTDIrq;
>  typedef struct VTD_MSIMessage VTD_MSIMessage;
> +typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode;
>  
>  /* Context-Entry */
>  struct VTDContextEntry {
> @@ -248,6 +249,12 @@ struct VTD_MSIMessage {
>  /* When IR is enabled, all MSI/MSI-X data bits should be zero */
>  #define VTD_IR_MSI_DATA          (0)
>  
> +struct IntelIOMMUNotifierNode {
> +    VTDAddressSpace *vtd_as;
> +    IOMMUNotifierFlag notifier_flag;
> +    QLIST_ENTRY(IntelIOMMUNotifierNode) next;
> +};
> +
>  /* The iommu (DMAR) device state struct */
>  struct IntelIOMMUState {
>      X86IOMMUState x86_iommu;
> @@ -285,6 +292,8 @@ struct IntelIOMMUState {
>      MemoryRegionIOMMUOps iommu_ops;
>      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
>      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
> +    /* list of registered notifiers */
> +    QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list;
>  
>      /* interrupt remapping */
>      bool intr_enabled;              /* Whether guest enabled IR */
Peter Xu Oct. 19, 2016, 9:33 a.m. UTC | #2
On Mon, Oct 17, 2016 at 06:44:24PM +0300, Aviv B.D wrote:
> From: "Aviv Ben-David" <bd.aviv@gmail.com>
> 
> Adds a list of registered vtd_as's to intel iommu state to save
> iteration over each PCI device in a search of the corrosponding domain.
> 
> Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> ---
>  hw/i386/intel_iommu.c          | 102 ++++++++++++++++++++++++++++++++++++++---
>  hw/i386/intel_iommu_internal.h |   2 +
>  include/hw/i386/intel_iommu.h  |   9 ++++
>  3 files changed, 106 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index dcf45f0..34fc1e8 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
>  
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {
> @@ -142,6 +145,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>      return new_val;
>  }
>  
> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn,
> +                           uint16_t *domain_id)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    assert(domain_id);
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr) {
> +        return -1;
> +    }
> +
> +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
                    ^ one more space

> +    return 0;
> +}
> +
>  /* GHashTable functions */
>  static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
>  {
> @@ -683,9 +703,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
>          *reads = (*reads) && (slpte & VTD_SL_R);
>          *writes = (*writes) && (slpte & VTD_SL_W);
>          if (!(slpte & access_right_check)) {
> -            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
> -                        "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
> -                        (flags == IOMMU_WO ? "write" : "read"), gpa, slpte);

Could I ask why we are removing these lines? It can be useful if we
have permission issues.

>              return (flags == IOMMU_RW || flags == IOMMU_WO) ?
>                     -VTD_FR_WRITE : -VTD_FR_READ;
>          }
> @@ -734,9 +751,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>      }
>  
>      if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> -                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);

Here as well. Any reason to remove it?

>          return -VTD_FR_CONTEXT_ENTRY_P;
>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -1065,6 +1079,55 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>                                  &domain_id);
>  }
>  
> +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> +                                           uint16_t domain_id, hwaddr addr,
> +                                           uint8_t am)
> +{

The logic of this function looks strange to me.

> +    IntelIOMMUNotifierNode *node;
> +
> +    QLIST_FOREACH(node, &(s->notifiers_list), next) {
> +        VTDAddressSpace *vtd_as = node->vtd_as;
> +        uint16_t vfio_domain_id;
> +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn,
> +                                  &vfio_domain_id);
> +        if (!ret && domain_id == vfio_domain_id) {
> +            IOMMUTLBEntry entry;
> +
> +            /* notify unmap */
> +            if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {

First of all, if we are talking about VFIO, notifier_flag should
always be MAP|UNMAP. So in that case, for newly mapped entries, looks
like we will first send an UNMAP, then a MAP?

> +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
> +                            addr, am);
> +                entry.target_as = &address_space_memory;
> +                entry.iova = addr & VTD_PAGE_MASK_4K;
> +                entry.translated_addr = 0;
> +                entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> +                entry.perm = IOMMU_NONE;
> +                memory_region_notify_iommu(&node->vtd_as->iommu, entry);
> +            }
> +
> +            /* notify map */
> +            if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
> +                hwaddr original_addr = addr;
> +                VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +                while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
> +                    /* call to vtd_iommu_translate */
> +                    IOMMUTLBEntry entry = s->iommu_ops.translate(
> +                                                         &node->vtd_as->iommu,
> +                                                         addr,
> +                                                         IOMMU_NO_FAIL);
> +                    if (entry.perm != IOMMU_NONE) {
> +                        addr += entry.addr_mask + 1;
> +                        memory_region_notify_iommu(&node->vtd_as->iommu, entry);
> +                    } else {
> +                        addr += VTD_PAGE_SIZE;

IIUC, here is the point that we found "the page is gone" (so this is
an UNMAP invalidation), and we should do memory_region_iommu_notify()
for the whole area with IOMMU_NONE. Then we just quit the loop since
continuous translate()s should fail as well if the first page is
missing.

Please correct if I am wrong.

Thanks,

-- peterx
Peter Xu Oct. 20, 2016, 7:28 a.m. UTC | #3
On Mon, Oct 17, 2016 at 06:44:24PM +0300, Aviv B.D wrote:

[...]

> @@ -2000,8 +2065,10 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
>                                            IOMMUNotifierFlag new)
>  {
>      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    IntelIOMMUNotifierNode *node = NULL;
>  
> -    if (new & IOMMU_NOTIFIER_MAP) {
> +    if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
>          error_report("Device at bus %s addr %02x.%d requires iommu "
>                       "notifier which is currently not supported by "
>                       "intel-iommu emulation",

Here after the patch works, we can modify the warning message into
something like:

"We need to set cache_mode=1 for intel-iommu to enable device
assignment with IOMMU protection."

> @@ -2009,6 +2076,27 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
>                       PCI_FUNC(vtd_as->devfn));
>          exit(1);
>      }
> +
> +    /* Add new ndoe if no mapping was exising before this call */
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        node = g_malloc0(sizeof(*node));
> +        node->vtd_as = vtd_as;
> +        node->notifier_flag = new;
> +        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
> +        return;
> +    }
> +
> +    /* update notifier node with new flags */
> +    QLIST_FOREACH(node, &s->notifiers_list, next) {

Though in this case it is safe, I would still suggest we use
QLIST_FOREACH_SAFE here.

> +        if (node->vtd_as == vtd_as) {
> +            if (new == IOMMU_NOTIFIER_NONE) {
> +                QLIST_REMOVE(node, next);

Memory leak here?

Thanks,

-- peterx
Aviv B.D. Oct. 20, 2016, 7:11 p.m. UTC | #4
On Wed, Oct 19, 2016 at 12:33 PM, Peter Xu <peterx@redhat.com> wrote:

> On Mon, Oct 17, 2016 at 06:44:24PM +0300, Aviv B.D wrote:
> > From: "Aviv Ben-David" <bd.aviv@gmail.com>
> >
> > Adds a list of registered vtd_as's to intel iommu state to save
> > iteration over each PCI device in a search of the corrosponding domain.
> >
> > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> > ---
> >  hw/i386/intel_iommu.c          | 102 ++++++++++++++++++++++++++++++
> ++++++++---
> >  hw/i386/intel_iommu_internal.h |   2 +
> >  include/hw/i386/intel_iommu.h  |   9 ++++
> >  3 files changed, 106 insertions(+), 7 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index dcf45f0..34fc1e8 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT(CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t
> bus_num,
> > +                                    uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> >                              uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -142,6 +145,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState
> *s, hwaddr addr,
> >      return new_val;
> >  }
> >
> > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t devfn,
> > +                           uint16_t *domain_id)
> > +{
> > +    VTDContextEntry ce;
> > +    int ret_fr;
> > +
> > +    assert(domain_id);
> > +
> > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +    if (ret_fr) {
> > +        return -1;
> > +    }
> > +
> > +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
>                     ^ one more space
>
> > +    return 0;
> > +}
> > +
> >  /* GHashTable functions */
> >  static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
> >  {
> > @@ -683,9 +703,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
> uint64_t gpa,
> >          *reads = (*reads) && (slpte & VTD_SL_R);
> >          *writes = (*writes) && (slpte & VTD_SL_W);
> >          if (!(slpte & access_right_check)) {
> > -            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
> > -                        "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
> > -                        (flags == IOMMU_WO ? "write" : "read"), gpa,
> slpte);
>
> Could I ask why we are removing these lines? It can be useful if we
> have permission issues.
>

I will return Those lines if flags & NO_FAIL == 0

>
> >              return (flags == IOMMU_RW || flags == IOMMU_WO) ?
> >                     -VTD_FR_WRITE : -VTD_FR_READ;
> >          }
> > @@ -734,9 +751,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
> *s, uint8_t bus_num,
> >      }
> >
> >      if (!vtd_context_entry_present(ce)) {
> > -        VTD_DPRINTF(GENERAL,
> > -                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -                    "is not present", devfn, bus_num);
>
> Here as well. Any reason to remove it?
>
>
Here as well...


> >          return -VTD_FR_CONTEXT_ENTRY_P;
> >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -1065,6 +1079,55 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState
> *s, uint16_t domain_id)
> >                                  &domain_id);
> >  }
> >
> > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> > +                                           uint16_t domain_id, hwaddr
> addr,
> > +                                           uint8_t am)
> > +{
>
> The logic of this function looks strange to me.
>
> > +    IntelIOMMUNotifierNode *node;
> > +
> > +    QLIST_FOREACH(node, &(s->notifiers_list), next) {
> > +        VTDAddressSpace *vtd_as = node->vtd_as;
> > +        uint16_t vfio_domain_id;
> > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn,
> > +                                  &vfio_domain_id);
> > +        if (!ret && domain_id == vfio_domain_id) {
> > +            IOMMUTLBEntry entry;
> > +
> > +            /* notify unmap */
> > +            if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
>
> First of all, if we are talking about VFIO, notifier_flag should
> always be MAP|UNMAP. So in that case, for newly mapped entries, looks
> like we will first send an UNMAP, then a MAP?
>

You are correct, there is no valid reason to have notifier_flag other than
MAP|UNMAP
at least for VFIO.
I'm not sure if in the feature there won't be good reason to do otherwise,
so my
code support this scenario...


> > +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
> > +                            addr, am);
> > +                entry.target_as = &address_space_memory;
> > +                entry.iova = addr & VTD_PAGE_MASK_4K;
> > +                entry.translated_addr = 0;
> > +                entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K +
> am);
> > +                entry.perm = IOMMU_NONE;
> > +                memory_region_notify_iommu(&node->vtd_as->iommu,
> entry);
> > +            }
> > +
> > +            /* notify map */
> > +            if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
> > +                hwaddr original_addr = addr;
> > +                VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d",
> addr, am);
> > +                while (addr < original_addr + (1 << am) *
> VTD_PAGE_SIZE) {
> > +                    /* call to vtd_iommu_translate */
> > +                    IOMMUTLBEntry entry = s->iommu_ops.translate(
> > +
>  &node->vtd_as->iommu,
> > +                                                         addr,
> > +                                                         IOMMU_NO_FAIL);
> > +                    if (entry.perm != IOMMU_NONE) {
> > +                        addr += entry.addr_mask + 1;
> > +                        memory_region_notify_iommu(&node->vtd_as->iommu,
> entry);
> > +                    } else {
> > +                        addr += VTD_PAGE_SIZE;
>
> IIUC, here is the point that we found "the page is gone" (so this is
> an UNMAP invalidation), and we should do memory_region_iommu_notify()
> for the whole area with IOMMU_NONE. Then we just quit the loop since
> continuous translate()s should fail as well if the first page is
> missing.
>
> Please correct if I am wrong.
>

If I remember correctly I encounter a few cases where there was hole of
unmaped
memory in the middle of otherwise mapped pages. If I remember correctly it
was
with linux kernel 4.4, but I'm not sure.


> Thanks,
>
> -- peterx
>
Aviv B.D. Oct. 20, 2016, 7:11 p.m. UTC | #5
I will fix all those issues.
Thanks,
Aviv.

On Thu, Oct 20, 2016 at 10:11 PM, Aviv B.D. <bd.aviv@gmail.com> wrote:

>
>
> On Wed, Oct 19, 2016 at 12:33 PM, Peter Xu <peterx@redhat.com> wrote:
>
>> On Mon, Oct 17, 2016 at 06:44:24PM +0300, Aviv B.D wrote:
>> > From: "Aviv Ben-David" <bd.aviv@gmail.com>
>> >
>> > Adds a list of registered vtd_as's to intel iommu state to save
>> > iteration over each PCI device in a search of the corrosponding domain.
>> >
>> > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
>> > ---
>> >  hw/i386/intel_iommu.c          | 102 ++++++++++++++++++++++++++++++
>> ++++++++---
>> >  hw/i386/intel_iommu_internal.h |   2 +
>> >  include/hw/i386/intel_iommu.h  |   9 ++++
>> >  3 files changed, 106 insertions(+), 7 deletions(-)
>> >
>> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> > index dcf45f0..34fc1e8 100644
>> > --- a/hw/i386/intel_iommu.c
>> > +++ b/hw/i386/intel_iommu.c
>> > @@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
>> VTD_DBGBIT(CSR);
>> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>> >  #endif
>> >
>> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t
>> bus_num,
>> > +                                    uint8_t devfn, VTDContextEntry
>> *ce);
>> > +
>> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
>> val,
>> >                              uint64_t wmask, uint64_t w1cmask)
>> >  {
>> > @@ -142,6 +145,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState
>> *s, hwaddr addr,
>> >      return new_val;
>> >  }
>> >
>> > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>> uint8_t devfn,
>> > +                           uint16_t *domain_id)
>> > +{
>> > +    VTDContextEntry ce;
>> > +    int ret_fr;
>> > +
>> > +    assert(domain_id);
>> > +
>> > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
>> > +    if (ret_fr) {
>> > +        return -1;
>> > +    }
>> > +
>> > +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
>>                     ^ one more space
>>
>> > +    return 0;
>> > +}
>> > +
>> >  /* GHashTable functions */
>> >  static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
>> >  {
>> > @@ -683,9 +703,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
>> uint64_t gpa,
>> >          *reads = (*reads) && (slpte & VTD_SL_R);
>> >          *writes = (*writes) && (slpte & VTD_SL_W);
>> >          if (!(slpte & access_right_check)) {
>> > -            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
>> > -                        "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
>> > -                        (flags == IOMMU_WO ? "write" : "read"), gpa,
>> slpte);
>>
>> Could I ask why we are removing these lines? It can be useful if we
>> have permission issues.
>>
>
> I will return Those lines if flags & NO_FAIL == 0
>
>>
>> >              return (flags == IOMMU_RW || flags == IOMMU_WO) ?
>> >                     -VTD_FR_WRITE : -VTD_FR_READ;
>> >          }
>> > @@ -734,9 +751,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
>> *s, uint8_t bus_num,
>> >      }
>> >
>> >      if (!vtd_context_entry_present(ce)) {
>> > -        VTD_DPRINTF(GENERAL,
>> > -                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
>> > -                    "is not present", devfn, bus_num);
>>
>> Here as well. Any reason to remove it?
>>
>>
> Here as well...
>
>
>> >          return -VTD_FR_CONTEXT_ENTRY_P;
>> >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>> >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
>> > @@ -1065,6 +1079,55 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState
>> *s, uint16_t domain_id)
>> >                                  &domain_id);
>> >  }
>> >
>> > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
>> > +                                           uint16_t domain_id, hwaddr
>> addr,
>> > +                                           uint8_t am)
>> > +{
>>
>> The logic of this function looks strange to me.
>>
>> > +    IntelIOMMUNotifierNode *node;
>> > +
>> > +    QLIST_FOREACH(node, &(s->notifiers_list), next) {
>> > +        VTDAddressSpace *vtd_as = node->vtd_as;
>> > +        uint16_t vfio_domain_id;
>> > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
>> vtd_as->devfn,
>> > +                                  &vfio_domain_id);
>> > +        if (!ret && domain_id == vfio_domain_id) {
>> > +            IOMMUTLBEntry entry;
>> > +
>> > +            /* notify unmap */
>> > +            if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
>>
>> First of all, if we are talking about VFIO, notifier_flag should
>> always be MAP|UNMAP. So in that case, for newly mapped entries, looks
>> like we will first send an UNMAP, then a MAP?
>>
>
> You are correct, there is no valid reason to have notifier_flag other than
> MAP|UNMAP
> at least for VFIO.
> I'm not sure if in the feature there won't be good reason to do otherwise,
> so my
> code support this scenario...
>
>
>> > +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
>> %d",
>> > +                            addr, am);
>> > +                entry.target_as = &address_space_memory;
>> > +                entry.iova = addr & VTD_PAGE_MASK_4K;
>> > +                entry.translated_addr = 0;
>> > +                entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K +
>> am);
>> > +                entry.perm = IOMMU_NONE;
>> > +                memory_region_notify_iommu(&node->vtd_as->iommu,
>> entry);
>> > +            }
>> > +
>> > +            /* notify map */
>> > +            if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
>> > +                hwaddr original_addr = addr;
>> > +                VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d",
>> addr, am);
>> > +                while (addr < original_addr + (1 << am) *
>> VTD_PAGE_SIZE) {
>> > +                    /* call to vtd_iommu_translate */
>> > +                    IOMMUTLBEntry entry = s->iommu_ops.translate(
>> > +
>>  &node->vtd_as->iommu,
>> > +                                                         addr,
>> > +
>>  IOMMU_NO_FAIL);
>> > +                    if (entry.perm != IOMMU_NONE) {
>> > +                        addr += entry.addr_mask + 1;
>> > +                        memory_region_notify_iommu(&node->vtd_as->iommu,
>> entry);
>> > +                    } else {
>> > +                        addr += VTD_PAGE_SIZE;
>>
>> IIUC, here is the point that we found "the page is gone" (so this is
>> an UNMAP invalidation), and we should do memory_region_iommu_notify()
>> for the whole area with IOMMU_NONE. Then we just quit the loop since
>> continuous translate()s should fail as well if the first page is
>> missing.
>>
>> Please correct if I am wrong.
>>
>
> If I remember correctly I encounter a few cases where there was hole of
> unmaped
> memory in the middle of otherwise mapped pages. If I remember correctly it
> was
> with linux kernel 4.4, but I'm not sure.
>
>
>> Thanks,
>>
>> -- peterx
>>
>
>
Peter Xu Oct. 21, 2016, 3:57 a.m. UTC | #6
On Thu, Oct 20, 2016 at 10:11:15PM +0300, Aviv B.D. wrote:

[...]

> > > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> > > +                                           uint16_t domain_id, hwaddr
> > addr,
> > > +                                           uint8_t am)
> > > +{
> >
> > The logic of this function looks strange to me.
> >
> > > +    IntelIOMMUNotifierNode *node;
> > > +
> > > +    QLIST_FOREACH(node, &(s->notifiers_list), next) {
> > > +        VTDAddressSpace *vtd_as = node->vtd_as;
> > > +        uint16_t vfio_domain_id;
> > > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> > vtd_as->devfn,
> > > +                                  &vfio_domain_id);
> > > +        if (!ret && domain_id == vfio_domain_id) {
> > > +            IOMMUTLBEntry entry;
> > > +
> > > +            /* notify unmap */
> > > +            if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
> >
> > First of all, if we are talking about VFIO, notifier_flag should
> > always be MAP|UNMAP. So in that case, for newly mapped entries, looks
> > like we will first send an UNMAP, then a MAP?
> >
> 
> You are correct, there is no valid reason to have notifier_flag other than
> MAP|UNMAP
> at least for VFIO.

Not sure whether you know about upstream vhost work, vhost is going to
support Intel IOMMU, in that case, it will only register to UNMAP
notifier, not MAP.

> I'm not sure if in the feature there won't be good reason to do otherwise,
> so my
> code support this scenario...

My point is, I think you should not send this notify unconditionally.
IMO the logic should be simpler here, like:

    foreach (page in invalidation range) {
        IOTLBEntry entry = m->iommu_ops.translate();
        if (entry.perm != IOMMU_NONE && notifier_flag & NOTIIFER_MAP) {
            /* this is map, user requested MAP flag */
            memory_region_iommu_notify(entry);
        } else if (entry.perm == IOMMU_NONE && notifier_flag &
                NOTIFIER_UNMAP) {
            /* this is unmap, user requested UNMAP */
            entry = ... /* setup the entry */
            memory_region_iommu_notify(entry);
        }
    }

This is to follow your logic. I don't know whether this is efficient
enough, maybe good for the first version. The problem is, when you
call translate(), you will need to go over the page every time from
root dir. A faster way may be: provide a function to walk specific
address range. If you are going to implement the replay logic that
Alex/David has mentioned, maybe that will help too (walk over the
whole 64bit range).

> 
> 
> > > +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
> > > +                            addr, am);
> > > +                entry.target_as = &address_space_memory;
> > > +                entry.iova = addr & VTD_PAGE_MASK_4K;
> > > +                entry.translated_addr = 0;
> > > +                entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K +
> > am);
> > > +                entry.perm = IOMMU_NONE;
> > > +                memory_region_notify_iommu(&node->vtd_as->iommu,
> > entry);
> > > +            }
> > > +
> > > +            /* notify map */
> > > +            if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
> > > +                hwaddr original_addr = addr;
> > > +                VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d",
> > addr, am);
> > > +                while (addr < original_addr + (1 << am) *
> > VTD_PAGE_SIZE) {
> > > +                    /* call to vtd_iommu_translate */
> > > +                    IOMMUTLBEntry entry = s->iommu_ops.translate(
> > > +
> >  &node->vtd_as->iommu,
> > > +                                                         addr,
> > > +                                                         IOMMU_NO_FAIL);
> > > +                    if (entry.perm != IOMMU_NONE) {
> > > +                        addr += entry.addr_mask + 1;
> > > +                        memory_region_notify_iommu(&node->vtd_as->iommu,
> > entry);
> > > +                    } else {
> > > +                        addr += VTD_PAGE_SIZE;
> >
> > IIUC, here is the point that we found "the page is gone" (so this is
> > an UNMAP invalidation), and we should do memory_region_iommu_notify()
> > for the whole area with IOMMU_NONE. Then we just quit the loop since
> > continuous translate()s should fail as well if the first page is
> > missing.
> >
> > Please correct if I am wrong.
> >
> 
> If I remember correctly I encounter a few cases where there was hole of
> unmaped
> memory in the middle of otherwise mapped pages. If I remember correctly it
> was
> with linux kernel 4.4, but I'm not sure.

I see. Thanks.

-- peterx
Aviv B.D. Oct. 24, 2016, 7:53 a.m. UTC | #7
On Fri, Oct 21, 2016 at 6:57 AM, Peter Xu <peterx@redhat.com> wrote:

> On Thu, Oct 20, 2016 at 10:11:15PM +0300, Aviv B.D. wrote:
>
> [...]
>
> > > > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> > > > +                                           uint16_t domain_id,
> hwaddr
> > > addr,
> > > > +                                           uint8_t am)
> > > > +{
> > >
> > > The logic of this function looks strange to me.
> > >
> > > > +    IntelIOMMUNotifierNode *node;
> > > > +
> > > > +    QLIST_FOREACH(node, &(s->notifiers_list), next) {
> > > > +        VTDAddressSpace *vtd_as = node->vtd_as;
> > > > +        uint16_t vfio_domain_id;
> > > > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> > > vtd_as->devfn,
> > > > +                                  &vfio_domain_id);
> > > > +        if (!ret && domain_id == vfio_domain_id) {
> > > > +            IOMMUTLBEntry entry;
> > > > +
> > > > +            /* notify unmap */
> > > > +            if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
> > >
> > > First of all, if we are talking about VFIO, notifier_flag should
> > > always be MAP|UNMAP. So in that case, for newly mapped entries, looks
> > > like we will first send an UNMAP, then a MAP?
> > >
> >
> > You are correct, there is no valid reason to have notifier_flag other
> than
> > MAP|UNMAP
> > at least for VFIO.
>
> Not sure whether you know about upstream vhost work, vhost is going to
> support Intel IOMMU, in that case, it will only register to UNMAP
> notifier, not MAP.
>
> > I'm not sure if in the feature there won't be good reason to do
> otherwise,
> > so my
> > code support this scenario...
>
> My point is, I think you should not send this notify unconditionally.
> IMO the logic should be simpler here, like:
>
>     foreach (page in invalidation range) {
>         IOTLBEntry entry = m->iommu_ops.translate();
>         if (entry.perm != IOMMU_NONE && notifier_flag & NOTIIFER_MAP) {
>             /* this is map, user requested MAP flag */
>             memory_region_iommu_notify(entry);
>         } else if (entry.perm == IOMMU_NONE && notifier_flag &
>                 NOTIFIER_UNMAP) {
>             /* this is unmap, user requested UNMAP */
>             entry = ... /* setup the entry */
>             memory_region_iommu_notify(entry);
>         }
>     }
>

This was my first algorithm, but VFIO do not support remapping of mapped
page.
Before each MAP operation in VFIO one must do unmap, and therefore I'm
sending
the unmap notifications blindly before.
I can rearrange my code closer to your suggestion.


>
> This is to follow your logic. I don't know whether this is efficient
> enough, maybe good for the first version. The problem is, when you
> call translate(), you will need to go over the page every time from
> root dir. A faster way may be: provide a function to walk specific
> address range. If you are going to implement the replay logic that
> Alex/David has mentioned, maybe that will help too (walk over the
> whole 64bit range).
>
> Interesting idea, but I prefer to add it in separate patch set after this
one committed, if it's OK.


> >
> >
> > > > +                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
> %d",
> > > > +                            addr, am);
> > > > +                entry.target_as = &address_space_memory;
> > > > +                entry.iova = addr & VTD_PAGE_MASK_4K;
> > > > +                entry.translated_addr = 0;
> > > > +                entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K
> +
> > > am);
> > > > +                entry.perm = IOMMU_NONE;
> > > > +                memory_region_notify_iommu(&node->vtd_as->iommu,
> > > entry);
> > > > +            }
> > > > +
> > > > +            /* notify map */
> > > > +            if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
> > > > +                hwaddr original_addr = addr;
> > > > +                VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask
> %d",
> > > addr, am);
> > > > +                while (addr < original_addr + (1 << am) *
> > > VTD_PAGE_SIZE) {
> > > > +                    /* call to vtd_iommu_translate */
> > > > +                    IOMMUTLBEntry entry = s->iommu_ops.translate(
> > > > +
> > >  &node->vtd_as->iommu,
> > > > +                                                         addr,
> > > > +
>  IOMMU_NO_FAIL);
> > > > +                    if (entry.perm != IOMMU_NONE) {
> > > > +                        addr += entry.addr_mask + 1;
> > > > +                        memory_region_notify_iommu(&
> node->vtd_as->iommu,
> > > entry);
> > > > +                    } else {
> > > > +                        addr += VTD_PAGE_SIZE;
> > >
> > > IIUC, here is the point that we found "the page is gone" (so this is
> > > an UNMAP invalidation), and we should do memory_region_iommu_notify()
> > > for the whole area with IOMMU_NONE. Then we just quit the loop since
> > > continuous translate()s should fail as well if the first page is
> > > missing.
> > >
> > > Please correct if I am wrong.
> > >
> >
> > If I remember correctly I encounter a few cases where there was hole of
> > unmaped
> > memory in the middle of otherwise mapped pages. If I remember correctly
> it
> > was
> > with linux kernel 4.4, but I'm not sure.
>
> I see. Thanks.
>
> -- peterx
>

Best regards,
Aviv.
Peter Xu Oct. 24, 2016, 8:02 a.m. UTC | #8
On Mon, Oct 24, 2016 at 10:53:01AM +0300, Aviv B.D. wrote:

[...]

> This was my first algorithm, but VFIO do not support remapping of mapped
> page.
> Before each MAP operation in VFIO one must do unmap, and therefore I'm
> sending
> the unmap notifications blindly before.
> I can rearrange my code closer to your suggestion.

If so, I would suggest we solve the real problem first: we should not
notify VFIO twice on map(), but only once. IMO either Alex's or
David's suggestion (in the other mail) is a good start.

> 
> 
> >
> > This is to follow your logic. I don't know whether this is efficient
> > enough, maybe good for the first version. The problem is, when you
> > call translate(), you will need to go over the page every time from
> > root dir. A faster way may be: provide a function to walk specific
> > address range. If you are going to implement the replay logic that
> > Alex/David has mentioned, maybe that will help too (walk over the
> > whole 64bit range).
> >
> > Interesting idea, but I prefer to add it in separate patch set after this
> one committed, if it's OK.

Sure.

-- peterx
Aviv B.D. Oct. 25, 2016, 10:07 a.m. UTC | #9
On Mon, Oct 24, 2016 at 11:02 AM, Peter Xu <peterx@redhat.com> wrote:

> On Mon, Oct 24, 2016 at 10:53:01AM +0300, Aviv B.D. wrote:
>
> [...]
>
> > This was my first algorithm, but VFIO do not support remapping of mapped
> > page.
> > Before each MAP operation in VFIO one must do unmap, and therefore I'm
> > sending
> > the unmap notifications blindly before.
> > I can rearrange my code closer to your suggestion.
>
> If so, I would suggest we solve the real problem first: we should not
> notify VFIO twice on map(), but only once. IMO either Alex's or
> David's suggestion (in the other mail) is a good start.
>
> OK. I will publish a new patch set that notify only once per page.
I prefer David's suggestion - adding the range information to the notifier
struct
and check it from inside the VFIO notification function.


> >
> >
> > >
> > > This is to follow your logic. I don't know whether this is efficient
> > > enough, maybe good for the first version. The problem is, when you
> > > call translate(), you will need to go over the page every time from
> > > root dir. A faster way may be: provide a function to walk specific
> > > address range. If you are going to implement the replay logic that
> > > Alex/David has mentioned, maybe that will help too (walk over the
> > > whole 64bit range).
> > >
> > > Interesting idea, but I prefer to add it in separate patch set after
> this
> > one committed, if it's OK.
>
> Sure.
>
> -- peterx
>

Thanks,
Aviv.
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index dcf45f0..34fc1e8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -51,6 +51,9 @@  static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+                                    uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
                             uint64_t wmask, uint64_t w1cmask)
 {
@@ -142,6 +145,23 @@  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
     return new_val;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn,
+                           uint16_t *domain_id)
+{
+    VTDContextEntry ce;
+    int ret_fr;
+
+    assert(domain_id);
+
+    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+    if (ret_fr) {
+        return -1;
+    }
+
+    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+    return 0;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -683,9 +703,6 @@  static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
         *reads = (*reads) && (slpte & VTD_SL_R);
         *writes = (*writes) && (slpte & VTD_SL_W);
         if (!(slpte & access_right_check)) {
-            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
-                        "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-                        (flags == IOMMU_WO ? "write" : "read"), gpa, slpte);
             return (flags == IOMMU_RW || flags == IOMMU_WO) ?
                    -VTD_FR_WRITE : -VTD_FR_READ;
         }
@@ -734,9 +751,6 @@  static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     }
 
     if (!vtd_context_entry_present(ce)) {
-        VTD_DPRINTF(GENERAL,
-                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-                    "is not present", devfn, bus_num);
         return -VTD_FR_CONTEXT_ENTRY_P;
     } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
                (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -1065,6 +1079,55 @@  static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
                                 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+                                           uint16_t domain_id, hwaddr addr,
+                                           uint8_t am)
+{
+    IntelIOMMUNotifierNode *node;
+
+    QLIST_FOREACH(node, &(s->notifiers_list), next) {
+        VTDAddressSpace *vtd_as = node->vtd_as;
+        uint16_t vfio_domain_id;
+        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn,
+                                  &vfio_domain_id);
+        if (!ret && domain_id == vfio_domain_id) {
+            IOMMUTLBEntry entry;
+
+            /* notify unmap */
+            if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
+                VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
+                            addr, am);
+                entry.target_as = &address_space_memory;
+                entry.iova = addr & VTD_PAGE_MASK_4K;
+                entry.translated_addr = 0;
+                entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
+                entry.perm = IOMMU_NONE;
+                memory_region_notify_iommu(&node->vtd_as->iommu, entry);
+            }
+
+            /* notify map */
+            if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
+                hwaddr original_addr = addr;
+                VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
+                while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
+                    /* call to vtd_iommu_translate */
+                    IOMMUTLBEntry entry = s->iommu_ops.translate(
+                                                         &node->vtd_as->iommu,
+                                                         addr,
+                                                         IOMMU_NO_FAIL);
+                    if (entry.perm != IOMMU_NONE) {
+                        addr += entry.addr_mask + 1;
+                        memory_region_notify_iommu(&node->vtd_as->iommu, entry);
+                    } else {
+                        addr += VTD_PAGE_SIZE;
+                    }
+                }
+            }
+        }
+    }
+}
+
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
                                       hwaddr addr, uint8_t am)
 {
@@ -1075,6 +1138,8 @@  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+
+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }
 
 /* Flush IOTLB
@@ -2000,8 +2065,10 @@  static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
                                           IOMMUNotifierFlag new)
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    IntelIOMMUNotifierNode *node = NULL;
 
-    if (new & IOMMU_NOTIFIER_MAP) {
+    if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
         error_report("Device at bus %s addr %02x.%d requires iommu "
                      "notifier which is currently not supported by "
                      "intel-iommu emulation",
@@ -2009,6 +2076,27 @@  static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
                      PCI_FUNC(vtd_as->devfn));
         exit(1);
     }
+
+    /* Add new ndoe if no mapping was exising before this call */
+    if (old == IOMMU_NOTIFIER_NONE) {
+        node = g_malloc0(sizeof(*node));
+        node->vtd_as = vtd_as;
+        node->notifier_flag = new;
+        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
+        return;
+    }
+
+    /* update notifier node with new flags */
+    QLIST_FOREACH(node, &s->notifiers_list, next) {
+        if (node->vtd_as == vtd_as) {
+            if (new == IOMMU_NOTIFIER_NONE) {
+                QLIST_REMOVE(node, next);
+            } else {
+                node->notifier_flag = new;
+            }
+            return;
+        }
+    }
 }
 
 static const VMStateDescription vtd_vmstate = {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 35d9f3a..96df4ae 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -381,6 +381,8 @@  typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
 #define VTD_PAGE_SHIFT_1G           30
 #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
 
+#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
+
 struct VTDRootEntry {
     uint64_t val;
     uint64_t rsvd;
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7a94f16..c160706 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -63,6 +63,7 @@  typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDIrq VTDIrq;
 typedef struct VTD_MSIMessage VTD_MSIMessage;
+typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -248,6 +249,12 @@  struct VTD_MSIMessage {
 /* When IR is enabled, all MSI/MSI-X data bits should be zero */
 #define VTD_IR_MSI_DATA          (0)
 
+struct IntelIOMMUNotifierNode {
+    VTDAddressSpace *vtd_as;
+    IOMMUNotifierFlag notifier_flag;
+    QLIST_ENTRY(IntelIOMMUNotifierNode) next;
+};
+
 /* The iommu (DMAR) device state struct */
 struct IntelIOMMUState {
     X86IOMMUState x86_iommu;
@@ -285,6 +292,8 @@  struct IntelIOMMUState {
     MemoryRegionIOMMUOps iommu_ops;
     GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
     VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
+    /* list of registered notifiers */
+    QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list;
 
     /* interrupt remapping */
     bool intr_enabled;              /* Whether guest enabled IR */