diff mbox

[RFC,v3,14/14] intel_iommu: enable vfio devices

Message ID 1484276800-26814-15-git-send-email-peterx@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Xu Jan. 13, 2017, 3:06 a.m. UTC
This patch is based on Aviv Ben-David (<bd.aviv@gmail.com>)'s patch
upstream:

  "IOMMU: enable intel_iommu map and unmap notifiers"
  https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01453.html

However I removed/fixed some content, and added my own codes.

Instead of translate() every page for iotlb invalidations (which is
slower), we walk the pages when needed and notify in a hook function.

This patch enables vfio devices for VT-d emulation.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c         | 68 +++++++++++++++++++++++++++++++++++++------
 include/hw/i386/intel_iommu.h |  8 +++++
 2 files changed, 67 insertions(+), 9 deletions(-)

Comments

Jason Wang Jan. 16, 2017, 6:30 a.m. UTC | #1
On 2017年01月13日 11:06, Peter Xu wrote:
> This patch is based on Aviv Ben-David (<bd.aviv@gmail.com>)'s patch
> upstream:
>
>    "IOMMU: enable intel_iommu map and unmap notifiers"
>    https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01453.html
>
> However I removed/fixed some content, and added my own codes.
>
> Instead of translate() every page for iotlb invalidations (which is
> slower), we walk the pages when needed and notify in a hook function.
>
> This patch enables vfio devices for VT-d emulation.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   hw/i386/intel_iommu.c         | 68 +++++++++++++++++++++++++++++++++++++------
>   include/hw/i386/intel_iommu.h |  8 +++++
>   2 files changed, 67 insertions(+), 9 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 2596f11..104200b 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -839,7 +839,8 @@ next:
>    * @private: private data for the hook function
>    */
>   static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
> -                         vtd_page_walk_hook hook_fn, void *private)
> +                         vtd_page_walk_hook hook_fn, void *private,
> +                         bool notify_unmap)
>   {
>       dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
>       uint32_t level = vtd_get_level_from_context_entry(ce);
> @@ -858,7 +859,7 @@ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
>       trace_vtd_page_walk(ce->hi, ce->lo, start, end);
>   
>       return vtd_page_walk_level(addr, start, end, hook_fn, private,
> -                               level, true, true, NULL, false);
> +                               level, true, true, NULL, notify_unmap);
>   }
>   
>   /* Map a device to its corresponding domain (context-entry) */
> @@ -1212,6 +1213,34 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>                                   &domain_id);
>   }
>   
> +static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
> +                                           void *private)
> +{
> +    memory_region_notify_iommu((MemoryRegion *)private, *entry);
> +    return 0;
> +}
> +
> +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> +                                           uint16_t domain_id, hwaddr addr,
> +                                           uint8_t am)
> +{
> +    IntelIOMMUNotifierNode *node;
> +    VTDContextEntry ce;
> +    int ret;
> +
> +    QLIST_FOREACH(node, &(s->notifiers_list), next) {
> +        VTDAddressSpace *vtd_as = node->vtd_as;
> +        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
> +                                       vtd_as->devfn, &ce);
> +        if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
> +            vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
> +                          vtd_page_invalidate_notify_hook,
> +                          (void *)&vtd_as->iommu, true);
> +        }
> +    }
> +}
> +
> +
>   static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>                                         hwaddr addr, uint8_t am)
>   {
> @@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>       info.addr = addr;
>       info.mask = ~((1 << am) - 1);
>       g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> +    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);

Is the case of GLOBAL or DSI flush missed, or we don't care it at all?

Thanks

>   }
>   
>   /* Flush IOTLB
> @@ -2244,15 +2274,34 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
>                                             IOMMUNotifierFlag new)
>   {
>       VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    IntelIOMMUNotifierNode *node = NULL;
> +    IntelIOMMUNotifierNode *next_node = NULL;
>   
> -    if (new & IOMMU_NOTIFIER_MAP) {
> -        error_report("Device at bus %s addr %02x.%d requires iommu "
> -                     "notifier which is currently not supported by "
> -                     "intel-iommu emulation",
> -                     vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
> -                     PCI_FUNC(vtd_as->devfn));
> +    if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
> +        error_report("We need to set cache_mode=1 for intel-iommu to enable "
> +                     "device assignment with IOMMU protection.");
>           exit(1);
>       }
> +
> +    /* Add new ndoe if no mapping was exising before this call */

"node"?

> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        node = g_malloc0(sizeof(*node));
> +        node->vtd_as = vtd_as;
> +        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
> +        return;
> +    }
> +
> +    /* update notifier node with new flags */
> +    QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
> +        if (node->vtd_as == vtd_as) {
> +            if (new == IOMMU_NOTIFIER_NONE) {
> +                QLIST_REMOVE(node, next);
> +                g_free(node);
> +            }
> +            return;
> +        }
> +    }
>   }
>   
>   static const VMStateDescription vtd_vmstate = {
> @@ -2689,7 +2738,7 @@ static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n)
>            */
>           trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
>                                     PCI_FUNC(vtd_as->devfn), ce.hi, ce.lo);
> -        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n);
> +        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n, false);
>       } else {
>           trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
>                                       PCI_FUNC(vtd_as->devfn));
> @@ -2871,6 +2920,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
>           return;
>       }
>   
> +    QLIST_INIT(&s->notifiers_list);
>       memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
>       memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
>                             "intel_iommu", DMAR_REG_SIZE);
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 9c3f6c0..832cfc9 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -63,6 +63,7 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
>   typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
>   typedef struct VTDIrq VTDIrq;
>   typedef struct VTD_MSIMessage VTD_MSIMessage;
> +typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode;
>   
>   /* Context-Entry */
>   struct VTDContextEntry {
> @@ -249,6 +250,11 @@ struct VTD_MSIMessage {
>   /* When IR is enabled, all MSI/MSI-X data bits should be zero */
>   #define VTD_IR_MSI_DATA          (0)
>   
> +struct IntelIOMMUNotifierNode {
> +    VTDAddressSpace *vtd_as;
> +    QLIST_ENTRY(IntelIOMMUNotifierNode) next;
> +};
> +
>   /* The iommu (DMAR) device state struct */
>   struct IntelIOMMUState {
>       X86IOMMUState x86_iommu;
> @@ -286,6 +292,8 @@ struct IntelIOMMUState {
>       MemoryRegionIOMMUOps iommu_ops;
>       GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
>       VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
> +    /* list of registered notifiers */
> +    QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list;
>   
>       /* interrupt remapping */
>       bool intr_enabled;              /* Whether guest enabled IR */
Peter Xu Jan. 16, 2017, 9:18 a.m. UTC | #2
On Mon, Jan 16, 2017 at 02:30:20PM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月13日 11:06, Peter Xu wrote:
> >This patch is based on Aviv Ben-David (<bd.aviv@gmail.com>)'s patch
> >upstream:
> >
> >   "IOMMU: enable intel_iommu map and unmap notifiers"
> >   https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01453.html
> >
> >However I removed/fixed some content, and added my own codes.
> >
> >Instead of translate() every page for iotlb invalidations (which is
> >slower), we walk the pages when needed and notify in a hook function.
> >
> >This patch enables vfio devices for VT-d emulation.
> >
> >Signed-off-by: Peter Xu <peterx@redhat.com>
> >---
> >  hw/i386/intel_iommu.c         | 68 +++++++++++++++++++++++++++++++++++++------
> >  include/hw/i386/intel_iommu.h |  8 +++++
> >  2 files changed, 67 insertions(+), 9 deletions(-)
> >
> >diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >index 2596f11..104200b 100644
> >--- a/hw/i386/intel_iommu.c
> >+++ b/hw/i386/intel_iommu.c
> >@@ -839,7 +839,8 @@ next:
> >   * @private: private data for the hook function
> >   */
> >  static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
> >-                         vtd_page_walk_hook hook_fn, void *private)
> >+                         vtd_page_walk_hook hook_fn, void *private,
> >+                         bool notify_unmap)
> >  {
> >      dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
> >      uint32_t level = vtd_get_level_from_context_entry(ce);
> >@@ -858,7 +859,7 @@ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
> >      trace_vtd_page_walk(ce->hi, ce->lo, start, end);
> >      return vtd_page_walk_level(addr, start, end, hook_fn, private,
> >-                               level, true, true, NULL, false);
> >+                               level, true, true, NULL, notify_unmap);
> >  }
> >  /* Map a device to its corresponding domain (context-entry) */
> >@@ -1212,6 +1213,34 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
> >                                  &domain_id);
> >  }
> >+static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
> >+                                           void *private)
> >+{
> >+    memory_region_notify_iommu((MemoryRegion *)private, *entry);
> >+    return 0;
> >+}
> >+
> >+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> >+                                           uint16_t domain_id, hwaddr addr,
> >+                                           uint8_t am)
> >+{
> >+    IntelIOMMUNotifierNode *node;
> >+    VTDContextEntry ce;
> >+    int ret;
> >+
> >+    QLIST_FOREACH(node, &(s->notifiers_list), next) {
> >+        VTDAddressSpace *vtd_as = node->vtd_as;
> >+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
> >+                                       vtd_as->devfn, &ce);
> >+        if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
> >+            vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
> >+                          vtd_page_invalidate_notify_hook,
> >+                          (void *)&vtd_as->iommu, true);
> >+        }
> >+    }
> >+}
> >+
> >+
> >  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >                                        hwaddr addr, uint8_t am)
> >  {
> >@@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >      info.addr = addr;
> >      info.mask = ~((1 << am) - 1);
> >      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> >+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
> 
> Is the case of GLOBAL or DSI flush missed, or we don't care it at all?

IMHO we don't. For device assignment, since we are having CM=1 here,
we should have explicit page invalidations even if guest sends
global/domain invalidations.

Thanks,

-- peterx
Peter Xu Jan. 16, 2017, 9:20 a.m. UTC | #3
On Mon, Jan 16, 2017 at 02:30:20PM +0800, Jason Wang wrote:

[...]

> >  }
> >  /* Flush IOTLB
> >@@ -2244,15 +2274,34 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
> >                                            IOMMUNotifierFlag new)
> >  {
> >      VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
> >+    IntelIOMMUState *s = vtd_as->iommu_state;
> >+    IntelIOMMUNotifierNode *node = NULL;
> >+    IntelIOMMUNotifierNode *next_node = NULL;
> >-    if (new & IOMMU_NOTIFIER_MAP) {
> >-        error_report("Device at bus %s addr %02x.%d requires iommu "
> >-                     "notifier which is currently not supported by "
> >-                     "intel-iommu emulation",
> >-                     vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
> >-                     PCI_FUNC(vtd_as->devfn));
> >+    if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
> >+        error_report("We need to set cache_mode=1 for intel-iommu to enable "
> >+                     "device assignment with IOMMU protection.");
> >          exit(1);
> >      }
> >+
> >+    /* Add new ndoe if no mapping was exising before this call */
> 
> "node"?

Sorry I missed this one - let me just remove above comment since it
just describes what the codes has done below.

Thanks,

> 
> >+    if (old == IOMMU_NOTIFIER_NONE) {
> >+        node = g_malloc0(sizeof(*node));
> >+        node->vtd_as = vtd_as;
> >+        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
> >+        return;
> >+    }

-- peterx
Jason Wang Jan. 16, 2017, 9:54 a.m. UTC | #4
On 2017年01月16日 17:18, Peter Xu wrote:
>>>   static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>                                         hwaddr addr, uint8_t am)
>>>   {
>>> @@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>       info.addr = addr;
>>>       info.mask = ~((1 << am) - 1);
>>>       g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
>>> +    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
>> Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
> IMHO we don't. For device assignment, since we are having CM=1 here,
> we should have explicit page invalidations even if guest sends
> global/domain invalidations.
>
> Thanks,
>
> -- peterx

Is this spec required? Btw, it looks to me that both DSI and GLOBAL are 
indeed explicit flush.

Just have a quick go through on driver codes and find this something 
interesting in intel_iommu_flush_iotlb_psi():

...
     /*
      * Fallback to domain selective flush if no PSI support or the size is
      * too big.
      * PSI requires page size to be 2 ^ x, and the base address is 
naturally
      * aligned to the size
      */
     if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
         iommu->flush.flush_iotlb(iommu, did, 0, 0,
                         DMA_TLB_DSI_FLUSH);
     else
         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
                         DMA_TLB_PSI_FLUSH);
...

It looks like DSI_FLUSH is possible even for CM on.

And in flush_unmaps():

...
         /* In caching mode, global flushes turn emulation expensive */
         if (!cap_caching_mode(iommu->cap))
             iommu->flush.flush_iotlb(iommu, 0, 0, 0,
                      DMA_TLB_GLOBAL_FLUSH);
...

If I understand the comments correctly, GLOBAL is ok for CM too (though 
the code did not do it for performance reason).

Thanks
Peter Xu Jan. 17, 2017, 2:45 p.m. UTC | #5
On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月16日 17:18, Peter Xu wrote:
> >>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >>>                                        hwaddr addr, uint8_t am)
> >>>  {
> >>>@@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >>>      info.addr = addr;
> >>>      info.mask = ~((1 << am) - 1);
> >>>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> >>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
> >>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
> >IMHO we don't. For device assignment, since we are having CM=1 here,
> >we should have explicit page invalidations even if guest sends
> >global/domain invalidations.
> >
> >Thanks,
> >
> >-- peterx
> 
> Is this spec required?

I think not. IMO the spec is very coarse grained on describing cache
mode...

> Btw, it looks to me that both DSI and GLOBAL are
> indeed explicit flush.

Actually when cache mode is on, it is unclear to me on how we should
treat domain/global invalidations, at least from the spec (as
mentioned earlier). My understanding is that they are not "explicit
flushes", which IMHO should only mean page selective IOTLB
invalidations.

> 
> Just have a quick go through on driver codes and find this something
> interesting in intel_iommu_flush_iotlb_psi():
> 
> ...
>     /*
>      * Fallback to domain selective flush if no PSI support or the size is
>      * too big.
>      * PSI requires page size to be 2 ^ x, and the base address is naturally
>      * aligned to the size
>      */
>     if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
>         iommu->flush.flush_iotlb(iommu, did, 0, 0,
>                         DMA_TLB_DSI_FLUSH);
>     else
>         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
>                         DMA_TLB_PSI_FLUSH);
> ...

I think this is interesting... and I doubt its correctness while with
cache mode enabled.

If so (sending domain invalidation instead of a big range of page
invalidations), how should we capture which pages are unmapped in
emulated IOMMU?

> 
> It looks like DSI_FLUSH is possible even for CM on.
> 
> And in flush_unmaps():
> 
> ...
>         /* In caching mode, global flushes turn emulation expensive */
>         if (!cap_caching_mode(iommu->cap))
>             iommu->flush.flush_iotlb(iommu, 0, 0, 0,
>                      DMA_TLB_GLOBAL_FLUSH);
> ...
> 
> If I understand the comments correctly, GLOBAL is ok for CM too (though the
> code did not do it for performance reason).

I think it should be okay to send global flush for CM, but not sure
whether we should notify anything when we receive it. Hmm, anyway, I
think I need some more reading to make sure I understand the whole
thing correctly. :)

For example, when I see this commit:

commit 78d5f0f500e6ba8f6cfd0673475ff4d941d705a2
Author: Nadav Amit <nadav.amit@gmail.com>
Date:   Thu Apr 8 23:00:41 2010 +0300

    intel-iommu: Avoid global flushes with caching mode.
    
    While it may be efficient on real hardware, emulation of global
    invalidations is very expensive as all shadow entries must be examined.
    This patch changes the behaviour when caching mode is enabled (which is
    the case when IOMMU emulation takes place). In this case, page specific
    invalidation is used instead.

Before I ask someone else besides qemu-devel, I am curious about
whether there is existing VT-d emulation code (outside QEMU, of
course) so that I can have a reference? Quick search didn't answer me.

Thanks,

-- peterx
Jason Wang Jan. 18, 2017, 3:10 a.m. UTC | #6
On 2017年01月17日 22:45, Peter Xu wrote:
> On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
>>
>> On 2017年01月16日 17:18, Peter Xu wrote:
>>>>>   static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>>>                                         hwaddr addr, uint8_t am)
>>>>>   {
>>>>> @@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>>>       info.addr = addr;
>>>>>       info.mask = ~((1 << am) - 1);
>>>>>       g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
>>>>> +    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
>>>> Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
>>> IMHO we don't. For device assignment, since we are having CM=1 here,
>>> we should have explicit page invalidations even if guest sends
>>> global/domain invalidations.
>>>
>>> Thanks,
>>>
>>> -- peterx
>> Is this spec required?
> I think not. IMO the spec is very coarse grained on describing cache
> mode...
>
>> Btw, it looks to me that both DSI and GLOBAL are
>> indeed explicit flush.
> Actually when cache mode is on, it is unclear to me on how we should
> treat domain/global invalidations, at least from the spec (as
> mentioned earlier). My understanding is that they are not "explicit
> flushes", which IMHO should only mean page selective IOTLB
> invalidations.

Probably not, at least from the view of performance. DSI and global 
should be more efficient in some cases.

>
>> Just have a quick go through on driver codes and find this something
>> interesting in intel_iommu_flush_iotlb_psi():
>>
>> ...
>>      /*
>>       * Fallback to domain selective flush if no PSI support or the size is
>>       * too big.
>>       * PSI requires page size to be 2 ^ x, and the base address is naturally
>>       * aligned to the size
>>       */
>>      if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
>>          iommu->flush.flush_iotlb(iommu, did, 0, 0,
>>                          DMA_TLB_DSI_FLUSH);
>>      else
>>          iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
>>                          DMA_TLB_PSI_FLUSH);
>> ...
> I think this is interesting... and I doubt its correctness while with
> cache mode enabled.
>
> If so (sending domain invalidation instead of a big range of page
> invalidations), how should we capture which pages are unmapped in
> emulated IOMMU?

We don't need to track individual pages here, since all pages for a 
specific domain were unmapped I believe?

>
>> It looks like DSI_FLUSH is possible even for CM on.
>>
>> And in flush_unmaps():
>>
>> ...
>>          /* In caching mode, global flushes turn emulation expensive */
>>          if (!cap_caching_mode(iommu->cap))
>>              iommu->flush.flush_iotlb(iommu, 0, 0, 0,
>>                       DMA_TLB_GLOBAL_FLUSH);
>> ...
>>
>> If I understand the comments correctly, GLOBAL is ok for CM too (though the
>> code did not do it for performance reason).
> I think it should be okay to send global flush for CM, but not sure
> whether we should notify anything when we receive it. Hmm, anyway, I
> think I need some more reading to make sure I understand the whole
> thing correctly. :)
>
> For example, when I see this commit:
>
> commit 78d5f0f500e6ba8f6cfd0673475ff4d941d705a2
> Author: Nadav Amit <nadav.amit@gmail.com>
> Date:   Thu Apr 8 23:00:41 2010 +0300
>
>      intel-iommu: Avoid global flushes with caching mode.
>      
>      While it may be efficient on real hardware, emulation of global
>      invalidations is very expensive as all shadow entries must be examined.
>      This patch changes the behaviour when caching mode is enabled (which is
>      the case when IOMMU emulation takes place). In this case, page specific
>      invalidation is used instead.
>
> Before I ask someone else besides qemu-devel, I am curious about
> whether there is existing VT-d emulation code (outside QEMU, of
> course) so that I can have a reference?

Yes, it has. The author of this patch - Nadav has done lots of research 
on emulated IOMMU. See following papers:

https://hal.inria.fr/inria-00493752/document
http://www.cse.iitd.ac.in/~sbansal/csl862-virt/readings/vIOMMU.pdf

Thanks

> Quick search didn't answer me.
>
> Thanks,
>
> -- peterx
Peter Xu Jan. 18, 2017, 8:11 a.m. UTC | #7
On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月17日 22:45, Peter Xu wrote:
> >On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
> >>
> >>On 2017年01月16日 17:18, Peter Xu wrote:
> >>>>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >>>>>                                        hwaddr addr, uint8_t am)
> >>>>>  {
> >>>>>@@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >>>>>      info.addr = addr;
> >>>>>      info.mask = ~((1 << am) - 1);
> >>>>>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> >>>>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
> >>>>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
> >>>IMHO we don't. For device assignment, since we are having CM=1 here,
> >>>we should have explicit page invalidations even if guest sends
> >>>global/domain invalidations.
> >>>
> >>>Thanks,
> >>>
> >>>-- peterx
> >>Is this spec required?
> >I think not. IMO the spec is very coarse grained on describing cache
> >mode...
> >
> >>Btw, it looks to me that both DSI and GLOBAL are
> >>indeed explicit flush.
> >Actually when cache mode is on, it is unclear to me on how we should
> >treat domain/global invalidations, at least from the spec (as
> >mentioned earlier). My understanding is that they are not "explicit
> >flushes", which IMHO should only mean page selective IOTLB
> >invalidations.
> 
> Probably not, at least from the view of performance. DSI and global should
> be more efficient in some cases.

I agree with you that DSI/GLOBAL flushes are more efficient in some
way. But IMHO that does not mean these invalidations are "explicit
invalidations", and I suspect whether cache mode has to coop with it.

But here I should add one more thing besides PSI - context entry
invalidation should be one of "the explicit invalidations" as well,
which we need to handle just like PSI when cache mode is on.

> 
> >
> >>Just have a quick go through on driver codes and find this something
> >>interesting in intel_iommu_flush_iotlb_psi():
> >>
> >>...
> >>     /*
> >>      * Fallback to domain selective flush if no PSI support or the size is
> >>      * too big.
> >>      * PSI requires page size to be 2 ^ x, and the base address is naturally
> >>      * aligned to the size
> >>      */
> >>     if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
> >>         iommu->flush.flush_iotlb(iommu, did, 0, 0,
> >>                         DMA_TLB_DSI_FLUSH);
> >>     else
> >>         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
> >>                         DMA_TLB_PSI_FLUSH);
> >>...
> >I think this is interesting... and I doubt its correctness while with
> >cache mode enabled.
> >
> >If so (sending domain invalidation instead of a big range of page
> >invalidations), how should we capture which pages are unmapped in
> >emulated IOMMU?
> 
> We don't need to track individual pages here, since all pages for a specific
> domain were unmapped I believe?

IMHO this might not be the correct behavior.

If we receive one domain specific invalidation, I agree that we should
invalidate the IOTLB cache for all the devices inside the domain.
However, when cache mode is on, we should be depending on the PSIs to
unmap each page (unless we want to unmap the whole address space, in
this case it's very possible that the guest is just unmapping a range,
not the entire space). If we convert several PSIs into one big DSI,
IMHO we will leave those pages mapped/unmapped while we should
unmap/map them.

> 
> >
> >>It looks like DSI_FLUSH is possible even for CM on.
> >>
> >>And in flush_unmaps():
> >>
> >>...
> >>         /* In caching mode, global flushes turn emulation expensive */
> >>         if (!cap_caching_mode(iommu->cap))
> >>             iommu->flush.flush_iotlb(iommu, 0, 0, 0,
> >>                      DMA_TLB_GLOBAL_FLUSH);
> >>...
> >>
> >>If I understand the comments correctly, GLOBAL is ok for CM too (though the
> >>code did not do it for performance reason).
> >I think it should be okay to send global flush for CM, but not sure
> >whether we should notify anything when we receive it. Hmm, anyway, I
> >think I need some more reading to make sure I understand the whole
> >thing correctly. :)
> >
> >For example, when I see this commit:
> >
> >commit 78d5f0f500e6ba8f6cfd0673475ff4d941d705a2
> >Author: Nadav Amit <nadav.amit@gmail.com>
> >Date:   Thu Apr 8 23:00:41 2010 +0300
> >
> >     intel-iommu: Avoid global flushes with caching mode.
> >     While it may be efficient on real hardware, emulation of global
> >     invalidations is very expensive as all shadow entries must be examined.
> >     This patch changes the behaviour when caching mode is enabled (which is
> >     the case when IOMMU emulation takes place). In this case, page specific
> >     invalidation is used instead.
> >
> >Before I ask someone else besides qemu-devel, I am curious about
> >whether there is existing VT-d emulation code (outside QEMU, of
> >course) so that I can have a reference?
> 
> Yes, it has. The author of this patch - Nadav has done lots of research on
> emulated IOMMU. See following papers:
> 
> https://hal.inria.fr/inria-00493752/document
> http://www.cse.iitd.ac.in/~sbansal/csl862-virt/readings/vIOMMU.pdf

Thanks for these good materials. I will google the author for sure
next time. :)

-- peterx
Jason Wang Jan. 18, 2017, 8:36 a.m. UTC | #8
On 2017年01月18日 16:11, Peter Xu wrote:
> On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:
>>
>> On 2017年01月17日 22:45, Peter Xu wrote:
>>> On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
>>>> On 2017年01月16日 17:18, Peter Xu wrote:
>>>>>>>   static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>>>>>                                         hwaddr addr, uint8_t am)
>>>>>>>   {
>>>>>>> @@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>>>>>       info.addr = addr;
>>>>>>>       info.mask = ~((1 << am) - 1);
>>>>>>>       g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
>>>>>>> +    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
>>>>>> Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
>>>>> IMHO we don't. For device assignment, since we are having CM=1 here,
>>>>> we should have explicit page invalidations even if guest sends
>>>>> global/domain invalidations.
>>>>>
>>>>> Thanks,
>>>>>
>>>>> -- peterx
>>>> Is this spec required?
>>> I think not. IMO the spec is very coarse grained on describing cache
>>> mode...
>>>
>>>> Btw, it looks to me that both DSI and GLOBAL are
>>>> indeed explicit flush.
>>> Actually when cache mode is on, it is unclear to me on how we should
>>> treat domain/global invalidations, at least from the spec (as
>>> mentioned earlier). My understanding is that they are not "explicit
>>> flushes", which IMHO should only mean page selective IOTLB
>>> invalidations.
>> Probably not, at least from the view of performance. DSI and global should
>> be more efficient in some cases.
> I agree with you that DSI/GLOBAL flushes are more efficient in some
> way. But IMHO that does not mean these invalidations are "explicit
> invalidations", and I suspect whether cache mode has to coop with it.

Well, the spec does not forbid DSI/GLOBAL with CM and the driver codes 
had used them for almost ten years. I can hardly believe it's wrong.

>
> But here I should add one more thing besides PSI - context entry
> invalidation should be one of "the explicit invalidations" as well,
> which we need to handle just like PSI when cache mode is on.
>
>>>> Just have a quick go through on driver codes and find this something
>>>> interesting in intel_iommu_flush_iotlb_psi():
>>>>
>>>> ...
>>>>      /*
>>>>       * Fallback to domain selective flush if no PSI support or the size is
>>>>       * too big.
>>>>       * PSI requires page size to be 2 ^ x, and the base address is naturally
>>>>       * aligned to the size
>>>>       */
>>>>      if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
>>>>          iommu->flush.flush_iotlb(iommu, did, 0, 0,
>>>>                          DMA_TLB_DSI_FLUSH);
>>>>      else
>>>>          iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
>>>>                          DMA_TLB_PSI_FLUSH);
>>>> ...
>>> I think this is interesting... and I doubt its correctness while with
>>> cache mode enabled.
>>>
>>> If so (sending domain invalidation instead of a big range of page
>>> invalidations), how should we capture which pages are unmapped in
>>> emulated IOMMU?
>> We don't need to track individual pages here, since all pages for a specific
>> domain were unmapped I believe?
> IMHO this might not be the correct behavior.
>
> If we receive one domain specific invalidation, I agree that we should
> invalidate the IOTLB cache for all the devices inside the domain.
> However, when cache mode is on, we should be depending on the PSIs to
> unmap each page (unless we want to unmap the whole address space, in
> this case it's very possible that the guest is just unmapping a range,
> not the entire space). If we convert several PSIs into one big DSI,
> IMHO we will leave those pages mapped/unmapped while we should
> unmap/map them.

Confused, do you have an example for this? (I fail to understand why DSI 
can't work, at least implementation can convert DSI to several PSIs 
internally).

Thanks

>
>>>> It looks like DSI_FLUSH is possible even for CM on.
>>>>
>>>> And in flush_unmaps():
>>>>
>>>> ...
>>>>          /* In caching mode, global flushes turn emulation expensive */
>>>>          if (!cap_caching_mode(iommu->cap))
>>>>              iommu->flush.flush_iotlb(iommu, 0, 0, 0,
>>>>                       DMA_TLB_GLOBAL_FLUSH);
>>>> ...
>>>>
>>>> If I understand the comments correctly, GLOBAL is ok for CM too (though the
>>>> code did not do it for performance reason).
>>> I think it should be okay to send global flush for CM, but not sure
>>> whether we should notify anything when we receive it. Hmm, anyway, I
>>> think I need some more reading to make sure I understand the whole
>>> thing correctly. :)
>>>
>>> For example, when I see this commit:
>>>
>>> commit 78d5f0f500e6ba8f6cfd0673475ff4d941d705a2
>>> Author: Nadav Amit <nadav.amit@gmail.com>
>>> Date:   Thu Apr 8 23:00:41 2010 +0300
>>>
>>>      intel-iommu: Avoid global flushes with caching mode.
>>>      While it may be efficient on real hardware, emulation of global
>>>      invalidations is very expensive as all shadow entries must be examined.
>>>      This patch changes the behaviour when caching mode is enabled (which is
>>>      the case when IOMMU emulation takes place). In this case, page specific
>>>      invalidation is used instead.
>>>
>>> Before I ask someone else besides qemu-devel, I am curious about
>>> whether there is existing VT-d emulation code (outside QEMU, of
>>> course) so that I can have a reference?
>> Yes, it has. The author of this patch - Nadav has done lots of research on
>> emulated IOMMU. See following papers:
>>
>> https://hal.inria.fr/inria-00493752/document
>> http://www.cse.iitd.ac.in/~sbansal/csl862-virt/readings/vIOMMU.pdf
> Thanks for these good materials. I will google the author for sure
> next time. :)
>
> -- peterx
Peter Xu Jan. 18, 2017, 8:46 a.m. UTC | #9
On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月18日 16:11, Peter Xu wrote:
> >On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:
> >>
> >>On 2017年01月17日 22:45, Peter Xu wrote:
> >>>On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
> >>>>On 2017年01月16日 17:18, Peter Xu wrote:
> >>>>>>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >>>>>>>                                        hwaddr addr, uint8_t am)
> >>>>>>>  {
> >>>>>>>@@ -1222,6 +1251,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> >>>>>>>      info.addr = addr;
> >>>>>>>      info.mask = ~((1 << am) - 1);
> >>>>>>>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> >>>>>>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
> >>>>>>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
> >>>>>IMHO we don't. For device assignment, since we are having CM=1 here,
> >>>>>we should have explicit page invalidations even if guest sends
> >>>>>global/domain invalidations.
> >>>>>
> >>>>>Thanks,
> >>>>>
> >>>>>-- peterx
> >>>>Is this spec required?
> >>>I think not. IMO the spec is very coarse grained on describing cache
> >>>mode...
> >>>
> >>>>Btw, it looks to me that both DSI and GLOBAL are
> >>>>indeed explicit flush.
> >>>Actually when cache mode is on, it is unclear to me on how we should
> >>>treat domain/global invalidations, at least from the spec (as
> >>>mentioned earlier). My understanding is that they are not "explicit
> >>>flushes", which IMHO should only mean page selective IOTLB
> >>>invalidations.
> >>Probably not, at least from the view of performance. DSI and global should
> >>be more efficient in some cases.
> >I agree with you that DSI/GLOBAL flushes are more efficient in some
> >way. But IMHO that does not mean these invalidations are "explicit
> >invalidations", and I suspect whether cache mode has to coop with it.
> 
> Well, the spec does not forbid DSI/GLOBAL with CM and the driver codes had
> used them for almost ten years. I can hardly believe it's wrong.

I think we have misunderstanding here. :)

I never thought we should not send DSI/GLOBAL invalidations with cache
mode. I just think we should not do anything special even if we have
cache mode on when we receive these signals.

In the spec, "explicit invalidation" is mentioned in the cache mode
chapter:

    The Caching Mode (CM) field in Capability Register indicates if
    the hardware implementation caches not-present or erroneous
    translation-structure entries. When the CM field is reported as
    Set, any software updates to any remapping structures (including
    updates to not-present entries or present entries whose
    programming resulted in translation faults) requires explicit
    invalidation of the caches.

And I thought we were discussion about "what is explicit invalidation"
mentioned above.

> 
> >
> >But here I should add one more thing besides PSI - context entry
> >invalidation should be one of "the explicit invalidations" as well,
> >which we need to handle just like PSI when cache mode is on.
> >
> >>>>Just have a quick go through on driver codes and find this something
> >>>>interesting in intel_iommu_flush_iotlb_psi():
> >>>>
> >>>>...
> >>>>     /*
> >>>>      * Fallback to domain selective flush if no PSI support or the size is
> >>>>      * too big.
> >>>>      * PSI requires page size to be 2 ^ x, and the base address is naturally
> >>>>      * aligned to the size
> >>>>      */
> >>>>     if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
> >>>>         iommu->flush.flush_iotlb(iommu, did, 0, 0,
> >>>>                         DMA_TLB_DSI_FLUSH);
> >>>>     else
> >>>>         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
> >>>>                         DMA_TLB_PSI_FLUSH);
> >>>>...
> >>>I think this is interesting... and I doubt its correctness while with
> >>>cache mode enabled.
> >>>
> >>>If so (sending domain invalidation instead of a big range of page
> >>>invalidations), how should we capture which pages are unmapped in
> >>>emulated IOMMU?
> >>We don't need to track individual pages here, since all pages for a specific
> >>domain were unmapped I believe?
> >IMHO this might not be the correct behavior.
> >
> >If we receive one domain specific invalidation, I agree that we should
> >invalidate the IOTLB cache for all the devices inside the domain.
> >However, when cache mode is on, we should be depending on the PSIs to
> >unmap each page (unless we want to unmap the whole address space, in
> >this case it's very possible that the guest is just unmapping a range,
> >not the entire space). If we convert several PSIs into one big DSI,
> >IMHO we will leave those pages mapped/unmapped while we should
> >unmap/map them.
> 
> Confused, do you have an example for this? (I fail to understand why DSI
> can't work, at least implementation can convert DSI to several PSIs
> internally).

That's how I understand it. It might be wrong. Btw, could you
elaborate a bit on how can we convert a DSI into several PSIs?

Thanks,

-- peterx
Tian, Kevin Jan. 18, 2017, 9:38 a.m. UTC | #10
> From: Peter Xu [mailto:peterx@redhat.com]

> Sent: Wednesday, January 18, 2017 4:46 PM

> 

> On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:

> >

> >

> > On 2017年01月18日 16:11, Peter Xu wrote:

> > >On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:

> > >>

> > >>On 2017年01月17日 22:45, Peter Xu wrote:

> > >>>On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:

> > >>>>On 2017年01月16日 17:18, Peter Xu wrote:

> > >>>>>>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t

> domain_id,

> > >>>>>>>                                        hwaddr addr, uint8_t am)

> > >>>>>>>  {

> > >>>>>>>@@ -1222,6 +1251,7 @@ static void

> vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,

> > >>>>>>>      info.addr = addr;

> > >>>>>>>      info.mask = ~((1 << am) - 1);

> > >>>>>>>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page,

> &info);

> > >>>>>>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);

> > >>>>>>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?

> > >>>>>IMHO we don't. For device assignment, since we are having CM=1 here,

> > >>>>>we should have explicit page invalidations even if guest sends

> > >>>>>global/domain invalidations.

> > >>>>>

> > >>>>>Thanks,

> > >>>>>

> > >>>>>-- peterx

> > >>>>Is this spec required?

> > >>>I think not. IMO the spec is very coarse grained on describing cache

> > >>>mode...

> > >>>

> > >>>>Btw, it looks to me that both DSI and GLOBAL are

> > >>>>indeed explicit flush.

> > >>>Actually when cache mode is on, it is unclear to me on how we should

> > >>>treat domain/global invalidations, at least from the spec (as

> > >>>mentioned earlier). My understanding is that they are not "explicit

> > >>>flushes", which IMHO should only mean page selective IOTLB

> > >>>invalidations.

> > >>Probably not, at least from the view of performance. DSI and global should

> > >>be more efficient in some cases.

> > >I agree with you that DSI/GLOBAL flushes are more efficient in some

> > >way. But IMHO that does not mean these invalidations are "explicit

> > >invalidations", and I suspect whether cache mode has to coop with it.

> >

> > Well, the spec does not forbid DSI/GLOBAL with CM and the driver codes had

> > used them for almost ten years. I can hardly believe it's wrong.

> 

> I think we have misunderstanding here. :)

> 

> I never thought we should not send DSI/GLOBAL invalidations with cache

> mode. I just think we should not do anything special even if we have

> cache mode on when we receive these signals.

> 

> In the spec, "explicit invalidation" is mentioned in the cache mode

> chapter:

> 

>     The Caching Mode (CM) field in Capability Register indicates if

>     the hardware implementation caches not-present or erroneous

>     translation-structure entries. When the CM field is reported as

>     Set, any software updates to any remapping structures (including

>     updates to not-present entries or present entries whose

>     programming resulted in translation faults) requires explicit

>     invalidation of the caches.

> 

> And I thought we were discussion about "what is explicit invalidation"

> mentioned above.


Check 6.5.3.1 Implicit Invalidation on Page Requests

	In addition to the explicit invalidation through invalidation commands 
	(see Section 6.5.1 and Section 6.5.2) or through deferred invalidation 
	messages (see Section 6.5.4), identified above, Page Requests from 
	endpoint devices invalidate entries in the IOTLBs and paging-structure 
	caches.

My impression is that above indirectly defines invalidation commands (
PSI/DSI/GLOBAL) as explicit invalidation, because they are explicitly
issued by driver. Then section 6.5.3.1 further describes implicit
invalidations caused by other VT-d operations.

I will check with VT-d spec owner to clarify.

> 

> >

> > >

> > >But here I should add one more thing besides PSI - context entry

> > >invalidation should be one of "the explicit invalidations" as well,

> > >which we need to handle just like PSI when cache mode is on.

> > >

> > >>>>Just have a quick go through on driver codes and find this something

> > >>>>interesting in intel_iommu_flush_iotlb_psi():

> > >>>>

> > >>>>...

> > >>>>     /*

> > >>>>      * Fallback to domain selective flush if no PSI support or the size is

> > >>>>      * too big.

> > >>>>      * PSI requires page size to be 2 ^ x, and the base address is naturally

> > >>>>      * aligned to the size

> > >>>>      */

> > >>>>     if (!cap_pgsel_inv(iommu->cap) || mask >

> cap_max_amask_val(iommu->cap))

> > >>>>         iommu->flush.flush_iotlb(iommu, did, 0, 0,

> > >>>>                         DMA_TLB_DSI_FLUSH);

> > >>>>     else

> > >>>>         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,

> > >>>>                         DMA_TLB_PSI_FLUSH);

> > >>>>...

> > >>>I think this is interesting... and I doubt its correctness while with

> > >>>cache mode enabled.

> > >>>

> > >>>If so (sending domain invalidation instead of a big range of page

> > >>>invalidations), how should we capture which pages are unmapped in

> > >>>emulated IOMMU?

> > >>We don't need to track individual pages here, since all pages for a specific

> > >>domain were unmapped I believe?

> > >IMHO this might not be the correct behavior.

> > >

> > >If we receive one domain specific invalidation, I agree that we should

> > >invalidate the IOTLB cache for all the devices inside the domain.

> > >However, when cache mode is on, we should be depending on the PSIs to

> > >unmap each page (unless we want to unmap the whole address space, in

> > >this case it's very possible that the guest is just unmapping a range,

> > >not the entire space). If we convert several PSIs into one big DSI,

> > >IMHO we will leave those pages mapped/unmapped while we should

> > >unmap/map them.

> >

> > Confused, do you have an example for this? (I fail to understand why DSI

> > can't work, at least implementation can convert DSI to several PSIs

> > internally).

> 

> That's how I understand it. It might be wrong. Btw, could you

> elaborate a bit on how can we convert a DSI into several PSIs?

> 

> Thanks,


If my understanding above is correct, there is nothing wrong with 
above IOMMU driver code - actually it makes sense on bare metal
when CM is disabled.

But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.
We rely on cache invalidations to indirectly capture remapping structure 
change. PSI provides accurate info, while DSI/GLOBAL doesn't. To 
emulate correct behavior of DSI/GLOBAL, we have to pretend that
the whole address space (iova=0, size=agaw) needs to be unmapped
(for GLOBAL it further means multiple address spaces)

Though not efficient, it doesn't mean it's wrong since guest driver
follows spec. We can ask for linux IOMMU driver change (CC Ashok)
to not use above optimization when cache mode is enabled, but 
anyway we need emulate correct DSI/GLOBAL behavior to follow
spec, because:

- even when driver fix is in place, old version still has this logic;

- there is still scenario where guest IOMMU driver does want to
invalidate the whole address space, e.g. when changing context
entry. Asking guest driver to use PSI for such purpose is another
bad thing.

Thanks
Kevin
Jason Wang Jan. 18, 2017, 10:06 a.m. UTC | #11
On 2017年01月18日 17:38, Tian, Kevin wrote:
>> From: Peter Xu [mailto:peterx@redhat.com]
>> Sent: Wednesday, January 18, 2017 4:46 PM
>>
>> On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:
>>>
>>> On 2017年01月18日 16:11, Peter Xu wrote:
>>>> On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:
>>>>> On 2017年01月17日 22:45, Peter Xu wrote:
>>>>>> On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
>>>>>>> On 2017年01月16日 17:18, Peter Xu wrote:
>>>>>>>>>>   static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t
>> domain_id,
>>>>>>>>>>                                         hwaddr addr, uint8_t am)
>>>>>>>>>>   {
>>>>>>>>>> @@ -1222,6 +1251,7 @@ static void
>> vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>>>>>>>>       info.addr = addr;
>>>>>>>>>>       info.mask = ~((1 << am) - 1);
>>>>>>>>>>       g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page,
>> &info);
>>>>>>>>>> +    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
>>>>>>>>> Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
>>>>>>>> IMHO we don't. For device assignment, since we are having CM=1 here,
>>>>>>>> we should have explicit page invalidations even if guest sends
>>>>>>>> global/domain invalidations.
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>>
>>>>>>>> -- peterx
>>>>>>> Is this spec required?
>>>>>> I think not. IMO the spec is very coarse grained on describing cache
>>>>>> mode...
>>>>>>
>>>>>>> Btw, it looks to me that both DSI and GLOBAL are
>>>>>>> indeed explicit flush.
>>>>>> Actually when cache mode is on, it is unclear to me on how we should
>>>>>> treat domain/global invalidations, at least from the spec (as
>>>>>> mentioned earlier). My understanding is that they are not "explicit
>>>>>> flushes", which IMHO should only mean page selective IOTLB
>>>>>> invalidations.
>>>>> Probably not, at least from the view of performance. DSI and global should
>>>>> be more efficient in some cases.
>>>> I agree with you that DSI/GLOBAL flushes are more efficient in some
>>>> way. But IMHO that does not mean these invalidations are "explicit
>>>> invalidations", and I suspect whether cache mode has to coop with it.
>>> Well, the spec does not forbid DSI/GLOBAL with CM and the driver codes had
>>> used them for almost ten years. I can hardly believe it's wrong.
>> I think we have misunderstanding here. :)
>>
>> I never thought we should not send DSI/GLOBAL invalidations with cache
>> mode. I just think we should not do anything special even if we have
>> cache mode on when we receive these signals.
>>
>> In the spec, "explicit invalidation" is mentioned in the cache mode
>> chapter:
>>
>>      The Caching Mode (CM) field in Capability Register indicates if
>>      the hardware implementation caches not-present or erroneous
>>      translation-structure entries. When the CM field is reported as
>>      Set, any software updates to any remapping structures (including
>>      updates to not-present entries or present entries whose
>>      programming resulted in translation faults) requires explicit
>>      invalidation of the caches.
>>
>> And I thought we were discussion about "what is explicit invalidation"
>> mentioned above.
> Check 6.5.3.1 Implicit Invalidation on Page Requests
>
> 	In addition to the explicit invalidation through invalidation commands
> 	(see Section 6.5.1 and Section 6.5.2) or through deferred invalidation
> 	messages (see Section 6.5.4), identified above, Page Requests from
> 	endpoint devices invalidate entries in the IOTLBs and paging-structure
> 	caches.
>
> My impression is that above indirectly defines invalidation commands (
> PSI/DSI/GLOBAL) as explicit invalidation, because they are explicitly
> issued by driver. Then section 6.5.3.1 further describes implicit
> invalidations caused by other VT-d operations.
>
> I will check with VT-d spec owner to clarify.

Good to hear from you.

So I think we should implement DSI and GLOBAL for vfio in this case. We 
can first try to implement it through current VFIO API which can accepts 
a range of iova. If not possible, let's discuss for other possible 
solutions.

>
>>>> But here I should add one more thing besides PSI - context entry
>>>> invalidation should be one of "the explicit invalidations" as well,
>>>> which we need to handle just like PSI when cache mode is on.
>>>>
>>>>>>> Just have a quick go through on driver codes and find this something
>>>>>>> interesting in intel_iommu_flush_iotlb_psi():
>>>>>>>
>>>>>>> ...
>>>>>>>      /*
>>>>>>>       * Fallback to domain selective flush if no PSI support or the size is
>>>>>>>       * too big.
>>>>>>>       * PSI requires page size to be 2 ^ x, and the base address is naturally
>>>>>>>       * aligned to the size
>>>>>>>       */
>>>>>>>      if (!cap_pgsel_inv(iommu->cap) || mask >
>> cap_max_amask_val(iommu->cap))
>>>>>>>          iommu->flush.flush_iotlb(iommu, did, 0, 0,
>>>>>>>                          DMA_TLB_DSI_FLUSH);
>>>>>>>      else
>>>>>>>          iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
>>>>>>>                          DMA_TLB_PSI_FLUSH);
>>>>>>> ...
>>>>>> I think this is interesting... and I doubt its correctness while with
>>>>>> cache mode enabled.
>>>>>>
>>>>>> If so (sending domain invalidation instead of a big range of page
>>>>>> invalidations), how should we capture which pages are unmapped in
>>>>>> emulated IOMMU?
>>>>> We don't need to track individual pages here, since all pages for a specific
>>>>> domain were unmapped I believe?
>>>> IMHO this might not be the correct behavior.
>>>>
>>>> If we receive one domain specific invalidation, I agree that we should
>>>> invalidate the IOTLB cache for all the devices inside the domain.
>>>> However, when cache mode is on, we should be depending on the PSIs to
>>>> unmap each page (unless we want to unmap the whole address space, in
>>>> this case it's very possible that the guest is just unmapping a range,
>>>> not the entire space). If we convert several PSIs into one big DSI,
>>>> IMHO we will leave those pages mapped/unmapped while we should
>>>> unmap/map them.
>>> Confused, do you have an example for this? (I fail to understand why DSI
>>> can't work, at least implementation can convert DSI to several PSIs
>>> internally).
>> That's how I understand it. It might be wrong. Btw, could you
>> elaborate a bit on how can we convert a DSI into several PSIs?
>>
>> Thanks,
> If my understanding above is correct, there is nothing wrong with
> above IOMMU driver code - actually it makes sense on bare metal
> when CM is disabled.
>
> But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.
> We rely on cache invalidations to indirectly capture remapping structure
> change. PSI provides accurate info, while DSI/GLOBAL doesn't. To
> emulate correct behavior of DSI/GLOBAL, we have to pretend that
> the whole address space (iova=0, size=agaw) needs to be unmapped
> (for GLOBAL it further means multiple address spaces)

Maybe a trick to have accurate info is virtual Device IOTLB.

>
> Though not efficient, it doesn't mean it's wrong since guest driver
> follows spec. We can ask for linux IOMMU driver change (CC Ashok)
> to not use above optimization when cache mode is enabled, but
> anyway we need emulate correct DSI/GLOBAL behavior to follow
> spec, because:
>
> - even when driver fix is in place, old version still has this logic;
>
> - there is still scenario where guest IOMMU driver does want to
> invalidate the whole address space, e.g. when changing context
> entry. Asking guest driver to use PSI for such purpose is another
> bad thing.
>
> Thanks
> Kevin

Agree.

Thanks
Peter Xu Jan. 19, 2017, 3:16 a.m. UTC | #12
On Wed, Jan 18, 2017 at 09:38:55AM +0000, Tian, Kevin wrote:
> > From: Peter Xu [mailto:peterx@redhat.com]
> > Sent: Wednesday, January 18, 2017 4:46 PM
> > 
> > On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:
> > >
> > >
> > > On 2017年01月18日 16:11, Peter Xu wrote:
> > > >On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:
> > > >>
> > > >>On 2017年01月17日 22:45, Peter Xu wrote:
> > > >>>On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
> > > >>>>On 2017年01月16日 17:18, Peter Xu wrote:
> > > >>>>>>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t
> > domain_id,
> > > >>>>>>>                                        hwaddr addr, uint8_t am)
> > > >>>>>>>  {
> > > >>>>>>>@@ -1222,6 +1251,7 @@ static void
> > vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> > > >>>>>>>      info.addr = addr;
> > > >>>>>>>      info.mask = ~((1 << am) - 1);
> > > >>>>>>>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page,
> > &info);
> > > >>>>>>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
> > > >>>>>>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
> > > >>>>>IMHO we don't. For device assignment, since we are having CM=1 here,
> > > >>>>>we should have explicit page invalidations even if guest sends
> > > >>>>>global/domain invalidations.
> > > >>>>>
> > > >>>>>Thanks,
> > > >>>>>
> > > >>>>>-- peterx
> > > >>>>Is this spec required?
> > > >>>I think not. IMO the spec is very coarse grained on describing cache
> > > >>>mode...
> > > >>>
> > > >>>>Btw, it looks to me that both DSI and GLOBAL are
> > > >>>>indeed explicit flush.
> > > >>>Actually when cache mode is on, it is unclear to me on how we should
> > > >>>treat domain/global invalidations, at least from the spec (as
> > > >>>mentioned earlier). My understanding is that they are not "explicit
> > > >>>flushes", which IMHO should only mean page selective IOTLB
> > > >>>invalidations.
> > > >>Probably not, at least from the view of performance. DSI and global should
> > > >>be more efficient in some cases.
> > > >I agree with you that DSI/GLOBAL flushes are more efficient in some
> > > >way. But IMHO that does not mean these invalidations are "explicit
> > > >invalidations", and I suspect whether cache mode has to coop with it.
> > >
> > > Well, the spec does not forbid DSI/GLOBAL with CM and the driver codes had
> > > used them for almost ten years. I can hardly believe it's wrong.
> > 
> > I think we have misunderstanding here. :)
> > 
> > I never thought we should not send DSI/GLOBAL invalidations with cache
> > mode. I just think we should not do anything special even if we have
> > cache mode on when we receive these signals.
> > 
> > In the spec, "explicit invalidation" is mentioned in the cache mode
> > chapter:
> > 
> >     The Caching Mode (CM) field in Capability Register indicates if
> >     the hardware implementation caches not-present or erroneous
> >     translation-structure entries. When the CM field is reported as
> >     Set, any software updates to any remapping structures (including
> >     updates to not-present entries or present entries whose
> >     programming resulted in translation faults) requires explicit
> >     invalidation of the caches.
> > 
> > And I thought we were discussion about "what is explicit invalidation"
> > mentioned above.
> 
> Check 6.5.3.1 Implicit Invalidation on Page Requests
> 
> 	In addition to the explicit invalidation through invalidation commands 
> 	(see Section 6.5.1 and Section 6.5.2) or through deferred invalidation 
> 	messages (see Section 6.5.4), identified above, Page Requests from 
> 	endpoint devices invalidate entries in the IOTLBs and paging-structure 
> 	caches.
> 
> My impression is that above indirectly defines invalidation commands (
> PSI/DSI/GLOBAL) as explicit invalidation, because they are explicitly
> issued by driver. Then section 6.5.3.1 further describes implicit
> invalidations caused by other VT-d operations.
> 
> I will check with VT-d spec owner to clarify.

Above spec is clear to me. So now I agree that both DSI/GLOBAL iotlb
invalidations are explicit invalidations.

> 
> > 
> > >
> > > >
> > > >But here I should add one more thing besides PSI - context entry
> > > >invalidation should be one of "the explicit invalidations" as well,
> > > >which we need to handle just like PSI when cache mode is on.
> > > >
> > > >>>>Just have a quick go through on driver codes and find this something
> > > >>>>interesting in intel_iommu_flush_iotlb_psi():
> > > >>>>
> > > >>>>...
> > > >>>>     /*
> > > >>>>      * Fallback to domain selective flush if no PSI support or the size is
> > > >>>>      * too big.
> > > >>>>      * PSI requires page size to be 2 ^ x, and the base address is naturally
> > > >>>>      * aligned to the size
> > > >>>>      */
> > > >>>>     if (!cap_pgsel_inv(iommu->cap) || mask >
> > cap_max_amask_val(iommu->cap))
> > > >>>>         iommu->flush.flush_iotlb(iommu, did, 0, 0,
> > > >>>>                         DMA_TLB_DSI_FLUSH);
> > > >>>>     else
> > > >>>>         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
> > > >>>>                         DMA_TLB_PSI_FLUSH);
> > > >>>>...
> > > >>>I think this is interesting... and I doubt its correctness while with
> > > >>>cache mode enabled.
> > > >>>
> > > >>>If so (sending domain invalidation instead of a big range of page
> > > >>>invalidations), how should we capture which pages are unmapped in
> > > >>>emulated IOMMU?
> > > >>We don't need to track individual pages here, since all pages for a specific
> > > >>domain were unmapped I believe?
> > > >IMHO this might not be the correct behavior.
> > > >
> > > >If we receive one domain specific invalidation, I agree that we should
> > > >invalidate the IOTLB cache for all the devices inside the domain.
> > > >However, when cache mode is on, we should be depending on the PSIs to
> > > >unmap each page (unless we want to unmap the whole address space, in
> > > >this case it's very possible that the guest is just unmapping a range,
> > > >not the entire space). If we convert several PSIs into one big DSI,
> > > >IMHO we will leave those pages mapped/unmapped while we should
> > > >unmap/map them.
> > >
> > > Confused, do you have an example for this? (I fail to understand why DSI
> > > can't work, at least implementation can convert DSI to several PSIs
> > > internally).
> > 
> > That's how I understand it. It might be wrong. Btw, could you
> > elaborate a bit on how can we convert a DSI into several PSIs?
> > 
> > Thanks,
> 
> If my understanding above is correct, there is nothing wrong with 
> above IOMMU driver code - actually it makes sense on bare metal
> when CM is disabled.
> 
> But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.
> We rely on cache invalidations to indirectly capture remapping structure 
> change. PSI provides accurate info, while DSI/GLOBAL doesn't. To 
> emulate correct behavior of DSI/GLOBAL, we have to pretend that
> the whole address space (iova=0, size=agaw) needs to be unmapped
> (for GLOBAL it further means multiple address spaces)
> 
> Though not efficient, it doesn't mean it's wrong since guest driver
> follows spec. We can ask for linux IOMMU driver change (CC Ashok)
> to not use above optimization when cache mode is enabled, but 
> anyway we need emulate correct DSI/GLOBAL behavior to follow
> spec, because:
> 
> - even when driver fix is in place, old version still has this logic;
> 
> - there is still scenario where guest IOMMU driver does want to
> invalidate the whole address space, e.g. when changing context
> entry. Asking guest driver to use PSI for such purpose is another
> bad thing.

Thanks for the thorough explanation. It did answered my above
question.

Btw, I never meant to ask guest IOMMU driver to send PSIs instead of
context entry invalidations, considering that the series is using
context entry invalidations to replay the region. But I admit I may
have misunderstood the spec a bit. :-)

I'll consider this issue in the next post, and handle domain/global
invalidations properly (though it might be slower).

Thanks,

-- peterx
Peter Xu Jan. 19, 2017, 3:32 a.m. UTC | #13
On Wed, Jan 18, 2017 at 06:06:57PM +0800, Jason Wang wrote:

[...]

> So I think we should implement DSI and GLOBAL for vfio in this case. We can
> first try to implement it through current VFIO API which can accepts a range
> of iova. If not possible, let's discuss for other possible solutions.

Do you mean VFIO_IOMMU_UNMAP_DMA here?

[...]

> >If my understanding above is correct, there is nothing wrong with
> >above IOMMU driver code - actually it makes sense on bare metal
> >when CM is disabled.
> >
> >But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.
> >We rely on cache invalidations to indirectly capture remapping structure
> >change. PSI provides accurate info, while DSI/GLOBAL doesn't. To
> >emulate correct behavior of DSI/GLOBAL, we have to pretend that
> >the whole address space (iova=0, size=agaw) needs to be unmapped
> >(for GLOBAL it further means multiple address spaces)
> 
> Maybe a trick to have accurate info is virtual Device IOTLB.

Could you elaborate a bit on this?

Thanks,

-- peterx
Jason Wang Jan. 19, 2017, 3:36 a.m. UTC | #14
On 2017年01月19日 11:32, Peter Xu wrote:
> On Wed, Jan 18, 2017 at 06:06:57PM +0800, Jason Wang wrote:
>
> [...]
>
>> So I think we should implement DSI and GLOBAL for vfio in this case. We can
>> first try to implement it through current VFIO API which can accepts a range
>> of iova. If not possible, let's discuss for other possible solutions.
> Do you mean VFIO_IOMMU_UNMAP_DMA here?
>
> [...]

Yes.

>
>>> If my understanding above is correct, there is nothing wrong with
>>> above IOMMU driver code - actually it makes sense on bare metal
>>> when CM is disabled.
>>>
>>> But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.
>>> We rely on cache invalidations to indirectly capture remapping structure
>>> change. PSI provides accurate info, while DSI/GLOBAL doesn't. To
>>> emulate correct behavior of DSI/GLOBAL, we have to pretend that
>>> the whole address space (iova=0, size=agaw) needs to be unmapped
>>> (for GLOBAL it further means multiple address spaces)
>> Maybe a trick to have accurate info is virtual Device IOTLB.
> Could you elaborate a bit on this?
>
> Thanks,
>
> -- peterx

I think the trick is if guest knows device has device IOTLB, it will 
explicit flush with accurate iova range.

Thanks
Tian, Kevin Jan. 19, 2017, 6:22 a.m. UTC | #15
> From: Peter Xu [mailto:peterx@redhat.com]

> Sent: Thursday, January 19, 2017 11:17 AM

> 

> On Wed, Jan 18, 2017 at 09:38:55AM +0000, Tian, Kevin wrote:

> > > From: Peter Xu [mailto:peterx@redhat.com]

> > > Sent: Wednesday, January 18, 2017 4:46 PM

> > >

> > > On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:

> > > >

> > > >

> > > > On 2017年01月18日 16:11, Peter Xu wrote:

> > > > >On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:

> > > > >>

> > > > >>On 2017年01月17日 22:45, Peter Xu wrote:

> > > > >>>On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:

> > > > >>>>On 2017年01月16日 17:18, Peter Xu wrote:

> > > > >>>>>>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t

> > > domain_id,

> > > > >>>>>>>                                        hwaddr addr, uint8_t am)

> > > > >>>>>>>  {

> > > > >>>>>>>@@ -1222,6 +1251,7 @@ static void

> > > vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,

> > > > >>>>>>>      info.addr = addr;

> > > > >>>>>>>      info.mask = ~((1 << am) - 1);

> > > > >>>>>>>      g_hash_table_foreach_remove(s->iotlb,

> vtd_hash_remove_by_page,

> > > &info);

> > > > >>>>>>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);

> > > > >>>>>>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?

> > > > >>>>>IMHO we don't. For device assignment, since we are having CM=1 here,

> > > > >>>>>we should have explicit page invalidations even if guest sends

> > > > >>>>>global/domain invalidations.

> > > > >>>>>

> > > > >>>>>Thanks,

> > > > >>>>>

> > > > >>>>>-- peterx

> > > > >>>>Is this spec required?

> > > > >>>I think not. IMO the spec is very coarse grained on describing cache

> > > > >>>mode...

> > > > >>>

> > > > >>>>Btw, it looks to me that both DSI and GLOBAL are

> > > > >>>>indeed explicit flush.

> > > > >>>Actually when cache mode is on, it is unclear to me on how we should

> > > > >>>treat domain/global invalidations, at least from the spec (as

> > > > >>>mentioned earlier). My understanding is that they are not "explicit

> > > > >>>flushes", which IMHO should only mean page selective IOTLB

> > > > >>>invalidations.

> > > > >>Probably not, at least from the view of performance. DSI and global should

> > > > >>be more efficient in some cases.

> > > > >I agree with you that DSI/GLOBAL flushes are more efficient in some

> > > > >way. But IMHO that does not mean these invalidations are "explicit

> > > > >invalidations", and I suspect whether cache mode has to coop with it.

> > > >

> > > > Well, the spec does not forbid DSI/GLOBAL with CM and the driver codes had

> > > > used them for almost ten years. I can hardly believe it's wrong.

> > >

> > > I think we have misunderstanding here. :)

> > >

> > > I never thought we should not send DSI/GLOBAL invalidations with cache

> > > mode. I just think we should not do anything special even if we have

> > > cache mode on when we receive these signals.

> > >

> > > In the spec, "explicit invalidation" is mentioned in the cache mode

> > > chapter:

> > >

> > >     The Caching Mode (CM) field in Capability Register indicates if

> > >     the hardware implementation caches not-present or erroneous

> > >     translation-structure entries. When the CM field is reported as

> > >     Set, any software updates to any remapping structures (including

> > >     updates to not-present entries or present entries whose

> > >     programming resulted in translation faults) requires explicit

> > >     invalidation of the caches.

> > >

> > > And I thought we were discussion about "what is explicit invalidation"

> > > mentioned above.

> >

> > Check 6.5.3.1 Implicit Invalidation on Page Requests

> >

> > 	In addition to the explicit invalidation through invalidation commands

> > 	(see Section 6.5.1 and Section 6.5.2) or through deferred invalidation

> > 	messages (see Section 6.5.4), identified above, Page Requests from

> > 	endpoint devices invalidate entries in the IOTLBs and paging-structure

> > 	caches.

> >

> > My impression is that above indirectly defines invalidation commands (

> > PSI/DSI/GLOBAL) as explicit invalidation, because they are explicitly

> > issued by driver. Then section 6.5.3.1 further describes implicit

> > invalidations caused by other VT-d operations.

> >

> > I will check with VT-d spec owner to clarify.

> 

> Above spec is clear to me. So now I agree that both DSI/GLOBAL iotlb

> invalidations are explicit invalidations.

> 


still copy response from spec owner here:-)

	Explicit invalidation is anytime software is explicitly invalidating something (
	through any descriptor) as opposed to something hardware does implicitly.  
	The only time hardware does implicit invalidation is during the handling of a page 
	request (recoverable page-fault) from an endpoint device.

Thanks
Kevin
Yi Liu Jan. 19, 2017, 6:44 a.m. UTC | #16
> -----Original Message-----

> From: Qemu-devel [mailto:qemu-devel-bounces+yi.l.liu=intel.com@nongnu.org]

> On Behalf Of Tian, Kevin

> Sent: Wednesday, January 18, 2017 5:39 PM

> To: Peter Xu <peterx@redhat.com>; Jason Wang <jasowang@redhat.com>

> Cc: Lan, Tianyu <tianyu.lan@intel.com>; Raj, Ashok <ashok.raj@intel.com>;

> mst@redhat.com; jan.kiszka@siemens.com; bd.aviv@gmail.com; qemu-

> devel@nongnu.org; alex.williamson@redhat.com

> Subject: Re: [Qemu-devel] [PATCH RFC v3 14/14] intel_iommu: enable vfio

> devices

> 

> > From: Peter Xu [mailto:peterx@redhat.com]

> > Sent: Wednesday, January 18, 2017 4:46 PM

> >

> > On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:

> > >

> > >

> > > On 2017年01月18日 16:11, Peter Xu wrote:

> > > >On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:

> > > >>

> > > >>On 2017年01月17日 22:45, Peter Xu wrote:

> > > >>>On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:

> > > >>>>On 2017年01月16日 17:18, Peter Xu wrote:

> > > >>>>>>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s,

> > > >>>>>>> uint16_t

> > domain_id,

> > > >>>>>>>                                        hwaddr addr, uint8_t

> > > >>>>>>>am)

> > > >>>>>>>  {

> > > >>>>>>>@@ -1222,6 +1251,7 @@ static void

> > vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,

> > > >>>>>>>      info.addr = addr;

> > > >>>>>>>      info.mask = ~((1 << am) - 1);

> > > >>>>>>>      g_hash_table_foreach_remove(s->iotlb,

> > > >>>>>>> vtd_hash_remove_by_page,

> > &info);

> > > >>>>>>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr,

> > > >>>>>>>+ am);

> > > >>>>>>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?

> > > >>>>>IMHO we don't. For device assignment, since we are having CM=1

> > > >>>>>here, we should have explicit page invalidations even if guest

> > > >>>>>sends global/domain invalidations.

> > > >>>>>

> > > >>>>>Thanks,

> > > >>>>>

> > > >>>>>-- peterx

> > > >>>>Is this spec required?

> > > >>>I think not. IMO the spec is very coarse grained on describing

> > > >>>cache mode...

> > > >>>

> > > >>>>Btw, it looks to me that both DSI and GLOBAL are indeed explicit

> > > >>>>flush.

> > > >>>Actually when cache mode is on, it is unclear to me on how we

> > > >>>should treat domain/global invalidations, at least from the spec

> > > >>>(as mentioned earlier). My understanding is that they are not

> > > >>>"explicit flushes", which IMHO should only mean page selective

> > > >>>IOTLB invalidations.

> > > >>Probably not, at least from the view of performance. DSI and

> > > >>global should be more efficient in some cases.

> > > >I agree with you that DSI/GLOBAL flushes are more efficient in some

> > > >way. But IMHO that does not mean these invalidations are "explicit

> > > >invalidations", and I suspect whether cache mode has to coop with it.

> > >

> > > Well, the spec does not forbid DSI/GLOBAL with CM and the driver

> > > codes had used them for almost ten years. I can hardly believe it's wrong.

> >

> > I think we have misunderstanding here. :)

> >

> > I never thought we should not send DSI/GLOBAL invalidations with cache

> > mode. I just think we should not do anything special even if we have

> > cache mode on when we receive these signals.

> >

> > In the spec, "explicit invalidation" is mentioned in the cache mode

> > chapter:

> >

> >     The Caching Mode (CM) field in Capability Register indicates if

> >     the hardware implementation caches not-present or erroneous

> >     translation-structure entries. When the CM field is reported as

> >     Set, any software updates to any remapping structures (including

> >     updates to not-present entries or present entries whose

> >     programming resulted in translation faults) requires explicit

> >     invalidation of the caches.

> >

> > And I thought we were discussion about "what is explicit invalidation"

> > mentioned above.

> 

> Check 6.5.3.1 Implicit Invalidation on Page Requests

> 

> 	In addition to the explicit invalidation through invalidation commands

> 	(see Section 6.5.1 and Section 6.5.2) or through deferred invalidation

> 	messages (see Section 6.5.4), identified above, Page Requests from

> 	endpoint devices invalidate entries in the IOTLBs and paging-structure

> 	caches.

> 

> My impression is that above indirectly defines invalidation commands (

> PSI/DSI/GLOBAL) as explicit invalidation, because they are explicitly issued by

> driver. Then section 6.5.3.1 further describes implicit invalidations caused by

> other VT-d operations.

> 

> I will check with VT-d spec owner to clarify.

> 

> >

> > >

> > > >

> > > >But here I should add one more thing besides PSI - context entry

> > > >invalidation should be one of "the explicit invalidations" as well,

> > > >which we need to handle just like PSI when cache mode is on.

> > > >

> > > >>>>Just have a quick go through on driver codes and find this

> > > >>>>something interesting in intel_iommu_flush_iotlb_psi():

> > > >>>>

> > > >>>>...

> > > >>>>     /*

> > > >>>>      * Fallback to domain selective flush if no PSI support or the size is

> > > >>>>      * too big.

> > > >>>>      * PSI requires page size to be 2 ^ x, and the base address is

> naturally

> > > >>>>      * aligned to the size

> > > >>>>      */

> > > >>>>     if (!cap_pgsel_inv(iommu->cap) || mask >

> > cap_max_amask_val(iommu->cap))

> > > >>>>         iommu->flush.flush_iotlb(iommu, did, 0, 0,

> > > >>>>                         DMA_TLB_DSI_FLUSH);

> > > >>>>     else

> > > >>>>         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,

> > > >>>>                         DMA_TLB_PSI_FLUSH); ...

> > > >>>I think this is interesting... and I doubt its correctness while

> > > >>>with cache mode enabled.

> > > >>>

> > > >>>If so (sending domain invalidation instead of a big range of page

> > > >>>invalidations), how should we capture which pages are unmapped in

> > > >>>emulated IOMMU?

> > > >>We don't need to track individual pages here, since all pages for

> > > >>a specific domain were unmapped I believe?

> > > >IMHO this might not be the correct behavior.

> > > >

> > > >If we receive one domain specific invalidation, I agree that we

> > > >should invalidate the IOTLB cache for all the devices inside the domain.

> > > >However, when cache mode is on, we should be depending on the PSIs

> > > >to unmap each page (unless we want to unmap the whole address

> > > >space, in this case it's very possible that the guest is just

> > > >unmapping a range, not the entire space). If we convert several

> > > >PSIs into one big DSI, IMHO we will leave those pages

> > > >mapped/unmapped while we should unmap/map them.

> > >

> > > Confused, do you have an example for this? (I fail to understand why

> > > DSI can't work, at least implementation can convert DSI to several

> > > PSIs internally).

> >

> > That's how I understand it. It might be wrong. Btw, could you

> > elaborate a bit on how can we convert a DSI into several PSIs?

> >

> > Thanks,

> 

> If my understanding above is correct, there is nothing wrong with above

> IOMMU driver code - actually it makes sense on bare metal when CM is

> disabled.

> 

> But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.

> We rely on cache invalidations to indirectly capture remapping structure change.

> PSI provides accurate info, while DSI/GLOBAL doesn't. To emulate correct

> behavior of DSI/GLOBAL, we have to pretend that the whole address space

> (iova=0, size=agaw) needs to be unmapped (for GLOBAL it further means

> multiple address spaces)

> 

> Though not efficient, it doesn't mean it's wrong since guest driver follows spec.

> We can ask for linux IOMMU driver change (CC Ashok) to not use above

> optimization when cache mode is enabled, but anyway we need emulate correct

> DSI/GLOBAL behavior to follow spec, because:

> 

> - even when driver fix is in place, old version still has this logic;

> 

> - there is still scenario where guest IOMMU driver does want to invalidate the

> whole address space, e.g. when changing context entry. Asking guest driver to

> use PSI for such purpose is another bad thing.


Hi Kevin/Peter/Jason,

I agree we should think DSI/GLOBAL. Herby, I guess there may be a chance to ignore
DSI/GLOBAL flush if the following assumption is correct.

It seems like that all DSI/GLOBAL flush would always be after a context entry invalidation. 

With this assumption, I remember Peter added memory_replay in context invalidation.
This memory_replay would walk guest second-level page table and do map. So the
second-level page table in host should be able to get the latest mapping info. Guest
IOMMU driver would issue an DSI/GLOBAL flush after changing context. Since the
mapping info has updated in host, then there is no need to deal this DSI/GLOBAL flush.

So gentlemen, pls help judge if the assumption is correct. If it is correct, then Peter's patch
may just work without special process against DSI/GLOBAL flush.
 
Regards,
Yi L
Jason Wang Jan. 19, 2017, 7:02 a.m. UTC | #17
On 2017年01月19日 14:44, Liu, Yi L wrote:
>> -----Original Message-----
>> From: Qemu-devel [mailto:qemu-devel-bounces+yi.l.liu=intel.com@nongnu.org]
>> On Behalf Of Tian, Kevin
>> Sent: Wednesday, January 18, 2017 5:39 PM
>> To: Peter Xu <peterx@redhat.com>; Jason Wang <jasowang@redhat.com>
>> Cc: Lan, Tianyu <tianyu.lan@intel.com>; Raj, Ashok <ashok.raj@intel.com>;
>> mst@redhat.com; jan.kiszka@siemens.com; bd.aviv@gmail.com; qemu-
>> devel@nongnu.org; alex.williamson@redhat.com
>> Subject: Re: [Qemu-devel] [PATCH RFC v3 14/14] intel_iommu: enable vfio
>> devices
>>
>>> From: Peter Xu [mailto:peterx@redhat.com]
>>> Sent: Wednesday, January 18, 2017 4:46 PM
>>>
>>> On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:
>>>>
>>>> On 2017年01月18日 16:11, Peter Xu wrote:
>>>>> On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:
>>>>>> On 2017年01月17日 22:45, Peter Xu wrote:
>>>>>>> On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
>>>>>>>> On 2017年01月16日 17:18, Peter Xu wrote:
>>>>>>>>>>>   static void vtd_iotlb_page_invalidate(IntelIOMMUState *s,
>>>>>>>>>>> uint16_t
>>> domain_id,
>>>>>>>>>>>                                         hwaddr addr, uint8_t
>>>>>>>>>>> am)
>>>>>>>>>>>   {
>>>>>>>>>>> @@ -1222,6 +1251,7 @@ static void
>>> vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>>>>>>>>>>       info.addr = addr;
>>>>>>>>>>>       info.mask = ~((1 << am) - 1);
>>>>>>>>>>>       g_hash_table_foreach_remove(s->iotlb,
>>>>>>>>>>> vtd_hash_remove_by_page,
>>> &info);
>>>>>>>>>>> +    vtd_iotlb_page_invalidate_notify(s, domain_id, addr,
>>>>>>>>>>> + am);
>>>>>>>>>> Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
>>>>>>>>> IMHO we don't. For device assignment, since we are having CM=1
>>>>>>>>> here, we should have explicit page invalidations even if guest
>>>>>>>>> sends global/domain invalidations.
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>>
>>>>>>>>> -- peterx
>>>>>>>> Is this spec required?
>>>>>>> I think not. IMO the spec is very coarse grained on describing
>>>>>>> cache mode...
>>>>>>>
>>>>>>>> Btw, it looks to me that both DSI and GLOBAL are indeed explicit
>>>>>>>> flush.
>>>>>>> Actually when cache mode is on, it is unclear to me on how we
>>>>>>> should treat domain/global invalidations, at least from the spec
>>>>>>> (as mentioned earlier). My understanding is that they are not
>>>>>>> "explicit flushes", which IMHO should only mean page selective
>>>>>>> IOTLB invalidations.
>>>>>> Probably not, at least from the view of performance. DSI and
>>>>>> global should be more efficient in some cases.
>>>>> I agree with you that DSI/GLOBAL flushes are more efficient in some
>>>>> way. But IMHO that does not mean these invalidations are "explicit
>>>>> invalidations", and I suspect whether cache mode has to coop with it.
>>>> Well, the spec does not forbid DSI/GLOBAL with CM and the driver
>>>> codes had used them for almost ten years. I can hardly believe it's wrong.
>>> I think we have misunderstanding here. :)
>>>
>>> I never thought we should not send DSI/GLOBAL invalidations with cache
>>> mode. I just think we should not do anything special even if we have
>>> cache mode on when we receive these signals.
>>>
>>> In the spec, "explicit invalidation" is mentioned in the cache mode
>>> chapter:
>>>
>>>      The Caching Mode (CM) field in Capability Register indicates if
>>>      the hardware implementation caches not-present or erroneous
>>>      translation-structure entries. When the CM field is reported as
>>>      Set, any software updates to any remapping structures (including
>>>      updates to not-present entries or present entries whose
>>>      programming resulted in translation faults) requires explicit
>>>      invalidation of the caches.
>>>
>>> And I thought we were discussion about "what is explicit invalidation"
>>> mentioned above.
>> Check 6.5.3.1 Implicit Invalidation on Page Requests
>>
>> 	In addition to the explicit invalidation through invalidation commands
>> 	(see Section 6.5.1 and Section 6.5.2) or through deferred invalidation
>> 	messages (see Section 6.5.4), identified above, Page Requests from
>> 	endpoint devices invalidate entries in the IOTLBs and paging-structure
>> 	caches.
>>
>> My impression is that above indirectly defines invalidation commands (
>> PSI/DSI/GLOBAL) as explicit invalidation, because they are explicitly issued by
>> driver. Then section 6.5.3.1 further describes implicit invalidations caused by
>> other VT-d operations.
>>
>> I will check with VT-d spec owner to clarify.
>>
>>>>> But here I should add one more thing besides PSI - context entry
>>>>> invalidation should be one of "the explicit invalidations" as well,
>>>>> which we need to handle just like PSI when cache mode is on.
>>>>>
>>>>>>>> Just have a quick go through on driver codes and find this
>>>>>>>> something interesting in intel_iommu_flush_iotlb_psi():
>>>>>>>>
>>>>>>>> ...
>>>>>>>>      /*
>>>>>>>>       * Fallback to domain selective flush if no PSI support or the size is
>>>>>>>>       * too big.
>>>>>>>>       * PSI requires page size to be 2 ^ x, and the base address is
>> naturally
>>>>>>>>       * aligned to the size
>>>>>>>>       */
>>>>>>>>      if (!cap_pgsel_inv(iommu->cap) || mask >
>>> cap_max_amask_val(iommu->cap))
>>>>>>>>          iommu->flush.flush_iotlb(iommu, did, 0, 0,
>>>>>>>>                          DMA_TLB_DSI_FLUSH);
>>>>>>>>      else
>>>>>>>>          iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
>>>>>>>>                          DMA_TLB_PSI_FLUSH); ...
>>>>>>> I think this is interesting... and I doubt its correctness while
>>>>>>> with cache mode enabled.
>>>>>>>
>>>>>>> If so (sending domain invalidation instead of a big range of page
>>>>>>> invalidations), how should we capture which pages are unmapped in
>>>>>>> emulated IOMMU?
>>>>>> We don't need to track individual pages here, since all pages for
>>>>>> a specific domain were unmapped I believe?
>>>>> IMHO this might not be the correct behavior.
>>>>>
>>>>> If we receive one domain specific invalidation, I agree that we
>>>>> should invalidate the IOTLB cache for all the devices inside the domain.
>>>>> However, when cache mode is on, we should be depending on the PSIs
>>>>> to unmap each page (unless we want to unmap the whole address
>>>>> space, in this case it's very possible that the guest is just
>>>>> unmapping a range, not the entire space). If we convert several
>>>>> PSIs into one big DSI, IMHO we will leave those pages
>>>>> mapped/unmapped while we should unmap/map them.
>>>> Confused, do you have an example for this? (I fail to understand why
>>>> DSI can't work, at least implementation can convert DSI to several
>>>> PSIs internally).
>>> That's how I understand it. It might be wrong. Btw, could you
>>> elaborate a bit on how can we convert a DSI into several PSIs?
>>>
>>> Thanks,
>> If my understanding above is correct, there is nothing wrong with above
>> IOMMU driver code - actually it makes sense on bare metal when CM is
>> disabled.
>>
>> But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.
>> We rely on cache invalidations to indirectly capture remapping structure change.
>> PSI provides accurate info, while DSI/GLOBAL doesn't. To emulate correct
>> behavior of DSI/GLOBAL, we have to pretend that the whole address space
>> (iova=0, size=agaw) needs to be unmapped (for GLOBAL it further means
>> multiple address spaces)
>>
>> Though not efficient, it doesn't mean it's wrong since guest driver follows spec.
>> We can ask for linux IOMMU driver change (CC Ashok) to not use above
>> optimization when cache mode is enabled, but anyway we need emulate correct
>> DSI/GLOBAL behavior to follow spec, because:
>>
>> - even when driver fix is in place, old version still has this logic;
>>
>> - there is still scenario where guest IOMMU driver does want to invalidate the
>> whole address space, e.g. when changing context entry. Asking guest driver to
>> use PSI for such purpose is another bad thing.
> Hi Kevin/Peter/Jason,
>
> I agree we should think DSI/GLOBAL. Herby, I guess there may be a chance to ignore
> DSI/GLOBAL flush if the following assumption is correct.
>
> It seems like that all DSI/GLOBAL flush would always be after a context entry invalidation.

Well it looks like at least for DSI, flush could happen when the size is 
too big?

>
> With this assumption, I remember Peter added memory_replay in context invalidation.
> This memory_replay would walk guest second-level page table and do map. So the
> second-level page table in host should be able to get the latest mapping info. Guest
> IOMMU driver would issue an DSI/GLOBAL flush after changing context. Since the
> mapping info has updated in host, then there is no need to deal this DSI/GLOBAL flush.
>
> So gentlemen, pls help judge if the assumption is correct. If it is correct, then Peter's patch
> may just work without special process against DSI/GLOBAL flush.
>   
> Regards,
> Yi L

Even if this may be the usual case, I think we'd better not make the 
codes depends on (usual) guest behaviors.

Thanks
Peter Xu Jan. 19, 2017, 7:02 a.m. UTC | #18
On Thu, Jan 19, 2017 at 06:44:06AM +0000, Liu, Yi L wrote:
> > -----Original Message-----
> > From: Qemu-devel [mailto:qemu-devel-bounces+yi.l.liu=intel.com@nongnu.org]
> > On Behalf Of Tian, Kevin
> > Sent: Wednesday, January 18, 2017 5:39 PM
> > To: Peter Xu <peterx@redhat.com>; Jason Wang <jasowang@redhat.com>
> > Cc: Lan, Tianyu <tianyu.lan@intel.com>; Raj, Ashok <ashok.raj@intel.com>;
> > mst@redhat.com; jan.kiszka@siemens.com; bd.aviv@gmail.com; qemu-
> > devel@nongnu.org; alex.williamson@redhat.com
> > Subject: Re: [Qemu-devel] [PATCH RFC v3 14/14] intel_iommu: enable vfio
> > devices
> > 
> > > From: Peter Xu [mailto:peterx@redhat.com]
> > > Sent: Wednesday, January 18, 2017 4:46 PM
> > >
> > > On Wed, Jan 18, 2017 at 04:36:05PM +0800, Jason Wang wrote:
> > > >
> > > >
> > > > On 2017年01月18日 16:11, Peter Xu wrote:
> > > > >On Wed, Jan 18, 2017 at 11:10:53AM +0800, Jason Wang wrote:
> > > > >>
> > > > >>On 2017年01月17日 22:45, Peter Xu wrote:
> > > > >>>On Mon, Jan 16, 2017 at 05:54:55PM +0800, Jason Wang wrote:
> > > > >>>>On 2017年01月16日 17:18, Peter Xu wrote:
> > > > >>>>>>>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s,
> > > > >>>>>>> uint16_t
> > > domain_id,
> > > > >>>>>>>                                        hwaddr addr, uint8_t
> > > > >>>>>>>am)
> > > > >>>>>>>  {
> > > > >>>>>>>@@ -1222,6 +1251,7 @@ static void
> > > vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> > > > >>>>>>>      info.addr = addr;
> > > > >>>>>>>      info.mask = ~((1 << am) - 1);
> > > > >>>>>>>      g_hash_table_foreach_remove(s->iotlb,
> > > > >>>>>>> vtd_hash_remove_by_page,
> > > &info);
> > > > >>>>>>>+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr,
> > > > >>>>>>>+ am);
> > > > >>>>>>Is the case of GLOBAL or DSI flush missed, or we don't care it at all?
> > > > >>>>>IMHO we don't. For device assignment, since we are having CM=1
> > > > >>>>>here, we should have explicit page invalidations even if guest
> > > > >>>>>sends global/domain invalidations.
> > > > >>>>>
> > > > >>>>>Thanks,
> > > > >>>>>
> > > > >>>>>-- peterx
> > > > >>>>Is this spec required?
> > > > >>>I think not. IMO the spec is very coarse grained on describing
> > > > >>>cache mode...
> > > > >>>
> > > > >>>>Btw, it looks to me that both DSI and GLOBAL are indeed explicit
> > > > >>>>flush.
> > > > >>>Actually when cache mode is on, it is unclear to me on how we
> > > > >>>should treat domain/global invalidations, at least from the spec
> > > > >>>(as mentioned earlier). My understanding is that they are not
> > > > >>>"explicit flushes", which IMHO should only mean page selective
> > > > >>>IOTLB invalidations.
> > > > >>Probably not, at least from the view of performance. DSI and
> > > > >>global should be more efficient in some cases.
> > > > >I agree with you that DSI/GLOBAL flushes are more efficient in some
> > > > >way. But IMHO that does not mean these invalidations are "explicit
> > > > >invalidations", and I suspect whether cache mode has to coop with it.
> > > >
> > > > Well, the spec does not forbid DSI/GLOBAL with CM and the driver
> > > > codes had used them for almost ten years. I can hardly believe it's wrong.
> > >
> > > I think we have misunderstanding here. :)
> > >
> > > I never thought we should not send DSI/GLOBAL invalidations with cache
> > > mode. I just think we should not do anything special even if we have
> > > cache mode on when we receive these signals.
> > >
> > > In the spec, "explicit invalidation" is mentioned in the cache mode
> > > chapter:
> > >
> > >     The Caching Mode (CM) field in Capability Register indicates if
> > >     the hardware implementation caches not-present or erroneous
> > >     translation-structure entries. When the CM field is reported as
> > >     Set, any software updates to any remapping structures (including
> > >     updates to not-present entries or present entries whose
> > >     programming resulted in translation faults) requires explicit
> > >     invalidation of the caches.
> > >
> > > And I thought we were discussion about "what is explicit invalidation"
> > > mentioned above.
> > 
> > Check 6.5.3.1 Implicit Invalidation on Page Requests
> > 
> > 	In addition to the explicit invalidation through invalidation commands
> > 	(see Section 6.5.1 and Section 6.5.2) or through deferred invalidation
> > 	messages (see Section 6.5.4), identified above, Page Requests from
> > 	endpoint devices invalidate entries in the IOTLBs and paging-structure
> > 	caches.
> > 
> > My impression is that above indirectly defines invalidation commands (
> > PSI/DSI/GLOBAL) as explicit invalidation, because they are explicitly issued by
> > driver. Then section 6.5.3.1 further describes implicit invalidations caused by
> > other VT-d operations.
> > 
> > I will check with VT-d spec owner to clarify.
> > 
> > >
> > > >
> > > > >
> > > > >But here I should add one more thing besides PSI - context entry
> > > > >invalidation should be one of "the explicit invalidations" as well,
> > > > >which we need to handle just like PSI when cache mode is on.
> > > > >
> > > > >>>>Just have a quick go through on driver codes and find this
> > > > >>>>something interesting in intel_iommu_flush_iotlb_psi():
> > > > >>>>
> > > > >>>>...
> > > > >>>>     /*
> > > > >>>>      * Fallback to domain selective flush if no PSI support or the size is
> > > > >>>>      * too big.
> > > > >>>>      * PSI requires page size to be 2 ^ x, and the base address is
> > naturally
> > > > >>>>      * aligned to the size
> > > > >>>>      */
> > > > >>>>     if (!cap_pgsel_inv(iommu->cap) || mask >
> > > cap_max_amask_val(iommu->cap))
> > > > >>>>         iommu->flush.flush_iotlb(iommu, did, 0, 0,
> > > > >>>>                         DMA_TLB_DSI_FLUSH);
> > > > >>>>     else
> > > > >>>>         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
> > > > >>>>                         DMA_TLB_PSI_FLUSH); ...
> > > > >>>I think this is interesting... and I doubt its correctness while
> > > > >>>with cache mode enabled.
> > > > >>>
> > > > >>>If so (sending domain invalidation instead of a big range of page
> > > > >>>invalidations), how should we capture which pages are unmapped in
> > > > >>>emulated IOMMU?
> > > > >>We don't need to track individual pages here, since all pages for
> > > > >>a specific domain were unmapped I believe?
> > > > >IMHO this might not be the correct behavior.
> > > > >
> > > > >If we receive one domain specific invalidation, I agree that we
> > > > >should invalidate the IOTLB cache for all the devices inside the domain.
> > > > >However, when cache mode is on, we should be depending on the PSIs
> > > > >to unmap each page (unless we want to unmap the whole address
> > > > >space, in this case it's very possible that the guest is just
> > > > >unmapping a range, not the entire space). If we convert several
> > > > >PSIs into one big DSI, IMHO we will leave those pages
> > > > >mapped/unmapped while we should unmap/map them.
> > > >
> > > > Confused, do you have an example for this? (I fail to understand why
> > > > DSI can't work, at least implementation can convert DSI to several
> > > > PSIs internally).
> > >
> > > That's how I understand it. It might be wrong. Btw, could you
> > > elaborate a bit on how can we convert a DSI into several PSIs?
> > >
> > > Thanks,
> > 
> > If my understanding above is correct, there is nothing wrong with above
> > IOMMU driver code - actually it makes sense on bare metal when CM is
> > disabled.
> > 
> > But yes, DSI/GLOBAL is far less efficient than PSI when CM is enabled.
> > We rely on cache invalidations to indirectly capture remapping structure change.
> > PSI provides accurate info, while DSI/GLOBAL doesn't. To emulate correct
> > behavior of DSI/GLOBAL, we have to pretend that the whole address space
> > (iova=0, size=agaw) needs to be unmapped (for GLOBAL it further means
> > multiple address spaces)
> > 
> > Though not efficient, it doesn't mean it's wrong since guest driver follows spec.
> > We can ask for linux IOMMU driver change (CC Ashok) to not use above
> > optimization when cache mode is enabled, but anyway we need emulate correct
> > DSI/GLOBAL behavior to follow spec, because:
> > 
> > - even when driver fix is in place, old version still has this logic;
> > 
> > - there is still scenario where guest IOMMU driver does want to invalidate the
> > whole address space, e.g. when changing context entry. Asking guest driver to
> > use PSI for such purpose is another bad thing.
> 
> Hi Kevin/Peter/Jason,
> 
> I agree we should think DSI/GLOBAL. Herby, I guess there may be a chance to ignore
> DSI/GLOBAL flush if the following assumption is correct.
> 
> It seems like that all DSI/GLOBAL flush would always be after a context entry invalidation. 
> 
> With this assumption, I remember Peter added memory_replay in context invalidation.
> This memory_replay would walk guest second-level page table and do map. So the
> second-level page table in host should be able to get the latest mapping info. Guest
> IOMMU driver would issue an DSI/GLOBAL flush after changing context. Since the
> mapping info has updated in host, then there is no need to deal this DSI/GLOBAL flush.
> 
> So gentlemen, pls help judge if the assumption is correct. If it is correct, then Peter's patch
> may just work without special process against DSI/GLOBAL flush.

Actually above is exactly what I thought before (I think I may not
have explaint clearly though :).

But I won't disagree on strictly following the spec, as Jason/Kevin
has suggested. The problem is, whether the spec is "strict enough to
be strictly followed", especially on caching mode part... :(

For example, logically, it is legal for a guest to send multiple PSIs
for a single page. When without caching mode, it never hurts, since
these PSIs will just let IOMMU flush cache for multiple times, that's
fine. However, if with caching mode, multiple PSIs means multiple
UNMAPs. That's a problem.

If to solve it, we need a per-device tree in QEMU to maintain the IOVA
address space, just like what vfio has done per-domain inside kernel.
Then when we see the 2nd and Nth unmap, we ignore it. But I guess
that's an overkill. A better way maybe we just restrict the guest to
send the invalidation once for each entry update. But I guess we don't
have such a requirement in spec now.

Thanks,

-- peterx
Peter Xu Jan. 19, 2017, 9:38 a.m. UTC | #19
On Thu, Jan 19, 2017 at 06:22:48AM +0000, Tian, Kevin wrote:

[...]

> still copy response from spec owner here:-)
> 
> 	Explicit invalidation is anytime software is explicitly invalidating something (
> 	through any descriptor) as opposed to something hardware does implicitly.  
> 	The only time hardware does implicit invalidation is during the handling of a page 
> 	request (recoverable page-fault) from an endpoint device.

Thanks for the confirmation!

-- peterx
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2596f11..104200b 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -839,7 +839,8 @@  next:
  * @private: private data for the hook function
  */
 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
-                         vtd_page_walk_hook hook_fn, void *private)
+                         vtd_page_walk_hook hook_fn, void *private,
+                         bool notify_unmap)
 {
     dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
     uint32_t level = vtd_get_level_from_context_entry(ce);
@@ -858,7 +859,7 @@  static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
     trace_vtd_page_walk(ce->hi, ce->lo, start, end);
 
     return vtd_page_walk_level(addr, start, end, hook_fn, private,
-                               level, true, true, NULL, false);
+                               level, true, true, NULL, notify_unmap);
 }
 
 /* Map a device to its corresponding domain (context-entry) */
@@ -1212,6 +1213,34 @@  static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
                                 &domain_id);
 }
 
+static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
+                                           void *private)
+{
+    memory_region_notify_iommu((MemoryRegion *)private, *entry);
+    return 0;
+}
+
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+                                           uint16_t domain_id, hwaddr addr,
+                                           uint8_t am)
+{
+    IntelIOMMUNotifierNode *node;
+    VTDContextEntry ce;
+    int ret;
+
+    QLIST_FOREACH(node, &(s->notifiers_list), next) {
+        VTDAddressSpace *vtd_as = node->vtd_as;
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                       vtd_as->devfn, &ce);
+        if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
+            vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
+                          vtd_page_invalidate_notify_hook,
+                          (void *)&vtd_as->iommu, true);
+        }
+    }
+}
+
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
                                       hwaddr addr, uint8_t am)
 {
@@ -1222,6 +1251,7 @@  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }
 
 /* Flush IOTLB
@@ -2244,15 +2274,34 @@  static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
                                           IOMMUNotifierFlag new)
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    IntelIOMMUNotifierNode *node = NULL;
+    IntelIOMMUNotifierNode *next_node = NULL;
 
-    if (new & IOMMU_NOTIFIER_MAP) {
-        error_report("Device at bus %s addr %02x.%d requires iommu "
-                     "notifier which is currently not supported by "
-                     "intel-iommu emulation",
-                     vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
-                     PCI_FUNC(vtd_as->devfn));
+    if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
+        error_report("We need to set cache_mode=1 for intel-iommu to enable "
+                     "device assignment with IOMMU protection.");
         exit(1);
     }
+
+    /* Add new ndoe if no mapping was exising before this call */
+    if (old == IOMMU_NOTIFIER_NONE) {
+        node = g_malloc0(sizeof(*node));
+        node->vtd_as = vtd_as;
+        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
+        return;
+    }
+
+    /* update notifier node with new flags */
+    QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
+        if (node->vtd_as == vtd_as) {
+            if (new == IOMMU_NOTIFIER_NONE) {
+                QLIST_REMOVE(node, next);
+                g_free(node);
+            }
+            return;
+        }
+    }
 }
 
 static const VMStateDescription vtd_vmstate = {
@@ -2689,7 +2738,7 @@  static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n)
          */
         trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
                                   PCI_FUNC(vtd_as->devfn), ce.hi, ce.lo);
-        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n);
+        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n, false);
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
                                     PCI_FUNC(vtd_as->devfn));
@@ -2871,6 +2920,7 @@  static void vtd_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    QLIST_INIT(&s->notifiers_list);
     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
                           "intel_iommu", DMAR_REG_SIZE);
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 9c3f6c0..832cfc9 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -63,6 +63,7 @@  typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDIrq VTDIrq;
 typedef struct VTD_MSIMessage VTD_MSIMessage;
+typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -249,6 +250,11 @@  struct VTD_MSIMessage {
 /* When IR is enabled, all MSI/MSI-X data bits should be zero */
 #define VTD_IR_MSI_DATA          (0)
 
+struct IntelIOMMUNotifierNode {
+    VTDAddressSpace *vtd_as;
+    QLIST_ENTRY(IntelIOMMUNotifierNode) next;
+};
+
 /* The iommu (DMAR) device state struct */
 struct IntelIOMMUState {
     X86IOMMUState x86_iommu;
@@ -286,6 +292,8 @@  struct IntelIOMMUState {
     MemoryRegionIOMMUOps iommu_ops;
     GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
     VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
+    /* list of registered notifiers */
+    QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list;
 
     /* interrupt remapping */
     bool intr_enabled;              /* Whether guest enabled IR */