diff mbox

[08/10] intel-iommu: maintain per-device iova ranges

Message ID 20180425045129.17449-9-peterx@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Xu April 25, 2018, 4:51 a.m. UTC
For each VTDAddressSpace, now we maintain what IOVA ranges we have
mapped and what we have not.  With that information, now we only send
MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we know
we have already mapped the range, meanwhile we don't send UNMAP notifies
if we know we never mapped the range at all.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/hw/i386/intel_iommu.h |  2 ++
 hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
 hw/i386/trace-events          |  2 ++
 3 files changed, 32 insertions(+)

Comments

Jason Wang April 27, 2018, 6:07 a.m. UTC | #1
On 2018年04月25日 12:51, Peter Xu wrote:
> For each VTDAddressSpace, now we maintain what IOVA ranges we have
> mapped and what we have not.  With that information, now we only send
> MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we know
> we have already mapped the range, meanwhile we don't send UNMAP notifies
> if we know we never mapped the range at all.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   include/hw/i386/intel_iommu.h |  2 ++
>   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
>   hw/i386/trace-events          |  2 ++
>   3 files changed, 32 insertions(+)
>
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 486e205e79..09a2e94404 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -27,6 +27,7 @@
>   #include "hw/i386/ioapic.h"
>   #include "hw/pci/msi.h"
>   #include "hw/sysbus.h"
> +#include "qemu/interval-tree.h"
>   
>   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>   #define INTEL_IOMMU_DEVICE(obj) \
> @@ -95,6 +96,7 @@ struct VTDAddressSpace {
>       QLIST_ENTRY(VTDAddressSpace) next;
>       /* Superset of notifier flags that this address space has */
>       IOMMUNotifierFlag notifier_flags;
> +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
>   };
>   
>   struct VTDBus {
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index a19c18b8d4..8f396a5d13 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -768,12 +768,37 @@ typedef struct {
>   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
>                                vtd_page_walk_info *info)
>   {
> +    VTDAddressSpace *as = info->as;
>       vtd_page_walk_hook hook_fn = info->hook_fn;
>       void *private = info->private;
> +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> +                                   entry->iova + entry->addr_mask);
>   
>       assert(hook_fn);
> +
> +    /* Update local IOVA mapped ranges */
> +    if (entry->perm) {
> +        if (mapped) {
> +            /* Skip since we have already mapped this range */
> +            trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
> +                                             mapped->start, mapped->end);
> +            return 0;
> +        }
> +        it_tree_insert(as->iova_tree, entry->iova,
> +                       entry->iova + entry->addr_mask);

I was consider a case e.g:

1) map A (iova) to B (pa)
2) invalidate A
3) map A (iova) to C (pa)
4) invalidate A

In this case, we will probably miss a walk here. But I'm not sure it was 
allowed by the spec (though I think so).

Thanks

> +    } else {
> +        if (!mapped) {
> +            /* Skip since we didn't map this range at all */
> +            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
> +            return 0;
> +        }
> +        it_tree_remove(as->iova_tree, entry->iova,
> +                       entry->iova + entry->addr_mask);
> +    }
> +
>       trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
>                               entry->addr_mask, entry->perm);
> +
>       return hook_fn(entry, private);
>   }
>   
> @@ -2798,6 +2823,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>           vtd_dev_as->devfn = (uint8_t)devfn;
>           vtd_dev_as->iommu_state = s;
>           vtd_dev_as->context_cache_entry.context_cache_gen = 0;
> +        vtd_dev_as->iova_tree = it_tree_new();
>   
>           /*
>            * Memory region relationships looks like (Address range shows
> @@ -2894,6 +2920,8 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
>                                VTD_PCI_FUNC(as->devfn),
>                                entry.iova, size);
>   
> +    it_tree_remove(as->iova_tree, entry.iova, entry.iova + entry.addr_mask);
> +
>       memory_region_notify_one(n, &entry);
>   }
>   
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index 22d44648af..677f83420d 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -40,6 +40,8 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
>   vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
>   vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
>   vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
> +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t start, uint64_t end) "iova 0x%"PRIx64" mask 0x%"PRIx64" start 0x%"PRIx64" end 0x%"PRIx64
> +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
>   vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
>   vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
>   vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
Peter Xu April 27, 2018, 6:34 a.m. UTC | #2
On Fri, Apr 27, 2018 at 02:07:46PM +0800, Jason Wang wrote:
> 
> 
> On 2018年04月25日 12:51, Peter Xu wrote:
> > For each VTDAddressSpace, now we maintain what IOVA ranges we have
> > mapped and what we have not.  With that information, now we only send
> > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we know
> > we have already mapped the range, meanwhile we don't send UNMAP notifies
> > if we know we never mapped the range at all.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >   include/hw/i386/intel_iommu.h |  2 ++
> >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
> >   hw/i386/trace-events          |  2 ++
> >   3 files changed, 32 insertions(+)
> > 
> > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> > index 486e205e79..09a2e94404 100644
> > --- a/include/hw/i386/intel_iommu.h
> > +++ b/include/hw/i386/intel_iommu.h
> > @@ -27,6 +27,7 @@
> >   #include "hw/i386/ioapic.h"
> >   #include "hw/pci/msi.h"
> >   #include "hw/sysbus.h"
> > +#include "qemu/interval-tree.h"
> >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> >   #define INTEL_IOMMU_DEVICE(obj) \
> > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> >       QLIST_ENTRY(VTDAddressSpace) next;
> >       /* Superset of notifier flags that this address space has */
> >       IOMMUNotifierFlag notifier_flags;
> > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> >   };
> >   struct VTDBus {
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index a19c18b8d4..8f396a5d13 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -768,12 +768,37 @@ typedef struct {
> >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> >                                vtd_page_walk_info *info)
> >   {
> > +    VTDAddressSpace *as = info->as;
> >       vtd_page_walk_hook hook_fn = info->hook_fn;
> >       void *private = info->private;
> > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > +                                   entry->iova + entry->addr_mask);
> >       assert(hook_fn);
> > +
> > +    /* Update local IOVA mapped ranges */
> > +    if (entry->perm) {
> > +        if (mapped) {
> > +            /* Skip since we have already mapped this range */
> > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
> > +                                             mapped->start, mapped->end);
> > +            return 0;
> > +        }
> > +        it_tree_insert(as->iova_tree, entry->iova,
> > +                       entry->iova + entry->addr_mask);
> 
> I was consider a case e.g:
> 
> 1) map A (iova) to B (pa)
> 2) invalidate A

Here to be more explicit you mean guest sends a PSI, not really
invalidation of the mapping.

> 3) map A (iova) to C (pa)
> 4) invalidate A

Here too.

> 
> In this case, we will probably miss a walk here. But I'm not sure it was
> allowed by the spec (though I think so).

IMHO IOMMU page tables should not be modified by guest directly.  It
can be mapped, unmapped, but should not be modified directly.  I
suppose that's why Linux IOMMU API won't provide iommu_modify() but
only iommu_[un]map(), etc.. I don't know whether there is anything in
the spec, but AFAIU this can cause coherence issue on device side
since after step (1) device should be able to know the mapping
already, then modifying that mapping without UNMAP that on device side
will cause undefined behaviors.

Thanks,
Tian, Kevin April 27, 2018, 7:02 a.m. UTC | #3
> From: Jason Wang [mailto:jasowang@redhat.com]

> Sent: Friday, April 27, 2018 2:08 PM

> 

> On 2018年04月25日 12:51, Peter Xu wrote:

> > For each VTDAddressSpace, now we maintain what IOVA ranges we have

> > mapped and what we have not.  With that information, now we only

> send

> > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we

> know

> > we have already mapped the range, meanwhile we don't send UNMAP

> notifies

> > if we know we never mapped the range at all.

> >

> > Signed-off-by: Peter Xu <peterx@redhat.com>

> > ---

> >   include/hw/i386/intel_iommu.h |  2 ++

> >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++

> >   hw/i386/trace-events          |  2 ++

> >   3 files changed, 32 insertions(+)

> >

> > diff --git a/include/hw/i386/intel_iommu.h

> b/include/hw/i386/intel_iommu.h

> > index 486e205e79..09a2e94404 100644

> > --- a/include/hw/i386/intel_iommu.h

> > +++ b/include/hw/i386/intel_iommu.h

> > @@ -27,6 +27,7 @@

> >   #include "hw/i386/ioapic.h"

> >   #include "hw/pci/msi.h"

> >   #include "hw/sysbus.h"

> > +#include "qemu/interval-tree.h"

> >

> >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"

> >   #define INTEL_IOMMU_DEVICE(obj) \

> > @@ -95,6 +96,7 @@ struct VTDAddressSpace {

> >       QLIST_ENTRY(VTDAddressSpace) next;

> >       /* Superset of notifier flags that this address space has */

> >       IOMMUNotifierFlag notifier_flags;

> > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */

> >   };

> >

> >   struct VTDBus {

> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c

> > index a19c18b8d4..8f396a5d13 100644

> > --- a/hw/i386/intel_iommu.c

> > +++ b/hw/i386/intel_iommu.c

> > @@ -768,12 +768,37 @@ typedef struct {

> >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,

> >                                vtd_page_walk_info *info)

> >   {

> > +    VTDAddressSpace *as = info->as;

> >       vtd_page_walk_hook hook_fn = info->hook_fn;

> >       void *private = info->private;

> > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,

> > +                                   entry->iova + entry->addr_mask);

> >

> >       assert(hook_fn);

> > +

> > +    /* Update local IOVA mapped ranges */

> > +    if (entry->perm) {

> > +        if (mapped) {

> > +            /* Skip since we have already mapped this range */

> > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-

> >addr_mask,

> > +                                             mapped->start, mapped->end);

> > +            return 0;

> > +        }

> > +        it_tree_insert(as->iova_tree, entry->iova,

> > +                       entry->iova + entry->addr_mask);

> 

> I was consider a case e.g:

> 

> 1) map A (iova) to B (pa)

> 2) invalidate A

> 3) map A (iova) to C (pa)

> 4) invalidate A

> 

> In this case, we will probably miss a walk here. But I'm not sure it was

> allowed by the spec (though I think so).

> 


I thought it was wrong in a glimpse, but then changed my mind after
another thinking. As long as device driver can quiescent the device
to not access A (iova) within above window, then above sequence
has no problem since any stale mappings (A->B) added before step 4)
won't be used and then will get flushed after step 4). Regarding to
that actually the 1st invalidation can be skipped:

1) map A (iova) to B (pa)
2) driver programs device to use A
3) driver programs device to not use A
4) map A (iova) to C (pa)
	A->B may be still valid in IOTLB
5) invalidate A
6) driver programs device to use A

Of course above doesn't generate a sane IOMMU API framework,
just as Peter pointed out. But from hardware p.o.v it looks no
problem.

Thanks
Kevin
diff mbox

Patch

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 486e205e79..09a2e94404 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -27,6 +27,7 @@ 
 #include "hw/i386/ioapic.h"
 #include "hw/pci/msi.h"
 #include "hw/sysbus.h"
+#include "qemu/interval-tree.h"
 
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -95,6 +96,7 @@  struct VTDAddressSpace {
     QLIST_ENTRY(VTDAddressSpace) next;
     /* Superset of notifier flags that this address space has */
     IOMMUNotifierFlag notifier_flags;
+    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
 };
 
 struct VTDBus {
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a19c18b8d4..8f396a5d13 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -768,12 +768,37 @@  typedef struct {
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
                              vtd_page_walk_info *info)
 {
+    VTDAddressSpace *as = info->as;
     vtd_page_walk_hook hook_fn = info->hook_fn;
     void *private = info->private;
+    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
+                                   entry->iova + entry->addr_mask);
 
     assert(hook_fn);
+
+    /* Update local IOVA mapped ranges */
+    if (entry->perm) {
+        if (mapped) {
+            /* Skip since we have already mapped this range */
+            trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+                                             mapped->start, mapped->end);
+            return 0;
+        }
+        it_tree_insert(as->iova_tree, entry->iova,
+                       entry->iova + entry->addr_mask);
+    } else {
+        if (!mapped) {
+            /* Skip since we didn't map this range at all */
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+            return 0;
+        }
+        it_tree_remove(as->iova_tree, entry->iova,
+                       entry->iova + entry->addr_mask);
+    }
+
     trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
                             entry->addr_mask, entry->perm);
+
     return hook_fn(entry, private);
 }
 
@@ -2798,6 +2823,7 @@  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
         vtd_dev_as->devfn = (uint8_t)devfn;
         vtd_dev_as->iommu_state = s;
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+        vtd_dev_as->iova_tree = it_tree_new();
 
         /*
          * Memory region relationships looks like (Address range shows
@@ -2894,6 +2920,8 @@  static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
                              VTD_PCI_FUNC(as->devfn),
                              entry.iova, size);
 
+    it_tree_remove(as->iova_tree, entry.iova, entry.iova + entry.addr_mask);
+
     memory_region_notify_one(n, &entry);
 }
 
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 22d44648af..677f83420d 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -40,6 +40,8 @@  vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
 vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t start, uint64_t end) "iova 0x%"PRIx64" mask 0x%"PRIx64" start 0x%"PRIx64" end 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
 vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"