diff mbox

[RFC,v4,15/20] intel_iommu: provide its own replay() callback

Message ID 1484917736-32056-16-git-send-email-peterx@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Xu Jan. 20, 2017, 1:08 p.m. UTC
The default replay() don't work for VT-d since vt-d will have a huge
default memory region which covers address range 0-(2^64-1). This will
normally consumes a lot of time (which looks like a dead loop).

The solution is simple - we don't walk over all the regions. Instead, we
jump over the regions when we found that the page directories are empty.
It'll greatly reduce the time to walk the whole region.

To achieve this, we provided a page walk helper to do that, invoking
corresponding hook function when we found an page we are interested in.
vtd_page_walk_level() is the core logic for the page walking. It's
interface is designed to suite further use case, e.g., to invalidate a
range of addresses.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++--
 hw/i386/trace-events  |   7 ++
 include/exec/memory.h |   2 +
 3 files changed, 220 insertions(+), 5 deletions(-)

Comments

Jason Wang Jan. 22, 2017, 7:56 a.m. UTC | #1
On 2017年01月20日 21:08, Peter Xu wrote:
> The default replay() don't work for VT-d since vt-d will have a huge
> default memory region which covers address range 0-(2^64-1). This will
> normally consumes a lot of time (which looks like a dead loop).
>
> The solution is simple - we don't walk over all the regions. Instead, we
> jump over the regions when we found that the page directories are empty.
> It'll greatly reduce the time to walk the whole region.
>
> To achieve this, we provided a page walk helper to do that, invoking
> corresponding hook function when we found an page we are interested in.
> vtd_page_walk_level() is the core logic for the page walking. It's
> interface is designed to suite further use case, e.g., to invalidate a
> range of addresses.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   hw/i386/intel_iommu.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++--
>   hw/i386/trace-events  |   7 ++
>   include/exec/memory.h |   2 +
>   3 files changed, 220 insertions(+), 5 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 6f5f68a..f9c5142 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -598,6 +598,22 @@ static inline uint32_t vtd_get_agaw_from_context_entry(VTDContextEntry *ce)
>       return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
>   }
>   
> +static inline uint64_t vtd_iova_limit(VTDContextEntry *ce)
> +{
> +    uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
> +    return 1ULL << MIN(ce_agaw, VTD_MGAW);
> +}
> +
> +/* Return true if IOVA passes range check, otherwise false. */
> +static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce)
> +{
> +    /*
> +     * Check if @iova is above 2^X-1, where X is the minimum of MGAW
> +     * in CAP_REG and AW in context-entry.
> +     */
> +    return !(iova & ~(vtd_iova_limit(ce) - 1));
> +}
> +
>   static const uint64_t vtd_paging_entry_rsvd_field[] = {
>       [0] = ~0ULL,
>       /* For not large page */
> @@ -633,13 +649,9 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
>       uint32_t level = vtd_get_level_from_context_entry(ce);
>       uint32_t offset;
>       uint64_t slpte;
> -    uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
>       uint64_t access_right_check;
>   
> -    /* Check if @iova is above 2^X-1, where X is the minimum of MGAW
> -     * in CAP_REG and AW in context-entry.
> -     */
> -    if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
> +    if (!vtd_iova_range_check(iova, ce)) {
>           trace_vtd_err("IOVA exceeds limits");
>           return -VTD_FR_ADDR_BEYOND_MGAW;
>       }
> @@ -681,6 +693,168 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
>       }
>   }
>   
> +typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
> +
> +/**
> + * vtd_page_walk_level - walk over specific level for IOVA range
> + *
> + * @addr: base GPA addr to start the walk
> + * @start: IOVA range start address
> + * @end: IOVA range end address (start <= addr < end)
> + * @hook_fn: hook func to be called when detected page
> + * @private: private data to be passed into hook func
> + * @read: whether parent level has read permission
> + * @write: whether parent level has write permission
> + * @skipped: accumulated skipped ranges

What's the usage for this parameter? Looks like it was never used in 
this series.

> + * @notify_unmap: whether we should notify invalid entries
> + */
> +static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
> +                               uint64_t end, vtd_page_walk_hook hook_fn,
> +                               void *private, uint32_t level,
> +                               bool read, bool write, uint64_t *skipped,
> +                               bool notify_unmap)
> +{
> +    bool read_cur, write_cur, entry_valid;
> +    uint32_t offset;
> +    uint64_t slpte;
> +    uint64_t subpage_size, subpage_mask;
> +    IOMMUTLBEntry entry;
> +    uint64_t iova = start;
> +    uint64_t iova_next;
> +    uint64_t skipped_local = 0;
> +    int ret = 0;
> +
> +    trace_vtd_page_walk_level(addr, level, start, end);
> +
> +    subpage_size = 1ULL << vtd_slpt_level_shift(level);
> +    subpage_mask = vtd_slpt_level_page_mask(level);
> +
> +    while (iova < end) {
> +        iova_next = (iova & subpage_mask) + subpage_size;
> +
> +        offset = vtd_iova_level_offset(iova, level);
> +        slpte = vtd_get_slpte(addr, offset);
> +
> +        /*
> +         * When one of the following case happens, we assume the whole
> +         * range is invalid:
> +         *
> +         * 1. read block failed

Don't get the meaning (and don't see any code relate to this comment).

> +         * 3. reserved area non-zero
> +         * 2. both read & write flag are not set

Should be 1,2,3? And the above comment is conflict with the code at 
least when notify_unmap is true.

> +         */
> +
> +        if (slpte == (uint64_t)-1) {

If this is true, vtd_slpte_nonzero_rsvd(slpte) should be true too I think?

> +            trace_vtd_page_walk_skip_read(iova, iova_next);
> +            skipped_local++;
> +            goto next;
> +        }
> +
> +        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
> +            trace_vtd_page_walk_skip_reserve(iova, iova_next);
> +            skipped_local++;
> +            goto next;
> +        }
> +
> +        /* Permissions are stacked with parents' */
> +        read_cur = read && (slpte & VTD_SL_R);
> +        write_cur = write && (slpte & VTD_SL_W);
> +
> +        /*
> +         * As long as we have either read/write permission, this is
> +         * a valid entry. The rule works for both page or page tables.
> +         */
> +        entry_valid = read_cur | write_cur;
> +
> +        if (vtd_is_last_slpte(slpte, level)) {
> +            entry.target_as = &address_space_memory;
> +            entry.iova = iova & subpage_mask;
> +            /*
> +             * This might be meaningless addr if (!read_cur &&
> +             * !write_cur), but after all this field will be
> +             * meaningless in that case, so let's share the code to
> +             * generate the IOTLBs no matter it's an MAP or UNMAP
> +             */
> +            entry.translated_addr = vtd_get_slpte_addr(slpte);
> +            entry.addr_mask = ~subpage_mask;
> +            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
> +            if (!entry_valid && !notify_unmap) {
> +                trace_vtd_page_walk_skip_perm(iova, iova_next);
> +                skipped_local++;
> +                goto next;
> +            }

Under which case should we care about unmap here (better with a comment 
I think)?

> +            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
> +                                    entry.addr_mask, entry.perm);
> +            if (hook_fn) {
> +                ret = hook_fn(&entry, private);

For better performance, we could try to merge adjacent mappings here. I 
think both vfio and vhost support this and it can save a lot of ioctls.

> +                if (ret < 0) {
> +                    error_report("Detected error in page walk hook "
> +                                 "function, stop walk.");
> +                    return ret;
> +                }
> +            }
> +        } else {
> +            if (!entry_valid) {
> +                trace_vtd_page_walk_skip_perm(iova, iova_next);
> +                skipped_local++;
> +                goto next;
> +            }
> +            trace_vtd_page_walk_level(vtd_get_slpte_addr(slpte), level - 1,
> +                                      iova, MIN(iova_next, end));

This looks duplicated?

> +            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova,
> +                                      MIN(iova_next, end), hook_fn, private,
> +                                      level - 1, read_cur, write_cur,
> +                                      &skipped_local, notify_unmap);
> +            if (ret < 0) {
> +                error_report("Detected page walk error on addr 0x%"PRIx64
> +                             " level %"PRIu32", stop walk.", addr, level - 1);

Guest triggered, so better use debug macro or tracepoint.

> +                return ret;
> +            }
> +        }
> +
> +next:
> +        iova = iova_next;
> +    }
> +
> +    if (skipped) {
> +        *skipped += skipped_local;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * vtd_page_walk - walk specific IOVA range, and call the hook
> + *
> + * @ce: context entry to walk upon
> + * @start: IOVA address to start the walk
> + * @end: IOVA range end address (start <= addr < end)
> + * @hook_fn: the hook that to be called for each detected area
> + * @private: private data for the hook function
> + */
> +static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
> +                         vtd_page_walk_hook hook_fn, void *private)
> +{
> +    dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
> +    uint32_t level = vtd_get_level_from_context_entry(ce);
> +
> +    if (!vtd_iova_range_check(start, ce)) {
> +        error_report("IOVA start 0x%"PRIx64 " end 0x%"PRIx64" exceeds limits",
> +                     start, end);

Guest triggered, better use debug macro or tracepoint.

> +        return -VTD_FR_ADDR_BEYOND_MGAW;
> +    }
> +
> +    if (!vtd_iova_range_check(end, ce)) {
> +        /* Fix end so that it reaches the maximum */
> +        end = vtd_iova_limit(ce);
> +    }
> +
> +    trace_vtd_page_walk_level(addr, level, start, end);

Duplicated with the tracepoint in vtd_page_walk_level() too?

> +
> +    return vtd_page_walk_level(addr, start, end, hook_fn, private,
> +                               level, true, true, NULL, false);
> +}
> +
>   /* Map a device to its corresponding domain (context-entry) */
>   static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>                                       uint8_t devfn, VTDContextEntry *ce)
> @@ -2395,6 +2569,37 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>       return vtd_dev_as;
>   }
>   
> +static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
> +{
> +    memory_region_notify_one((IOMMUNotifier *)private, entry);
> +    return 0;
> +}
> +
> +static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n)
> +{
> +    VTDAddressSpace *vtd_as = container_of(mr, VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    uint8_t bus_n = pci_bus_num(vtd_as->bus);
> +    VTDContextEntry ce;
> +
> +    if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
> +        /*
> +         * Scanned a valid context entry, walk over the pages and
> +         * notify when needed.
> +         */
> +        trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
> +                                  PCI_FUNC(vtd_as->devfn),
> +                                  VTD_CONTEXT_ENTRY_DID(ce.hi),
> +                                  ce.hi, ce.lo);
> +        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n);

~0ULL?

> +    } else {
> +        trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
> +                                    PCI_FUNC(vtd_as->devfn));
> +    }
> +
> +    return;
> +}
> +
>   /* Do the initialization. It will also be called when reset, so pay
>    * attention when adding new initialization stuff.
>    */
> @@ -2409,6 +2614,7 @@ static void vtd_init(IntelIOMMUState *s)
>   
>       s->iommu_ops.translate = vtd_iommu_translate;
>       s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed;
> +    s->iommu_ops.replay = vtd_iommu_replay;
>       s->root = 0;
>       s->root_extended = false;
>       s->dmar_enabled = false;
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index a273980..a3e1a9d 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -31,6 +31,13 @@ vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t doma
>   vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
>   vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
>   vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
> +vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
> +vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
> +vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
> +vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
> +vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
> +vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
> +vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
>   
>   # hw/i386/amd_iommu.c
>   amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index bb4e654..9fd3232 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -59,6 +59,8 @@ typedef enum {
>       IOMMU_RW   = 3,
>   } IOMMUAccessFlags;
>   
> +#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0))
> +
>   struct IOMMUTLBEntry {
>       AddressSpace    *target_as;
>       hwaddr           iova;
Peter Xu Jan. 22, 2017, 8:51 a.m. UTC | #2
On Sun, Jan 22, 2017 at 03:56:10PM +0800, Jason Wang wrote:

[...]

> >+/**
> >+ * vtd_page_walk_level - walk over specific level for IOVA range
> >+ *
> >+ * @addr: base GPA addr to start the walk
> >+ * @start: IOVA range start address
> >+ * @end: IOVA range end address (start <= addr < end)
> >+ * @hook_fn: hook func to be called when detected page
> >+ * @private: private data to be passed into hook func
> >+ * @read: whether parent level has read permission
> >+ * @write: whether parent level has write permission
> >+ * @skipped: accumulated skipped ranges
> 
> What's the usage for this parameter? Looks like it was never used in this
> series.

This was for debugging purpose before, and I kept it in case one day
it can be used again, considering that will not affect much on the
overall performance.

> 
> >+ * @notify_unmap: whether we should notify invalid entries
> >+ */
> >+static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
> >+                               uint64_t end, vtd_page_walk_hook hook_fn,
> >+                               void *private, uint32_t level,
> >+                               bool read, bool write, uint64_t *skipped,
> >+                               bool notify_unmap)
> >+{
> >+    bool read_cur, write_cur, entry_valid;
> >+    uint32_t offset;
> >+    uint64_t slpte;
> >+    uint64_t subpage_size, subpage_mask;
> >+    IOMMUTLBEntry entry;
> >+    uint64_t iova = start;
> >+    uint64_t iova_next;
> >+    uint64_t skipped_local = 0;
> >+    int ret = 0;
> >+
> >+    trace_vtd_page_walk_level(addr, level, start, end);
> >+
> >+    subpage_size = 1ULL << vtd_slpt_level_shift(level);
> >+    subpage_mask = vtd_slpt_level_page_mask(level);
> >+
> >+    while (iova < end) {
> >+        iova_next = (iova & subpage_mask) + subpage_size;
> >+
> >+        offset = vtd_iova_level_offset(iova, level);
> >+        slpte = vtd_get_slpte(addr, offset);
> >+
> >+        /*
> >+         * When one of the following case happens, we assume the whole
> >+         * range is invalid:
> >+         *
> >+         * 1. read block failed
> 
> Don't get the meaning (and don't see any code relate to this comment).

I took above vtd_get_slpte() a "read", so I was trying to mean that we
failed to read the SLPTE due to some reason, we assume the range is
invalid.

> 
> >+         * 3. reserved area non-zero
> >+         * 2. both read & write flag are not set
> 
> Should be 1,2,3? And the above comment is conflict with the code at least
> when notify_unmap is true.

Yes, okay I don't know why 3 jumped. :(

And yes, I should mention that "both read & write flag not set" only
suites for page tables here.

> 
> >+         */
> >+
> >+        if (slpte == (uint64_t)-1) {
> 
> If this is true, vtd_slpte_nonzero_rsvd(slpte) should be true too I think?

Yes, but we are doing two checks here:

- checking against -1 to make sure slpte read success
- checking against nonzero reserved fields to make sure it follows spec

Imho we should not skip the first check here, only if one day removing
this may really matter (e.g., for performance reason? I cannot think
of one yet).

> 
> >+            trace_vtd_page_walk_skip_read(iova, iova_next);
> >+            skipped_local++;
> >+            goto next;
> >+        }
> >+
> >+        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
> >+            trace_vtd_page_walk_skip_reserve(iova, iova_next);
> >+            skipped_local++;
> >+            goto next;
> >+        }
> >+
> >+        /* Permissions are stacked with parents' */
> >+        read_cur = read && (slpte & VTD_SL_R);
> >+        write_cur = write && (slpte & VTD_SL_W);
> >+
> >+        /*
> >+         * As long as we have either read/write permission, this is
> >+         * a valid entry. The rule works for both page or page tables.
> >+         */
> >+        entry_valid = read_cur | write_cur;
> >+
> >+        if (vtd_is_last_slpte(slpte, level)) {
> >+            entry.target_as = &address_space_memory;
> >+            entry.iova = iova & subpage_mask;
> >+            /*
> >+             * This might be meaningless addr if (!read_cur &&
> >+             * !write_cur), but after all this field will be
> >+             * meaningless in that case, so let's share the code to
> >+             * generate the IOTLBs no matter it's an MAP or UNMAP
> >+             */
> >+            entry.translated_addr = vtd_get_slpte_addr(slpte);
> >+            entry.addr_mask = ~subpage_mask;
> >+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
> >+            if (!entry_valid && !notify_unmap) {
> >+                trace_vtd_page_walk_skip_perm(iova, iova_next);
> >+                skipped_local++;
> >+                goto next;
> >+            }
> 
> Under which case should we care about unmap here (better with a comment I
> think)?

When PSIs are for invalidation, rather than newly mapped entries. In
that case, notify_unmap will be true, and here we need to notify
IOMMU_NONE to do the cache flush or unmap.

(this page walk is not only for replaying, it is for cache flushing as
 well)

Do you have suggestion on the comments?

> 
> >+            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
> >+                                    entry.addr_mask, entry.perm);
> >+            if (hook_fn) {
> >+                ret = hook_fn(&entry, private);
> 
> For better performance, we could try to merge adjacent mappings here. I
> think both vfio and vhost support this and it can save a lot of ioctls.

Looks so, and this is in my todo list.

Do you mind I do it later after this series merged? I would really
appreciate if we can have this codes settled down first (considering
that this series has been dangling for half a year, or more, startint
from Aviv's series), and I am just afraid this will led to
unconvergence of this series (and I believe there are other places
that can be enhanced in the future as well).

> 
> >+                if (ret < 0) {
> >+                    error_report("Detected error in page walk hook "
> >+                                 "function, stop walk.");
> >+                    return ret;
> >+                }
> >+            }
> >+        } else {
> >+            if (!entry_valid) {
> >+                trace_vtd_page_walk_skip_perm(iova, iova_next);
> >+                skipped_local++;
> >+                goto next;
> >+            }
> >+            trace_vtd_page_walk_level(vtd_get_slpte_addr(slpte), level - 1,
> >+                                      iova, MIN(iova_next, end));
> 
> This looks duplicated?

I suppose not. The level is different.

> 
> >+            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova,
> >+                                      MIN(iova_next, end), hook_fn, private,
> >+                                      level - 1, read_cur, write_cur,
> >+                                      &skipped_local, notify_unmap);
> >+            if (ret < 0) {
> >+                error_report("Detected page walk error on addr 0x%"PRIx64
> >+                             " level %"PRIu32", stop walk.", addr, level - 1);
> 
> Guest triggered, so better use debug macro or tracepoint.

Sorry. Will replace all the error_report() in the whole series.

> 
> >+                return ret;
> >+            }
> >+        }
> >+
> >+next:
> >+        iova = iova_next;
> >+    }
> >+
> >+    if (skipped) {
> >+        *skipped += skipped_local;
> >+    }
> >+
> >+    return 0;
> >+}
> >+
> >+/**
> >+ * vtd_page_walk - walk specific IOVA range, and call the hook
> >+ *
> >+ * @ce: context entry to walk upon
> >+ * @start: IOVA address to start the walk
> >+ * @end: IOVA range end address (start <= addr < end)
> >+ * @hook_fn: the hook that to be called for each detected area
> >+ * @private: private data for the hook function
> >+ */
> >+static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
> >+                         vtd_page_walk_hook hook_fn, void *private)
> >+{
> >+    dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
> >+    uint32_t level = vtd_get_level_from_context_entry(ce);
> >+
> >+    if (!vtd_iova_range_check(start, ce)) {
> >+        error_report("IOVA start 0x%"PRIx64 " end 0x%"PRIx64" exceeds limits",
> >+                     start, end);
> 
> Guest triggered, better use debug macro or tracepoint.

Same.

> 
> >+        return -VTD_FR_ADDR_BEYOND_MGAW;
> >+    }
> >+
> >+    if (!vtd_iova_range_check(end, ce)) {
> >+        /* Fix end so that it reaches the maximum */
> >+        end = vtd_iova_limit(ce);
> >+    }
> >+
> >+    trace_vtd_page_walk_level(addr, level, start, end);
> 
> Duplicated with the tracepoint in vtd_page_walk_level() too?

Nop? This should be the top level.

> 
> >+
> >+    return vtd_page_walk_level(addr, start, end, hook_fn, private,
> >+                               level, true, true, NULL, false);
> >+}
> >+
> >  /* Map a device to its corresponding domain (context-entry) */
> >  static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> >                                      uint8_t devfn, VTDContextEntry *ce)
> >@@ -2395,6 +2569,37 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
> >      return vtd_dev_as;
> >  }
> >+static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
> >+{
> >+    memory_region_notify_one((IOMMUNotifier *)private, entry);
> >+    return 0;
> >+}
> >+
> >+static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n)
> >+{
> >+    VTDAddressSpace *vtd_as = container_of(mr, VTDAddressSpace, iommu);
> >+    IntelIOMMUState *s = vtd_as->iommu_state;
> >+    uint8_t bus_n = pci_bus_num(vtd_as->bus);
> >+    VTDContextEntry ce;
> >+
> >+    if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
> >+        /*
> >+         * Scanned a valid context entry, walk over the pages and
> >+         * notify when needed.
> >+         */
> >+        trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
> >+                                  PCI_FUNC(vtd_as->devfn),
> >+                                  VTD_CONTEXT_ENTRY_DID(ce.hi),
> >+                                  ce.hi, ce.lo);
> >+        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n);
> 
> ~0ULL?

Fixing up.

Thanks,

-- peterx
Jason Wang Jan. 23, 2017, 1:48 a.m. UTC | #3
On 2017年01月22日 16:51, Peter Xu wrote:
> On Sun, Jan 22, 2017 at 03:56:10PM +0800, Jason Wang wrote:
>
> [...]
>
>>> +/**
>>> + * vtd_page_walk_level - walk over specific level for IOVA range
>>> + *
>>> + * @addr: base GPA addr to start the walk
>>> + * @start: IOVA range start address
>>> + * @end: IOVA range end address (start <= addr < end)
>>> + * @hook_fn: hook func to be called when detected page
>>> + * @private: private data to be passed into hook func
>>> + * @read: whether parent level has read permission
>>> + * @write: whether parent level has write permission
>>> + * @skipped: accumulated skipped ranges
>> What's the usage for this parameter? Looks like it was never used in this
>> series.
> This was for debugging purpose before, and I kept it in case one day
> it can be used again, considering that will not affect much on the
> overall performance.

I think we usually do not keep debugging codes outside debug macros.

>
>>> + * @notify_unmap: whether we should notify invalid entries
>>> + */
>>> +static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
>>> +                               uint64_t end, vtd_page_walk_hook hook_fn,
>>> +                               void *private, uint32_t level,
>>> +                               bool read, bool write, uint64_t *skipped,
>>> +                               bool notify_unmap)
>>> +{
>>> +    bool read_cur, write_cur, entry_valid;
>>> +    uint32_t offset;
>>> +    uint64_t slpte;
>>> +    uint64_t subpage_size, subpage_mask;
>>> +    IOMMUTLBEntry entry;
>>> +    uint64_t iova = start;
>>> +    uint64_t iova_next;
>>> +    uint64_t skipped_local = 0;
>>> +    int ret = 0;
>>> +
>>> +    trace_vtd_page_walk_level(addr, level, start, end);
>>> +
>>> +    subpage_size = 1ULL << vtd_slpt_level_shift(level);
>>> +    subpage_mask = vtd_slpt_level_page_mask(level);
>>> +
>>> +    while (iova < end) {
>>> +        iova_next = (iova & subpage_mask) + subpage_size;
>>> +
>>> +        offset = vtd_iova_level_offset(iova, level);
>>> +        slpte = vtd_get_slpte(addr, offset);
>>> +
>>> +        /*
>>> +         * When one of the following case happens, we assume the whole
>>> +         * range is invalid:
>>> +         *
>>> +         * 1. read block failed
>> Don't get the meaning (and don't see any code relate to this comment).
> I took above vtd_get_slpte() a "read", so I was trying to mean that we
> failed to read the SLPTE due to some reason, we assume the range is
> invalid.

I see, so we'd better move the comment above of vtd_get_slpte().

>
>>> +         * 3. reserved area non-zero
>>> +         * 2. both read & write flag are not set
>> Should be 1,2,3? And the above comment is conflict with the code at least
>> when notify_unmap is true.
> Yes, okay I don't know why 3 jumped. :(
>
> And yes, I should mention that "both read & write flag not set" only
> suites for page tables here.
>
>>> +         */
>>> +
>>> +        if (slpte == (uint64_t)-1) {
>> If this is true, vtd_slpte_nonzero_rsvd(slpte) should be true too I think?
> Yes, but we are doing two checks here:
>
> - checking against -1 to make sure slpte read success
> - checking against nonzero reserved fields to make sure it follows spec
>
> Imho we should not skip the first check here, only if one day removing
> this may really matter (e.g., for performance reason? I cannot think
> of one yet).

Ok. (return -1 seems not good, but we can address this in the future).

>
>>> +            trace_vtd_page_walk_skip_read(iova, iova_next);
>>> +            skipped_local++;
>>> +            goto next;
>>> +        }
>>> +
>>> +        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
>>> +            trace_vtd_page_walk_skip_reserve(iova, iova_next);
>>> +            skipped_local++;
>>> +            goto next;
>>> +        }
>>> +
>>> +        /* Permissions are stacked with parents' */
>>> +        read_cur = read && (slpte & VTD_SL_R);
>>> +        write_cur = write && (slpte & VTD_SL_W);
>>> +
>>> +        /*
>>> +         * As long as we have either read/write permission, this is
>>> +         * a valid entry. The rule works for both page or page tables.
>>> +         */
>>> +        entry_valid = read_cur | write_cur;
>>> +
>>> +        if (vtd_is_last_slpte(slpte, level)) {
>>> +            entry.target_as = &address_space_memory;
>>> +            entry.iova = iova & subpage_mask;
>>> +            /*
>>> +             * This might be meaningless addr if (!read_cur &&
>>> +             * !write_cur), but after all this field will be
>>> +             * meaningless in that case, so let's share the code to
>>> +             * generate the IOTLBs no matter it's an MAP or UNMAP
>>> +             */
>>> +            entry.translated_addr = vtd_get_slpte_addr(slpte);
>>> +            entry.addr_mask = ~subpage_mask;
>>> +            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
>>> +            if (!entry_valid && !notify_unmap) {
>>> +                trace_vtd_page_walk_skip_perm(iova, iova_next);
>>> +                skipped_local++;
>>> +                goto next;
>>> +            }
>> Under which case should we care about unmap here (better with a comment I
>> think)?
> When PSIs are for invalidation, rather than newly mapped entries. In
> that case, notify_unmap will be true, and here we need to notify
> IOMMU_NONE to do the cache flush or unmap.
>
> (this page walk is not only for replaying, it is for cache flushing as
>   well)
>
> Do you have suggestion on the comments?

I think then we'd better move this to patch 18 which will use notify_unmap.

>
>>> +            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
>>> +                                    entry.addr_mask, entry.perm);
>>> +            if (hook_fn) {
>>> +                ret = hook_fn(&entry, private);
>> For better performance, we could try to merge adjacent mappings here. I
>> think both vfio and vhost support this and it can save a lot of ioctls.
> Looks so, and this is in my todo list.
>
> Do you mind I do it later after this series merged? I would really
> appreciate if we can have this codes settled down first (considering
> that this series has been dangling for half a year, or more, startint
> from Aviv's series), and I am just afraid this will led to
> unconvergence of this series (and I believe there are other places
> that can be enhanced in the future as well).

Yes, let's do it on top.

>
>>> +                if (ret < 0) {
>>> +                    error_report("Detected error in page walk hook "
>>> +                                 "function, stop walk.");
>>> +                    return ret;
>>> +                }
>>> +            }
>>> +        } else {
>>> +            if (!entry_valid) {
>>> +                trace_vtd_page_walk_skip_perm(iova, iova_next);
>>> +                skipped_local++;
>>> +                goto next;
>>> +            }
>>> +            trace_vtd_page_walk_level(vtd_get_slpte_addr(slpte), level - 1,
>>> +                                      iova, MIN(iova_next, end));
>> This looks duplicated?
> I suppose not. The level is different.

Seem not? level - 1 was passed to vtd_page_walk_level() as level which did:

trace_vtd_page_walk_level(addr, level, start, end);


>
>>> +            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova,
>>> +                                      MIN(iova_next, end), hook_fn, private,
>>> +                                      level - 1, read_cur, write_cur,
>>> +                                      &skipped_local, notify_unmap);
>>> +            if (ret < 0) {
>>> +                error_report("Detected page walk error on addr 0x%"PRIx64
>>> +                             " level %"PRIu32", stop walk.", addr, level - 1);
>> Guest triggered, so better use debug macro or tracepoint.
> Sorry. Will replace all the error_report() in the whole series.
>
>>> +                return ret;
>>> +            }
>>> +        }
>>> +
>>> +next:
>>> +        iova = iova_next;
>>> +    }
>>> +
>>> +    if (skipped) {
>>> +        *skipped += skipped_local;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +/**
>>> + * vtd_page_walk - walk specific IOVA range, and call the hook
>>> + *
>>> + * @ce: context entry to walk upon
>>> + * @start: IOVA address to start the walk
>>> + * @end: IOVA range end address (start <= addr < end)
>>> + * @hook_fn: the hook that to be called for each detected area
>>> + * @private: private data for the hook function
>>> + */
>>> +static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
>>> +                         vtd_page_walk_hook hook_fn, void *private)
>>> +{
>>> +    dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
>>> +    uint32_t level = vtd_get_level_from_context_entry(ce);
>>> +
>>> +    if (!vtd_iova_range_check(start, ce)) {
>>> +        error_report("IOVA start 0x%"PRIx64 " end 0x%"PRIx64" exceeds limits",
>>> +                     start, end);
>> Guest triggered, better use debug macro or tracepoint.
> Same.
>
>>> +        return -VTD_FR_ADDR_BEYOND_MGAW;
>>> +    }
>>> +
>>> +    if (!vtd_iova_range_check(end, ce)) {
>>> +        /* Fix end so that it reaches the maximum */
>>> +        end = vtd_iova_limit(ce);
>>> +    }
>>> +
>>> +    trace_vtd_page_walk_level(addr, level, start, end);
>> Duplicated with the tracepoint in vtd_page_walk_level() too?
> Nop? This should be the top level.
>
>>> +
>>> +    return vtd_page_walk_level(addr, start, end, hook_fn, private,
>>> +                               level, true, true, NULL, false);
>>> +}
>>> +
>>>   /* Map a device to its corresponding domain (context-entry) */
>>>   static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>>>                                       uint8_t devfn, VTDContextEntry *ce)
>>> @@ -2395,6 +2569,37 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>>>       return vtd_dev_as;
>>>   }
>>> +static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
>>> +{
>>> +    memory_region_notify_one((IOMMUNotifier *)private, entry);
>>> +    return 0;
>>> +}
>>> +
>>> +static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n)
>>> +{
>>> +    VTDAddressSpace *vtd_as = container_of(mr, VTDAddressSpace, iommu);
>>> +    IntelIOMMUState *s = vtd_as->iommu_state;
>>> +    uint8_t bus_n = pci_bus_num(vtd_as->bus);
>>> +    VTDContextEntry ce;
>>> +
>>> +    if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
>>> +        /*
>>> +         * Scanned a valid context entry, walk over the pages and
>>> +         * notify when needed.
>>> +         */
>>> +        trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
>>> +                                  PCI_FUNC(vtd_as->devfn),
>>> +                                  VTD_CONTEXT_ENTRY_DID(ce.hi),
>>> +                                  ce.hi, ce.lo);
>>> +        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n);
>> ~0ULL?
> Fixing up.
>
> Thanks,
>
> -- peterx
>
Peter Xu Jan. 23, 2017, 2:54 a.m. UTC | #4
On Mon, Jan 23, 2017 at 09:48:48AM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月22日 16:51, Peter Xu wrote:
> >On Sun, Jan 22, 2017 at 03:56:10PM +0800, Jason Wang wrote:
> >
> >[...]
> >
> >>>+/**
> >>>+ * vtd_page_walk_level - walk over specific level for IOVA range
> >>>+ *
> >>>+ * @addr: base GPA addr to start the walk
> >>>+ * @start: IOVA range start address
> >>>+ * @end: IOVA range end address (start <= addr < end)
> >>>+ * @hook_fn: hook func to be called when detected page
> >>>+ * @private: private data to be passed into hook func
> >>>+ * @read: whether parent level has read permission
> >>>+ * @write: whether parent level has write permission
> >>>+ * @skipped: accumulated skipped ranges
> >>What's the usage for this parameter? Looks like it was never used in this
> >>series.
> >This was for debugging purpose before, and I kept it in case one day
> >it can be used again, considering that will not affect much on the
> >overall performance.
> 
> I think we usually do not keep debugging codes outside debug macros.

I'll remove it.

> 
> >
> >>>+ * @notify_unmap: whether we should notify invalid entries
> >>>+ */
> >>>+static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
> >>>+                               uint64_t end, vtd_page_walk_hook hook_fn,
> >>>+                               void *private, uint32_t level,
> >>>+                               bool read, bool write, uint64_t *skipped,
> >>>+                               bool notify_unmap)
> >>>+{
> >>>+    bool read_cur, write_cur, entry_valid;
> >>>+    uint32_t offset;
> >>>+    uint64_t slpte;
> >>>+    uint64_t subpage_size, subpage_mask;
> >>>+    IOMMUTLBEntry entry;
> >>>+    uint64_t iova = start;
> >>>+    uint64_t iova_next;
> >>>+    uint64_t skipped_local = 0;
> >>>+    int ret = 0;
> >>>+
> >>>+    trace_vtd_page_walk_level(addr, level, start, end);
> >>>+
> >>>+    subpage_size = 1ULL << vtd_slpt_level_shift(level);
> >>>+    subpage_mask = vtd_slpt_level_page_mask(level);
> >>>+
> >>>+    while (iova < end) {
> >>>+        iova_next = (iova & subpage_mask) + subpage_size;
> >>>+
> >>>+        offset = vtd_iova_level_offset(iova, level);
> >>>+        slpte = vtd_get_slpte(addr, offset);
> >>>+
> >>>+        /*
> >>>+         * When one of the following case happens, we assume the whole
> >>>+         * range is invalid:
> >>>+         *
> >>>+         * 1. read block failed
> >>Don't get the meaning (and don't see any code relate to this comment).
> >I took above vtd_get_slpte() a "read", so I was trying to mean that we
> >failed to read the SLPTE due to some reason, we assume the range is
> >invalid.
> 
> I see, so we'd better move the comment above of vtd_get_slpte().

Let me remove the whole comment block... I think the codes explained
it clearly even without any comment. (when people see the code check
slpte against -1, it'll think about above function naturally)

> 
> >
> >>>+         * 3. reserved area non-zero
> >>>+         * 2. both read & write flag are not set
> >>Should be 1,2,3? And the above comment is conflict with the code at least
> >>when notify_unmap is true.
> >Yes, okay I don't know why 3 jumped. :(
> >
> >And yes, I should mention that "both read & write flag not set" only
> >suites for page tables here.
> >
> >>>+         */
> >>>+
> >>>+        if (slpte == (uint64_t)-1) {
> >>If this is true, vtd_slpte_nonzero_rsvd(slpte) should be true too I think?
> >Yes, but we are doing two checks here:
> >
> >- checking against -1 to make sure slpte read success
> >- checking against nonzero reserved fields to make sure it follows spec
> >
> >Imho we should not skip the first check here, only if one day removing
> >this may really matter (e.g., for performance reason? I cannot think
> >of one yet).
> 
> Ok. (return -1 seems not good, but we can address this in the future).

Thanks.

> 
> >
> >>>+            trace_vtd_page_walk_skip_read(iova, iova_next);
> >>>+            skipped_local++;
> >>>+            goto next;
> >>>+        }
> >>>+
> >>>+        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
> >>>+            trace_vtd_page_walk_skip_reserve(iova, iova_next);
> >>>+            skipped_local++;
> >>>+            goto next;
> >>>+        }
> >>>+
> >>>+        /* Permissions are stacked with parents' */
> >>>+        read_cur = read && (slpte & VTD_SL_R);
> >>>+        write_cur = write && (slpte & VTD_SL_W);
> >>>+
> >>>+        /*
> >>>+         * As long as we have either read/write permission, this is
> >>>+         * a valid entry. The rule works for both page or page tables.
> >>>+         */
> >>>+        entry_valid = read_cur | write_cur;
> >>>+
> >>>+        if (vtd_is_last_slpte(slpte, level)) {
> >>>+            entry.target_as = &address_space_memory;
> >>>+            entry.iova = iova & subpage_mask;
> >>>+            /*
> >>>+             * This might be meaningless addr if (!read_cur &&
> >>>+             * !write_cur), but after all this field will be
> >>>+             * meaningless in that case, so let's share the code to
> >>>+             * generate the IOTLBs no matter it's an MAP or UNMAP
> >>>+             */
> >>>+            entry.translated_addr = vtd_get_slpte_addr(slpte);
> >>>+            entry.addr_mask = ~subpage_mask;
> >>>+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
> >>>+            if (!entry_valid && !notify_unmap) {
> >>>+                trace_vtd_page_walk_skip_perm(iova, iova_next);
> >>>+                skipped_local++;
> >>>+                goto next;
> >>>+            }
> >>Under which case should we care about unmap here (better with a comment I
> >>think)?
> >When PSIs are for invalidation, rather than newly mapped entries. In
> >that case, notify_unmap will be true, and here we need to notify
> >IOMMU_NONE to do the cache flush or unmap.
> >
> >(this page walk is not only for replaying, it is for cache flushing as
> >  well)
> >
> >Do you have suggestion on the comments?
> 
> I think then we'd better move this to patch 18 which will use notify_unmap.

Do you mean to add some more comment on patch 18?

> 
> >
> >>>+            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
> >>>+                                    entry.addr_mask, entry.perm);
> >>>+            if (hook_fn) {
> >>>+                ret = hook_fn(&entry, private);
> >>For better performance, we could try to merge adjacent mappings here. I
> >>think both vfio and vhost support this and it can save a lot of ioctls.
> >Looks so, and this is in my todo list.
> >
> >Do you mind I do it later after this series merged? I would really
> >appreciate if we can have this codes settled down first (considering
> >that this series has been dangling for half a year, or more, startint
> >from Aviv's series), and I am just afraid this will led to
> >unconvergence of this series (and I believe there are other places
> >that can be enhanced in the future as well).
> 
> Yes, let's do it on top.

Thanks.

> 
> >
> >>>+                if (ret < 0) {
> >>>+                    error_report("Detected error in page walk hook "
> >>>+                                 "function, stop walk.");
> >>>+                    return ret;
> >>>+                }
> >>>+            }
> >>>+        } else {
> >>>+            if (!entry_valid) {
> >>>+                trace_vtd_page_walk_skip_perm(iova, iova_next);
> >>>+                skipped_local++;
> >>>+                goto next;
> >>>+            }
> >>>+            trace_vtd_page_walk_level(vtd_get_slpte_addr(slpte), level - 1,
> >>>+                                      iova, MIN(iova_next, end));
> >>This looks duplicated?
> >I suppose not. The level is different.
> 
> Seem not? level - 1 was passed to vtd_page_walk_level() as level which did:
> 
> trace_vtd_page_walk_level(addr, level, start, end);

Hmm yes I didn't notice that I had one at the entry. :(

Let me only keep that one.

Thanks,

-- peterx
Jason Wang Jan. 23, 2017, 3:12 a.m. UTC | #5
On 2017年01月23日 10:54, Peter Xu wrote:
>>>>> +            trace_vtd_page_walk_skip_read(iova, iova_next);
>>>>> +            skipped_local++;
>>>>> +            goto next;
>>>>> +        }
>>>>> +
>>>>> +        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
>>>>> +            trace_vtd_page_walk_skip_reserve(iova, iova_next);
>>>>> +            skipped_local++;
>>>>> +            goto next;
>>>>> +        }
>>>>> +
>>>>> +        /* Permissions are stacked with parents' */
>>>>> +        read_cur = read && (slpte & VTD_SL_R);
>>>>> +        write_cur = write && (slpte & VTD_SL_W);
>>>>> +
>>>>> +        /*
>>>>> +         * As long as we have either read/write permission, this is
>>>>> +         * a valid entry. The rule works for both page or page tables.
>>>>> +         */
>>>>> +        entry_valid = read_cur | write_cur;
>>>>> +
>>>>> +        if (vtd_is_last_slpte(slpte, level)) {
>>>>> +            entry.target_as = &address_space_memory;
>>>>> +            entry.iova = iova & subpage_mask;
>>>>> +            /*
>>>>> +             * This might be meaningless addr if (!read_cur &&
>>>>> +             * !write_cur), but after all this field will be
>>>>> +             * meaningless in that case, so let's share the code to
>>>>> +             * generate the IOTLBs no matter it's an MAP or UNMAP
>>>>> +             */
>>>>> +            entry.translated_addr = vtd_get_slpte_addr(slpte);
>>>>> +            entry.addr_mask = ~subpage_mask;
>>>>> +            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
>>>>> +            if (!entry_valid && !notify_unmap) {
>>>>> +                trace_vtd_page_walk_skip_perm(iova, iova_next);
>>>>> +                skipped_local++;
>>>>> +                goto next;
>>>>> +            }
>>>> Under which case should we care about unmap here (better with a comment I
>>>> think)?
>>> When PSIs are for invalidation, rather than newly mapped entries. In
>>> that case, notify_unmap will be true, and here we need to notify
>>> IOMMU_NONE to do the cache flush or unmap.
>>>
>>> (this page walk is not only for replaying, it is for cache flushing as
>>>   well)
>>>
>>> Do you have suggestion on the comments?
>> I think then we'd better move this to patch 18 which will use notify_unmap.
> Do you mean to add some more comment on patch 18?
>

I mean move the unmap_nofity and its comment to patch 18 (real user). 
But if you want to keep it in the patch, I'm also fine.

Thanks
Peter Xu Jan. 23, 2017, 3:35 a.m. UTC | #6
On Mon, Jan 23, 2017 at 11:12:27AM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月23日 10:54, Peter Xu wrote:
> >>>>>+            trace_vtd_page_walk_skip_read(iova, iova_next);
> >>>>>+            skipped_local++;
> >>>>>+            goto next;
> >>>>>+        }
> >>>>>+
> >>>>>+        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
> >>>>>+            trace_vtd_page_walk_skip_reserve(iova, iova_next);
> >>>>>+            skipped_local++;
> >>>>>+            goto next;
> >>>>>+        }
> >>>>>+
> >>>>>+        /* Permissions are stacked with parents' */
> >>>>>+        read_cur = read && (slpte & VTD_SL_R);
> >>>>>+        write_cur = write && (slpte & VTD_SL_W);
> >>>>>+
> >>>>>+        /*
> >>>>>+         * As long as we have either read/write permission, this is
> >>>>>+         * a valid entry. The rule works for both page or page tables.
> >>>>>+         */
> >>>>>+        entry_valid = read_cur | write_cur;
> >>>>>+
> >>>>>+        if (vtd_is_last_slpte(slpte, level)) {
> >>>>>+            entry.target_as = &address_space_memory;
> >>>>>+            entry.iova = iova & subpage_mask;
> >>>>>+            /*
> >>>>>+             * This might be meaningless addr if (!read_cur &&
> >>>>>+             * !write_cur), but after all this field will be
> >>>>>+             * meaningless in that case, so let's share the code to
> >>>>>+             * generate the IOTLBs no matter it's an MAP or UNMAP
> >>>>>+             */
> >>>>>+            entry.translated_addr = vtd_get_slpte_addr(slpte);
> >>>>>+            entry.addr_mask = ~subpage_mask;
> >>>>>+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
> >>>>>+            if (!entry_valid && !notify_unmap) {
> >>>>>+                trace_vtd_page_walk_skip_perm(iova, iova_next);
> >>>>>+                skipped_local++;
> >>>>>+                goto next;
> >>>>>+            }
> >>>>Under which case should we care about unmap here (better with a comment I
> >>>>think)?
> >>>When PSIs are for invalidation, rather than newly mapped entries. In
> >>>that case, notify_unmap will be true, and here we need to notify
> >>>IOMMU_NONE to do the cache flush or unmap.
> >>>
> >>>(this page walk is not only for replaying, it is for cache flushing as
> >>>  well)
> >>>
> >>>Do you have suggestion on the comments?
> >>I think then we'd better move this to patch 18 which will use notify_unmap.
> >Do you mean to add some more comment on patch 18?
> >
> 
> I mean move the unmap_nofity and its comment to patch 18 (real user). But if
> you want to keep it in the patch, I'm also fine.

(Since we discussed in IRC for this :)

So I'll keep it for this time. Thanks,

-- peterx
Alex Williamson Jan. 23, 2017, 7:33 p.m. UTC | #7
On Sun, 22 Jan 2017 16:51:18 +0800
Peter Xu <peterx@redhat.com> wrote:

> On Sun, Jan 22, 2017 at 03:56:10PM +0800, Jason Wang wrote:
> >   
> > >+            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
> > >+                                    entry.addr_mask, entry.perm);
> > >+            if (hook_fn) {
> > >+                ret = hook_fn(&entry, private);  
> > 
> > For better performance, we could try to merge adjacent mappings here. I
> > think both vfio and vhost support this and it can save a lot of ioctls.  
> 
> Looks so, and this is in my todo list.
> 
> Do you mind I do it later after this series merged? I would really
> appreciate if we can have this codes settled down first (considering
> that this series has been dangling for half a year, or more, startint
> from Aviv's series), and I am just afraid this will led to
> unconvergence of this series (and I believe there are other places
> that can be enhanced in the future as well).

NAK, we can't merge mappings per my comment on 18/20.  You're looking
at an entirely new or at best revised version of the vfio IOMMU
interface to do so.  vfio does not support invalidations at a smaller
granularity than the original mapping.  Thanks,

Alex
Alex Williamson Jan. 23, 2017, 7:34 p.m. UTC | #8
On Mon, 23 Jan 2017 10:54:49 +0800
Peter Xu <peterx@redhat.com> wrote:

> On Mon, Jan 23, 2017 at 09:48:48AM +0800, Jason Wang wrote:
> > 
> > 
> > On 2017年01月22日 16:51, Peter Xu wrote:  
> > >On Sun, Jan 22, 2017 at 03:56:10PM +0800, Jason Wang wrote:
> > >
> > >[...]
> > >  
> > >>>+/**
> > >>>+ * vtd_page_walk_level - walk over specific level for IOVA range
> > >>>+ *
> > >>>+ * @addr: base GPA addr to start the walk
> > >>>+ * @start: IOVA range start address
> > >>>+ * @end: IOVA range end address (start <= addr < end)
> > >>>+ * @hook_fn: hook func to be called when detected page
> > >>>+ * @private: private data to be passed into hook func
> > >>>+ * @read: whether parent level has read permission
> > >>>+ * @write: whether parent level has write permission
> > >>>+ * @skipped: accumulated skipped ranges  
> > >>What's the usage for this parameter? Looks like it was never used in this
> > >>series.  
> > >This was for debugging purpose before, and I kept it in case one day
> > >it can be used again, considering that will not affect much on the
> > >overall performance.  
> > 
> > I think we usually do not keep debugging codes outside debug macros.  
> 
> I'll remove it.

While you're at it, what's the value in using a void* private rather
than just passing around an IOMMUNotifier*.  Seems like unnecessary
abstraction.  Thanks,

Alex
Peter Xu Jan. 24, 2017, 4:04 a.m. UTC | #9
On Mon, Jan 23, 2017 at 12:34:29PM -0700, Alex Williamson wrote:
> On Mon, 23 Jan 2017 10:54:49 +0800
> Peter Xu <peterx@redhat.com> wrote:
> 
> > On Mon, Jan 23, 2017 at 09:48:48AM +0800, Jason Wang wrote:
> > > 
> > > 
> > > On 2017年01月22日 16:51, Peter Xu wrote:  
> > > >On Sun, Jan 22, 2017 at 03:56:10PM +0800, Jason Wang wrote:
> > > >
> > > >[...]
> > > >  
> > > >>>+/**
> > > >>>+ * vtd_page_walk_level - walk over specific level for IOVA range
> > > >>>+ *
> > > >>>+ * @addr: base GPA addr to start the walk
> > > >>>+ * @start: IOVA range start address
> > > >>>+ * @end: IOVA range end address (start <= addr < end)
> > > >>>+ * @hook_fn: hook func to be called when detected page
> > > >>>+ * @private: private data to be passed into hook func
> > > >>>+ * @read: whether parent level has read permission
> > > >>>+ * @write: whether parent level has write permission
> > > >>>+ * @skipped: accumulated skipped ranges  
> > > >>What's the usage for this parameter? Looks like it was never used in this
> > > >>series.  
> > > >This was for debugging purpose before, and I kept it in case one day
> > > >it can be used again, considering that will not affect much on the
> > > >overall performance.  
> > > 
> > > I think we usually do not keep debugging codes outside debug macros.  
> > 
> > I'll remove it.
> 
> While you're at it, what's the value in using a void* private rather
> than just passing around an IOMMUNotifier*.  Seems like unnecessary
> abstraction.  Thanks,

When handling PSIs (in continuous patches, not this one), we were
passing in MemoryRegion* rather than IOMMUNotifier*:

        vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
                        vtd_page_invalidate_notify_hook,
                        (void *)&vtd_as->iommu, true);

So a void* might still be required. Thanks,

-- peterx
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 6f5f68a..f9c5142 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -598,6 +598,22 @@  static inline uint32_t vtd_get_agaw_from_context_entry(VTDContextEntry *ce)
     return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
 }
 
+static inline uint64_t vtd_iova_limit(VTDContextEntry *ce)
+{
+    uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
+    return 1ULL << MIN(ce_agaw, VTD_MGAW);
+}
+
+/* Return true if IOVA passes range check, otherwise false. */
+static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce)
+{
+    /*
+     * Check if @iova is above 2^X-1, where X is the minimum of MGAW
+     * in CAP_REG and AW in context-entry.
+     */
+    return !(iova & ~(vtd_iova_limit(ce) - 1));
+}
+
 static const uint64_t vtd_paging_entry_rsvd_field[] = {
     [0] = ~0ULL,
     /* For not large page */
@@ -633,13 +649,9 @@  static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
     uint32_t level = vtd_get_level_from_context_entry(ce);
     uint32_t offset;
     uint64_t slpte;
-    uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
     uint64_t access_right_check;
 
-    /* Check if @iova is above 2^X-1, where X is the minimum of MGAW
-     * in CAP_REG and AW in context-entry.
-     */
-    if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
+    if (!vtd_iova_range_check(iova, ce)) {
         trace_vtd_err("IOVA exceeds limits");
         return -VTD_FR_ADDR_BEYOND_MGAW;
     }
@@ -681,6 +693,168 @@  static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
     }
 }
 
+typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
+
+/**
+ * vtd_page_walk_level - walk over specific level for IOVA range
+ *
+ * @addr: base GPA addr to start the walk
+ * @start: IOVA range start address
+ * @end: IOVA range end address (start <= addr < end)
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @read: whether parent level has read permission
+ * @write: whether parent level has write permission
+ * @skipped: accumulated skipped ranges
+ * @notify_unmap: whether we should notify invalid entries
+ */
+static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
+                               uint64_t end, vtd_page_walk_hook hook_fn,
+                               void *private, uint32_t level,
+                               bool read, bool write, uint64_t *skipped,
+                               bool notify_unmap)
+{
+    bool read_cur, write_cur, entry_valid;
+    uint32_t offset;
+    uint64_t slpte;
+    uint64_t subpage_size, subpage_mask;
+    IOMMUTLBEntry entry;
+    uint64_t iova = start;
+    uint64_t iova_next;
+    uint64_t skipped_local = 0;
+    int ret = 0;
+
+    trace_vtd_page_walk_level(addr, level, start, end);
+
+    subpage_size = 1ULL << vtd_slpt_level_shift(level);
+    subpage_mask = vtd_slpt_level_page_mask(level);
+
+    while (iova < end) {
+        iova_next = (iova & subpage_mask) + subpage_size;
+
+        offset = vtd_iova_level_offset(iova, level);
+        slpte = vtd_get_slpte(addr, offset);
+
+        /*
+         * When one of the following case happens, we assume the whole
+         * range is invalid:
+         *
+         * 1. read block failed
+         * 3. reserved area non-zero
+         * 2. both read & write flag are not set
+         */
+
+        if (slpte == (uint64_t)-1) {
+            trace_vtd_page_walk_skip_read(iova, iova_next);
+            skipped_local++;
+            goto next;
+        }
+
+        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
+            trace_vtd_page_walk_skip_reserve(iova, iova_next);
+            skipped_local++;
+            goto next;
+        }
+
+        /* Permissions are stacked with parents' */
+        read_cur = read && (slpte & VTD_SL_R);
+        write_cur = write && (slpte & VTD_SL_W);
+
+        /*
+         * As long as we have either read/write permission, this is
+         * a valid entry. The rule works for both page or page tables.
+         */
+        entry_valid = read_cur | write_cur;
+
+        if (vtd_is_last_slpte(slpte, level)) {
+            entry.target_as = &address_space_memory;
+            entry.iova = iova & subpage_mask;
+            /*
+             * This might be meaningless addr if (!read_cur &&
+             * !write_cur), but after all this field will be
+             * meaningless in that case, so let's share the code to
+             * generate the IOTLBs no matter it's an MAP or UNMAP
+             */
+            entry.translated_addr = vtd_get_slpte_addr(slpte);
+            entry.addr_mask = ~subpage_mask;
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
+            if (!entry_valid && !notify_unmap) {
+                trace_vtd_page_walk_skip_perm(iova, iova_next);
+                skipped_local++;
+                goto next;
+            }
+            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
+                                    entry.addr_mask, entry.perm);
+            if (hook_fn) {
+                ret = hook_fn(&entry, private);
+                if (ret < 0) {
+                    error_report("Detected error in page walk hook "
+                                 "function, stop walk.");
+                    return ret;
+                }
+            }
+        } else {
+            if (!entry_valid) {
+                trace_vtd_page_walk_skip_perm(iova, iova_next);
+                skipped_local++;
+                goto next;
+            }
+            trace_vtd_page_walk_level(vtd_get_slpte_addr(slpte), level - 1,
+                                      iova, MIN(iova_next, end));
+            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova,
+                                      MIN(iova_next, end), hook_fn, private,
+                                      level - 1, read_cur, write_cur,
+                                      &skipped_local, notify_unmap);
+            if (ret < 0) {
+                error_report("Detected page walk error on addr 0x%"PRIx64
+                             " level %"PRIu32", stop walk.", addr, level - 1);
+                return ret;
+            }
+        }
+
+next:
+        iova = iova_next;
+    }
+
+    if (skipped) {
+        *skipped += skipped_local;
+    }
+
+    return 0;
+}
+
+/**
+ * vtd_page_walk - walk specific IOVA range, and call the hook
+ *
+ * @ce: context entry to walk upon
+ * @start: IOVA address to start the walk
+ * @end: IOVA range end address (start <= addr < end)
+ * @hook_fn: the hook that to be called for each detected area
+ * @private: private data for the hook function
+ */
+static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
+                         vtd_page_walk_hook hook_fn, void *private)
+{
+    dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
+    uint32_t level = vtd_get_level_from_context_entry(ce);
+
+    if (!vtd_iova_range_check(start, ce)) {
+        error_report("IOVA start 0x%"PRIx64 " end 0x%"PRIx64" exceeds limits",
+                     start, end);
+        return -VTD_FR_ADDR_BEYOND_MGAW;
+    }
+
+    if (!vtd_iova_range_check(end, ce)) {
+        /* Fix end so that it reaches the maximum */
+        end = vtd_iova_limit(ce);
+    }
+
+    trace_vtd_page_walk_level(addr, level, start, end);
+
+    return vtd_page_walk_level(addr, start, end, hook_fn, private,
+                               level, true, true, NULL, false);
+}
+
 /* Map a device to its corresponding domain (context-entry) */
 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
                                     uint8_t devfn, VTDContextEntry *ce)
@@ -2395,6 +2569,37 @@  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
     return vtd_dev_as;
 }
 
+static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
+{
+    memory_region_notify_one((IOMMUNotifier *)private, entry);
+    return 0;
+}
+
+static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n)
+{
+    VTDAddressSpace *vtd_as = container_of(mr, VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    uint8_t bus_n = pci_bus_num(vtd_as->bus);
+    VTDContextEntry ce;
+
+    if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
+        /*
+         * Scanned a valid context entry, walk over the pages and
+         * notify when needed.
+         */
+        trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
+                                  PCI_FUNC(vtd_as->devfn),
+                                  VTD_CONTEXT_ENTRY_DID(ce.hi),
+                                  ce.hi, ce.lo);
+        vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n);
+    } else {
+        trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
+                                    PCI_FUNC(vtd_as->devfn));
+    }
+
+    return;
+}
+
 /* Do the initialization. It will also be called when reset, so pay
  * attention when adding new initialization stuff.
  */
@@ -2409,6 +2614,7 @@  static void vtd_init(IntelIOMMUState *s)
 
     s->iommu_ops.translate = vtd_iommu_translate;
     s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed;
+    s->iommu_ops.replay = vtd_iommu_replay;
     s->root = 0;
     s->root_extended = false;
     s->dmar_enabled = false;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index a273980..a3e1a9d 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -31,6 +31,13 @@  vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t doma
 vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
 vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
 vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
+vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
+vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
+vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
+vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
+vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
 
 # hw/i386/amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
diff --git a/include/exec/memory.h b/include/exec/memory.h
index bb4e654..9fd3232 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -59,6 +59,8 @@  typedef enum {
     IOMMU_RW   = 3,
 } IOMMUAccessFlags;
 
+#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0))
+
 struct IOMMUTLBEntry {
     AddressSpace    *target_as;
     hwaddr           iova;