[v5] x86/p2m: use large pages for MMIO mappings
diff mbox

Message ID 56A658FE02000078000CAC3D@prv-mh.provo.novell.com
State New, archived
Headers show

Commit Message

Jan Beulich Jan. 25, 2016, 4:18 p.m. UTC
When mapping large BARs (e.g. the frame buffer of a graphics card) the
overhead of establishing such mappings using only 4k pages has,
particularly after the XSA-125 fix, become unacceptable. Alter the
XEN_DOMCTL_memory_mapping semantics once again, so that there's no
longer a fixed amount of guest frames that represents the upper limit
of what a single invocation can map. Instead bound execution time by
limiting the number of iterations (regardless of page size).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Open issues (perhaps for subsequent changes):
- ARM side unimplemented (and hence libxc for now made cope with both
  models), the main issue (besides my inability to test any change
  there) being the many internal uses of map_mmio_regions())
- iommu_{,un}map_page() interfaces don't support "order" (hence
  mmio_order() for now returns zero when !iommu_hap_pt_share, which in
  particular means the AMD side isn't being taken care of just yet, but
  note that this also has the intended effect of suppressing non-zero
  order mappings in the shadow mode case)
---
v5: Refine comment in domctl.h.
v4: Move cleanup duty entirely to the caller of the hypercall. Move
    return value description to from commit message to domctl.h.
v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
    hap_capabilities". Extend description to spell out new return value
    meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
    of literal 0. Take into consideration r/o MMIO pages.
v2: Produce valid entries for large p2m_mmio_direct mappings in
    p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
    mmio_order(). Update function comment of set_typed_p2m_entry() and
    clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
    {,un}map_mmio_regions() to detect otherwise endless loops.
x86/p2m: use large pages for MMIO mappings

When mapping large BARs (e.g. the frame buffer of a graphics card) the
overhead of establishing such mappings using only 4k pages has,
particularly after the XSA-125 fix, become unacceptable. Alter the
XEN_DOMCTL_memory_mapping semantics once again, so that there's no
longer a fixed amount of guest frames that represents the upper limit
of what a single invocation can map. Instead bound execution time by
limiting the number of iterations (regardless of page size).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Open issues (perhaps for subsequent changes):
- ARM side unimplemented (and hence libxc for now made cope with both
  models), the main issue (besides my inability to test any change
  there) being the many internal uses of map_mmio_regions())
- iommu_{,un}map_page() interfaces don't support "order" (hence
  mmio_order() for now returns zero when !iommu_hap_pt_share, which in
  particular means the AMD side isn't being taken care of just yet, but
  note that this also has the intended effect of suppressing non-zero
  order mappings in the shadow mode case)
---
v5: Refine comment in domctl.h.
v4: Move cleanup duty entirely to the caller of the hypercall. Move
    return value description to from commit message to domctl.h.
v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
    hap_capabilities". Extend description to spell out new return value
    meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
    of literal 0. Take into consideration r/o MMIO pages.
v2: Produce valid entries for large p2m_mmio_direct mappings in
    p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
    mmio_order(). Update function comment of set_typed_p2m_entry() and
    clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
    {,un}map_mmio_regions() to detect otherwise endless loops.

--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -2174,7 +2174,7 @@ int xc_domain_memory_mapping(
 {
     DECLARE_DOMCTL;
     xc_dominfo_t info;
-    int ret = 0, err;
+    int ret = 0, rc;
     unsigned long done = 0, nr, max_batch_sz;
 
     if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
@@ -2199,19 +2199,24 @@ int xc_domain_memory_mapping(
         domctl.u.memory_mapping.nr_mfns = nr;
         domctl.u.memory_mapping.first_gfn = first_gfn + done;
         domctl.u.memory_mapping.first_mfn = first_mfn + done;
-        err = do_domctl(xch, &domctl);
-        if ( err && errno == E2BIG )
+        rc = do_domctl(xch, &domctl);
+        if ( rc < 0 && errno == E2BIG )
         {
             if ( max_batch_sz <= 1 )
                 break;
             max_batch_sz >>= 1;
             continue;
         }
+        if ( rc > 0 )
+        {
+            done += rc;
+            continue;
+        }
         /* Save the first error... */
         if ( !ret )
-            ret = err;
+            ret = rc;
         /* .. and ignore the rest of them when removing. */
-        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
+        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
             break;
 
         done += nr;
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s
         else
             a = p2m_access_rw;
 
-        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
+                                      PAGE_ORDER_4K, a)) )
             panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
                   gfn, mfn, i, rc);
         if ( !(i & 0xfffff) )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru
     share_xen_page_with_guest(pg, d, XENSHARE_writable);
     d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
     set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
-                       p2m_get_hostp2m(d)->default_access);
+                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
 
     return 0;
 }
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain
     p2m_unlock(p2m);
 }
 
-/* Returns: 0 for success, -errno for failure */
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
 static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                               p2m_type_t gfn_p2mt, p2m_access_t access)
+                               unsigned int order, p2m_type_t gfn_p2mt,
+                               p2m_access_t access)
 {
     int rc = 0;
     p2m_access_t a;
     p2m_type_t ot;
     mfn_t omfn;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        gfn_unlock(p2m, gfn, order);
+        return cur_order + 1;
+    }
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
-        gfn_unlock(p2m, gfn, 0);
+        gfn_unlock(p2m, gfn, order);
         domain_crash(d);
         return -ENOENT;
     }
     else if ( p2m_is_ram(ot) )
     {
+        unsigned long i;
+
         ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        for ( i = 0; i < (1UL << order); ++i )
+            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
     }
 
     P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
-    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
-                       access);
+    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
     if ( rc )
-        gdprintk(XENLOG_ERR,
-                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
-                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
+        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+                 gfn, order, rc, mfn_x(mfn));
     else if ( p2m_is_pod(ot) )
     {
         pod_lock(p2m);
-        p2m->pod.entry_count--;
+        p2m->pod.entry_count -= 1UL << order;
         BUG_ON(p2m->pod.entry_count < 0);
         pod_unlock(p2m);
     }
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -949,14 +963,21 @@ static int set_typed_p2m_entry(struct do
 static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
                                  mfn_t mfn)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
+    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
                                p2m_get_hostp2m(d)->default_access);
 }
 
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access)
+                       unsigned int order, p2m_access_t access)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
+    if ( order &&
+         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
+                                 mfn_x(mfn) + (1UL << order) - 1) &&
+         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
+                                  mfn_x(mfn) + (1UL << order) - 1) )
+        return order;
+
+    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
 }
 
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
@@ -1009,20 +1030,33 @@ int set_identity_p2m_entry(struct domain
     return ret;
 }
 
-/* Returns: 0 for success, -errno for failure */
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order)
 {
     int rc = -EINVAL;
     mfn_t actual_mfn;
     p2m_access_t a;
     p2m_type_t t;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        rc = cur_order + 1;
+        goto out;
+    }
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
@@ -1035,11 +1069,11 @@ int clear_mmio_p2m_entry(struct domain *
         gdprintk(XENLOG_WARNING,
                  "no mapping between mfn %08lx and gfn %08lx\n",
                  mfn_x(mfn), gfn);
-    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
+    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
                        p2m->default_access);
 
  out:
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain *
     return map_domain_page(*mfn);
 }
 
+static unsigned int mmio_order(const struct domain *d,
+                               unsigned long start_fn, unsigned long nr)
+{
+    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
+         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
+        return 0;
+
+    if ( !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
+         hap_has_1gb )
+        return PAGE_ORDER_1G;
+
+    if ( hap_has_2mb )
+        return PAGE_ORDER_2M;
+
+    return 0;
+}
+
+#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
+
 int map_mmio_regions(struct domain *d,
                      unsigned long start_gfn,
                      unsigned long nr,
@@ -2102,22 +2155,29 @@ int map_mmio_regions(struct domain *d,
 {
     int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; !ret && i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
-                                 p2m_get_hostp2m(d)->default_access);
-        if ( ret )
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
         {
-            unmap_mmio_regions(d, start_gfn, i, mfn);
-            break;
+            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order,
+                                     p2m_get_hostp2m(d)->default_access);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
         }
+        if ( ret < 0 )
+            break;
     }
 
-    return ret;
+    return i == nr ? 0 : i ?: ret;
 }
 
 int unmap_mmio_regions(struct domain *d,
@@ -2125,20 +2185,30 @@ int unmap_mmio_regions(struct domain *d,
                        unsigned long nr,
                        unsigned long mfn)
 {
-    int err = 0;
+    int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
-        if ( ret )
-            err = ret;
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
+        {
+            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
+        }
+        if ( ret < 0 )
+            break;
     }
 
-    return err;
+    return i == nr ? 0 : i ?: ret;
 }
 
 unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
                                                     entry->mfn);
+            ASSERT(entry->w || !is_epte_superpage(entry));
             entry->a = !!cpu_has_vmx_ept_ad;
             entry->d = entry->w && cpu_has_vmx_ept_ad;
             break;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,7 +72,8 @@ static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
+                                       unsigned int level)
 {
     unsigned long flags;
     /*
@@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p
     case p2m_mmio_direct:
         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
             flags |= _PAGE_RW;
+        else
+            ASSERT(!level);
         return flags | P2M_BASE_FLAGS | _PAGE_PCD;
     }
 }
@@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain *
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
 
             if ( level )
             {
@@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn));
+                                             p2m_type_to_flags(p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
+                                       p2m_type_to_flags(p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
              (gfn + nr_mfns - 1) < gfn ) /* wrap? */
             break;
 
+#ifndef CONFIG_X86 /* XXX ARM!? */
         ret = -E2BIG;
         /* Must break hypercall up as this could take a while. */
         if ( nr_mfns > 64 )
             break;
+#endif
 
         ret = -EPERM;
         if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
@@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret )
+            if ( ret < 0 )
                 printk(XENLOG_G_WARNING
                        "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
                        d->domain_id, gfn, mfn, nr_mfns, ret);
@@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret && is_hardware_domain(current->domain) )
+            if ( ret < 0 && is_hardware_domain(current->domain) )
                 printk(XENLOG_ERR
                        "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
                        ret, d->domain_id, mfn, mfn_end);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d,
     }
     if ( p2mt == p2m_mmio_direct )
     {
-        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
+        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
         put_gfn(d, gmfn);
         return 1;
     }
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom
 
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access);
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+                       unsigned int order, p2m_access_t access);
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order);
 
 /* Set identity addresses in the p2m table (for pass-through) */
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -542,8 +542,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
 
 
 /* Bind machine I/O address range -> HVM address range. */
-/* If this returns -E2BIG lower nr_mfns value. */
 /* XEN_DOMCTL_memory_mapping */
+/* Returns
+   - zero     success, everything done
+   - -E2BIG   passed in nr_mfns value too large for the implementation
+   - positive partial success for the first <result> page frames (with
+              <result> less than nr_mfns), requiring re-invocation by the
+              caller after updating inputs
+   - negative error; other than -E2BIG
+*/
 #define DPCI_ADD_MAPPING         1
 #define DPCI_REMOVE_MAPPING      0
 struct xen_domctl_memory_mapping {

Comments

Ian Campbell Jan. 25, 2016, 5:18 p.m. UTC | #1
On Mon, 2016-01-25 at 09:18 -0700, Jan Beulich wrote:
> When mapping large BARs (e.g. the frame buffer of a graphics card) the
> overhead of establishing such mappings using only 4k pages has,
> particularly after the XSA-125 fix, become unacceptable. Alter the
> XEN_DOMCTL_memory_mapping semantics once again, so that there's no
> longer a fixed amount of guest frames that represents the upper limit
> of what a single invocation can map. Instead bound execution time by
> limiting the number of iterations (regardless of page size).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Tools and domctl API side:

Acked-by: Ian Campbell <ian.campbell@citrix.com>

> ---
> Open issues (perhaps for subsequent changes):
> - ARM side unimplemented (and hence libxc for now made cope with both
>   models), the main issue (besides my inability to test any change
>   there) being the many internal uses of map_mmio_regions())
> - iommu_{,un}map_page() interfaces don't support "order" (hence
>   mmio_order() for now returns zero when !iommu_hap_pt_share, which in
>   particular means the AMD side isn't being taken care of just yet, but
>   note that this also has the intended effect of suppressing non-zero
>   order mappings in the shadow mode case)
> ---
> v5: Refine comment in domctl.h.
> v4: Move cleanup duty entirely to the caller of the hypercall. Move
>     return value description to from commit message to domctl.h.
> v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
>     hap_capabilities". Extend description to spell out new return value
>     meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
>     of literal 0. Take into consideration r/o MMIO pages.
> v2: Produce valid entries for large p2m_mmio_direct mappings in
>     p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
>     mmio_order(). Update function comment of set_typed_p2m_entry() and
>     clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
>     {,un}map_mmio_regions() to detect otherwise endless loops.
> 
> --- a/tools/libxc/xc_domain.c
> +++ b/tools/libxc/xc_domain.c
> @@ -2174,7 +2174,7 @@ int xc_domain_memory_mapping(
>  {
>      DECLARE_DOMCTL;
>      xc_dominfo_t info;
> -    int ret = 0, err;
> +    int ret = 0, rc;
>      unsigned long done = 0, nr, max_batch_sz;
>  
>      if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
> @@ -2199,19 +2199,24 @@ int xc_domain_memory_mapping(
>          domctl.u.memory_mapping.nr_mfns = nr;
>          domctl.u.memory_mapping.first_gfn = first_gfn + done;
>          domctl.u.memory_mapping.first_mfn = first_mfn + done;
> -        err = do_domctl(xch, &domctl);
> -        if ( err && errno == E2BIG )
> +        rc = do_domctl(xch, &domctl);
> +        if ( rc < 0 && errno == E2BIG )
>          {
>              if ( max_batch_sz <= 1 )
>                  break;
>              max_batch_sz >>= 1;
>              continue;
>          }
> +        if ( rc > 0 )
> +        {
> +            done += rc;
> +            continue;
> +        }
>          /* Save the first error... */
>          if ( !ret )
> -            ret = err;
> +            ret = rc;
>          /* .. and ignore the rest of them when removing. */
> -        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
> +        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
>              break;
>  
>          done += nr;
> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s
>          else
>              a = p2m_access_rw;
>  
> -        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
> +        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
> +                                      PAGE_ORDER_4K, a)) )
>              panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
>                    gfn, mfn, i, rc);
>          if ( !(i & 0xfffff) )
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru
>      share_xen_page_with_guest(pg, d, XENSHARE_writable);
>      d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
>      set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE),
> _mfn(mfn),
> -                       p2m_get_hostp2m(d)->default_access);
> +                       PAGE_ORDER_4K, p2m_get_hostp2m(d)-
> >default_access);
>  
>      return 0;
>  }
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain
>      p2m_unlock(p2m);
>  }
>  
> -/* Returns: 0 for success, -errno for failure */
> +/*
> + * Returns:
> + *    0        for success
> + *    -errno   for failure
> + *    order+1  for caller to retry with order (guaranteed smaller than
> + *             the order value passed in)
> + */
>  static int set_typed_p2m_entry(struct domain *d, unsigned long gfn,
> mfn_t mfn,
> -                               p2m_type_t gfn_p2mt, p2m_access_t access)
> +                               unsigned int order, p2m_type_t gfn_p2mt,
> +                               p2m_access_t access)
>  {
>      int rc = 0;
>      p2m_access_t a;
>      p2m_type_t ot;
>      mfn_t omfn;
> +    unsigned int cur_order = 0;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>  
>      if ( !paging_mode_translate(d) )
>          return -EIO;
>  
> -    gfn_lock(p2m, gfn, 0);
> -    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
> +    gfn_lock(p2m, gfn, order);
> +    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
> +    if ( cur_order < order )
> +    {
> +        gfn_unlock(p2m, gfn, order);
> +        return cur_order + 1;
> +    }
>      if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
>      {
> -        gfn_unlock(p2m, gfn, 0);
> +        gfn_unlock(p2m, gfn, order);
>          domain_crash(d);
>          return -ENOENT;
>      }
>      else if ( p2m_is_ram(ot) )
>      {
> +        unsigned long i;
> +
>          ASSERT(mfn_valid(omfn));
> -        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
> +        for ( i = 0; i < (1UL << order); ++i )
> +            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
>      }
>  
>      P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
> -    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
> -                       access);
> +    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
>      if ( rc )
> -        gdprintk(XENLOG_ERR,
> -                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
> -                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)),
> rc);
> +        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d
> (0x%"PRI_mfn")\n",
> +                 gfn, order, rc, mfn_x(mfn));
>      else if ( p2m_is_pod(ot) )
>      {
>          pod_lock(p2m);
> -        p2m->pod.entry_count--;
> +        p2m->pod.entry_count -= 1UL << order;
>          BUG_ON(p2m->pod.entry_count < 0);
>          pod_unlock(p2m);
>      }
> -    gfn_unlock(p2m, gfn, 0);
> +    gfn_unlock(p2m, gfn, order);
>  
>      return rc;
>  }
> @@ -949,14 +963,21 @@ static int set_typed_p2m_entry(struct do
>  static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
>                                   mfn_t mfn)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
> +    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K,
> p2m_map_foreign,
>                                 p2m_get_hostp2m(d)->default_access);
>  }
>  
>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                       p2m_access_t access)
> +                       unsigned int order, p2m_access_t access)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
> +    if ( order &&
> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
> +                                  mfn_x(mfn) + (1UL << order) - 1) )
> +        return order;
> +
> +    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct,
> access);
>  }
>  
>  int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
> @@ -1009,20 +1030,33 @@ int set_identity_p2m_entry(struct domain
>      return ret;
>  }
>  
> -/* Returns: 0 for success, -errno for failure */
> -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
> +/*
> + * Returns:
> + *    0        for success
> + *    -errno   for failure
> + *    order+1  for caller to retry with order (guaranteed smaller than
> + *             the order value passed in)
> + */
> +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> +                         unsigned int order)
>  {
>      int rc = -EINVAL;
>      mfn_t actual_mfn;
>      p2m_access_t a;
>      p2m_type_t t;
> +    unsigned int cur_order = 0;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>  
>      if ( !paging_mode_translate(d) )
>          return -EIO;
>  
> -    gfn_lock(p2m, gfn, 0);
> -    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
> +    gfn_lock(p2m, gfn, order);
> +    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
> +    if ( cur_order < order )
> +    {
> +        rc = cur_order + 1;
> +        goto out;
> +    }
>  
>      /* Do not use mfn_valid() here as it will usually fail for MMIO
> pages. */
>      if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
> @@ -1035,11 +1069,11 @@ int clear_mmio_p2m_entry(struct domain *
>          gdprintk(XENLOG_WARNING,
>                   "no mapping between mfn %08lx and gfn %08lx\n",
>                   mfn_x(mfn), gfn);
> -    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K,
> p2m_invalid,
> +    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
>                         p2m->default_access);
>  
>   out:
> -    gfn_unlock(p2m, gfn, 0);
> +    gfn_unlock(p2m, gfn, order);
>  
>      return rc;
>  }
> @@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain *
>      return map_domain_page(*mfn);
>  }
>  
> +static unsigned int mmio_order(const struct domain *d,
> +                               unsigned long start_fn, unsigned long nr)
> +{
> +    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
> +         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >>
> PAGE_ORDER_2M) )
> +        return 0;
> +
> +    if ( !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >>
> PAGE_ORDER_1G) &&
> +         hap_has_1gb )
> +        return PAGE_ORDER_1G;
> +
> +    if ( hap_has_2mb )
> +        return PAGE_ORDER_2M;
> +
> +    return 0;
> +}
> +
> +#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
> +
>  int map_mmio_regions(struct domain *d,
>                       unsigned long start_gfn,
>                       unsigned long nr,
> @@ -2102,22 +2155,29 @@ int map_mmio_regions(struct domain *d,
>  {
>      int ret = 0;
>      unsigned long i;
> +    unsigned int iter, order;
>  
>      if ( !paging_mode_translate(d) )
>          return 0;
>  
> -    for ( i = 0; !ret && i < nr; i++ )
> +    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
> +          i += 1UL << order, ++iter )
>      {
> -        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
> -                                 p2m_get_hostp2m(d)->default_access);
> -        if ( ret )
> +        /* OR'ing gfn and mfn values will return an order suitable to
> both. */
> +        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr -
> i); ;
> +              order = ret - 1 )
>          {
> -            unmap_mmio_regions(d, start_gfn, i, mfn);
> -            break;
> +            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
> order,
> +                                     p2m_get_hostp2m(d)-
> >default_access);
> +            if ( ret <= 0 )
> +                break;
> +            ASSERT(ret <= order);
>          }
> +        if ( ret < 0 )
> +            break;
>      }
>  
> -    return ret;
> +    return i == nr ? 0 : i ?: ret;
>  }
>  
>  int unmap_mmio_regions(struct domain *d,
> @@ -2125,20 +2185,30 @@ int unmap_mmio_regions(struct domain *d,
>                         unsigned long nr,
>                         unsigned long mfn)
>  {
> -    int err = 0;
> +    int ret = 0;
>      unsigned long i;
> +    unsigned int iter, order;
>  
>      if ( !paging_mode_translate(d) )
>          return 0;
>  
> -    for ( i = 0; i < nr; i++ )
> +    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
> +          i += 1UL << order, ++iter )
>      {
> -        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
> -        if ( ret )
> -            err = ret;
> +        /* OR'ing gfn and mfn values will return an order suitable to
> both. */
> +        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr -
> i); ;
> +              order = ret - 1 )
> +        {
> +            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
> order);
> +            if ( ret <= 0 )
> +                break;
> +            ASSERT(ret <= order);
> +        }
> +        if ( ret < 0 )
> +            break;
>      }
>  
> -    return err;
> +    return i == nr ? 0 : i ?: ret;
>  }
>  
>  unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
> --- a/xen/arch/x86/mm/p2m-ept.c
> +++ b/xen/arch/x86/mm/p2m-ept.c
> @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
>              entry->r = entry->x = 1;
>              entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
>                                                      entry->mfn);
> +            ASSERT(entry->w || !is_epte_superpage(entry));
>              entry->a = !!cpu_has_vmx_ept_ad;
>              entry->d = entry->w && cpu_has_vmx_ept_ad;
>              break;
> --- a/xen/arch/x86/mm/p2m-pt.c
> +++ b/xen/arch/x86/mm/p2m-pt.c
> @@ -72,7 +72,8 @@ static const unsigned long pgt[] = {
>      PGT_l3_page_table
>  };
>  
> -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
> +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
> +                                       unsigned int level)
>  {
>      unsigned long flags;
>      /*
> @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p
>      case p2m_mmio_direct:
>          if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
>              flags |= _PAGE_RW;
> +        else
> +            ASSERT(!level);
>          return flags | P2M_BASE_FLAGS | _PAGE_PCD;
>      }
>  }
> @@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain *
>              p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn
> | ~mask)
>                                ? p2m_ram_logdirty : p2m_ram_rw;
>              unsigned long mfn = l1e_get_pfn(e);
> -            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
> +            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn),
> level);
>  
>              if ( level )
>              {
> @@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
>          ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
>          l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
>              ? l3e_from_pfn(mfn_x(mfn),
> -                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
> +                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
>              : l3e_empty();
>          entry_content.l1 = l3e_content.l3;
>  
> @@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
>  
>          if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
>              entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
> -                                             p2m_type_to_flags(p2mt,
> mfn));
> +                                             p2m_type_to_flags(p2mt,
> mfn, 0));
>          else
>              entry_content = l1e_empty();
>  
> @@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
>          ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
>          if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
>              l2e_content = l2e_from_pfn(mfn_x(mfn),
> -                                       p2m_type_to_flags(p2mt, mfn) |
> +                                       p2m_type_to_flags(p2mt, mfn, 1) |
>                                         _PAGE_PSE);
>          else
>              l2e_content = l2e_empty();
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
>               (gfn + nr_mfns - 1) < gfn ) /* wrap? */
>              break;
>  
> +#ifndef CONFIG_X86 /* XXX ARM!? */
>          ret = -E2BIG;
>          /* Must break hypercall up as this could take a while. */
>          if ( nr_mfns > 64 )
>              break;
> +#endif
>  
>          ret = -EPERM;
>          if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
> @@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
>                     d->domain_id, gfn, mfn, nr_mfns);
>  
>              ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
> -            if ( ret )
> +            if ( ret < 0 )
>                  printk(XENLOG_G_WARNING
>                         "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx
> ret:%ld\n",
>                         d->domain_id, gfn, mfn, nr_mfns, ret);
> @@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
>                     d->domain_id, gfn, mfn, nr_mfns);
>  
>              ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
> -            if ( ret && is_hardware_domain(current->domain) )
> +            if ( ret < 0 && is_hardware_domain(current->domain) )
>                  printk(XENLOG_ERR
>                         "memory_map: error %ld removing dom%d access to
> [%lx,%lx]\n",
>                         ret, d->domain_id, mfn, mfn_end);
> --- a/xen/common/memory.c
> +++ b/xen/common/memory.c
> @@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d,
>      }
>      if ( p2mt == p2m_mmio_direct )
>      {
> -        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
> +        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
>          put_gfn(d, gmfn);
>          return 1;
>      }
> --- a/xen/include/asm-x86/p2m.h
> +++ b/xen/include/asm-x86/p2m.h
> @@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom
>  
>  /* Set mmio addresses in the p2m table (for pass-through) */
>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                       p2m_access_t access);
> -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t
> mfn);
> +                       unsigned int order, p2m_access_t access);
> +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> +                         unsigned int order);
>  
>  /* Set identity addresses in the p2m table (for pass-through) */
>  int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -542,8 +542,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
>  
>  
>  /* Bind machine I/O address range -> HVM address range. */
> -/* If this returns -E2BIG lower nr_mfns value. */
>  /* XEN_DOMCTL_memory_mapping */
> +/* Returns
> +   - zero     success, everything done
> +   - -E2BIG   passed in nr_mfns value too large for the implementation
> +   - positive partial success for the first <result> page frames (with
> +              <result> less than nr_mfns), requiring re-invocation by
> the
> +              caller after updating inputs
> +   - negative error; other than -E2BIG
> +*/
>  #define DPCI_ADD_MAPPING         1
>  #define DPCI_REMOVE_MAPPING      0
>  struct xen_domctl_memory_mapping {
> 
>
Tian, Kevin Jan. 26, 2016, 10:35 p.m. UTC | #2
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Tuesday, January 26, 2016 12:19 AM
> 
> When mapping large BARs (e.g. the frame buffer of a graphics card) the
> overhead of establishing such mappings using only 4k pages has,
> particularly after the XSA-125 fix, become unacceptable. Alter the
> XEN_DOMCTL_memory_mapping semantics once again, so that there's no
> longer a fixed amount of guest frames that represents the upper limit
> of what a single invocation can map. Instead bound execution time by
> limiting the number of iterations (regardless of page size).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Kevin Tian <kevin.tian@intel.com> for VMX part.

Curious. When you say "become unacceptable", how bad is it? mostly
impact the boot time?

Thanks
Kevin
Jan Beulich Jan. 27, 2016, 10:22 a.m. UTC | #3
>>> On 26.01.16 at 23:35, <kevin.tian@intel.com> wrote:
>>  From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: Tuesday, January 26, 2016 12:19 AM
>> 
>> When mapping large BARs (e.g. the frame buffer of a graphics card) the
>> overhead of establishing such mappings using only 4k pages has,
>> particularly after the XSA-125 fix, become unacceptable. Alter the
>> XEN_DOMCTL_memory_mapping semantics once again, so that there's no
>> longer a fixed amount of guest frames that represents the upper limit
>> of what a single invocation can map. Instead bound execution time by
>> limiting the number of iterations (regardless of page size).
>> 
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Acked-by: Kevin Tian <kevin.tian@intel.com> for VMX part.
> 
> Curious. When you say "become unacceptable", how bad is it? mostly
> impact the boot time?

Yes, guest boot time. I don't have a reference to the original report
at hand, but that was what someone (Konrad?) had reported. I've
never seen the issue myself, largely because I've never made any
attempt at GPU pass-through.

Jan
Andrew Cooper Jan. 27, 2016, 10:28 a.m. UTC | #4
On 27/01/16 10:22, Jan Beulich wrote:
>>>> On 26.01.16 at 23:35, <kevin.tian@intel.com> wrote:
>>>  From: Jan Beulich [mailto:JBeulich@suse.com]
>>> Sent: Tuesday, January 26, 2016 12:19 AM
>>>
>>> When mapping large BARs (e.g. the frame buffer of a graphics card) the
>>> overhead of establishing such mappings using only 4k pages has,
>>> particularly after the XSA-125 fix, become unacceptable. Alter the
>>> XEN_DOMCTL_memory_mapping semantics once again, so that there's no
>>> longer a fixed amount of guest frames that represents the upper limit
>>> of what a single invocation can map. Instead bound execution time by
>>> limiting the number of iterations (regardless of page size).
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Acked-by: Kevin Tian <kevin.tian@intel.com> for VMX part.
>>
>> Curious. When you say "become unacceptable", how bad is it? mostly
>> impact the boot time?
> Yes, guest boot time. I don't have a reference to the original report
> at hand, but that was what someone (Konrad?) had reported. I've
> never seen the issue myself, largely because I've never made any
> attempt at GPU pass-through.

From XenServer testing, with a 1GB GPU BAR, XSA-125 caused and
additional 70s of guest boot time.

Naturally. we had to work around this.  Partly upping the repeat limit,
and deferring VT-d flushes.

~Andrew
Andrew Cooper Jan. 27, 2016, 12:32 p.m. UTC | #5
On 25/01/16 16:18, Jan Beulich wrote:
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru
>      share_xen_page_with_guest(pg, d, XENSHARE_writable);
>      d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
>      set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
> -                       p2m_get_hostp2m(d)->default_access);
> +                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
>  

This should ASSERT() success, in case we make further changes to the
error handling.

>      return 0;
>  }
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain
>      p2m_unlock(p2m);
>  }
>  
> -/* Returns: 0 for success, -errno for failure */
> +/*
> + * Returns:
> + *    0        for success
> + *    -errno   for failure
> + *    order+1  for caller to retry with order (guaranteed smaller than
> + *             the order value passed in)
> + */
>  static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                               p2m_type_t gfn_p2mt, p2m_access_t access)
> +                               unsigned int order, p2m_type_t gfn_p2mt,
> +                               p2m_access_t access)
>  {
>      int rc = 0;
>      p2m_access_t a;
>      p2m_type_t ot;
>      mfn_t omfn;
> +    unsigned int cur_order = 0;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>  
>      if ( !paging_mode_translate(d) )
>          return -EIO;
>  
> -    gfn_lock(p2m, gfn, 0);
> -    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
> +    gfn_lock(p2m, gfn, order);
> +    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
> +    if ( cur_order < order )
> +    {
> +        gfn_unlock(p2m, gfn, order);
> +        return cur_order + 1;

Your comment states that the return value is guarenteed to be less than
the passed-in order, but this is not the case here.  cur_order could, in
principle, be only 1 less than order, at which point your documentation
is incorrect.

Does this rely on the x86 architectural orders to function as documented?

> +    }
>      if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
>      {
> -        gfn_unlock(p2m, gfn, 0);
> +        gfn_unlock(p2m, gfn, order);
>          domain_crash(d);
>          return -ENOENT;
>      }
>      else if ( p2m_is_ram(ot) )
>      {
> +        unsigned long i;
> +
>          ASSERT(mfn_valid(omfn));

Shouldn't this check should be extended to the top of the order?

> -        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
> +        for ( i = 0; i < (1UL << order); ++i )
> +            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
>      }
>  
>      P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
> -    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
> -                       access);
> +    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
>      if ( rc )
> -        gdprintk(XENLOG_ERR,
> -                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
> -                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
> +        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
> +                 gfn, order, rc, mfn_x(mfn));
>      else if ( p2m_is_pod(ot) )
>      {
>          pod_lock(p2m);
> -        p2m->pod.entry_count--;
> +        p2m->pod.entry_count -= 1UL << order;
>          BUG_ON(p2m->pod.entry_count < 0);
>          pod_unlock(p2m);
>      }
> -    gfn_unlock(p2m, gfn, 0);
> +    gfn_unlock(p2m, gfn, order);
>  
>      return rc;
>  }
> @@ -949,14 +963,21 @@ static int set_typed_p2m_entry(struct do
>  static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
>                                   mfn_t mfn)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
> +    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
>                                 p2m_get_hostp2m(d)->default_access);
>  }
>  
>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                       p2m_access_t access)
> +                       unsigned int order, p2m_access_t access)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
> +    if ( order &&
> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
> +                                  mfn_x(mfn) + (1UL << order) - 1) )
> +        return order;

Should this not be a hard error?  Even retrying with a lower order is
going fail.

> +
> +    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
>  }
>  
>  int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
> @@ -1009,20 +1030,33 @@ int set_identity_p2m_entry(struct domain
>      return ret;
>  }
>  
> -/* Returns: 0 for success, -errno for failure */
> -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
> +/*
> + * Returns:
> + *    0        for success
> + *    -errno   for failure
> + *    order+1  for caller to retry with order (guaranteed smaller than
> + *             the order value passed in)
> + */
> +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> +                         unsigned int order)
>  {
>      int rc = -EINVAL;
>      mfn_t actual_mfn;
>      p2m_access_t a;
>      p2m_type_t t;
> +    unsigned int cur_order = 0;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>  
>      if ( !paging_mode_translate(d) )
>          return -EIO;
>  
> -    gfn_lock(p2m, gfn, 0);
> -    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
> +    gfn_lock(p2m, gfn, order);
> +    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
> +    if ( cur_order < order )
> +    {
> +        rc = cur_order + 1;
> +        goto out;
> +    }
>  
>      /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
>      if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
> @@ -1035,11 +1069,11 @@ int clear_mmio_p2m_entry(struct domain *
>          gdprintk(XENLOG_WARNING,
>                   "no mapping between mfn %08lx and gfn %08lx\n",
>                   mfn_x(mfn), gfn);
> -    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
> +    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
>                         p2m->default_access);
>  
>   out:
> -    gfn_unlock(p2m, gfn, 0);
> +    gfn_unlock(p2m, gfn, order);
>  
>      return rc;
>  }
> @@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain *
>      return map_domain_page(*mfn);
>  }
>  
> +static unsigned int mmio_order(const struct domain *d,
> +                               unsigned long start_fn, unsigned long nr)
> +{
> +    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
> +         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
> +        return 0;

Perhaps PAGE_ORDER_4K for consistency?

~Andrew
Jan Beulich Jan. 27, 2016, 1:37 p.m. UTC | #6
>>> On 27.01.16 at 13:32, <andrew.cooper3@citrix.com> wrote:
> On 25/01/16 16:18, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru
>>      share_xen_page_with_guest(pg, d, XENSHARE_writable);
>>      d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
>>      set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
>> -                       p2m_get_hostp2m(d)->default_access);
>> +                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
>>  
> 
> This should ASSERT() success, in case we make further changes to the
> error handling.

Maybe, but since it didn't before I don't see why this couldn't /
shouldn't be an independent future patch.

>> --- a/xen/arch/x86/mm/p2m.c
>> +++ b/xen/arch/x86/mm/p2m.c
>> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain
>>      p2m_unlock(p2m);
>>  }
>>  
>> -/* Returns: 0 for success, -errno for failure */
>> +/*
>> + * Returns:
>> + *    0        for success
>> + *    -errno   for failure
>> + *    order+1  for caller to retry with order (guaranteed smaller than
>> + *             the order value passed in)
>> + */
>>  static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>> -                               p2m_type_t gfn_p2mt, p2m_access_t access)
>> +                               unsigned int order, p2m_type_t gfn_p2mt,
>> +                               p2m_access_t access)
>>  {
>>      int rc = 0;
>>      p2m_access_t a;
>>      p2m_type_t ot;
>>      mfn_t omfn;
>> +    unsigned int cur_order = 0;
>>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>  
>>      if ( !paging_mode_translate(d) )
>>          return -EIO;
>>  
>> -    gfn_lock(p2m, gfn, 0);
>> -    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
>> +    gfn_lock(p2m, gfn, order);
>> +    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
>> +    if ( cur_order < order )
>> +    {
>> +        gfn_unlock(p2m, gfn, order);
>> +        return cur_order + 1;
> 
> Your comment states that the return value is guarenteed to be less than
> the passed-in order, but this is not the case here.  cur_order could, in
> principle, be only 1 less than order, at which point your documentation
> is incorrect.
> 
> Does this rely on the x86 architectural orders to function as documented?

No. Maybe the comment text is ambiguous, but I don't see how to
improve it without making it too lengthy: The return value is
<order>+1, telling the caller to retry with <order>, which is
guaranteed to be less than the order that got passed in. I.e. taking
the variable naming above, the caller would have to retry with
cur_order, which - due to the if() - is smaller than order.

>> +    }
>>      if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
>>      {
>> -        gfn_unlock(p2m, gfn, 0);
>> +        gfn_unlock(p2m, gfn, order);
>>          domain_crash(d);
>>          return -ENOENT;
>>      }
>>      else if ( p2m_is_ram(ot) )
>>      {
>> +        unsigned long i;
>> +
>>          ASSERT(mfn_valid(omfn));
> 
> Shouldn't this check should be extended to the top of the order?

Well, yes, perhaps better to move it into ...

>> -        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
>> +        for ( i = 0; i < (1UL << order); ++i )
>> +            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);

... the body of the for(). But I'll wait with v6 until we settled on
the other aspects you raise.

>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>> -                       p2m_access_t access)
>> +                       unsigned int order, p2m_access_t access)
>>  {
>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>> +    if ( order &&
>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>> +        return order;
> 
> Should this not be a hard error?  Even retrying with a lower order is
> going fail.

Why? The latest when order == 0, rangeset_overlaps_range()
will return the same as rangeset_contains_range(), and hence
the condition above will always be false (one of the two reasons
for checking order first here).

>> @@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain *
>>      return map_domain_page(*mfn);
>>  }
>>  
>> +static unsigned int mmio_order(const struct domain *d,
>> +                               unsigned long start_fn, unsigned long nr)
>> +{
>> +    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
>> +         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
>> +        return 0;
> 
> Perhaps PAGE_ORDER_4K for consistency?

Oh, indeed.

Jan
Andrew Cooper Jan. 27, 2016, 2:28 p.m. UTC | #7
On 27/01/16 13:37, Jan Beulich wrote:
>>>> On 27.01.16 at 13:32, <andrew.cooper3@citrix.com> wrote:
>> On 25/01/16 16:18, Jan Beulich wrote:
>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru
>>>      share_xen_page_with_guest(pg, d, XENSHARE_writable);
>>>      d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
>>>      set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
>>> -                       p2m_get_hostp2m(d)->default_access);
>>> +                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
>>>  
>> This should ASSERT() success, in case we make further changes to the
>> error handling.
> Maybe, but since it didn't before I don't see why this couldn't /
> shouldn't be an independent future patch.

Can be.  IMO it is a bug that it isn't already checked.  (-ENOMEM when
allocating p2m leaves perhaps?)

>
>>> --- a/xen/arch/x86/mm/p2m.c
>>> +++ b/xen/arch/x86/mm/p2m.c
>>> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain
>>>      p2m_unlock(p2m);
>>>  }
>>>  
>>> -/* Returns: 0 for success, -errno for failure */
>>> +/*
>>> + * Returns:
>>> + *    0        for success
>>> + *    -errno   for failure
>>> + *    order+1  for caller to retry with order (guaranteed smaller than
>>> + *             the order value passed in)
>>> + */
>>>  static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>> -                               p2m_type_t gfn_p2mt, p2m_access_t access)
>>> +                               unsigned int order, p2m_type_t gfn_p2mt,
>>> +                               p2m_access_t access)
>>>  {
>>>      int rc = 0;
>>>      p2m_access_t a;
>>>      p2m_type_t ot;
>>>      mfn_t omfn;
>>> +    unsigned int cur_order = 0;
>>>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>>  
>>>      if ( !paging_mode_translate(d) )
>>>          return -EIO;
>>>  
>>> -    gfn_lock(p2m, gfn, 0);
>>> -    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
>>> +    gfn_lock(p2m, gfn, order);
>>> +    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
>>> +    if ( cur_order < order )
>>> +    {
>>> +        gfn_unlock(p2m, gfn, order);
>>> +        return cur_order + 1;
>> Your comment states that the return value is guarenteed to be less than
>> the passed-in order, but this is not the case here.  cur_order could, in
>> principle, be only 1 less than order, at which point your documentation
>> is incorrect.
>>
>> Does this rely on the x86 architectural orders to function as documented?
> No. Maybe the comment text is ambiguous, but I don't see how to
> improve it without making it too lengthy: The return value is
> <order>+1, telling the caller to retry with <order>, which is
> guaranteed to be less than the order that got passed in. I.e. taking
> the variable naming above, the caller would have to retry with
> cur_order, which - due to the if() - is smaller than order.

Ah - I see.  The text is indeed confusing.  How about:

"1 + new order: for caller to retry with smaller order (guaranteed to be
smaller than order passed in)"

>
>>> +    }
>>>      if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
>>>      {
>>> -        gfn_unlock(p2m, gfn, 0);
>>> +        gfn_unlock(p2m, gfn, order);
>>>          domain_crash(d);
>>>          return -ENOENT;
>>>      }
>>>      else if ( p2m_is_ram(ot) )
>>>      {
>>> +        unsigned long i;
>>> +
>>>          ASSERT(mfn_valid(omfn));
>> Shouldn't this check should be extended to the top of the order?
> Well, yes, perhaps better to move it into ...
>
>>> -        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
>>> +        for ( i = 0; i < (1UL << order); ++i )
>>> +            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
> ... the body of the for(). But I'll wait with v6 until we settled on
> the other aspects you raise.
>
>>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>> -                       p2m_access_t access)
>>> +                       unsigned int order, p2m_access_t access)
>>>  {
>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>>> +    if ( order &&
>>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>>> +        return order;
>> Should this not be a hard error?  Even retrying with a lower order is
>> going fail.
> Why? The latest when order == 0, rangeset_overlaps_range()
> will return the same as rangeset_contains_range(), and hence
> the condition above will always be false (one of the two reasons
> for checking order first here).

It isn't the order check which is an issue.

One way or another, if the original (mfn/order) fails the rangeset
checks, the overall call is going to fail, but it will be re-executed
repeatedly with an order decreasing to 0.  Wouldn't it be better just to
short-circuit this back&forth?

Relatedly, is there actually anything wrong with making a superpage
read-only mapping over some scattered read-only 4K pages?

~Andrew
Jan Beulich Jan. 27, 2016, 2:40 p.m. UTC | #8
>>> On 27.01.16 at 15:28, <andrew.cooper3@citrix.com> wrote:
> On 27/01/16 13:37, Jan Beulich wrote:
>>>>> On 27.01.16 at 13:32, <andrew.cooper3@citrix.com> wrote:
>>> On 25/01/16 16:18, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>>> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru
>>>>      share_xen_page_with_guest(pg, d, XENSHARE_writable);
>>>>      d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
>>>>      set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
>>>> -                       p2m_get_hostp2m(d)->default_access);
>>>> +                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
>>>>  
>>> This should ASSERT() success, in case we make further changes to the
>>> error handling.
>> Maybe, but since it didn't before I don't see why this couldn't /
>> shouldn't be an independent future patch.
> 
> Can be.  IMO it is a bug that it isn't already checked.  (-ENOMEM when
> allocating p2m leaves perhaps?)

Indeed, albeit that means ASSERT() wouldn't be right anyway. I
hope the VMX maintainers monitor this and will prepare a patch...

>>>> --- a/xen/arch/x86/mm/p2m.c
>>>> +++ b/xen/arch/x86/mm/p2m.c
>>>> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain
>>>>      p2m_unlock(p2m);
>>>>  }
>>>>  
>>>> -/* Returns: 0 for success, -errno for failure */
>>>> +/*
>>>> + * Returns:
>>>> + *    0        for success
>>>> + *    -errno   for failure
>>>> + *    order+1  for caller to retry with order (guaranteed smaller than
>>>> + *             the order value passed in)
>>>> + */
>>>>  static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>>> -                               p2m_type_t gfn_p2mt, p2m_access_t access)
>>>> +                               unsigned int order, p2m_type_t gfn_p2mt,
>>>> +                               p2m_access_t access)
>>>>  {
>>>>      int rc = 0;
>>>>      p2m_access_t a;
>>>>      p2m_type_t ot;
>>>>      mfn_t omfn;
>>>> +    unsigned int cur_order = 0;
>>>>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>>>  
>>>>      if ( !paging_mode_translate(d) )
>>>>          return -EIO;
>>>>  
>>>> -    gfn_lock(p2m, gfn, 0);
>>>> -    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
>>>> +    gfn_lock(p2m, gfn, order);
>>>> +    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
>>>> +    if ( cur_order < order )
>>>> +    {
>>>> +        gfn_unlock(p2m, gfn, order);
>>>> +        return cur_order + 1;
>>> Your comment states that the return value is guarenteed to be less than
>>> the passed-in order, but this is not the case here.  cur_order could, in
>>> principle, be only 1 less than order, at which point your documentation
>>> is incorrect.
>>>
>>> Does this rely on the x86 architectural orders to function as documented?
>> No. Maybe the comment text is ambiguous, but I don't see how to
>> improve it without making it too lengthy: The return value is
>> <order>+1, telling the caller to retry with <order>, which is
>> guaranteed to be less than the order that got passed in. I.e. taking
>> the variable naming above, the caller would have to retry with
>> cur_order, which - due to the if() - is smaller than order.
> 
> Ah - I see.  The text is indeed confusing.  How about:
> 
> "1 + new order: for caller to retry with smaller order (guaranteed to be
> smaller than order passed in)"

Okay.

>>>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>>> -                       p2m_access_t access)
>>>> +                       unsigned int order, p2m_access_t access)
>>>>  {
>>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>>>> +    if ( order &&
>>>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>>>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>>>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>>>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>>>> +        return order;
>>> Should this not be a hard error?  Even retrying with a lower order is
>>> going fail.
>> Why? The latest when order == 0, rangeset_overlaps_range()
>> will return the same as rangeset_contains_range(), and hence
>> the condition above will always be false (one of the two reasons
>> for checking order first here).
> 
> It isn't the order check which is an issue.
> 
> One way or another, if the original (mfn/order) fails the rangeset
> checks, the overall call is going to fail, but it will be re-executed
> repeatedly with an order decreasing to 0.  Wouldn't it be better just to
> short-circuit this back&forth?

But this won't necessarily go down to order 0. Short-circuiting
would mean taking PAGE_ORDER_2M and PAGE_ORDER_1G into
account here, which would imo severely hamper readability.

> Relatedly, is there actually anything wrong with making a superpage
> read-only mapping over some scattered read-only 4K pages?

I'm afraid I don't understand: "scattered pages" and "superpage
mapping" don't seem to fit together for me.

Jan
Andrew Cooper Jan. 27, 2016, 2:51 p.m. UTC | #9
On 27/01/16 14:40, Jan Beulich wrote:
>
>>>>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>>>> -                       p2m_access_t access)
>>>>> +                       unsigned int order, p2m_access_t access)
>>>>>  {
>>>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>>>>> +    if ( order &&
>>>>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>>>>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>>>>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>>>>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>>>>> +        return order;
>>>> Should this not be a hard error?  Even retrying with a lower order is
>>>> going fail.
>>> Why? The latest when order == 0, rangeset_overlaps_range()
>>> will return the same as rangeset_contains_range(), and hence
>>> the condition above will always be false (one of the two reasons
>>> for checking order first here).
>> It isn't the order check which is an issue.
>>
>> One way or another, if the original (mfn/order) fails the rangeset
>> checks, the overall call is going to fail, but it will be re-executed
>> repeatedly with an order decreasing to 0.  Wouldn't it be better just to
>> short-circuit this back&forth?
> But this won't necessarily go down to order 0. Short-circuiting
> would mean taking PAGE_ORDER_2M and PAGE_ORDER_1G into
> account here, which would imo severely hamper readability.

Even when this check starts passing, the subsequent
set_typed_p2m_entry() will fail for writeable mappings, after having
constructed small pages up to the boundary of the RO region.

>
>> Relatedly, is there actually anything wrong with making a superpage
>> read-only mapping over some scattered read-only 4K pages?
> I'm afraid I don't understand: "scattered pages" and "superpage
> mapping" don't seem to fit together for me.

If there is a single 4K page in the RO region, and the caller attempts
to create a RO 2M superpage which includes the 4K region, these checks
will force the use of 4K mappings even though the 2M mapping would be fine.

~Andrew
Jan Beulich Jan. 27, 2016, 3:20 p.m. UTC | #10
>>> On 27.01.16 at 15:51, <andrew.cooper3@citrix.com> wrote:
> On 27/01/16 14:40, Jan Beulich wrote:
>>
>>>>>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>>>>> -                       p2m_access_t access)
>>>>>> +                       unsigned int order, p2m_access_t access)
>>>>>>  {
>>>>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>>>>>> +    if ( order &&
>>>>>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>>>>>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>>>>>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>>>>>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>>>>>> +        return order;
>>>>> Should this not be a hard error?  Even retrying with a lower order is
>>>>> going fail.
>>>> Why? The latest when order == 0, rangeset_overlaps_range()
>>>> will return the same as rangeset_contains_range(), and hence
>>>> the condition above will always be false (one of the two reasons
>>>> for checking order first here).
>>> It isn't the order check which is an issue.
>>>
>>> One way or another, if the original (mfn/order) fails the rangeset
>>> checks, the overall call is going to fail, but it will be re-executed
>>> repeatedly with an order decreasing to 0.  Wouldn't it be better just to
>>> short-circuit this back&forth?
>> But this won't necessarily go down to order 0. Short-circuiting
>> would mean taking PAGE_ORDER_2M and PAGE_ORDER_1G into
>> account here, which would imo severely hamper readability.
> 
> Even when this check starts passing, the subsequent
> set_typed_p2m_entry() will fail for writeable mappings, after having
> constructed small pages up to the boundary of the RO region.

I don't see where such failure would come from:
{ept_,}p2m_type_to_flags() silently suppress the mapping
becoming writable. What am I overlooking?

>>> Relatedly, is there actually anything wrong with making a superpage
>>> read-only mapping over some scattered read-only 4K pages?
>> I'm afraid I don't understand: "scattered pages" and "superpage
>> mapping" don't seem to fit together for me.
> 
> If there is a single 4K page in the RO region, and the caller attempts
> to create a RO 2M superpage which includes the 4K region, these checks
> will force the use of 4K mappings even though the 2M mapping would be fine.

Oh, so you want "access" to also be taken into account. Not
sure that's worth it right now - r/o MMIO mappings shouldn't
occur very often (and map_mmio_regions() passes
->default_access anyway).

Jan

Patch
diff mbox

--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -2174,7 +2174,7 @@  int xc_domain_memory_mapping(
 {
     DECLARE_DOMCTL;
     xc_dominfo_t info;
-    int ret = 0, err;
+    int ret = 0, rc;
     unsigned long done = 0, nr, max_batch_sz;
 
     if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
@@ -2199,19 +2199,24 @@  int xc_domain_memory_mapping(
         domctl.u.memory_mapping.nr_mfns = nr;
         domctl.u.memory_mapping.first_gfn = first_gfn + done;
         domctl.u.memory_mapping.first_mfn = first_mfn + done;
-        err = do_domctl(xch, &domctl);
-        if ( err && errno == E2BIG )
+        rc = do_domctl(xch, &domctl);
+        if ( rc < 0 && errno == E2BIG )
         {
             if ( max_batch_sz <= 1 )
                 break;
             max_batch_sz >>= 1;
             continue;
         }
+        if ( rc > 0 )
+        {
+            done += rc;
+            continue;
+        }
         /* Save the first error... */
         if ( !ret )
-            ret = err;
+            ret = rc;
         /* .. and ignore the rest of them when removing. */
-        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
+        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
             break;
 
         done += nr;
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -436,7 +436,8 @@  static __init void pvh_add_mem_mapping(s
         else
             a = p2m_access_rw;
 
-        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
+                                      PAGE_ORDER_4K, a)) )
             panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
                   gfn, mfn, i, rc);
         if ( !(i & 0xfffff) )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2491,7 +2491,7 @@  static int vmx_alloc_vlapic_mapping(stru
     share_xen_page_with_guest(pg, d, XENSHARE_writable);
     d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
     set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
-                       p2m_get_hostp2m(d)->default_access);
+                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
 
     return 0;
 }
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -899,48 +899,62 @@  void p2m_change_type_range(struct domain
     p2m_unlock(p2m);
 }
 
-/* Returns: 0 for success, -errno for failure */
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
 static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                               p2m_type_t gfn_p2mt, p2m_access_t access)
+                               unsigned int order, p2m_type_t gfn_p2mt,
+                               p2m_access_t access)
 {
     int rc = 0;
     p2m_access_t a;
     p2m_type_t ot;
     mfn_t omfn;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        gfn_unlock(p2m, gfn, order);
+        return cur_order + 1;
+    }
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
-        gfn_unlock(p2m, gfn, 0);
+        gfn_unlock(p2m, gfn, order);
         domain_crash(d);
         return -ENOENT;
     }
     else if ( p2m_is_ram(ot) )
     {
+        unsigned long i;
+
         ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        for ( i = 0; i < (1UL << order); ++i )
+            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
     }
 
     P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
-    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
-                       access);
+    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
     if ( rc )
-        gdprintk(XENLOG_ERR,
-                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
-                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
+        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+                 gfn, order, rc, mfn_x(mfn));
     else if ( p2m_is_pod(ot) )
     {
         pod_lock(p2m);
-        p2m->pod.entry_count--;
+        p2m->pod.entry_count -= 1UL << order;
         BUG_ON(p2m->pod.entry_count < 0);
         pod_unlock(p2m);
     }
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -949,14 +963,21 @@  static int set_typed_p2m_entry(struct do
 static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
                                  mfn_t mfn)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
+    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
                                p2m_get_hostp2m(d)->default_access);
 }
 
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access)
+                       unsigned int order, p2m_access_t access)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
+    if ( order &&
+         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
+                                 mfn_x(mfn) + (1UL << order) - 1) &&
+         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
+                                  mfn_x(mfn) + (1UL << order) - 1) )
+        return order;
+
+    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
 }
 
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
@@ -1009,20 +1030,33 @@  int set_identity_p2m_entry(struct domain
     return ret;
 }
 
-/* Returns: 0 for success, -errno for failure */
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order)
 {
     int rc = -EINVAL;
     mfn_t actual_mfn;
     p2m_access_t a;
     p2m_type_t t;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        rc = cur_order + 1;
+        goto out;
+    }
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
@@ -1035,11 +1069,11 @@  int clear_mmio_p2m_entry(struct domain *
         gdprintk(XENLOG_WARNING,
                  "no mapping between mfn %08lx and gfn %08lx\n",
                  mfn_x(mfn), gfn);
-    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
+    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
                        p2m->default_access);
 
  out:
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -2095,6 +2129,25 @@  void *map_domain_gfn(struct p2m_domain *
     return map_domain_page(*mfn);
 }
 
+static unsigned int mmio_order(const struct domain *d,
+                               unsigned long start_fn, unsigned long nr)
+{
+    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
+         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
+        return 0;
+
+    if ( !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
+         hap_has_1gb )
+        return PAGE_ORDER_1G;
+
+    if ( hap_has_2mb )
+        return PAGE_ORDER_2M;
+
+    return 0;
+}
+
+#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
+
 int map_mmio_regions(struct domain *d,
                      unsigned long start_gfn,
                      unsigned long nr,
@@ -2102,22 +2155,29 @@  int map_mmio_regions(struct domain *d,
 {
     int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; !ret && i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
-                                 p2m_get_hostp2m(d)->default_access);
-        if ( ret )
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
         {
-            unmap_mmio_regions(d, start_gfn, i, mfn);
-            break;
+            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order,
+                                     p2m_get_hostp2m(d)->default_access);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
         }
+        if ( ret < 0 )
+            break;
     }
 
-    return ret;
+    return i == nr ? 0 : i ?: ret;
 }
 
 int unmap_mmio_regions(struct domain *d,
@@ -2125,20 +2185,30 @@  int unmap_mmio_regions(struct domain *d,
                        unsigned long nr,
                        unsigned long mfn)
 {
-    int err = 0;
+    int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
-        if ( ret )
-            err = ret;
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
+        {
+            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
+        }
+        if ( ret < 0 )
+            break;
     }
 
-    return err;
+    return i == nr ? 0 : i ?: ret;
 }
 
 unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -136,6 +136,7 @@  static void ept_p2m_type_to_flags(struct
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
                                                     entry->mfn);
+            ASSERT(entry->w || !is_epte_superpage(entry));
             entry->a = !!cpu_has_vmx_ept_ad;
             entry->d = entry->w && cpu_has_vmx_ept_ad;
             break;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,7 +72,8 @@  static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
+                                       unsigned int level)
 {
     unsigned long flags;
     /*
@@ -107,6 +108,8 @@  static unsigned long p2m_type_to_flags(p
     case p2m_mmio_direct:
         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
             flags |= _PAGE_RW;
+        else
+            ASSERT(!level);
         return flags | P2M_BASE_FLAGS | _PAGE_PCD;
     }
 }
@@ -436,7 +439,7 @@  static int do_recalc(struct p2m_domain *
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
 
             if ( level )
             {
@@ -573,7 +576,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -609,7 +612,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn));
+                                             p2m_type_to_flags(p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -645,7 +648,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
+                                       p2m_type_to_flags(p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -1046,10 +1046,12 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
              (gfn + nr_mfns - 1) < gfn ) /* wrap? */
             break;
 
+#ifndef CONFIG_X86 /* XXX ARM!? */
         ret = -E2BIG;
         /* Must break hypercall up as this could take a while. */
         if ( nr_mfns > 64 )
             break;
+#endif
 
         ret = -EPERM;
         if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
@@ -1067,7 +1069,7 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret )
+            if ( ret < 0 )
                 printk(XENLOG_G_WARNING
                        "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
                        d->domain_id, gfn, mfn, nr_mfns, ret);
@@ -1079,7 +1081,7 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret && is_hardware_domain(current->domain) )
+            if ( ret < 0 && is_hardware_domain(current->domain) )
                 printk(XENLOG_ERR
                        "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
                        ret, d->domain_id, mfn, mfn_end);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -259,7 +259,7 @@  int guest_remove_page(struct domain *d,
     }
     if ( p2mt == p2m_mmio_direct )
     {
-        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
+        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
         put_gfn(d, gmfn);
         return 1;
     }
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,8 +574,9 @@  int p2m_is_logdirty_range(struct p2m_dom
 
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access);
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+                       unsigned int order, p2m_access_t access);
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order);
 
 /* Set identity addresses in the p2m table (for pass-through) */
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -542,8 +542,15 @@  DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
 
 
 /* Bind machine I/O address range -> HVM address range. */
-/* If this returns -E2BIG lower nr_mfns value. */
 /* XEN_DOMCTL_memory_mapping */
+/* Returns
+   - zero     success, everything done
+   - -E2BIG   passed in nr_mfns value too large for the implementation
+   - positive partial success for the first <result> page frames (with
+              <result> less than nr_mfns), requiring re-invocation by the
+              caller after updating inputs
+   - negative error; other than -E2BIG
+*/
 #define DPCI_ADD_MAPPING         1
 #define DPCI_REMOVE_MAPPING      0
 struct xen_domctl_memory_mapping {