diff mbox

[v7] x86/p2m: use large pages for MMIO mappings

Message ID 56B0D61302000078000CD962@prv-mh.provo.novell.com
State New, archived
Headers show

Commit Message

Jan Beulich Feb. 2, 2016, 3:15 p.m. UTC
When mapping large BARs (e.g. the frame buffer of a graphics card) the
overhead of establishing such mappings using only 4k pages has,
particularly after the XSA-125 fix, become unacceptable. Alter the
XEN_DOMCTL_memory_mapping semantics once again, so that there's no
longer a fixed amount of guest frames that represents the upper limit
of what a single invocation can map. Instead bound execution time by
limiting the number of iterations (regardless of page size).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
---
Open issues (perhaps for subsequent changes):
- ARM side unimplemented (and hence libxc for now made cope with both
  models), the main issue (besides my inability to test any change
  there) being the many internal uses of map_mmio_regions())
- iommu_{,un}map_page() interfaces don't support "order" (hence
  mmio_order() for now returns zero when !iommu_hap_pt_share, which in
  particular means the AMD side isn't being taken care of just yet, but
  note that this also has the intended effect of suppressing non-zero
  order mappings in the shadow mode case)
---
v7: Only allow use of 2M mappings for the time being.
v6: Move an mfn_valid() assertion to cover the full MFN range. Use
    PAGE_ORDER_4K in mmio_order(). Improve the return value description
    of set_typed_p2m_entry().
v5: Refine comment in domctl.h.
v4: Move cleanup duty entirely to the caller of the hypercall. Move
    return value description to from commit message to domctl.h.
v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
    hap_capabilities". Extend description to spell out new return value
    meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
    of literal 0. Take into consideration r/o MMIO pages.
v2: Produce valid entries for large p2m_mmio_direct mappings in
    p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
    mmio_order(). Update function comment of set_typed_p2m_entry() and
    clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
    {,un}map_mmio_regions() to detect otherwise endless loops.
x86/p2m: use large pages for MMIO mappings

When mapping large BARs (e.g. the frame buffer of a graphics card) the
overhead of establishing such mappings using only 4k pages has,
particularly after the XSA-125 fix, become unacceptable. Alter the
XEN_DOMCTL_memory_mapping semantics once again, so that there's no
longer a fixed amount of guest frames that represents the upper limit
of what a single invocation can map. Instead bound execution time by
limiting the number of iterations (regardless of page size).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
---
Open issues (perhaps for subsequent changes):
- ARM side unimplemented (and hence libxc for now made cope with both
  models), the main issue (besides my inability to test any change
  there) being the many internal uses of map_mmio_regions())
- iommu_{,un}map_page() interfaces don't support "order" (hence
  mmio_order() for now returns zero when !iommu_hap_pt_share, which in
  particular means the AMD side isn't being taken care of just yet, but
  note that this also has the intended effect of suppressing non-zero
  order mappings in the shadow mode case)
---
v7: Only allow use of 2M mappings for the time being.
v6: Move an mfn_valid() assertion to cover the full MFN range. Use
    PAGE_ORDER_4K in mmio_order(). Improve the return value description
    of set_typed_p2m_entry().
v5: Refine comment in domctl.h.
v4: Move cleanup duty entirely to the caller of the hypercall. Move
    return value description to from commit message to domctl.h.
v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
    hap_capabilities". Extend description to spell out new return value
    meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
    of literal 0. Take into consideration r/o MMIO pages.
v2: Produce valid entries for large p2m_mmio_direct mappings in
    p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
    mmio_order(). Update function comment of set_typed_p2m_entry() and
    clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
    {,un}map_mmio_regions() to detect otherwise endless loops.

--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -2229,7 +2229,7 @@ int xc_domain_memory_mapping(
 {
     DECLARE_DOMCTL;
     xc_dominfo_t info;
-    int ret = 0, err;
+    int ret = 0, rc;
     unsigned long done = 0, nr, max_batch_sz;
 
     if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
@@ -2254,19 +2254,24 @@ int xc_domain_memory_mapping(
         domctl.u.memory_mapping.nr_mfns = nr;
         domctl.u.memory_mapping.first_gfn = first_gfn + done;
         domctl.u.memory_mapping.first_mfn = first_mfn + done;
-        err = do_domctl(xch, &domctl);
-        if ( err && errno == E2BIG )
+        rc = do_domctl(xch, &domctl);
+        if ( rc < 0 && errno == E2BIG )
         {
             if ( max_batch_sz <= 1 )
                 break;
             max_batch_sz >>= 1;
             continue;
         }
+        if ( rc > 0 )
+        {
+            done += rc;
+            continue;
+        }
         /* Save the first error... */
         if ( !ret )
-            ret = err;
+            ret = rc;
         /* .. and ignore the rest of them when removing. */
-        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
+        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
             break;
 
         done += nr;
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s
         else
             a = p2m_access_rw;
 
-        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
+                                      PAGE_ORDER_4K, a)) )
             panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
                   gfn, mfn, i, rc);
         if ( !(i & 0xfffff) )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2503,7 +2503,7 @@ static int vmx_alloc_vlapic_mapping(stru
     share_xen_page_with_guest(pg, d, XENSHARE_writable);
     d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
     set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
-                       p2m_get_hostp2m(d)->default_access);
+                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
 
     return 0;
 }
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -899,48 +899,64 @@ void p2m_change_type_range(struct domain
     p2m_unlock(p2m);
 }
 
-/* Returns: 0 for success, -errno for failure */
+/*
+ * Returns:
+ *    0              for success
+ *    -errno         for failure
+ *    1 + new order  for caller to retry with smaller order (guaranteed
+ *                   to be smaller than order passed in)
+ */
 static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                               p2m_type_t gfn_p2mt, p2m_access_t access)
+                               unsigned int order, p2m_type_t gfn_p2mt,
+                               p2m_access_t access)
 {
     int rc = 0;
     p2m_access_t a;
     p2m_type_t ot;
     mfn_t omfn;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        gfn_unlock(p2m, gfn, order);
+        return cur_order + 1;
+    }
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
-        gfn_unlock(p2m, gfn, 0);
+        gfn_unlock(p2m, gfn, order);
         domain_crash(d);
         return -ENOENT;
     }
     else if ( p2m_is_ram(ot) )
     {
-        ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        unsigned long i;
+
+        for ( i = 0; i < (1UL << order); ++i )
+        {
+            ASSERT(mfn_valid(_mfn(mfn_x(omfn) + i)));
+            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
+        }
     }
 
     P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
-    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
-                       access);
+    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
     if ( rc )
-        gdprintk(XENLOG_ERR,
-                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
-                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
+        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+                 gfn, order, rc, mfn_x(mfn));
     else if ( p2m_is_pod(ot) )
     {
         pod_lock(p2m);
-        p2m->pod.entry_count--;
+        p2m->pod.entry_count -= 1UL << order;
         BUG_ON(p2m->pod.entry_count < 0);
         pod_unlock(p2m);
     }
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -949,14 +965,21 @@ static int set_typed_p2m_entry(struct do
 static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
                                  mfn_t mfn)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
+    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
                                p2m_get_hostp2m(d)->default_access);
 }
 
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access)
+                       unsigned int order, p2m_access_t access)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
+    if ( order &&
+         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
+                                 mfn_x(mfn) + (1UL << order) - 1) &&
+         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
+                                  mfn_x(mfn) + (1UL << order) - 1) )
+        return order;
+
+    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
 }
 
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
@@ -1009,20 +1032,33 @@ int set_identity_p2m_entry(struct domain
     return ret;
 }
 
-/* Returns: 0 for success, -errno for failure */
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order)
 {
     int rc = -EINVAL;
     mfn_t actual_mfn;
     p2m_access_t a;
     p2m_type_t t;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        rc = cur_order + 1;
+        goto out;
+    }
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
@@ -1035,11 +1071,11 @@ int clear_mmio_p2m_entry(struct domain *
         gdprintk(XENLOG_WARNING,
                  "no mapping between mfn %08lx and gfn %08lx\n",
                  mfn_x(mfn), gfn);
-    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
+    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
                        p2m->default_access);
 
  out:
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -2095,6 +2131,30 @@ void *map_domain_gfn(struct p2m_domain *
     return map_domain_page(*mfn);
 }
 
+static unsigned int mmio_order(const struct domain *d,
+                               unsigned long start_fn, unsigned long nr)
+{
+    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
+         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
+        return PAGE_ORDER_4K;
+
+    if ( 0 /*
+            * Don't use 1Gb pages, to limit the iteration count in
+            * set_typed_p2m_entry() when it needs to zap M2P entries
+            * for a RAM range.
+            */ &&
+         !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
+         hap_has_1gb )
+        return PAGE_ORDER_1G;
+
+    if ( hap_has_2mb )
+        return PAGE_ORDER_2M;
+
+    return PAGE_ORDER_4K;
+}
+
+#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
+
 int map_mmio_regions(struct domain *d,
                      unsigned long start_gfn,
                      unsigned long nr,
@@ -2102,22 +2162,29 @@ int map_mmio_regions(struct domain *d,
 {
     int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; !ret && i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
-                                 p2m_get_hostp2m(d)->default_access);
-        if ( ret )
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
         {
-            unmap_mmio_regions(d, start_gfn, i, mfn);
-            break;
+            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order,
+                                     p2m_get_hostp2m(d)->default_access);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
         }
+        if ( ret < 0 )
+            break;
     }
 
-    return ret;
+    return i == nr ? 0 : i ?: ret;
 }
 
 int unmap_mmio_regions(struct domain *d,
@@ -2125,20 +2192,30 @@ int unmap_mmio_regions(struct domain *d,
                        unsigned long nr,
                        unsigned long mfn)
 {
-    int err = 0;
+    int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
-        if ( ret )
-            err = ret;
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
+        {
+            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
+        }
+        if ( ret < 0 )
+            break;
     }
 
-    return err;
+    return i == nr ? 0 : i ?: ret;
 }
 
 unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
                                                     entry->mfn);
+            ASSERT(entry->w || !is_epte_superpage(entry));
             entry->a = !!cpu_has_vmx_ept_ad;
             entry->d = entry->w && cpu_has_vmx_ept_ad;
             break;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,7 +72,8 @@ static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
+                                       unsigned int level)
 {
     unsigned long flags;
     /*
@@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p
     case p2m_mmio_direct:
         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
             flags |= _PAGE_RW;
+        else
+            ASSERT(!level);
         return flags | P2M_BASE_FLAGS | _PAGE_PCD;
     }
 }
@@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain *
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
 
             if ( level )
             {
@@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn));
+                                             p2m_type_to_flags(p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
+                                       p2m_type_to_flags(p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
              (gfn + nr_mfns - 1) < gfn ) /* wrap? */
             break;
 
+#ifndef CONFIG_X86 /* XXX ARM!? */
         ret = -E2BIG;
         /* Must break hypercall up as this could take a while. */
         if ( nr_mfns > 64 )
             break;
+#endif
 
         ret = -EPERM;
         if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
@@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret )
+            if ( ret < 0 )
                 printk(XENLOG_G_WARNING
                        "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
                        d->domain_id, gfn, mfn, nr_mfns, ret);
@@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret && is_hardware_domain(current->domain) )
+            if ( ret < 0 && is_hardware_domain(current->domain) )
                 printk(XENLOG_ERR
                        "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
                        ret, d->domain_id, mfn, mfn_end);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d,
     }
     if ( p2mt == p2m_mmio_direct )
     {
-        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
+        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
         put_gfn(d, gmfn);
         return 1;
     }
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom
 
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access);
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+                       unsigned int order, p2m_access_t access);
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order);
 
 /* Set identity addresses in the p2m table (for pass-through) */
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -542,8 +542,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
 
 
 /* Bind machine I/O address range -> HVM address range. */
-/* If this returns -E2BIG lower nr_mfns value. */
 /* XEN_DOMCTL_memory_mapping */
+/* Returns
+   - zero     success, everything done
+   - -E2BIG   passed in nr_mfns value too large for the implementation
+   - positive partial success for the first <result> page frames (with
+              <result> less than nr_mfns), requiring re-invocation by the
+              caller after updating inputs
+   - negative error; other than -E2BIG
+*/
 #define DPCI_ADD_MAPPING         1
 #define DPCI_REMOVE_MAPPING      0
 struct xen_domctl_memory_mapping {

Comments

Andrew Cooper Feb. 2, 2016, 3:27 p.m. UTC | #1
On 02/02/16 15:15, Jan Beulich wrote:
> @@ -2095,6 +2131,30 @@ void *map_domain_gfn(struct p2m_domain *
>      return map_domain_page(*mfn);
>  }
>  
> +static unsigned int mmio_order(const struct domain *d,
> +                               unsigned long start_fn, unsigned long nr)
> +{
> +    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
> +         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
> +        return PAGE_ORDER_4K;
> +
> +    if ( 0 /*
> +            * Don't use 1Gb pages, to limit the iteration count in
> +            * set_typed_p2m_entry() when it needs to zap M2P entries
> +            * for a RAM range.
> +            */ &&
> +         !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
> +         hap_has_1gb )
> +        return PAGE_ORDER_1G;
> +
> +    if ( hap_has_2mb )
> +        return PAGE_ORDER_2M;
> +
> +    return PAGE_ORDER_4K;
> +}

Apologises again for only just spotting this.  PV guests need to be
restricted to 4K mappings.  Currently, this function can return
PAGE_ORDER_2M for PV guests, based on hap_has_2mb.

All the callers are presently gated on !paging_mode_translate(d), which
will exclude PV guests, but it mmio_order() isn't in principle wrong to
call for a PV guest.

With either a suitable PV path, or assertion excluding PV guests,

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>  (to avoid
another on-list iteration for just this issue)
Jan Beulich Feb. 2, 2016, 3:41 p.m. UTC | #2
>>> On 02.02.16 at 16:27, <andrew.cooper3@citrix.com> wrote:
> On 02/02/16 15:15, Jan Beulich wrote:
>> @@ -2095,6 +2131,30 @@ void *map_domain_gfn(struct p2m_domain *
>>      return map_domain_page(*mfn);
>>  }
>>  
>> +static unsigned int mmio_order(const struct domain *d,
>> +                               unsigned long start_fn, unsigned long nr)
>> +{
>> +    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
>> +         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
>> +        return PAGE_ORDER_4K;
>> +
>> +    if ( 0 /*
>> +            * Don't use 1Gb pages, to limit the iteration count in
>> +            * set_typed_p2m_entry() when it needs to zap M2P entries
>> +            * for a RAM range.
>> +            */ &&
>> +         !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
>> +         hap_has_1gb )
>> +        return PAGE_ORDER_1G;
>> +
>> +    if ( hap_has_2mb )
>> +        return PAGE_ORDER_2M;
>> +
>> +    return PAGE_ORDER_4K;
>> +}
> 
> Apologises again for only just spotting this.  PV guests need to be
> restricted to 4K mappings.  Currently, this function can return
> PAGE_ORDER_2M for PV guests, based on hap_has_2mb.
> 
> All the callers are presently gated on !paging_mode_translate(d), which
> will exclude PV guests, but it mmio_order() isn't in principle wrong to
> call for a PV guest.

And why is the !iommu_use_hap_pt() not good enough?

Jan

> With either a suitable PV path, or assertion excluding PV guests,
> 
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>  (to avoid
> another on-list iteration for just this issue)
George Dunlap Feb. 8, 2016, 6:04 p.m. UTC | #3
On 02/02/16 15:15, Jan Beulich wrote:
> When mapping large BARs (e.g. the frame buffer of a graphics card) the
> overhead of establishing such mappings using only 4k pages has,
> particularly after the XSA-125 fix, become unacceptable. Alter the
> XEN_DOMCTL_memory_mapping semantics once again, so that there's no
> longer a fixed amount of guest frames that represents the upper limit
> of what a single invocation can map. Instead bound execution time by
> limiting the number of iterations (regardless of page size).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> Acked-by: Kevin Tian <kevin.tian@intel.com>

Hey Jan,

Sorry for the delay in reviewing this.  Overall looks correct; just a
couple of comments.

First -- and this isn't a blocker, but just a question (and sorry if
it's been answered before) -- why have the "0 means I did it all, <nr
means I did it partially"?  Why not just return the number of entries
actually handled?

> ---
> Open issues (perhaps for subsequent changes):
> - ARM side unimplemented (and hence libxc for now made cope with both
>   models), the main issue (besides my inability to test any change
>   there) being the many internal uses of map_mmio_regions())
> - iommu_{,un}map_page() interfaces don't support "order" (hence
>   mmio_order() for now returns zero when !iommu_hap_pt_share, which in
>   particular means the AMD side isn't being taken care of just yet, but
>   note that this also has the intended effect of suppressing non-zero
>   order mappings in the shadow mode case)

This information -- about using iommu_hap_pt_share() to guard against
both AMD and shadow mode -- should probably be in the tree somewhere;
preferably in a comment above map_mmio_regions(), but at very least in
the changelog (rather than in this "comment to reviewers" section, which
will be thrown away on check-in).

> @@ -949,14 +965,21 @@ static int set_typed_p2m_entry(struct do
>  static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
>                                   mfn_t mfn)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
> +    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
>                                 p2m_get_hostp2m(d)->default_access);
>  }
>  
>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                       p2m_access_t access)
> +                       unsigned int order, p2m_access_t access)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
> +    if ( order &&
> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
> +                                  mfn_x(mfn) + (1UL << order) - 1) )
> +        return order;

What is this about?  Isn't set_mmio_p2m_entry() now supposed to have a
calling convention like clear_mmio_p2m_entry(), where it returns
recommended_order+1 if you should retry with order, and this value
(recommended_order) is guaranteed to be strictly less than what was
passed in?

Did you mean to return PAGE_ORDER_4k+1 here, perhaps?  (Or (order-9)+1?)

> --- a/xen/arch/x86/mm/p2m-ept.c
> +++ b/xen/arch/x86/mm/p2m-ept.c
> @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
>              entry->r = entry->x = 1;
>              entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
>                                                      entry->mfn);
> +            ASSERT(entry->w || !is_epte_superpage(entry));

What is this about?  Single-page epte entries are allowed to be either
read-only or read-write, but superpage entries have to have write
permission?

>              entry->a = !!cpu_has_vmx_ept_ad;
>              entry->d = entry->w && cpu_has_vmx_ept_ad;
>              break;
> --- a/xen/arch/x86/mm/p2m-pt.c
> +++ b/xen/arch/x86/mm/p2m-pt.c
> @@ -72,7 +72,8 @@ static const unsigned long pgt[] = {
>      PGT_l3_page_table
>  };
>  
> -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
> +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
> +                                       unsigned int level)
>  {
>      unsigned long flags;
>      /*
> @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p
>      case p2m_mmio_direct:
>          if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
>              flags |= _PAGE_RW;
> +        else
> +            ASSERT(!level);

Why are we not allowed to have superpage MMIO ranges in this case?  Is
it just because the AMD side is unimplemented and the shadow side it
isn't allowed?

If so, a comment to that effect would be helpful.

Other than that, looks good.

 -George
Jan Beulich Feb. 9, 2016, 8:42 a.m. UTC | #4
>>> On 08.02.16 at 19:04, <george.dunlap@citrix.com> wrote:
> First -- and this isn't a blocker, but just a question (and sorry if
> it's been answered before) -- why have the "0 means I did it all, <nr
> means I did it partially"?  Why not just return the number of entries
> actually handled?

Because I view zero as the conventional "success" indicator. No
other deeper reason.

>> ---
>> Open issues (perhaps for subsequent changes):
>> - ARM side unimplemented (and hence libxc for now made cope with both
>>   models), the main issue (besides my inability to test any change
>>   there) being the many internal uses of map_mmio_regions())
>> - iommu_{,un}map_page() interfaces don't support "order" (hence
>>   mmio_order() for now returns zero when !iommu_hap_pt_share, which in
>>   particular means the AMD side isn't being taken care of just yet, but
>>   note that this also has the intended effect of suppressing non-zero
>>   order mappings in the shadow mode case)
> 
> This information -- about using iommu_hap_pt_share() to guard against
> both AMD and shadow mode -- should probably be in the tree somewhere;
> preferably in a comment above map_mmio_regions(), but at very least in
> the changelog (rather than in this "comment to reviewers" section, which
> will be thrown away on check-in).

Can be done of course.

>> @@ -949,14 +965,21 @@ static int set_typed_p2m_entry(struct do
>>  static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
>>                                   mfn_t mfn)
>>  {
>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
>> +    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
>>                                 p2m_get_hostp2m(d)->default_access);
>>  }
>>  
>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>> -                       p2m_access_t access)
>> +                       unsigned int order, p2m_access_t access)
>>  {
>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>> +    if ( order &&
>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>> +        return order;
> 
> What is this about?  Isn't set_mmio_p2m_entry() now supposed to have a
> calling convention like clear_mmio_p2m_entry(), where it returns
> recommended_order+1 if you should retry with order, and this value
> (recommended_order) is guaranteed to be strictly less than what was
> passed in?
> 
> Did you mean to return PAGE_ORDER_4k+1 here, perhaps?  (Or (order-9)+1?)

No. This is catering for callers of both {,ept_}p2m_type_to_flags()
not easily being in the position of requesting further page splitting.
Hence rather than complicate the code there, it is here where we
make sure that the r/o enforcement won't be missed.

>> --- a/xen/arch/x86/mm/p2m-ept.c
>> +++ b/xen/arch/x86/mm/p2m-ept.c
>> @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
>>              entry->r = entry->x = 1;
>>              entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
>>                                                      entry->mfn);
>> +            ASSERT(entry->w || !is_epte_superpage(entry));
> 
> What is this about?  Single-page epte entries are allowed to be either
> read-only or read-write, but superpage entries have to have write
> permission?

Yes, due to the lack of an "order" input here, and (as mentioned
above) the difficulties of this function's callers otherwise needing
to be able to deal with further page splitting directives (coming
out of this function).

But now that you mention this I think you're right further up:
That code as it is now indeed seems to put things up for the
ASSERT() here to actually be triggerable (if there would be a
large enough r/o MMIO region). So it looks like code further
up should indeed use PAGE_ORDER_4K+1 here, as you suggest
(which would then also address one of Andrew's concerns).

>> @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p
>>      case p2m_mmio_direct:
>>          if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
>>              flags |= _PAGE_RW;
>> +        else
>> +            ASSERT(!level);
> 
> Why are we not allowed to have superpage MMIO ranges in this case?  Is
> it just because the AMD side is unimplemented and the shadow side it
> isn't allowed?
> 
> If so, a comment to that effect would be helpful.

Actually the primary reason is because the rangeset check is
(and right now can only be) a singleton one. I.e. this is again
related to the complications which would arise if a page split
was to be requested from here.

Jan
George Dunlap Feb. 9, 2016, 10:56 a.m. UTC | #5
On 09/02/16 08:42, Jan Beulich wrote:
>>>> On 08.02.16 at 19:04, <george.dunlap@citrix.com> wrote:
>> First -- and this isn't a blocker, but just a question (and sorry if
>> it's been answered before) -- why have the "0 means I did it all, <nr
>> means I did it partially"?  Why not just return the number of entries
>> actually handled?
> 
> Because I view zero as the conventional "success" indicator. No
> other deeper reason.

For certain classes of functionality, yes.  But for other classes --
where the caller will request N but the OS / hypervisor / whatever may
do some other number fewer than N, then the convention is to always pass
the number of items completed.  read() and write() system calls are like
this, as are most of the XENMEM operations (e.g.,
XENMEM_increase_resevration).

Since this domctl looks a lot like some of those XENMEM operations,
wouldn't it make more sense to follow that convention, and return the
number of extents actually allocated?

>>> @@ -949,14 +965,21 @@ static int set_typed_p2m_entry(struct do
>>>  static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
>>>                                   mfn_t mfn)
>>>  {
>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
>>> +    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
>>>                                 p2m_get_hostp2m(d)->default_access);
>>>  }
>>>  
>>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>> -                       p2m_access_t access)
>>> +                       unsigned int order, p2m_access_t access)
>>>  {
>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>>> +    if ( order &&
>>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>>> +        return order;
>>
>> What is this about?  Isn't set_mmio_p2m_entry() now supposed to have a
>> calling convention like clear_mmio_p2m_entry(), where it returns
>> recommended_order+1 if you should retry with order, and this value
>> (recommended_order) is guaranteed to be strictly less than what was
>> passed in?
>>
>> Did you mean to return PAGE_ORDER_4k+1 here, perhaps?  (Or (order-9)+1?)
> 
> No. This is catering for callers of both {,ept_}p2m_type_to_flags()
> not easily being in the position of requesting further page splitting.
> Hence rather than complicate the code there, it is here where we
> make sure that the r/o enforcement won't be missed.

I wasn't asking what the check is about; I'm asking about the return
value.  Did you really mean that if someone passed in order 9, that it
would say "Please try with order 8 instead"?  I'm asking because I would
have expected that to be "Please try with order 0 instead".

(Although my question wasn't clear because I thought it was returning
the *same* value, not one less.  A brief comment like, "Request trying
again with order-1" would be helpful if we end up retaining this return
value.)

I'm still becoming somewhat familiar with the p2m code, but it looks to
me like ept_set_entry() at least doesn't tolerate orders that aren't
multiples of EPT_TABLE_ORDER.

This conditional will only actually do anything under the following
conditions:

a. the requested gfn order is 9 (since we're not doing 1G pages yet)

b. at order 9, the mfn range partially overlaps mmio_ro_ranges

c. the p2m entry for that gfn is order 9 (or larger)

(Re c: i.e., if the order in the p2m entry is already 0, then
set_typed_p2m_entry() will already request a 4K page, making this entire
conditional redundant.)

Let's also suppose for simplicity that:

d. at order 8, the mfn range does *not* partially overlap mmio_ro_ranges
anymore.

Under those conditions, it looks to me like the following will happen:
1. map_mmio_regions() will call set_mmio_p2m_entry() with order 9
2. set_mmio_p2m_entry() will return 9 (requesting using order 8)
3. map_mmio_regions() will call set_mmio_p2m_entry() with order 8
4. set_mmio_p2m_entry() will call set_typed_p2m_entry() with order 8
5. set_typed_p2m_entry() will read the current p2m entry of >= 9 and
continue (since 8 <= 9)
6. set_typed_p2m_entry() will call p2m_set_entry() with an order of 8
7. ept_set_entry() will return -EINVAL, since order % EPT_TABLE_ORDER != 0.

Am I missing something?

> 
>>> --- a/xen/arch/x86/mm/p2m-ept.c
>>> +++ b/xen/arch/x86/mm/p2m-ept.c
>>> @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
>>>              entry->r = entry->x = 1;
>>>              entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
>>>                                                      entry->mfn);
>>> +            ASSERT(entry->w || !is_epte_superpage(entry));
>>
>> What is this about?  Single-page epte entries are allowed to be either
>> read-only or read-write, but superpage entries have to have write
>> permission?
> 
> Yes, due to the lack of an "order" input here, and (as mentioned
> above) the difficulties of this function's callers otherwise needing
> to be able to deal with further page splitting directives (coming
> out of this function).
> 
> But now that you mention this I think you're right further up:
> That code as it is now indeed seems to put things up for the
> ASSERT() here to actually be triggerable (if there would be a
> large enough r/o MMIO region). So it looks like code further
> up should indeed use PAGE_ORDER_4K+1 here, as you suggest
> (which would then also address one of Andrew's concerns).

You could either get rid of the "!rangeset_contains_range()" in
set_mmio_p2m_entry() (not allowing >4k orders even if the entire range
fits), or you could change these assertions to check that either the
entire range is contained or that it doesn't overlap (essentially the
inverse of the check at set_mmio_p2m_entry()).

I don't immediately see any reason why you couldn't use superpages if
the entire mfn range fits within mmio_ro_ranges.  (Although if you have
reason to believe that these almost never contain superpages, it would
certainly make the code simpler to just always use 4k pages in case of
an overlap.)

 -George
Jan Beulich Feb. 9, 2016, 11:48 a.m. UTC | #6
>>> On 09.02.16 at 11:56, <george.dunlap@citrix.com> wrote:
> On 09/02/16 08:42, Jan Beulich wrote:
>>>>> On 08.02.16 at 19:04, <george.dunlap@citrix.com> wrote:
>>> First -- and this isn't a blocker, but just a question (and sorry if
>>> it's been answered before) -- why have the "0 means I did it all, <nr
>>> means I did it partially"?  Why not just return the number of entries
>>> actually handled?
>> 
>> Because I view zero as the conventional "success" indicator. No
>> other deeper reason.
> 
> For certain classes of functionality, yes.  But for other classes --
> where the caller will request N but the OS / hypervisor / whatever may
> do some other number fewer than N, then the convention is to always pass
> the number of items completed.  read() and write() system calls are like
> this, as are most of the XENMEM operations (e.g.,
> XENMEM_increase_resevration).
> 
> Since this domctl looks a lot like some of those XENMEM operations,
> wouldn't it make more sense to follow that convention, and return the
> number of extents actually allocated?

Well, if comparing with the XENMEM ones, perhaps. But if comparing
with other domctls, perhaps rather not? I've really been undecided
here from the very beginning, since I had considered that alternative
right away.

>>>> @@ -949,14 +965,21 @@ static int set_typed_p2m_entry(struct do
>>>>  static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
>>>>                                   mfn_t mfn)
>>>>  {
>>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
>>>> +    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
>>>>                                 p2m_get_hostp2m(d)->default_access);
>>>>  }
>>>>  
>>>>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
>>>> -                       p2m_access_t access)
>>>> +                       unsigned int order, p2m_access_t access)
>>>>  {
>>>> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
>>>> +    if ( order &&
>>>> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
>>>> +                                 mfn_x(mfn) + (1UL << order) - 1) &&
>>>> +         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
>>>> +                                  mfn_x(mfn) + (1UL << order) - 1) )
>>>> +        return order;
>>>
>>> What is this about?  Isn't set_mmio_p2m_entry() now supposed to have a
>>> calling convention like clear_mmio_p2m_entry(), where it returns
>>> recommended_order+1 if you should retry with order, and this value
>>> (recommended_order) is guaranteed to be strictly less than what was
>>> passed in?
>>>
>>> Did you mean to return PAGE_ORDER_4k+1 here, perhaps?  (Or (order-9)+1?)
>> 
>> No. This is catering for callers of both {,ept_}p2m_type_to_flags()
>> not easily being in the position of requesting further page splitting.
>> Hence rather than complicate the code there, it is here where we
>> make sure that the r/o enforcement won't be missed.
> 
> I wasn't asking what the check is about; I'm asking about the return
> value.  Did you really mean that if someone passed in order 9, that it
> would say "Please try with order 8 instead"?  I'm asking because I would
> have expected that to be "Please try with order 0 instead".
> 
> (Although my question wasn't clear because I thought it was returning
> the *same* value, not one less.  A brief comment like, "Request trying
> again with order-1" would be helpful if we end up retaining this return
> value.)
> 
> I'm still becoming somewhat familiar with the p2m code, but it looks to
> me like ept_set_entry() at least doesn't tolerate orders that aren't
> multiples of EPT_TABLE_ORDER.
> 
> This conditional will only actually do anything under the following
> conditions:
> 
> a. the requested gfn order is 9 (since we're not doing 1G pages yet)
> 
> b. at order 9, the mfn range partially overlaps mmio_ro_ranges
> 
> c. the p2m entry for that gfn is order 9 (or larger)
> 
> (Re c: i.e., if the order in the p2m entry is already 0, then
> set_typed_p2m_entry() will already request a 4K page, making this entire
> conditional redundant.)
> 
> Let's also suppose for simplicity that:
> 
> d. at order 8, the mfn range does *not* partially overlap mmio_ro_ranges
> anymore.
> 
> Under those conditions, it looks to me like the following will happen:
> 1. map_mmio_regions() will call set_mmio_p2m_entry() with order 9
> 2. set_mmio_p2m_entry() will return 9 (requesting using order 8)
> 3. map_mmio_regions() will call set_mmio_p2m_entry() with order 8
> 4. set_mmio_p2m_entry() will call set_typed_p2m_entry() with order 8
> 5. set_typed_p2m_entry() will read the current p2m entry of >= 9 and
> continue (since 8 <= 9)
> 6. set_typed_p2m_entry() will call p2m_set_entry() with an order of 8
> 7. ept_set_entry() will return -EINVAL, since order % EPT_TABLE_ORDER != 0.
> 
> Am I missing something?

Between step 6 and step 7 there is actually p2m_set_entry()
breaking up the request into chunks or orders the hardware
supports.

So to answer your first question above - yes, the intention was
to reduce orders in small steps, even if the hardware doesn't
support those orders. Indeed this wasn't the most efficient way
of doing it (as Andrew also noted), but the intention specifically
was for that code to not needlessly depend on hardware
characteristics.

But this is all moot now anyway, with the return value going to
become PAGE_ORDER_4K+1 (due to the below).

>>>> --- a/xen/arch/x86/mm/p2m-ept.c
>>>> +++ b/xen/arch/x86/mm/p2m-ept.c
>>>> @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
>>>>              entry->r = entry->x = 1;
>>>>              entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
>>>>                                                      entry->mfn);
>>>> +            ASSERT(entry->w || !is_epte_superpage(entry));
>>>
>>> What is this about?  Single-page epte entries are allowed to be either
>>> read-only or read-write, but superpage entries have to have write
>>> permission?
>> 
>> Yes, due to the lack of an "order" input here, and (as mentioned
>> above) the difficulties of this function's callers otherwise needing
>> to be able to deal with further page splitting directives (coming
>> out of this function).
>> 
>> But now that you mention this I think you're right further up:
>> That code as it is now indeed seems to put things up for the
>> ASSERT() here to actually be triggerable (if there would be a
>> large enough r/o MMIO region). So it looks like code further
>> up should indeed use PAGE_ORDER_4K+1 here, as you suggest
>> (which would then also address one of Andrew's concerns).
> 
> You could either get rid of the "!rangeset_contains_range()" in
> set_mmio_p2m_entry() (not allowing >4k orders even if the entire range
> fits), or you could change these assertions to check that either the
> entire range is contained or that it doesn't overlap (essentially the
> inverse of the check at set_mmio_p2m_entry()).
> 
> I don't immediately see any reason why you couldn't use superpages if
> the entire mfn range fits within mmio_ro_ranges.

There indeed is no technical reason other than - as said - the
difficulty of enforcing the page split at the time the new entry
is about to be written (when its flags get calculated).

> (Although if you have
> reason to believe that these almost never contain superpages, it would
> certainly make the code simpler to just always use 4k pages in case of
> an overlap.)

The MMCFG area(s) could easily be (a) large one(s), but we don't
expose this to other than Dom0. MSI-X tables, otoh, will most likely
be solitary pages. VT-d's RMRRs are less simple to predict,
considering that they're reportedly also being used for graphics
devices.

But I simply haven't got enough time to do the necessary surgery
right now that would be needed to support large page r/o MMIO
mappings.

Jan
George Dunlap Feb. 9, 2016, 12:17 p.m. UTC | #7
On 09/02/16 11:48, Jan Beulich wrote:
>>>> On 09.02.16 at 11:56, <george.dunlap@citrix.com> wrote:
>> On 09/02/16 08:42, Jan Beulich wrote:
>>>>>> On 08.02.16 at 19:04, <george.dunlap@citrix.com> wrote:
>>>> First -- and this isn't a blocker, but just a question (and sorry if
>>>> it's been answered before) -- why have the "0 means I did it all, <nr
>>>> means I did it partially"?  Why not just return the number of entries
>>>> actually handled?
>>>
>>> Because I view zero as the conventional "success" indicator. No
>>> other deeper reason.
>>
>> For certain classes of functionality, yes.  But for other classes --
>> where the caller will request N but the OS / hypervisor / whatever may
>> do some other number fewer than N, then the convention is to always pass
>> the number of items completed.  read() and write() system calls are like
>> this, as are most of the XENMEM operations (e.g.,
>> XENMEM_increase_resevration).
>>
>> Since this domctl looks a lot like some of those XENMEM operations,
>> wouldn't it make more sense to follow that convention, and return the
>> number of extents actually allocated?
> 
> Well, if comparing with the XENMEM ones, perhaps. But if comparing
> with other domctls, perhaps rather not? I've really been undecided
> here from the very beginning, since I had considered that alternative
> right away.

Well from a brief survey of things that can partially succeed
(getmemlist, gethvmcontext, {get,set}_vcpu_msrs), it seems that most of
them:

1. Use the return value *only* for success or failure; they either
return 0 or -errno, even if they only partially succeed
2. Return the actual number of things accomplished in the struct itself.

You seem to be introducing a new model, where you use the return value
sort of for all three.  (Are there any other hypercalls that behave this
way?)

I don't think sometimes returning the number of things you did and
sometimes returning zero makes any sense.  My suggestion would be either
make "nr_mfns" bidirectional (as similar fields are in the other
domctls) and return 0 on either full or partial success, or just return
the number of mfns actually mapped either on full or partial success.

>> Under those conditions, it looks to me like the following will happen:
>> 1. map_mmio_regions() will call set_mmio_p2m_entry() with order 9
>> 2. set_mmio_p2m_entry() will return 9 (requesting using order 8)
>> 3. map_mmio_regions() will call set_mmio_p2m_entry() with order 8
>> 4. set_mmio_p2m_entry() will call set_typed_p2m_entry() with order 8
>> 5. set_typed_p2m_entry() will read the current p2m entry of >= 9 and
>> continue (since 8 <= 9)
>> 6. set_typed_p2m_entry() will call p2m_set_entry() with an order of 8
>> 7. ept_set_entry() will return -EINVAL, since order % EPT_TABLE_ORDER != 0.
>>
>> Am I missing something?
> 
> Between step 6 and step 7 there is actually p2m_set_entry()
> breaking up the request into chunks or orders the hardware
> supports.

Oh, right -- sorry, I assumed that p2m_set_entry() was just an alias for
the underlying *set_entry() function.  I see now there's a loop handling
arbitrary orders.  Thanks. :-)

 -George
Jan Beulich Feb. 9, 2016, 12:35 p.m. UTC | #8
>>> On 09.02.16 at 13:17, <george.dunlap@citrix.com> wrote:
> On 09/02/16 11:48, Jan Beulich wrote:
>>>>> On 09.02.16 at 11:56, <george.dunlap@citrix.com> wrote:
>>> On 09/02/16 08:42, Jan Beulich wrote:
>>>>>>> On 08.02.16 at 19:04, <george.dunlap@citrix.com> wrote:
>>>>> First -- and this isn't a blocker, but just a question (and sorry if
>>>>> it's been answered before) -- why have the "0 means I did it all, <nr
>>>>> means I did it partially"?  Why not just return the number of entries
>>>>> actually handled?
>>>>
>>>> Because I view zero as the conventional "success" indicator. No
>>>> other deeper reason.
>>>
>>> For certain classes of functionality, yes.  But for other classes --
>>> where the caller will request N but the OS / hypervisor / whatever may
>>> do some other number fewer than N, then the convention is to always pass
>>> the number of items completed.  read() and write() system calls are like
>>> this, as are most of the XENMEM operations (e.g.,
>>> XENMEM_increase_resevration).
>>>
>>> Since this domctl looks a lot like some of those XENMEM operations,
>>> wouldn't it make more sense to follow that convention, and return the
>>> number of extents actually allocated?
>> 
>> Well, if comparing with the XENMEM ones, perhaps. But if comparing
>> with other domctls, perhaps rather not? I've really been undecided
>> here from the very beginning, since I had considered that alternative
>> right away.
> 
> Well from a brief survey of things that can partially succeed
> (getmemlist, gethvmcontext, {get,set}_vcpu_msrs), it seems that most of
> them:
> 
> 1. Use the return value *only* for success or failure; they either
> return 0 or -errno, even if they only partially succeed
> 2. Return the actual number of things accomplished in the struct itself.
> 
> You seem to be introducing a new model, where you use the return value
> sort of for all three.  (Are there any other hypercalls that behave this
> way?)
> 
> I don't think sometimes returning the number of things you did and
> sometimes returning zero makes any sense.  My suggestion would be either
> make "nr_mfns" bidirectional (as similar fields are in the other
> domctls) and return 0 on either full or partial success, or just return
> the number of mfns actually mapped either on full or partial success.

As said - I can see your point, and I've been considering the
alternatives and had to decide for one. Since I've already got
Ian's approval for the currently implementation, and since we're
at v7 and I've already spent way more time on this than I had
expected, I hope you understand that I'm a little hesitant to
make more changes (perhaps even requiring re-obtaining acks,
which has by itself been taking long enough for this patch) than
absolutely necessary to get this in.

So - Ian, do you think the alternative proposed by George
would make for a meaningfully better interface?

Jan
Ian Campbell Feb. 10, 2016, 10:06 a.m. UTC | #9
On Tue, 2016-02-09 at 05:35 -0700, Jan Beulich wrote:
> > On 09.02.16 at 13:17, <george.dunlap@citrix.com> wrote:
> > I don't think sometimes returning the number of things you did and
> > sometimes returning zero makes any sense.  My suggestion would be
> > either
> > make "nr_mfns" bidirectional (as similar fields are in the other
> > domctls) and return 0 on either full or partial success, or just return
> > the number of mfns actually mapped either on full or partial success.
> 
> As said - I can see your point, and I've been considering the
> alternatives and had to decide for one. Since I've already got
> Ian's approval for the currently implementation, and since we're
> at v7 and I've already spent way more time on this than I had
> expected, I hope you understand that I'm a little hesitant to
> make more changes (perhaps even requiring re-obtaining acks,
> which has by itself been taking long enough for this patch) than
> absolutely necessary to get this in.
> 
> So - Ian, do you think the alternative proposed by George
> would make for a meaningfully better interface?

I can see his point, but for a domctl I don't think I'd be inclined to
insist on changing it, given the reasons you explain above for not wanting
to at this stage.

I'd most likely be inclined to ack a follow up patch (from whomsoever is
motivated enough to produce one) which revved the API again though.

Ian.
George Dunlap Feb. 10, 2016, 10:19 a.m. UTC | #10
On 10/02/16 10:06, Ian Campbell wrote:
> On Tue, 2016-02-09 at 05:35 -0700, Jan Beulich wrote:
>>> On 09.02.16 at 13:17, <george.dunlap@citrix.com> wrote:
>>> I don't think sometimes returning the number of things you did and
>>> sometimes returning zero makes any sense.  My suggestion would be
>>> either
>>> make "nr_mfns" bidirectional (as similar fields are in the other
>>> domctls) and return 0 on either full or partial success, or just return
>>> the number of mfns actually mapped either on full or partial success.
>>
>> As said - I can see your point, and I've been considering the
>> alternatives and had to decide for one. Since I've already got
>> Ian's approval for the currently implementation, and since we're
>> at v7 and I've already spent way more time on this than I had
>> expected, I hope you understand that I'm a little hesitant to
>> make more changes (perhaps even requiring re-obtaining acks,
>> which has by itself been taking long enough for this patch) than
>> absolutely necessary to get this in.
>>
>> So - Ian, do you think the alternative proposed by George
>> would make for a meaningfully better interface?
> 
> I can see his point, but for a domctl I don't think I'd be inclined to
> insist on changing it, given the reasons you explain above for not wanting
> to at this stage.
> 
> I'd most likely be inclined to ack a follow up patch (from whomsoever is
> motivated enough to produce one) which revved the API again though.

Yes, I can certainly understand just geting this off the plate.

If we fix the mmio_ro page size checks / assertion, I'm fine with the
current interface.

 -George
diff mbox

Patch

--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -2229,7 +2229,7 @@  int xc_domain_memory_mapping(
 {
     DECLARE_DOMCTL;
     xc_dominfo_t info;
-    int ret = 0, err;
+    int ret = 0, rc;
     unsigned long done = 0, nr, max_batch_sz;
 
     if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
@@ -2254,19 +2254,24 @@  int xc_domain_memory_mapping(
         domctl.u.memory_mapping.nr_mfns = nr;
         domctl.u.memory_mapping.first_gfn = first_gfn + done;
         domctl.u.memory_mapping.first_mfn = first_mfn + done;
-        err = do_domctl(xch, &domctl);
-        if ( err && errno == E2BIG )
+        rc = do_domctl(xch, &domctl);
+        if ( rc < 0 && errno == E2BIG )
         {
             if ( max_batch_sz <= 1 )
                 break;
             max_batch_sz >>= 1;
             continue;
         }
+        if ( rc > 0 )
+        {
+            done += rc;
+            continue;
+        }
         /* Save the first error... */
         if ( !ret )
-            ret = err;
+            ret = rc;
         /* .. and ignore the rest of them when removing. */
-        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
+        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
             break;
 
         done += nr;
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -436,7 +436,8 @@  static __init void pvh_add_mem_mapping(s
         else
             a = p2m_access_rw;
 
-        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
+                                      PAGE_ORDER_4K, a)) )
             panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
                   gfn, mfn, i, rc);
         if ( !(i & 0xfffff) )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2503,7 +2503,7 @@  static int vmx_alloc_vlapic_mapping(stru
     share_xen_page_with_guest(pg, d, XENSHARE_writable);
     d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
     set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
-                       p2m_get_hostp2m(d)->default_access);
+                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
 
     return 0;
 }
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -899,48 +899,64 @@  void p2m_change_type_range(struct domain
     p2m_unlock(p2m);
 }
 
-/* Returns: 0 for success, -errno for failure */
+/*
+ * Returns:
+ *    0              for success
+ *    -errno         for failure
+ *    1 + new order  for caller to retry with smaller order (guaranteed
+ *                   to be smaller than order passed in)
+ */
 static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                               p2m_type_t gfn_p2mt, p2m_access_t access)
+                               unsigned int order, p2m_type_t gfn_p2mt,
+                               p2m_access_t access)
 {
     int rc = 0;
     p2m_access_t a;
     p2m_type_t ot;
     mfn_t omfn;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        gfn_unlock(p2m, gfn, order);
+        return cur_order + 1;
+    }
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
-        gfn_unlock(p2m, gfn, 0);
+        gfn_unlock(p2m, gfn, order);
         domain_crash(d);
         return -ENOENT;
     }
     else if ( p2m_is_ram(ot) )
     {
-        ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        unsigned long i;
+
+        for ( i = 0; i < (1UL << order); ++i )
+        {
+            ASSERT(mfn_valid(_mfn(mfn_x(omfn) + i)));
+            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
+        }
     }
 
     P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
-    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
-                       access);
+    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
     if ( rc )
-        gdprintk(XENLOG_ERR,
-                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
-                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
+        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+                 gfn, order, rc, mfn_x(mfn));
     else if ( p2m_is_pod(ot) )
     {
         pod_lock(p2m);
-        p2m->pod.entry_count--;
+        p2m->pod.entry_count -= 1UL << order;
         BUG_ON(p2m->pod.entry_count < 0);
         pod_unlock(p2m);
     }
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -949,14 +965,21 @@  static int set_typed_p2m_entry(struct do
 static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
                                  mfn_t mfn)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
+    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
                                p2m_get_hostp2m(d)->default_access);
 }
 
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access)
+                       unsigned int order, p2m_access_t access)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
+    if ( order &&
+         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
+                                 mfn_x(mfn) + (1UL << order) - 1) &&
+         !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn),
+                                  mfn_x(mfn) + (1UL << order) - 1) )
+        return order;
+
+    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
 }
 
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
@@ -1009,20 +1032,33 @@  int set_identity_p2m_entry(struct domain
     return ret;
 }
 
-/* Returns: 0 for success, -errno for failure */
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order)
 {
     int rc = -EINVAL;
     mfn_t actual_mfn;
     p2m_access_t a;
     p2m_type_t t;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        rc = cur_order + 1;
+        goto out;
+    }
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
@@ -1035,11 +1071,11 @@  int clear_mmio_p2m_entry(struct domain *
         gdprintk(XENLOG_WARNING,
                  "no mapping between mfn %08lx and gfn %08lx\n",
                  mfn_x(mfn), gfn);
-    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
+    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
                        p2m->default_access);
 
  out:
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -2095,6 +2131,30 @@  void *map_domain_gfn(struct p2m_domain *
     return map_domain_page(*mfn);
 }
 
+static unsigned int mmio_order(const struct domain *d,
+                               unsigned long start_fn, unsigned long nr)
+{
+    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
+         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
+        return PAGE_ORDER_4K;
+
+    if ( 0 /*
+            * Don't use 1Gb pages, to limit the iteration count in
+            * set_typed_p2m_entry() when it needs to zap M2P entries
+            * for a RAM range.
+            */ &&
+         !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
+         hap_has_1gb )
+        return PAGE_ORDER_1G;
+
+    if ( hap_has_2mb )
+        return PAGE_ORDER_2M;
+
+    return PAGE_ORDER_4K;
+}
+
+#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
+
 int map_mmio_regions(struct domain *d,
                      unsigned long start_gfn,
                      unsigned long nr,
@@ -2102,22 +2162,29 @@  int map_mmio_regions(struct domain *d,
 {
     int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; !ret && i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
-                                 p2m_get_hostp2m(d)->default_access);
-        if ( ret )
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
         {
-            unmap_mmio_regions(d, start_gfn, i, mfn);
-            break;
+            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order,
+                                     p2m_get_hostp2m(d)->default_access);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
         }
+        if ( ret < 0 )
+            break;
     }
 
-    return ret;
+    return i == nr ? 0 : i ?: ret;
 }
 
 int unmap_mmio_regions(struct domain *d,
@@ -2125,20 +2192,30 @@  int unmap_mmio_regions(struct domain *d,
                        unsigned long nr,
                        unsigned long mfn)
 {
-    int err = 0;
+    int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
-        if ( ret )
-            err = ret;
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
+        {
+            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
+        }
+        if ( ret < 0 )
+            break;
     }
 
-    return err;
+    return i == nr ? 0 : i ?: ret;
 }
 
 unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -136,6 +136,7 @@  static void ept_p2m_type_to_flags(struct
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
                                                     entry->mfn);
+            ASSERT(entry->w || !is_epte_superpage(entry));
             entry->a = !!cpu_has_vmx_ept_ad;
             entry->d = entry->w && cpu_has_vmx_ept_ad;
             break;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,7 +72,8 @@  static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
+                                       unsigned int level)
 {
     unsigned long flags;
     /*
@@ -107,6 +108,8 @@  static unsigned long p2m_type_to_flags(p
     case p2m_mmio_direct:
         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
             flags |= _PAGE_RW;
+        else
+            ASSERT(!level);
         return flags | P2M_BASE_FLAGS | _PAGE_PCD;
     }
 }
@@ -436,7 +439,7 @@  static int do_recalc(struct p2m_domain *
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
 
             if ( level )
             {
@@ -573,7 +576,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -609,7 +612,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn));
+                                             p2m_type_to_flags(p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -645,7 +648,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
+                                       p2m_type_to_flags(p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -1046,10 +1046,12 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
              (gfn + nr_mfns - 1) < gfn ) /* wrap? */
             break;
 
+#ifndef CONFIG_X86 /* XXX ARM!? */
         ret = -E2BIG;
         /* Must break hypercall up as this could take a while. */
         if ( nr_mfns > 64 )
             break;
+#endif
 
         ret = -EPERM;
         if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
@@ -1067,7 +1069,7 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret )
+            if ( ret < 0 )
                 printk(XENLOG_G_WARNING
                        "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
                        d->domain_id, gfn, mfn, nr_mfns, ret);
@@ -1079,7 +1081,7 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret && is_hardware_domain(current->domain) )
+            if ( ret < 0 && is_hardware_domain(current->domain) )
                 printk(XENLOG_ERR
                        "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
                        ret, d->domain_id, mfn, mfn_end);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -259,7 +259,7 @@  int guest_remove_page(struct domain *d,
     }
     if ( p2mt == p2m_mmio_direct )
     {
-        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
+        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
         put_gfn(d, gmfn);
         return 1;
     }
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,8 +574,9 @@  int p2m_is_logdirty_range(struct p2m_dom
 
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access);
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+                       unsigned int order, p2m_access_t access);
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order);
 
 /* Set identity addresses in the p2m table (for pass-through) */
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -542,8 +542,15 @@  DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
 
 
 /* Bind machine I/O address range -> HVM address range. */
-/* If this returns -E2BIG lower nr_mfns value. */
 /* XEN_DOMCTL_memory_mapping */
+/* Returns
+   - zero     success, everything done
+   - -E2BIG   passed in nr_mfns value too large for the implementation
+   - positive partial success for the first <result> page frames (with
+              <result> less than nr_mfns), requiring re-invocation by the
+              caller after updating inputs
+   - negative error; other than -E2BIG
+*/
 #define DPCI_ADD_MAPPING         1
 #define DPCI_REMOVE_MAPPING      0
 struct xen_domctl_memory_mapping {