diff mbox

superpages lost after migration of HVM domU

Message ID 20170428101521.GA15020@aepfle.de (mailing list archive)
State New, archived
Headers show

Commit Message

Olaf Hering April 28, 2017, 10:15 a.m. UTC
On Wed, Apr 26, Andrew Cooper wrote:

> On 26/04/17 16:43, Olaf Hering wrote:
> > On Thu, Apr 20, Jan Beulich wrote:
> >
> >>>>> On 20.04.17 at 18:04, <olaf@aepfle.de> wrote:
> >>> On Thu, Apr 20, Andrew Cooper wrote:
> >>>
> >>>> As it currently stands, the sending side iterates from 0 to p2m_size,
> >>>> and sends every frame on the first pass.  This means we get PAGE_DATA
> >>>> records linearly, in batches of 1024, or two aligned 2M superpages.
> >>> Is there a way to preserve 1G pages? This 380G domU I'm looking at is
> >>> built with 4k:461390 2M:2341 1G:365 pages.
> >> I think we've hashed out a possible way to deal with this, by
> >> speculatively allocating 1G pages as long as the allocation cap for
> >> the domain allows, subsequently punching holes into those pages
> >> if we can't allocate any new pages anymore (due to otherwise
> >> overrunning the cap).
> > The result is not pretty. This HVM-only approach appears to work for a
> > domU with "memory=3024" and localhost migration.
> > It is required to punch holes as soon as possible to avoid errors in
> > xenforeignmemory_map due to "Over-allocation". Would be nice if the
> > receiver gets a memory map upfront to avoid all stunts...
> 
> Oh - I was about to start working on this.  This is a pleasant surprise. :)

Here is a variant that actually works for migration between two dom0s.
diff mbox

Patch

--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -107,6 +107,9 @@  struct xc_sr_save_ops
  */
 struct xc_sr_restore_ops
 {
+    /* Allocate a MFN for the given PFN */
+    int (*allocate_pfn)(struct xc_sr_context *ctx, xen_pfn_t pfn);
+
     /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
     xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
 
@@ -172,6 +175,52 @@  struct xc_sr_x86_pv_restore_vcpu
     size_t basicsz, extdsz, xsavesz, msrsz;
 };
 
+struct xc_sr_bitmap
+{
+    void *p;
+    unsigned long bits;
+};
+
+extern bool _xc_sr_bitmap_resize(struct xc_sr_bitmap *bm, unsigned long bits);
+static inline bool xc_sr_bitmap_resize(struct xc_sr_bitmap *bm, unsigned long bits)
+{
+    if (bits > bm->bits)
+        return _xc_sr_bitmap_resize(bm, bits);
+    return true;
+}
+
+static inline void xc_sr_bitmap_free(struct xc_sr_bitmap *bm)
+{
+    free(bm->p);
+}
+
+static inline bool xc_sr_set_bit(unsigned long bit, struct xc_sr_bitmap *bm)
+{
+    if (!xc_sr_bitmap_resize(bm, bit))
+        return false;
+
+    set_bit(bit, bm->p);
+    return true;
+}
+
+static inline bool xc_sr_test_bit(unsigned long bit, struct xc_sr_bitmap *bm)
+{
+    if (bit > bm->bits)
+        return false;
+    return !!test_bit(bit, bm->p);
+}
+
+static inline int xc_sr_test_and_clear_bit(unsigned long bit, struct xc_sr_bitmap *bm)
+{
+    return test_and_clear_bit(bit, bm->p);
+}
+
+static inline int xc_sr_test_and_set_bit(unsigned long bit, struct xc_sr_bitmap *bm)
+{
+    return test_and_set_bit(bit, bm->p);
+}
+
+
 struct xc_sr_context
 {
     xc_interface *xch;
@@ -256,8 +305,7 @@  struct xc_sr_context
             domid_t      xenstore_domid,  console_domid;
 
             /* Bitmap of currently populated PFNs during restore. */
-            unsigned long *populated_pfns;
-            xen_pfn_t max_populated_pfn;
+            struct xc_sr_bitmap populated_pfns;
 
             /* Sender has invoked verify mode on the stream. */
             bool verify;
@@ -332,6 +380,12 @@  struct xc_sr_context
                     /* HVM context blob. */
                     void *context;
                     size_t contextsz;
+
+                    /* Bitmap of currently allocated PFNs during restore. */
+                    struct xc_sr_bitmap attempted_1g;
+                    struct xc_sr_bitmap attempted_2m;
+                    struct xc_sr_bitmap allocated_pfns;
+                    unsigned long alloc_cnt;
                 } restore;
             };
         } x86_hvm;
--- a/tools/libxc/xc_sr_restore.c
+++ b/tools/libxc/xc_sr_restore.c
@@ -71,11 +71,9 @@  static int read_headers(struct xc_sr_con
 /*
  * Is a pfn populated?
  */
-static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+static bool pfn_is_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
 {
-    if ( pfn > ctx->restore.max_populated_pfn )
-        return false;
-    return test_bit(pfn, ctx->restore.populated_pfns);
+    return xc_sr_test_bit(pfn, &ctx->restore.populated_pfns);
 }
 
 /*
@@ -87,42 +85,12 @@  static int pfn_set_populated(struct xc_s
 {
     xc_interface *xch = ctx->xch;
 
-    if ( pfn > ctx->restore.max_populated_pfn )
+    if ( !xc_sr_set_bit(pfn, &ctx->restore.populated_pfns) )
     {
-        xen_pfn_t new_max;
-        size_t old_sz, new_sz;
-        unsigned long *p;
-
-        /* Round up to the nearest power of two larger than pfn, less 1. */
-        new_max = pfn;
-        new_max |= new_max >> 1;
-        new_max |= new_max >> 2;
-        new_max |= new_max >> 4;
-        new_max |= new_max >> 8;
-        new_max |= new_max >> 16;
-#ifdef __x86_64__
-        new_max |= new_max >> 32;
-#endif
-
-        old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
-        new_sz = bitmap_size(new_max + 1);
-        p = realloc(ctx->restore.populated_pfns, new_sz);
-        if ( !p )
-        {
-            ERROR("Failed to realloc populated bitmap");
-            errno = ENOMEM;
-            return -1;
-        }
-
-        memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
-
-        ctx->restore.populated_pfns    = p;
-        ctx->restore.max_populated_pfn = new_max;
+        ERROR("Failed to realloc populated bitmap");
+        errno = ENOMEM;
+        return -1;
     }
-
-    assert(!test_bit(pfn, ctx->restore.populated_pfns));
-    set_bit(pfn, ctx->restore.populated_pfns);
-
     return 0;
 }
 
@@ -135,6 +103,7 @@  int populate_pfns(struct xc_sr_context *
                   const xen_pfn_t *original_pfns, const uint32_t *types)
 {
     xc_interface *xch = ctx->xch;
+    xen_pfn_t min_pfn = original_pfns[0], max_pfn = original_pfns[0];
     xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
         *pfns = malloc(count * sizeof(*pfns));
     unsigned i, nr_pfns = 0;
@@ -149,11 +118,18 @@  int populate_pfns(struct xc_sr_context *
 
     for ( i = 0; i < count; ++i )
     {
+        if (original_pfns[i] < min_pfn)
+            min_pfn = original_pfns[i];
+        if (original_pfns[i] > max_pfn)
+            max_pfn = original_pfns[i];
         if ( (!types || (types &&
                          (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
                           types[i] != XEN_DOMCTL_PFINFO_BROKEN))) &&
              !pfn_is_populated(ctx, original_pfns[i]) )
         {
+            rc = ctx->restore.ops.allocate_pfn(ctx, original_pfns[i]);
+            if ( rc )
+                goto err;
             rc = pfn_set_populated(ctx, original_pfns[i]);
             if ( rc )
                 goto err;
@@ -161,6 +137,21 @@  int populate_pfns(struct xc_sr_context *
             ++nr_pfns;
         }
     }
+    IPRINTF("checking range %lx %lx\n", min_pfn, max_pfn);
+    while (min_pfn < max_pfn) {
+        if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.allocated_pfns, min_pfn))
+        {
+            PERROR("Failed to realloc allocated_pfns %" PRI_xen_pfn, min_pfn);
+            goto err;
+        }
+        if (!pfn_is_populated(ctx, min_pfn) && xc_sr_test_and_clear_bit(min_pfn, &ctx->x86_hvm.restore.allocated_pfns)) {
+            xen_pfn_t pfn = min_pfn;
+            rc = xc_domain_decrease_reservation_exact(xch, ctx->domid, 1, 0, &pfn);
+            IPRINTF("free %lx %lx %d\n", min_pfn, pfn, rc);
+        }
+        min_pfn++;
+    }
+    nr_pfns = 0;
 
     if ( nr_pfns )
     {
@@ -684,10 +675,8 @@  static int setup(struct xc_sr_context *c
     if ( rc )
         goto err;
 
-    ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1;
-    ctx->restore.populated_pfns = bitmap_alloc(
-        ctx->restore.max_populated_pfn + 1);
-    if ( !ctx->restore.populated_pfns )
+    rc = !xc_sr_bitmap_resize(&ctx->restore.populated_pfns, 32 * 1024 / 4);
+    if ( rc )
     {
         ERROR("Unable to allocate memory for populated_pfns bitmap");
         rc = -1;
@@ -722,7 +711,10 @@  static void cleanup(struct xc_sr_context
         xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
                                    NRPAGES(bitmap_size(ctx->restore.p2m_size)));
     free(ctx->restore.buffered_records);
-    free(ctx->restore.populated_pfns);
+    xc_sr_bitmap_free(&ctx->restore.populated_pfns);
+    xc_sr_bitmap_free(&ctx->x86_hvm.restore.attempted_1g);
+    xc_sr_bitmap_free(&ctx->x86_hvm.restore.attempted_2m);
+    xc_sr_bitmap_free(&ctx->x86_hvm.restore.allocated_pfns);
     if ( ctx->restore.ops.cleanup(ctx) )
         PERROR("Failed to clean up");
 }
@@ -810,6 +802,17 @@  static int restore(struct xc_sr_context
     saved_errno = errno;
     saved_rc = rc;
     PERROR("Restore failed");
+    {
+        unsigned long i;
+        bool a, p;
+        IPRINTF("alloc_cnt %lu\n", ctx->x86_hvm.restore.alloc_cnt);
+        for (i = 0; i < ctx->restore.p2m_size; i++) {
+            p = xc_sr_test_bit(i, &ctx->restore.populated_pfns);
+            a = xc_sr_test_bit(i, &ctx->x86_hvm.restore.allocated_pfns);
+            if (p != a)
+                IPRINTF("%lx a %x p %x\n", i, a, p);
+        }
+    }
 
  done:
     cleanup(ctx);
@@ -888,6 +891,7 @@  int xc_domain_restore(xc_interface *xch,
     }
 
     ctx.restore.p2m_size = nr_pfns;
+    IPRINTF("p2m_size %lx\n", ctx.restore.p2m_size);
 
     if ( ctx.dominfo.hvm )
     {
--- a/tools/libxc/xc_sr_restore_x86_hvm.c
+++ b/tools/libxc/xc_sr_restore_x86_hvm.c
@@ -3,6 +3,10 @@ 
 
 #include "xc_sr_common_x86.h"
 
+#define SUPERPAGE_2MB_SHIFT   9
+#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
+#define SUPERPAGE_1GB_SHIFT   18
+#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
 /*
  * Process an HVM_CONTEXT record from the stream.
  */
@@ -130,6 +134,17 @@  static int x86_hvm_setup(struct xc_sr_co
         return -1;
     }
 
+    if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_1g, (ctx->restore.p2m_size >> SUPERPAGE_1GB_SHIFT) + 1) ||
+        !xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_2m, (ctx->restore.p2m_size >> SUPERPAGE_2MB_SHIFT) + 1) ||
+        !xc_sr_bitmap_resize(&ctx->x86_hvm.restore.allocated_pfns, ctx->restore.p2m_size + 1))
+    {
+        ERROR("Unable to allocate memory for allocated_pfns bitmaps");
+        return -1;
+    }
+    /* No superpage in 1st 2MB due to VGA hole */
+    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_1g);
+    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_2m);
+
     return 0;
 }
 
@@ -209,8 +224,110 @@  static int x86_hvm_cleanup(struct xc_sr_
     return 0;
 }
 
+static bool pfn_is_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    return xc_sr_test_bit(pfn, &ctx->x86_hvm.restore.allocated_pfns);
+}
+
+/*
+ * Set a pfn as allocated, expanding the tracking structures if needed. To
+ * avoid realloc()ing too excessively, the size increased to the nearest power
+ * of two large enough to contain the required pfn.
+ */
+static int pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+
+    if ( !xc_sr_set_bit(pfn, &ctx->x86_hvm.restore.allocated_pfns) )
+    {
+        ERROR("Failed to realloc allocated_pfns bitmap");
+        errno = ENOMEM;
+        return -1;
+    }
+    return 0;
+}
+
+static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+    bool success = false;
+    int rc = -1;
+    long done;
+    unsigned long i;
+    unsigned long stat_1g = 0, stat_2m = 0, stat_4k = 0;
+    unsigned long idx_1g, idx_2m;
+    unsigned long count;
+    xen_pfn_t base_pfn = 0, sp_extent;
+
+    IPRINTF("pfn %lx\n", (long)pfn);
+    if (pfn_is_allocated(ctx, pfn))
+        return 0;
+
+    idx_1g = pfn >> SUPERPAGE_1GB_SHIFT;
+    idx_2m = pfn >> SUPERPAGE_2MB_SHIFT;
+    if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_1g, idx_1g))
+    {
+        PERROR("Failed to realloc attempted_1g");
+        return -1;
+    }
+    if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_2m, idx_2m))
+    {
+        PERROR("Failed to realloc attempted_2m");
+        return -1;
+    }
+    IPRINTF("idx_1g %lu idx_2m %lu\n", idx_1g, idx_2m);
+    if (!xc_sr_test_and_set_bit(idx_1g, &ctx->x86_hvm.restore.attempted_1g)) {
+        count = 1UL << SUPERPAGE_1GB_SHIFT;
+        base_pfn = (pfn >> SUPERPAGE_1GB_SHIFT) << SUPERPAGE_1GB_SHIFT;
+        sp_extent = base_pfn;
+        done = xc_domain_populate_physmap(xch, ctx->domid, 1, SUPERPAGE_1GB_SHIFT, 0, &sp_extent);
+        IPRINTF("1G base_pfn %lx count %lu done %ld\n", (long)base_pfn, count, done);
+        if (done > 0) {
+            success = true;
+            ctx->x86_hvm.restore.alloc_cnt += count;
+            stat_1g = done;
+            for (i = 0; i < (count >> SUPERPAGE_2MB_SHIFT); i++)
+                xc_sr_set_bit((base_pfn >> SUPERPAGE_2MB_SHIFT) + i, &ctx->x86_hvm.restore.attempted_2m);
+        }
+    }
+
+    if (!xc_sr_test_and_set_bit(idx_2m, &ctx->x86_hvm.restore.attempted_2m)) {
+        count = 1UL << SUPERPAGE_2MB_SHIFT;
+        base_pfn = (pfn >> SUPERPAGE_2MB_SHIFT) << SUPERPAGE_2MB_SHIFT;
+        sp_extent = base_pfn;
+        done = xc_domain_populate_physmap(xch, ctx->domid, 1, SUPERPAGE_2MB_SHIFT, 0, &sp_extent);
+        IPRINTF("2M base_pfn %lx count %lu done %ld\n", (long)base_pfn, count, done);
+        if (done > 0) {
+            success = true;
+            ctx->x86_hvm.restore.alloc_cnt += count;
+            stat_2m = done;
+        }
+    }
+    if (success == false) {
+        count = 1;
+        sp_extent = base_pfn = pfn;
+        done = xc_domain_populate_physmap(xch, ctx->domid, count, 0, 0, &sp_extent);
+        if (done > 0) {
+            success = true;
+            ctx->x86_hvm.restore.alloc_cnt += count;
+            stat_4k = count;
+        }
+    }
+    IPRINTF("count %lu 1G %lu 2M %lu 4k %lu\n", count, stat_1g, stat_2m, stat_4k);
+    if (success == true) {
+        do {
+            count--;
+            rc = pfn_set_allocated(ctx, base_pfn + count);
+            if (rc)
+                break;
+        } while (count);
+    }
+    return rc;
+}
+
 struct xc_sr_restore_ops restore_ops_x86_hvm =
 {
+    .allocate_pfn    = x86_hvm_allocate_pfn,
     .pfn_is_valid    = x86_hvm_pfn_is_valid,
     .pfn_to_gfn      = x86_hvm_pfn_to_gfn,
     .set_gfn         = x86_hvm_set_gfn,
--- a/tools/libxc/xc_sr_restore_x86_pv.c
+++ b/tools/libxc/xc_sr_restore_x86_pv.c
@@ -1141,8 +1141,15 @@  static int x86_pv_cleanup(struct xc_sr_c
     return 0;
 }
 
+static int x86_pv_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    errno = ENOMEM;
+    return -1;
+}
+
 struct xc_sr_restore_ops restore_ops_x86_pv =
 {
+    .allocate_pfn    = x86_pv_allocate_pfn,
     .pfn_is_valid    = x86_pv_pfn_is_valid,
     .pfn_to_gfn      = pfn_to_mfn,
     .set_page_type   = x86_pv_set_page_type,
--- a/tools/libxc/xc_sr_common.c
+++ b/tools/libxc/xc_sr_common.c
@@ -153,6 +153,42 @@  static void __attribute__((unused)) buil
     XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params)        != 8);
 }
 
+bool _xc_sr_bitmap_resize(struct xc_sr_bitmap *bm, unsigned long bits)
+{
+    if (bits > bm->bits)
+    {
+        size_t new_max;
+        size_t old_sz, new_sz;
+        void *p;
+
+        /* Round up to the nearest power of two larger than bit, less 1. */
+        new_max = bits;
+        new_max |= new_max >> 1;
+        new_max |= new_max >> 2;
+        new_max |= new_max >> 4;
+        new_max |= new_max >> 8;
+        new_max |= new_max >> 16;
+#ifdef __x86_64__
+        new_max |= new_max >> 32;
+#endif
+
+        old_sz = bitmap_size(bm->bits + 1);
+        new_sz = bitmap_size(new_max + 1);
+        p = realloc(bm->p, new_sz);
+        if (!p)
+            return false;
+
+        if (bm->p)
+            memset(p + old_sz, 0, new_sz - old_sz);
+        else
+            memset(p, 0, new_sz);
+
+        bm->p = p;
+        bm->bits = new_max;
+    }
+    return true;
+}
+
 /*
  * Local variables:
  * mode: C