diff mbox

[v9,3/3] tools/libxc: use superpages during restore of HVM guest

Message ID 20170901160843.9057-4-olaf@aepfle.de (mailing list archive)
State New, archived
Headers show

Commit Message

Olaf Hering Sept. 1, 2017, 4:08 p.m. UTC
During creating of a HVM domU meminit_hvm() tries to map superpages.
After save/restore or migration this mapping is lost, everything is
allocated in single pages. This causes a performance degradition after
migration.

Add neccessary code to preallocate a superpage for the chunk of pfns
that is received. In case a pfn was not populated on the sending side it
must be freed on the receiving side to avoid over-allocation.

The existing code for x86_pv is moved unmodified into its own file.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
 tools/libxc/xc_sr_common.h          |  26 ++-
 tools/libxc/xc_sr_restore.c         |  75 +-------
 tools/libxc/xc_sr_restore_x86_hvm.c | 341 ++++++++++++++++++++++++++++++++++++
 tools/libxc/xc_sr_restore_x86_pv.c  |  72 +++++++-
 4 files changed, 436 insertions(+), 78 deletions(-)

Comments

Wei Liu Sept. 6, 2017, 11:34 a.m. UTC | #1
On Fri, Sep 01, 2017 at 06:08:43PM +0200, Olaf Hering wrote:
[...]
> diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
> index 734320947a..93141a6e25 100644
> --- a/tools/libxc/xc_sr_common.h
> +++ b/tools/libxc/xc_sr_common.h
> @@ -139,6 +139,16 @@ struct xc_sr_restore_ops
>       */
>      int (*setup)(struct xc_sr_context *ctx);
>  
> +    /**
> +     * Populate PFNs
> +     *
> +     * Given a set of pfns, obtain memory from Xen to fill the physmap for the
> +     * unpopulated subset.
> +     */
> +    int (*populate_pfns)(struct xc_sr_context *ctx, unsigned count,
> +                         const xen_pfn_t *original_pfns, const uint32_t *types);
> +

One blank line is good enough.

> +
>      /**
>       * Process an individual record from the stream.  The caller shall take
>       * care of processing common records (e.g. END, PAGE_DATA).
> @@ -224,6 +234,8 @@ struct xc_sr_context
>  
>              int send_back_fd;
>              unsigned long p2m_size;
> +            unsigned long max_pages;
> +            unsigned long tot_pages;
>              xc_hypercall_buffer_t dirty_bitmap_hbuf;
>  
>              /* From Image Header. */
> @@ -336,6 +348,12 @@ struct xc_sr_context
>                      /* HVM context blob. */
>                      void *context;
>                      size_t contextsz;
> +
> +                    /* Bitmap of currently allocated PFNs during restore. */
> +                    struct xc_sr_bitmap attempted_1g;
> +                    struct xc_sr_bitmap attempted_2m;
> +                    struct xc_sr_bitmap allocated_pfns;
> +                    xen_pfn_t idx1G_prev, idx2M_prev;
>                  } restore;
>              };
>          } x86_hvm;
> @@ -459,14 +477,6 @@ static inline int write_record(struct xc_sr_context *ctx,
>   */
>  int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);
>  
> -/*
> - * This would ideally be private in restore.c, but is needed by
> - * x86_pv_localise_page() if we receive pagetables frames ahead of the
> - * contents of the frames they point at.
> - */
> -int populate_pfns(struct xc_sr_context *ctx, unsigned count,
> -                  const xen_pfn_t *original_pfns, const uint32_t *types);
> -
>  #endif
>  /*
>   * Local variables:
[...]
>  
> +struct x86_hvm_sp {

Forgot to ask: what does sp stand for?

> +static bool x86_hvm_punch_hole(struct xc_sr_context *ctx, xen_pfn_t max_pfn)
> +{
> +    xc_interface *xch = ctx->xch;
> +    struct xc_sr_bitmap *bm;
> +    xen_pfn_t _pfn, pfn, min_pfn;
> +    uint32_t domid, freed = 0, order;

unsigned int / long for freed and order.

> +    int rc = -1;
> +
> +    /*
> +     * Scan the entire superpage because several batches will fit into
> +     * a superpage, and it is unknown which pfn triggered the allocation.
> +     */
> +    order = SUPERPAGE_1GB_SHIFT;
> +    pfn = min_pfn = (max_pfn >> order) << order;
> +

min_pfn -> start_pfn?

> +    while ( pfn <= max_pfn )
> +    {

bm can be defined here.

> +        bm = &ctx->x86_hvm.restore.allocated_pfns;
> +        if ( !xc_sr_bitmap_resize(bm, pfn) )
> +        {
> +            PERROR("Failed to realloc allocated_pfns %" PRI_xen_pfn, pfn);
> +            return false;
> +        }
> +        if ( !pfn_is_populated(ctx, pfn) &&
> +            xc_sr_test_and_clear_bit(pfn, bm) ) {

domid and _pfn can be defined here.

> +            domid = ctx->domid;
> +            _pfn = pfn;
> +            rc = xc_domain_decrease_reservation_exact(xch, domid, 1, 0, &_pfn);

Please batch the requests otherwise it is going to be very slow.

It should be feasible to construct an array of pfns here and issue a
single decrease_reservation outside of this loop.

> +            if ( rc )
> +            {
> +                PERROR("Failed to release pfn %" PRI_xen_pfn, pfn);
> +                return false;
> +            }
> +            ctx->restore.tot_pages--;
> +            freed++;
> +        }
> +        pfn++;
> +    }
> +    if ( freed )
> +        DPRINTF("freed %u between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
> +                freed, min_pfn, max_pfn);
> +    return true;
> +}
> +
> +/*
> + * Try to allocate superpages.
> + * This works without memory map only if the pfns arrive in incremental order.
> + */

I have said several times, one way or another, I don't want to make
assumption on the stream of pfns. So I'm afraid I can't ack a patch like
this.

If Ian or Andrew thinks this is OK, I won't stand in the way.

> +static int x86_hvm_populate_pfns(struct xc_sr_context *ctx, unsigned count,
> +                                 const xen_pfn_t *original_pfns,

original_pfns -> pfns?

The list is not copied and/or altered in any way afaict.

(I skipped the rest)
Andrew Cooper Sept. 6, 2017, 11:39 a.m. UTC | #2
>> +            if ( rc )
>> +            {
>> +                PERROR("Failed to release pfn %" PRI_xen_pfn, pfn);
>> +                return false;
>> +            }
>> +            ctx->restore.tot_pages--;
>> +            freed++;
>> +        }
>> +        pfn++;
>> +    }
>> +    if ( freed )
>> +        DPRINTF("freed %u between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
>> +                freed, min_pfn, max_pfn);
>> +    return true;
>> +}
>> +
>> +/*
>> + * Try to allocate superpages.
>> + * This works without memory map only if the pfns arrive in incremental order.
>> + */
> I have said several times, one way or another, I don't want to make
> assumption on the stream of pfns. So I'm afraid I can't ack a patch like
> this.
>
> If Ian or Andrew thinks this is OK, I won't stand in the way.

The stream has always been in-order for the first pass (even in the
legacy days), and I don't forsee that changing.  Reliance on the order
was suggested by both myself and Jan during the early design.

It is certainly an acceptable assumption until we put a proper address
map into the head of the stream.

~Andrew
Olaf Hering Sept. 6, 2017, 12:02 p.m. UTC | #3
Am Wed, 6 Sep 2017 12:34:10 +0100
schrieb Wei Liu <wei.liu2@citrix.com>:

> > +struct x86_hvm_sp {  
> Forgot to ask: what does sp stand for?

superpage. I will check if there is room to expand this string.

> > + * Try to allocate superpages.
> > + * This works without memory map only if the pfns arrive in incremental order.
> > + */  
> I have said several times, one way or another, I don't want to make
> assumption on the stream of pfns. So I'm afraid I can't ack a patch like
> this.

It will work with any order, I think. Just with incremental order the superpages will not be split once they are allocated.

Thanks for the review. I will send another series shortly.

Olaf
Andrew Cooper Sept. 6, 2017, 12:13 p.m. UTC | #4
On 01/09/17 17:08, Olaf Hering wrote:
> +    /* No superpage in 1st 2MB due to VGA hole */
> +    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_1g);
> +    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_2m);

This is false for PVH guests.

I still fail to understand why you need the bitmaps at all?  You can
calculate everything you need from the pfn list alone, which will also
let you spot the presence or absence of the VGA hole.

You need to track which pfns you've see so far in the stream, and which
pfns have been populated.  When you find holes in the pfns in the
stream, you need to undo the prospective superpage allocation.  Unless
I've missed something?

Also, please take care to use 2M decrease reservations wherever
possible, or you will end up shattering the host superpage as part of
trying to remove the memory.

~Andrew

> +
>      return 0;
> +
> +out:
> +    ERROR("Unable to allocate memory for pfn bitmaps");
> +    return -1;
>  }
>  
>  /*
>
Olaf Hering Sept. 6, 2017, 12:17 p.m. UTC | #5
On Wed, Sep 06, Andrew Cooper wrote:

> On 01/09/17 17:08, Olaf Hering wrote:
> > +    /* No superpage in 1st 2MB due to VGA hole */
> > +    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_1g);
> > +    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_2m);
> This is false for PVH guests.

How can I detect a PVH guest?

Olaf
Andrew Cooper Sept. 6, 2017, 12:23 p.m. UTC | #6
On 06/09/17 13:17, Olaf Hering wrote:
> On Wed, Sep 06, Andrew Cooper wrote:
>
>> On 01/09/17 17:08, Olaf Hering wrote:
>>> +    /* No superpage in 1st 2MB due to VGA hole */
>>> +    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_1g);
>>> +    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_2m);
>> This is false for PVH guests.
> How can I detect a PVH guest?

You (hopefully) can't, and it would be a layering violation if you could.

The exact set of emulation available to/used by a guest is not relevant
to how we move its memory.

If a PVH guest has got MTRRs disabled, then it genuinely can run on an
unshattered 1G superpage at 0.

~Andrew
Olaf Hering Sept. 6, 2017, 12:24 p.m. UTC | #7
On Wed, Sep 06, Andrew Cooper wrote:

> I still fail to understand why you need the bitmaps at all?  You can
> calculate everything you need from the pfn list alone, which will also
> let you spot the presence or absence of the VGA hole.

These bitmaps track if a range has been allocated as superpage or not.
If there is a given pfn within a range of either 1G or 2M there might be
double allocation of a 1G or 2M page. This is not related to the VGA
hole. These two lines are just hints that in this range no superpage can
be allocated.

> You need to track which pfns you've see so far in the stream, and which
> pfns have been populated.  When you find holes in the pfns in the
> stream, you need to undo the prospective superpage allocation.  Unless
> I've missed something?

This is whats happening, holes will be created as soon as they are seen
in the stream.

> Also, please take care to use 2M decrease reservations wherever
> possible, or you will end up shattering the host superpage as part of
> trying to remove the memory.

This is what Wei suggested, build a list of pfns instead of releasing
each pfn individually. I think with this new code it should be possible
to decrease in 2M steps as needed.

Olaf
Olaf Hering Sept. 6, 2017, 12:25 p.m. UTC | #8
On Wed, Sep 06, Andrew Cooper wrote:

> If a PVH guest has got MTRRs disabled, then it genuinely can run on an
> unshattered 1G superpage at 0.

Ok, the code will detect the holes and will release memory as needed. I
will drop these two lines.

Olaf
Olaf Hering Sept. 8, 2017, 11:45 a.m. UTC | #9
On Wed, Sep 06, Andrew Cooper wrote:

> The stream has always been in-order for the first pass (even in the
> legacy days), and I don't forsee that changing.  Reliance on the order
> was suggested by both myself and Jan during the early design.

A related question: is it save to increase MAX_BATCH_SIZE from 1024 to
(256*1024) to transfer a whole gigabyte at a time? That way it will be
easier to handle holes within a 1GB superpage.

Olaf
Olaf Hering Oct. 11, 2017, 2:15 p.m. UTC | #10
On Fri, Sep 08, Olaf Hering wrote:

> A related question: is it save to increase MAX_BATCH_SIZE from 1024 to
> (256*1024) to transfer a whole gigabyte at a time? That way it will be
> easier to handle holes within a 1GB superpage.

To answer my own question:

This change leads to this error:

-#define MAX_BATCH_SIZE 1024   /* up to 1024 pages (4MB) at a time */
+#define MAX_BATCH_SIZE SUPERPAGE_1GB_NR_PFNS   /* up to 1GB at a time */

...
xc: info: Found x86 HVM domain from Xen 4.10
xc: detail: dom 9 p2m_size fee01 max_pages 100100
xc: info: Restoring domain
xc: error: Failed to read Record Header from stream (0 = Success): Internal error
xc: error: Restore failed (0 = Success): Internal error
...

Olaf
Olaf Hering Oct. 11, 2017, 3:09 p.m. UTC | #11
On Wed, Oct 11, Olaf Hering wrote:

> -#define MAX_BATCH_SIZE 1024   /* up to 1024 pages (4MB) at a time */
> +#define MAX_BATCH_SIZE SUPERPAGE_1GB_NR_PFNS   /* up to 1GB at a time */

Actually the error is something else, I missed this in the debug output:

xc: error: Failed to get types for pfn batch (7 = Argument list too long): Internal error

write_batch() should probably split the requests when filling types[] because
Xen has "1024" hardcoded in XEN_DOMCTL_getpageframeinfo3...


Olaf
diff mbox

Patch

diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index 734320947a..93141a6e25 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -139,6 +139,16 @@  struct xc_sr_restore_ops
      */
     int (*setup)(struct xc_sr_context *ctx);
 
+    /**
+     * Populate PFNs
+     *
+     * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+     * unpopulated subset.
+     */
+    int (*populate_pfns)(struct xc_sr_context *ctx, unsigned count,
+                         const xen_pfn_t *original_pfns, const uint32_t *types);
+
+
     /**
      * Process an individual record from the stream.  The caller shall take
      * care of processing common records (e.g. END, PAGE_DATA).
@@ -224,6 +234,8 @@  struct xc_sr_context
 
             int send_back_fd;
             unsigned long p2m_size;
+            unsigned long max_pages;
+            unsigned long tot_pages;
             xc_hypercall_buffer_t dirty_bitmap_hbuf;
 
             /* From Image Header. */
@@ -336,6 +348,12 @@  struct xc_sr_context
                     /* HVM context blob. */
                     void *context;
                     size_t contextsz;
+
+                    /* Bitmap of currently allocated PFNs during restore. */
+                    struct xc_sr_bitmap attempted_1g;
+                    struct xc_sr_bitmap attempted_2m;
+                    struct xc_sr_bitmap allocated_pfns;
+                    xen_pfn_t idx1G_prev, idx2M_prev;
                 } restore;
             };
         } x86_hvm;
@@ -459,14 +477,6 @@  static inline int write_record(struct xc_sr_context *ctx,
  */
 int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);
 
-/*
- * This would ideally be private in restore.c, but is needed by
- * x86_pv_localise_page() if we receive pagetables frames ahead of the
- * contents of the frames they point at.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned count,
-                  const xen_pfn_t *original_pfns, const uint32_t *types);
-
 #endif
 /*
  * Local variables:
diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c
index d53948e1a6..8cd9289d1a 100644
--- a/tools/libxc/xc_sr_restore.c
+++ b/tools/libxc/xc_sr_restore.c
@@ -68,74 +68,6 @@  static int read_headers(struct xc_sr_context *ctx)
     return 0;
 }
 
-/*
- * Given a set of pfns, obtain memory from Xen to fill the physmap for the
- * unpopulated subset.  If types is NULL, no page type checking is performed
- * and all unpopulated pfns are populated.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned count,
-                  const xen_pfn_t *original_pfns, const uint32_t *types)
-{
-    xc_interface *xch = ctx->xch;
-    xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
-        *pfns = malloc(count * sizeof(*pfns));
-    unsigned i, nr_pfns = 0;
-    int rc = -1;
-
-    if ( !mfns || !pfns )
-    {
-        ERROR("Failed to allocate %zu bytes for populating the physmap",
-              2 * count * sizeof(*mfns));
-        goto err;
-    }
-
-    for ( i = 0; i < count; ++i )
-    {
-        if ( (!types || (types &&
-                         (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
-                          types[i] != XEN_DOMCTL_PFINFO_BROKEN))) &&
-             !pfn_is_populated(ctx, original_pfns[i]) )
-        {
-            rc = pfn_set_populated(ctx, original_pfns[i]);
-            if ( rc )
-                goto err;
-            pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
-            ++nr_pfns;
-        }
-    }
-
-    if ( nr_pfns )
-    {
-        rc = xc_domain_populate_physmap_exact(
-            xch, ctx->domid, nr_pfns, 0, 0, mfns);
-        if ( rc )
-        {
-            PERROR("Failed to populate physmap");
-            goto err;
-        }
-
-        for ( i = 0; i < nr_pfns; ++i )
-        {
-            if ( mfns[i] == INVALID_MFN )
-            {
-                ERROR("Populate physmap failed for pfn %u", i);
-                rc = -1;
-                goto err;
-            }
-
-            ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
-        }
-    }
-
-    rc = 0;
-
- err:
-    free(pfns);
-    free(mfns);
-
-    return rc;
-}
-
 /*
  * Given a list of pfns, their types, and a block of page data from the
  * stream, populate and record their types, map the relevant subset and copy
@@ -161,7 +93,7 @@  static int process_page_data(struct xc_sr_context *ctx, unsigned count,
         goto err;
     }
 
-    rc = populate_pfns(ctx, count, pfns, types);
+    rc = ctx->restore.ops.populate_pfns(ctx, count, pfns, types);
     if ( rc )
     {
         ERROR("Failed to populate pfns for batch of %u pages", count);
@@ -826,7 +758,12 @@  int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
         return -1;
     }
 
+    /* See xc_domain_getinfo */
+    ctx.restore.max_pages = ctx.dominfo.max_memkb >> (PAGE_SHIFT-10);
+    ctx.restore.tot_pages = ctx.dominfo.nr_pages;
     ctx.restore.p2m_size = nr_pfns;
+    DPRINTF("dom %u p2m_size %lx max_pages %lx",
+            ctx.domid, ctx.restore.p2m_size, ctx.restore.max_pages);
 
     if ( ctx.dominfo.hvm )
     {
diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c b/tools/libxc/xc_sr_restore_x86_hvm.c
index 1dca85354a..a3e5309a83 100644
--- a/tools/libxc/xc_sr_restore_x86_hvm.c
+++ b/tools/libxc/xc_sr_restore_x86_hvm.c
@@ -135,6 +135,8 @@  static int x86_hvm_localise_page(struct xc_sr_context *ctx,
 static int x86_hvm_setup(struct xc_sr_context *ctx)
 {
     xc_interface *xch = ctx->xch;
+    struct xc_sr_bitmap *bm;
+    unsigned long bits;
 
     if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM )
     {
@@ -149,7 +151,30 @@  static int x86_hvm_setup(struct xc_sr_context *ctx)
         return -1;
     }
 
+    bm = &ctx->x86_hvm.restore.attempted_1g;
+    bits = (ctx->restore.p2m_size >> SUPERPAGE_1GB_SHIFT) + 1;
+    if ( xc_sr_bitmap_resize(bm, bits) == false )
+        goto out;
+
+    bm = &ctx->x86_hvm.restore.attempted_2m;
+    bits = (ctx->restore.p2m_size >> SUPERPAGE_2MB_SHIFT) + 1;
+    if ( xc_sr_bitmap_resize(bm, bits) == false )
+        goto out;
+
+    bm = &ctx->x86_hvm.restore.allocated_pfns;
+    bits = ctx->restore.p2m_size + 1;
+    if ( xc_sr_bitmap_resize(bm, bits) == false )
+        goto out;
+
+    /* No superpage in 1st 2MB due to VGA hole */
+    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_1g);
+    xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_2m);
+
     return 0;
+
+out:
+    ERROR("Unable to allocate memory for pfn bitmaps");
+    return -1;
 }
 
 /*
@@ -224,10 +249,325 @@  static int x86_hvm_stream_complete(struct xc_sr_context *ctx)
 static int x86_hvm_cleanup(struct xc_sr_context *ctx)
 {
     free(ctx->x86_hvm.restore.context);
+    xc_sr_bitmap_free(&ctx->x86_hvm.restore.attempted_1g);
+    xc_sr_bitmap_free(&ctx->x86_hvm.restore.attempted_2m);
+    xc_sr_bitmap_free(&ctx->x86_hvm.restore.allocated_pfns);
+
+    return 0;
+}
 
+/*
+ * Set a pfn as allocated, expanding the tracking structures if needed.
+ */
+static int pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+
+    if ( !xc_sr_set_bit(pfn, &ctx->x86_hvm.restore.allocated_pfns) )
+    {
+        ERROR("Failed to realloc allocated_pfns bitmap");
+        errno = ENOMEM;
+        return -1;
+    }
     return 0;
 }
 
+struct x86_hvm_sp {
+    xen_pfn_t pfn;
+    xen_pfn_t base_pfn;
+    unsigned long index;
+    unsigned long count;
+};
+
+/*
+ * Try to allocate a 1GB page for this pfn, but avoid Over-allocation.
+ * If this succeeds, mark the range of 2MB pages as busy.
+ */
+static bool x86_hvm_alloc_1g(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_bitmap *bm;
+    unsigned int order, shift;
+    int i, done;
+    xen_pfn_t extent;
+
+    bm = &ctx->x86_hvm.restore.attempted_1g;
+
+    /* Only one attempt to avoid overlapping allocation */
+    if ( xc_sr_test_and_set_bit(sp->index, bm) )
+        return false;
+
+    order = SUPERPAGE_1GB_SHIFT;
+    sp->count = 1ULL << order;
+
+    /* Allocate only if there is room for another superpage */
+    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
+        return false;
+
+    extent = sp->base_pfn = (sp->pfn >> order) << order;
+    done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+    if ( done < 0 ) {
+        PERROR("populate_physmap failed.");
+        return false;
+    }
+    if ( done == 0 )
+        return false;
+
+    DPRINTF("1G base_pfn %" PRI_xen_pfn "\n", sp->base_pfn);
+
+    /* Mark all 2MB pages as done to avoid overlapping allocation */
+    bm = &ctx->x86_hvm.restore.attempted_2m;
+    shift = SUPERPAGE_1GB_SHIFT - SUPERPAGE_2MB_SHIFT;
+    for ( i = 0; i < (sp->count >> shift); i++ )
+        xc_sr_set_bit((sp->base_pfn >> SUPERPAGE_2MB_SHIFT) + i, bm);
+
+    return true;
+}
+
+/* Allocate a 2MB page if x86_hvm_alloc_1g failed, avoid Over-allocation. */
+static bool x86_hvm_alloc_2m(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_bitmap *bm;
+    unsigned int order;
+    int done;
+    xen_pfn_t extent;
+
+    bm = &ctx->x86_hvm.restore.attempted_2m;
+
+    /* Only one attempt to avoid overlapping allocation */
+    if ( xc_sr_test_and_set_bit(sp->index, bm) )
+        return false;
+
+    order = SUPERPAGE_2MB_SHIFT;
+    sp->count = 1ULL << order;
+
+    /* Allocate only if there is room for another superpage */
+    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
+        return false;
+
+    extent = sp->base_pfn = (sp->pfn >> order) << order;
+    done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+    if ( done < 0 ) {
+        PERROR("populate_physmap failed.");
+        return false;
+    }
+    if ( done == 0 )
+        return false;
+
+    DPRINTF("2M base_pfn %" PRI_xen_pfn "\n", sp->base_pfn);
+    return true;
+}
+
+/* Allocate a single page if x86_hvm_alloc_2m failed. */
+static bool x86_hvm_alloc_4k(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned int order;
+    int done;
+    xen_pfn_t extent;
+
+    order = 0;
+    sp->count = 1ULL << order;
+
+    /* Allocate only if there is room for another page */
+    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
+        return false;
+
+    extent = sp->base_pfn = (sp->pfn >> order) << order;
+    done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+    if ( done < 0 ) {
+        PERROR("populate_physmap failed.");
+        return false;
+    }
+    if ( done == 0 )
+        return false;
+
+    DPRINTF("4K base_pfn %" PRI_xen_pfn "\n", sp->base_pfn);
+    return true;
+}
+/*
+ * Attempt to allocate a superpage where the pfn resides.
+ */
+static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+    bool success;
+    int rc = -1;
+    unsigned long idx_1g, idx_2m;
+    struct x86_hvm_sp sp = {
+        .pfn = pfn
+    };
+
+    if ( xc_sr_test_bit(pfn, &ctx->x86_hvm.restore.allocated_pfns) )
+        return 0;
+
+    idx_1g = pfn >> SUPERPAGE_1GB_SHIFT;
+    idx_2m = pfn >> SUPERPAGE_2MB_SHIFT;
+    if ( !xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_1g, idx_1g) )
+    {
+        PERROR("Failed to realloc attempted_1g");
+        return -1;
+    }
+    if ( !xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_2m, idx_2m) )
+    {
+        PERROR("Failed to realloc attempted_2m");
+        return -1;
+    }
+
+    sp.index = idx_1g;
+    success = x86_hvm_alloc_1g(ctx, &sp);
+
+    if ( success == false ) {
+        sp.index = idx_2m;
+        success = x86_hvm_alloc_2m(ctx, &sp);
+    }
+
+    if ( success == false ) {
+        sp.index = 0;
+        success = x86_hvm_alloc_4k(ctx, &sp);
+    }
+
+    if ( success == true ) {
+        do {
+            sp.count--;
+            ctx->restore.tot_pages++;
+            rc = pfn_set_allocated(ctx, sp.base_pfn + sp.count);
+            if ( rc )
+                break;
+        } while ( sp.count );
+    }
+    return rc;
+}
+
+static bool x86_hvm_punch_hole(struct xc_sr_context *ctx, xen_pfn_t max_pfn)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_bitmap *bm;
+    xen_pfn_t _pfn, pfn, min_pfn;
+    uint32_t domid, freed = 0, order;
+    int rc = -1;
+
+    /*
+     * Scan the entire superpage because several batches will fit into
+     * a superpage, and it is unknown which pfn triggered the allocation.
+     */
+    order = SUPERPAGE_1GB_SHIFT;
+    pfn = min_pfn = (max_pfn >> order) << order;
+
+    while ( pfn <= max_pfn )
+    {
+        bm = &ctx->x86_hvm.restore.allocated_pfns;
+        if ( !xc_sr_bitmap_resize(bm, pfn) )
+        {
+            PERROR("Failed to realloc allocated_pfns %" PRI_xen_pfn, pfn);
+            return false;
+        }
+        if ( !pfn_is_populated(ctx, pfn) &&
+            xc_sr_test_and_clear_bit(pfn, bm) ) {
+            domid = ctx->domid;
+            _pfn = pfn;
+            rc = xc_domain_decrease_reservation_exact(xch, domid, 1, 0, &_pfn);
+            if ( rc )
+            {
+                PERROR("Failed to release pfn %" PRI_xen_pfn, pfn);
+                return false;
+            }
+            ctx->restore.tot_pages--;
+            freed++;
+        }
+        pfn++;
+    }
+    if ( freed )
+        DPRINTF("freed %u between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
+                freed, min_pfn, max_pfn);
+    return true;
+}
+
+/*
+ * Try to allocate superpages.
+ * This works without memory map only if the pfns arrive in incremental order.
+ */
+static int x86_hvm_populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                                 const xen_pfn_t *original_pfns,
+                                 const uint32_t *types)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t pfn, min_pfn = original_pfns[0], max_pfn = original_pfns[0];
+    xen_pfn_t idx1G, idx2M;
+    unsigned i, order;
+    int rc = -1;
+
+    /* Loop once over the array to show statistics */
+    for ( i = 0; i < count; ++i )
+    {
+        if ( original_pfns[i] < min_pfn )
+            min_pfn = original_pfns[i];
+        if ( original_pfns[i] > max_pfn )
+            max_pfn = original_pfns[i];
+    }
+    DPRINTF("batch of %u pfns between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
+            count, min_pfn, max_pfn);
+
+    for ( i = 0; i < count; ++i )
+    {
+        pfn = original_pfns[i];
+        idx1G = pfn >> SUPERPAGE_1GB_SHIFT;
+        idx2M = pfn >> SUPERPAGE_2MB_SHIFT;
+
+        /*
+         * If this pfn is in another 2MB superpage it is required to punch holes
+         * to release memory, starting from the 1GB boundary up to the highest
+         * pfn within the previous 2MB superpage.
+         */
+        if ( ctx->x86_hvm.restore.idx1G_prev == idx1G &&
+             ctx->x86_hvm.restore.idx2M_prev == idx2M )
+        {
+            /* Same 2MB superpage, nothing to do */
+            ;
+        } else {
+            /*
+             * If this next pfn is within another 1GB or 2MB superpage it is
+             * required to scan the entire previous superpage because there
+             * might be holes between the last pfn and the end of the superpage.
+             */
+            if ( ctx->x86_hvm.restore.idx1G_prev != idx1G )
+            {
+                order = SUPERPAGE_1GB_SHIFT;
+                max_pfn = ((ctx->x86_hvm.restore.idx1G_prev + 1) << order) - 1;
+            }
+            else
+            {
+                order = SUPERPAGE_2MB_SHIFT;
+                max_pfn = ((ctx->x86_hvm.restore.idx2M_prev + 1) << order) - 1;
+            }
+
+            if ( x86_hvm_punch_hole(ctx, max_pfn) == false )
+                goto err;
+        }
+
+        if ( (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
+              types[i] != XEN_DOMCTL_PFINFO_BROKEN) &&
+             !pfn_is_populated(ctx, pfn) )
+        {
+            rc = x86_hvm_allocate_pfn(ctx, pfn);
+            if ( rc )
+                goto err;
+            rc = pfn_set_populated(ctx, pfn);
+            if ( rc )
+                goto err;
+        }
+        ctx->x86_hvm.restore.idx1G_prev = idx1G;
+        ctx->x86_hvm.restore.idx2M_prev = idx2M;
+    }
+
+    rc = 0;
+
+ err:
+    return rc;
+}
+
+
 struct xc_sr_restore_ops restore_ops_x86_hvm =
 {
     .pfn_is_valid    = x86_hvm_pfn_is_valid,
@@ -236,6 +576,7 @@  struct xc_sr_restore_ops restore_ops_x86_hvm =
     .set_page_type   = x86_hvm_set_page_type,
     .localise_page   = x86_hvm_localise_page,
     .setup           = x86_hvm_setup,
+    .populate_pfns   = x86_hvm_populate_pfns,
     .process_record  = x86_hvm_process_record,
     .stream_complete = x86_hvm_stream_complete,
     .cleanup         = x86_hvm_cleanup,
diff --git a/tools/libxc/xc_sr_restore_x86_pv.c b/tools/libxc/xc_sr_restore_x86_pv.c
index 50e25c162c..87957559bc 100644
--- a/tools/libxc/xc_sr_restore_x86_pv.c
+++ b/tools/libxc/xc_sr_restore_x86_pv.c
@@ -936,6 +936,75 @@  static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
         ((uint32_t *)ctx->x86_pv.p2m)[pfn] = mfn;
 }
 
+/*
+ * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+ * unpopulated subset.  If types is NULL, no page type checking is performed
+ * and all unpopulated pfns are populated.
+ */
+static int x86_pv_populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                                const xen_pfn_t *original_pfns,
+                                const uint32_t *types)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
+        *pfns = malloc(count * sizeof(*pfns));
+    unsigned i, nr_pfns = 0;
+    int rc = -1;
+
+    if ( !mfns || !pfns )
+    {
+        ERROR("Failed to allocate %zu bytes for populating the physmap",
+              2 * count * sizeof(*mfns));
+        goto err;
+    }
+
+    for ( i = 0; i < count; ++i )
+    {
+        if ( (!types || (types &&
+                         (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
+                          types[i] != XEN_DOMCTL_PFINFO_BROKEN))) &&
+             !pfn_is_populated(ctx, original_pfns[i]) )
+        {
+            rc = pfn_set_populated(ctx, original_pfns[i]);
+            if ( rc )
+                goto err;
+            pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+            ++nr_pfns;
+        }
+    }
+
+    if ( nr_pfns )
+    {
+        rc = xc_domain_populate_physmap_exact(
+            xch, ctx->domid, nr_pfns, 0, 0, mfns);
+        if ( rc )
+        {
+            PERROR("Failed to populate physmap");
+            goto err;
+        }
+
+        for ( i = 0; i < nr_pfns; ++i )
+        {
+            if ( mfns[i] == INVALID_MFN )
+            {
+                ERROR("Populate physmap failed for pfn %u", i);
+                rc = -1;
+                goto err;
+            }
+
+            ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+        }
+    }
+
+    rc = 0;
+
+ err:
+    free(pfns);
+    free(mfns);
+
+    return rc;
+}
+
 /*
  * restore_ops function.  Convert pfns back to mfns in pagetables.  Possibly
  * needs to populate new frames if a PTE is found referring to a frame which
@@ -980,7 +1049,7 @@  static int x86_pv_localise_page(struct xc_sr_context *ctx,
         }
     }
 
-    if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
+    if ( to_populate && x86_pv_populate_pfns(ctx, to_populate, pfns, NULL) )
         return -1;
 
     for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
@@ -1160,6 +1229,7 @@  struct xc_sr_restore_ops restore_ops_x86_pv =
     .set_gfn         = x86_pv_set_gfn,
     .localise_page   = x86_pv_localise_page,
     .setup           = x86_pv_setup,
+    .populate_pfns   = x86_pv_populate_pfns,
     .process_record  = x86_pv_process_record,
     .stream_complete = x86_pv_stream_complete,
     .cleanup         = x86_pv_cleanup,