@@ -131,6 +131,9 @@ struct xc_dom_image {
* a hybrid guest this means that it maps GPFNs to GPFNS.
*
* Note that the input is offset by rambase.
+ *
+ * This is not populated for guests that provide an arch-specific
+ * lookup hook in arch_hooks.
*/
xen_pfn_t *p2m_host;
void *p2m_guest;
@@ -274,6 +277,10 @@ struct xc_dom_arch {
int arch_private_size;
struct xc_dom_arch *next;
+
+ /* arch-specific p2m table lookup to get rid of the p2m_host array stored in
+ * xc_dom_image. */
+ xen_pfn_t (*p2m_host) (struct xc_dom_image *dom, unsigned long idx);
};
void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks);
@@ -437,7 +444,9 @@ static inline xen_pfn_t xc_dom_p2m(struct xc_dom_image *dom, xen_pfn_t pfn)
return pfn;
if (pfn < dom->rambase_pfn || pfn >= dom->rambase_pfn + dom->total_pages)
return INVALID_MFN;
- return dom->p2m_host[pfn - dom->rambase_pfn];
+ return dom->arch_hooks->p2m_host ?
+ dom->arch_hooks->p2m_host(dom, pfn - dom->rambase_pfn)
+ : dom->p2m_host[pfn - dom->rambase_pfn];
}
#endif /* _XC_DOM_H */
@@ -547,6 +547,7 @@ static struct xc_dom_arch xc_dom_32 = {
.meminit = meminit,
.bootearly = bootearly,
.bootlate = bootlate,
+ .p2m_host = NULL,
};
static struct xc_dom_arch xc_dom_64 = {
@@ -563,6 +564,7 @@ static struct xc_dom_arch xc_dom_64 = {
.meminit = meminit,
.bootearly = bootearly,
.bootlate = bootlate,
+ .p2m_host = NULL,
};
static void __init register_arch_hooks(void)
@@ -985,7 +985,9 @@ int xc_dom_update_guest_p2m(struct xc_dom_image *dom)
__FUNCTION__, dom->p2m_size);
p2m_32 = dom->p2m_guest;
for ( i = 0; i < dom->p2m_size; i++ )
- if ( dom->p2m_host[i] != INVALID_PFN )
+ if ( dom->arch_hooks->p2m_host )
+ p2m_32[i] = dom->arch_hooks->p2m_host(dom, i);
+ else if ( dom->p2m_host[i] != INVALID_PFN )
p2m_32[i] = dom->p2m_host[i];
else
p2m_32[i] = (uint32_t) - 1;
@@ -101,6 +101,10 @@ struct xc_dom_image_x86 {
#define MAPPING_MAX 2
struct xc_dom_x86_mapping maps[MAPPING_MAX];
struct xc_dom_params *params;
+
+ /* Used to fake vmemrange information in case vNUMA information was not provided. */
+ xen_vmemrange_t dummy_vmemrange[2];
+ unsigned int nr_dummy_vmemranges;
};
/* get guest IO ABI protocol */
@@ -1252,13 +1256,13 @@ static int meminit_hvm(struct xc_dom_image *dom)
unsigned int memflags = 0;
int claim_enabled = dom->claim_enabled;
uint64_t total_pages;
- xen_vmemrange_t dummy_vmemrange[2];
unsigned int dummy_vnode_to_pnode[1];
xen_vmemrange_t *vmemranges;
unsigned int *vnode_to_pnode;
unsigned int nr_vmemranges, nr_vnodes;
xc_interface *xch = dom->xch;
uint32_t domid = dom->guest_domid;
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
if ( nr_pages > target_pages )
memflags |= XENMEMF_populate_on_demand;
@@ -1274,25 +1278,26 @@ static int meminit_hvm(struct xc_dom_image *dom)
* has no effect on the actual result.
*/
- dummy_vmemrange[0].start = 0;
- dummy_vmemrange[0].end = dom->lowmem_end;
- dummy_vmemrange[0].flags = 0;
- dummy_vmemrange[0].nid = 0;
- nr_vmemranges = 1;
+ domx86->dummy_vmemrange[0].start = 0;
+ domx86->dummy_vmemrange[0].end = dom->lowmem_end;
+ domx86->dummy_vmemrange[0].flags = 0;
+ domx86->dummy_vmemrange[0].nid = 0;
+ domx86->nr_dummy_vmemranges = 1;
if ( dom->highmem_end > (1ULL << 32) )
{
- dummy_vmemrange[1].start = 1ULL << 32;
- dummy_vmemrange[1].end = dom->highmem_end;
- dummy_vmemrange[1].flags = 0;
- dummy_vmemrange[1].nid = 0;
+ domx86->dummy_vmemrange[1].start = 1ULL << 32;
+ domx86->dummy_vmemrange[1].end = dom->highmem_end;
+ domx86->dummy_vmemrange[1].flags = 0;
+ domx86->dummy_vmemrange[1].nid = 0;
- nr_vmemranges++;
+ domx86->nr_dummy_vmemranges++;
}
dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE;
nr_vnodes = 1;
- vmemranges = dummy_vmemrange;
+ vmemranges = domx86->dummy_vmemrange;
+ nr_vmemranges = domx86->nr_dummy_vmemranges;
vnode_to_pnode = dummy_vnode_to_pnode;
}
else
@@ -1329,25 +1334,6 @@ static int meminit_hvm(struct xc_dom_image *dom)
}
dom->p2m_size = p2m_size;
- dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) *
- dom->p2m_size);
- if ( dom->p2m_host == NULL )
- {
- DOMPRINTF("Could not allocate p2m");
- goto error_out;
- }
-
- for ( i = 0; i < p2m_size; i++ )
- dom->p2m_host[i] = ((xen_pfn_t)-1);
- for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
- {
- uint64_t pfn;
-
- for ( pfn = vmemranges[vmemid].start >> PAGE_SHIFT;
- pfn < vmemranges[vmemid].end >> PAGE_SHIFT;
- pfn++ )
- dom->p2m_host[pfn] = pfn;
- }
/*
* Try to claim pages for early warning of insufficient memory available.
@@ -1395,8 +1381,12 @@ static int meminit_hvm(struct xc_dom_image *dom)
*/
if ( dom->device_model )
{
+ xen_pfn_t pfn_batch[0xa0];
+ for ( i = 0; i < 0xa0; i++ )
+ pfn_batch[i] = dom->arch_hooks->p2m_host(dom, i);
+
rc = xc_domain_populate_physmap_exact(
- xch, domid, 0xa0, 0, memflags, &dom->p2m_host[0x00]);
+ xch, domid, 0xa0, 0, memflags, &pfn_batch[0x00]);
if ( rc != 0 )
{
DOMPRINTF("Could not populate low memory (< 0xA0).\n");
@@ -1439,7 +1429,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
if ( count > max_pages )
count = max_pages;
- cur_pfn = dom->p2m_host[cur_pages];
+ cur_pfn = dom->arch_hooks->p2m_host(dom, cur_pages);
/* Take care the corner cases of super page tails */
if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
@@ -1465,8 +1455,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
xen_pfn_t sp_extents[nr_extents];
for ( i = 0; i < nr_extents; i++ )
- sp_extents[i] =
- dom->p2m_host[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)];
+ sp_extents[i] = dom->arch_hooks->p2m_host(dom, cur_pages+(i<<SUPERPAGE_1GB_SHIFT));
done = xc_domain_populate_physmap(xch, domid, nr_extents,
SUPERPAGE_1GB_SHIFT,
@@ -1505,8 +1494,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
xen_pfn_t sp_extents[nr_extents];
for ( i = 0; i < nr_extents; i++ )
- sp_extents[i] =
- dom->p2m_host[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)];
+ sp_extents[i] = dom->arch_hooks->p2m_host(dom, cur_pages+(i<<SUPERPAGE_2MB_SHIFT));
done = xc_domain_populate_physmap(xch, domid, nr_extents,
SUPERPAGE_2MB_SHIFT,
@@ -1521,14 +1509,39 @@ static int meminit_hvm(struct xc_dom_image *dom)
}
}
}
-
/* Fall back to 4kB extents. */
if ( count != 0 )
{
- rc = xc_domain_populate_physmap_exact(
- xch, domid, count, 0, new_memflags, &dom->p2m_host[cur_pages]);
- cur_pages += count;
- stat_normal_pages += count;
+ unsigned long nr_extents;
+ xen_pfn_t *pfn_batch;
+
+ pfn_batch = calloc(SUPERPAGE_1GB_NR_PFNS, sizeof(*pfn_batch));
+ if ( !pfn_batch ) {
+ DOMPRINTF("Could not allocate memory to construct physmap batch.");
+ rc = -1;
+ goto error_out;
+ }
+
+ while ( count > 0 ) {
+ for ( i = 0; i < count && i < SUPERPAGE_1GB_NR_PFNS; i++)
+ pfn_batch[i] = dom->arch_hooks->p2m_host(dom, cur_pages+i);
+
+ nr_extents = count > SUPERPAGE_1GB_NR_PFNS ? SUPERPAGE_1GB_NR_PFNS : count;
+ rc = xc_domain_populate_physmap_exact(xch, domid, nr_extents,
+ 0, new_memflags, &pfn_batch[0]);
+ if ( rc != 0 ) {
+ DOMPRINTF("Could not populate physmap batch.");
+ free(pfn_batch);
+ rc = -1;
+ goto error_out;
+ }
+
+ stat_normal_pages += nr_extents;
+ cur_pages += nr_extents;
+ count -= nr_extents;
+ }
+
+ free(pfn_batch);
}
}
@@ -1780,6 +1793,31 @@ static int bootlate_hvm(struct xc_dom_image *dom)
return 0;
}
+static xen_pfn_t p2m_host_hvm(struct xc_dom_image *dom, unsigned long idx)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ xen_vmemrange_t *vmemranges;
+ unsigned int nr_vmemranges;
+ int vmemid;
+
+ if ( dom->nr_vmemranges ) {
+ vmemranges = dom->vmemranges;
+ nr_vmemranges = dom->nr_vmemranges;
+ } else {
+ vmemranges = domx86->dummy_vmemrange;
+ nr_vmemranges = domx86->nr_dummy_vmemranges;
+ }
+
+ for ( vmemid = 0; vmemid < nr_vmemranges ; vmemid++ ) {
+ if ( idx >= (vmemranges[vmemid].start >> XC_DOM_PAGE_SHIFT(dom))
+ && idx < (vmemranges[vmemid].end >> XC_DOM_PAGE_SHIFT(dom)) ) {
+ return idx;
+ }
+ }
+
+ return ((xen_pfn_t)-1);
+}
+
bool xc_dom_translated(const struct xc_dom_image *dom)
{
/* HVM guests are translated. PV guests are not. */
@@ -1805,6 +1843,7 @@ static struct xc_dom_arch xc_dom_32_pae = {
.meminit = meminit_pv,
.bootearly = bootearly,
.bootlate = bootlate_pv,
+ .p2m_host = NULL,
};
static struct xc_dom_arch xc_dom_64 = {
@@ -1824,6 +1863,7 @@ static struct xc_dom_arch xc_dom_64 = {
.meminit = meminit_pv,
.bootearly = bootearly,
.bootlate = bootlate_pv,
+ .p2m_host = NULL,
};
static struct xc_dom_arch xc_hvm_32 = {
@@ -1831,6 +1871,7 @@ static struct xc_dom_arch xc_hvm_32 = {
.native_protocol = XEN_IO_PROTO_ABI_X86_32,
.page_shift = PAGE_SHIFT_X86,
.sizeof_pfn = 4,
+ .arch_private_size = sizeof(struct xc_dom_image_x86),
.alloc_magic_pages = alloc_magic_pages_hvm,
.alloc_pgtables = alloc_pgtables_hvm,
.setup_pgtables = NULL,
@@ -1840,6 +1881,7 @@ static struct xc_dom_arch xc_hvm_32 = {
.meminit = meminit_hvm,
.bootearly = bootearly,
.bootlate = bootlate_hvm,
+ .p2m_host = p2m_host_hvm,
};
static void __init register_arch_hooks(void)
When allocating the guest memory for an HVM domain, libxc keeps the P2M mapping for the entirety of the guest memory around for the time of the launch as xc_dom_image->p2m_host. For guests that have a large memory (3904 GiB), the p2m_host allocation takes more than 7.5 GiB of space, and leaves xl susceptible to getting OOM-killed on guest creation. Convert the p2m_host table lookups to an arch-specific function that returns the mapping on-the-fly for x86 HVM guests to avoid this allocation, bringing down xl's memory usage from > 8GiB to < 70Mib for such launches. Signed-off-by: Varad Gautam <vrd@amazon.de> --- Applies to stable-4.11+. tools/libxc/include/xc_dom.h | 11 +++- tools/libxc/xc_dom_arm.c | 2 + tools/libxc/xc_dom_core.c | 4 +- tools/libxc/xc_dom_x86.c | 126 ++++++++++++++++++++++++++++--------------- 4 files changed, 99 insertions(+), 44 deletions(-)