diff mbox

[v4,10/14] xen/x86: populate PVHv2 Dom0 physical memory map

Message ID 20161130164950.43543-11-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné Nov. 30, 2016, 4:49 p.m. UTC
Craft the Dom0 e820 memory map and populate it. Introduce a helper to remove
memory pages that are shared between Xen and a domain, and use it in order to
remove low 1MB RAM regions from dom_io in order to assign them to a PVHv2 Dom0.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes since v3:
 - Drop get_order_from_bytes_floor, it was only used by
   hvm_populate_memory_range.
 - Switch hvm_populate_memory_range to use frame numbers instead of full memory
   addresses.
 - Add a helper to steal the low 1MB RAM areas from dom_io and add them to Dom0
   as normal RAM.
 - Introduce unshare_xen_page_with_guest in order to remove pages from dom_io,
   so they can be assigned to other domains. This is needed in order to remove
   the low 1MB RAM regions from dom_io and assign them to the hardware_domain.
 - Simplify the loop in hvm_steal_ram.
 - Move definition of map_identity_mmio into this patch.

Changes since v2:
 - Introduce get_order_from_bytes_floor as a local function to
   domain_build.c.
 - Remove extra asserts.
 - Make hvm_populate_memory_range return an error code instead of panicking.
 - Fix comments and printks.
 - Use ULL sufix instead of casting to uint64_t.
 - Rename hvm_setup_vmx_unrestricted_guest to
   hvm_setup_vmx_realmode_helpers.
 - Only substract two pages from the memory calculation, that will be used
   by the MADT replacement.
 - Remove some comments.
 - Remove printing allocation information.
 - Don't stash any pages for the MADT, TSS or ident PT, those will be
   subtracted directly from RAM regions of the memory map.
 - Count the number of iterations before calling process_pending_softirqs
   when populating the memory map.
 - Move the initial call to process_pending_softirqs into construct_dom0,
   and remove the ones from construct_dom0_hvm and construct_dom0_pv.
 - Make memflags global so it can be shared between alloc_chunk and
   hvm_populate_memory_range.

Changes since RFC:
 - Use IS_ALIGNED instead of checking with PAGE_MASK.
 - Use the new %pB specifier in order to print sizes in human readable form.
 - Create a VM86 TSS for hardware that doesn't support unrestricted mode.
 - Subtract guest RAM for the identity page table and the VM86 TSS.
 - Split the creation of the unrestricted mode helper structures to a
   separate function.
 - Use preemption with paging_set_allocation.
 - Use get_order_from_bytes_floor.
---
 xen/arch/x86/domain_build.c | 310 ++++++++++++++++++++++++++++++++++++++++++--
 xen/arch/x86/mm.c           |  37 ++++++
 xen/include/asm-x86/mm.h    |   2 +
 3 files changed, 340 insertions(+), 9 deletions(-)

Comments

Jan Beulich Dec. 9, 2016, 4:48 p.m. UTC | #1
>>> On 30.11.16 at 17:49, <roger.pau@citrix.com> wrote:
> @@ -302,7 +307,8 @@ static unsigned long __init compute_dom0_nr_pages(
>              avail -= max_pdx >> s;
>      }
>  
> -    need_paging = opt_dom0_shadow || (is_pvh_domain(d) && !iommu_hap_pt_share);
> +    need_paging = opt_dom0_shadow || (has_hvm_container_domain(d) &&
> +                  (!iommu_hap_pt_share || !paging_mode_hap(d)));

Indentation.

> @@ -545,11 +552,12 @@ static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
>      ASSERT(nr_holes == 0);
>  }
>  
> -static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> +static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)

Why?

> @@ -577,8 +585,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>              continue;
>          }
>  
> -        *entry_guest = *entry;
> -        pages = PFN_UP(entry_guest->size);
> +        /*
> +         * Make sure the start and length are aligned to PAGE_SIZE, because
> +         * that's the minimum granularity of the 2nd stage translation.
> +         */
> +        start = ROUNDUP(entry->addr, PAGE_SIZE);
> +        end = (entry->addr + entry->size) & PAGE_MASK;

Taking the comment into consideration, I wonder whether you
wouldn't better use PAGE_ORDER_4K here, as that's what the
p2m code uses.

> @@ -1010,8 +1029,6 @@ static int __init construct_dom0_pv(
>      BUG_ON(d->vcpu[0] == NULL);
>      BUG_ON(v->is_initialised);
>  
> -    process_pending_softirqs();

Wouldn't this adjustment better fit into the previous patch, together
with its companion below?

> +static int __init hvm_steal_ram(struct domain *d, unsigned long size,
> +                                paddr_t limit, paddr_t *addr)
> +{
> +    unsigned int i = d->arch.nr_e820;
> +
> +    while ( i-- )
> +    {
> +        struct e820entry *entry = &d->arch.e820[i];
> +
> +        if ( entry->type != E820_RAM || entry->size < size )
> +            continue;
> +
> +        /* Subtract from the beginning. */
> +        if ( entry->addr + size < limit && entry->addr >= MB(1) )

<= I think (for the left comparison)?

> +static void __init hvm_steal_low_ram(struct domain *d, unsigned long start,
> +                                     unsigned long nr_pages)
> +{
> +    unsigned long mfn;
> +
> +    ASSERT(start + nr_pages < PFN_DOWN(MB(1)));

<= again I think.

> +static int __init hvm_setup_p2m(struct domain *d)
> +{
> +    struct vcpu *v = d->vcpu[0];
> +    unsigned long nr_pages;
> +    unsigned int i;
> +    int rc;
> +    bool preempted;
> +#define MB1_PAGES PFN_DOWN(MB(1))
> +
> +    nr_pages = compute_dom0_nr_pages(d, NULL, 0);
> +
> +    hvm_setup_e820(d, nr_pages);
> +    do {
> +        preempted = false;
> +        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
> +                              &preempted);
> +        process_pending_softirqs();
> +    } while ( preempted );
> +
> +    /*
> +     * Memory below 1MB is identity mapped.
> +     * NB: this only makes sense when booted from legacy BIOS.
> +     */
> +    rc = modify_identity_mmio(d, 0, PFN_DOWN(MB(1)), true);
> +    if ( rc )
> +    {
> +        printk("Failed to identity map low 1MB: %d\n", rc);
> +        return rc;
> +    }
> +
> +    /* Populate memory map. */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        unsigned long addr, size;
> +
> +        if ( d->arch.e820[i].type != E820_RAM )
> +            continue;
> +
> +        addr = PFN_DOWN(d->arch.e820[i].addr);
> +        size = PFN_DOWN(d->arch.e820[i].size);
> +
> +        if ( addr >= MB1_PAGES )
> +            rc = hvm_populate_memory_range(d, addr, size);
> +        else if ( addr + size > MB1_PAGES )
> +        {
> +            hvm_steal_low_ram(d, addr, MB1_PAGES - addr);
> +            rc = hvm_populate_memory_range(d, MB1_PAGES,
> +                                           size - (MB1_PAGES - addr));

Is this case possible at all? All x86 systems have some form of
BIOS right below the 1Mb boundary, and the E820 map for
Dom0 is being derived from the host one.

> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -475,6 +475,43 @@ void share_xen_page_with_guest(
>      spin_unlock(&d->page_alloc_lock);
>  }
>  
> +int unshare_xen_page_with_guest(struct page_info *page, struct domain *d)

__init

And once its __init, it may be possible to simplify it, as you don't need
to fear races anymore. E.g. you wouldn't need a loop over cmpxchg().

> +{
> +    unsigned long y, x;
> +    bool drop_dom_ref;
> +
> +    if ( page_get_owner(page) != d || !(page->count_info & PGC_xen_heap) )

Please don't open code is_xen_heap_page().

> +        return -EINVAL;
> +
> +    spin_lock(&d->page_alloc_lock);
> +
> +    /* Drop the page reference so we can chanfge the owner. */
> +    y = page->count_info;
> +    do {
> +        x = y;
> +        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
> +        {
> +            spin_unlock(&d->page_alloc_lock);
> +            return -EINVAL;
> +        }
> +        y = cmpxchg(&page->count_info, x, PGC_xen_heap);
> +    } while ( y != x );
> +
> +    /* Remove the page form the list of domain pages. */
> +    page_list_del(page, &d->xenpage_list);
> +    drop_dom_ref = (--d->xenheap_pages == 0);

Aren't you open coding

    if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
        put_page(page);

here (except for the check on the initial value, which could be
moved up)?

> +    /* Remove the owner and clear the flags. */
> +    page_set_owner(page, NULL);
> +    page->u.inuse.type_info = 0;

I think you'd better bail early if this is non-zero. Or else please use
the order used elsewhere (clearing type info, then owner) - while
it's benign, it avoids someone later wondering whether the order
is wrong in either place.

Jan
Roger Pau Monné Dec. 16, 2016, 5:38 p.m. UTC | #2
On Fri, Dec 09, 2016 at 09:48:32AM -0700, Jan Beulich wrote:
> >>> On 30.11.16 at 17:49, <roger.pau@citrix.com> wrote:
> > @@ -302,7 +307,8 @@ static unsigned long __init compute_dom0_nr_pages(
> >              avail -= max_pdx >> s;
> >      }
> >  
> > -    need_paging = opt_dom0_shadow || (is_pvh_domain(d) && !iommu_hap_pt_share);
> > +    need_paging = opt_dom0_shadow || (has_hvm_container_domain(d) &&
> > +                  (!iommu_hap_pt_share || !paging_mode_hap(d)));
> 
> Indentation.

Fixed.

> > @@ -545,11 +552,12 @@ static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
> >      ASSERT(nr_holes == 0);
> >  }
> >  
> > -static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> > +static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
> 
> Why?

So that afterwards I can remove all the pvh_ functions and leave the hvm_ ones
only. But seeing your response to the other patch, would you prefer me to just
use pvh_ for the new functions also?

> > @@ -577,8 +585,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> >              continue;
> >          }
> >  
> > -        *entry_guest = *entry;
> > -        pages = PFN_UP(entry_guest->size);
> > +        /*
> > +         * Make sure the start and length are aligned to PAGE_SIZE, because
> > +         * that's the minimum granularity of the 2nd stage translation.
> > +         */
> > +        start = ROUNDUP(entry->addr, PAGE_SIZE);
> > +        end = (entry->addr + entry->size) & PAGE_MASK;
> 
> Taking the comment into consideration, I wonder whether you
> wouldn't better use PAGE_ORDER_4K here, as that's what the
> p2m code uses.

That's going to be more cumbersome, since PAGE_SIZE would become 1UL <<
PAGE_ORDER_4K << PAGE_SHIFT, and PAGE_MASK is going to be -1 and ~ of the
previous construct. But I see your point, maybe I should define PAGE_SIZE_4K
and PAGE_MASK_4K in xen/include/asm-x86/page.h?

> > @@ -1010,8 +1029,6 @@ static int __init construct_dom0_pv(
> >      BUG_ON(d->vcpu[0] == NULL);
> >      BUG_ON(v->is_initialised);
> >  
> > -    process_pending_softirqs();
> 
> Wouldn't this adjustment better fit into the previous patch, together
> with its companion below?

Yes, I guess I must have forgotten to move this.

> > +static int __init hvm_steal_ram(struct domain *d, unsigned long size,
> > +                                paddr_t limit, paddr_t *addr)
> > +{
> > +    unsigned int i = d->arch.nr_e820;
> > +
> > +    while ( i-- )
> > +    {
> > +        struct e820entry *entry = &d->arch.e820[i];
> > +
> > +        if ( entry->type != E820_RAM || entry->size < size )
> > +            continue;
> > +
> > +        /* Subtract from the beginning. */
> > +        if ( entry->addr + size < limit && entry->addr >= MB(1) )
> 
> <= I think (for the left comparison)?

Yes.

> > +static void __init hvm_steal_low_ram(struct domain *d, unsigned long start,
> > +                                     unsigned long nr_pages)
> > +{
> > +    unsigned long mfn;
> > +
> > +    ASSERT(start + nr_pages < PFN_DOWN(MB(1)));
> 
> <= again I think.

Right.

> > +static int __init hvm_setup_p2m(struct domain *d)
> > +{
> > +    struct vcpu *v = d->vcpu[0];
> > +    unsigned long nr_pages;
> > +    unsigned int i;
> > +    int rc;
> > +    bool preempted;
> > +#define MB1_PAGES PFN_DOWN(MB(1))
> > +
> > +    nr_pages = compute_dom0_nr_pages(d, NULL, 0);
> > +
> > +    hvm_setup_e820(d, nr_pages);
> > +    do {
> > +        preempted = false;
> > +        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
> > +                              &preempted);
> > +        process_pending_softirqs();
> > +    } while ( preempted );
> > +
> > +    /*
> > +     * Memory below 1MB is identity mapped.
> > +     * NB: this only makes sense when booted from legacy BIOS.
> > +     */
> > +    rc = modify_identity_mmio(d, 0, PFN_DOWN(MB(1)), true);
> > +    if ( rc )
> > +    {
> > +        printk("Failed to identity map low 1MB: %d\n", rc);
> > +        return rc;
> > +    }
> > +
> > +    /* Populate memory map. */
> > +    for ( i = 0; i < d->arch.nr_e820; i++ )
> > +    {
> > +        unsigned long addr, size;
> > +
> > +        if ( d->arch.e820[i].type != E820_RAM )
> > +            continue;
> > +
> > +        addr = PFN_DOWN(d->arch.e820[i].addr);
> > +        size = PFN_DOWN(d->arch.e820[i].size);
> > +
> > +        if ( addr >= MB1_PAGES )
> > +            rc = hvm_populate_memory_range(d, addr, size);
> > +        else if ( addr + size > MB1_PAGES )
> > +        {
> > +            hvm_steal_low_ram(d, addr, MB1_PAGES - addr);
> > +            rc = hvm_populate_memory_range(d, MB1_PAGES,
> > +                                           size - (MB1_PAGES - addr));
> 
> Is this case possible at all? All x86 systems have some form of
> BIOS right below the 1Mb boundary, and the E820 map for
> Dom0 is being derived from the host one.

Heh, I don't think so but I wanted to cover all possible inputs. TBH I have no
idea how broken e820 memory maps can really be.

Would you be fine with removing this case and adding an ASSERT instead?

> > --- a/xen/arch/x86/mm.c
> > +++ b/xen/arch/x86/mm.c
> > @@ -475,6 +475,43 @@ void share_xen_page_with_guest(
> >      spin_unlock(&d->page_alloc_lock);
> >  }
> >  
> > +int unshare_xen_page_with_guest(struct page_info *page, struct domain *d)
> 
> __init
> 
> And once its __init, it may be possible to simplify it, as you don't need
> to fear races anymore. E.g. you wouldn't need a loop over cmpxchg().

Indeed.

> > +{
> > +    unsigned long y, x;
> > +    bool drop_dom_ref;
> > +
> > +    if ( page_get_owner(page) != d || !(page->count_info & PGC_xen_heap) )
> 
> Please don't open code is_xen_heap_page().

Right, I'm not very knowledgeable of the mm functions yet.

> > +        return -EINVAL;
> > +
> > +    spin_lock(&d->page_alloc_lock);
> > +
> > +    /* Drop the page reference so we can chanfge the owner. */
> > +    y = page->count_info;
> > +    do {
> > +        x = y;
> > +        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
> > +        {
> > +            spin_unlock(&d->page_alloc_lock);
> > +            return -EINVAL;
> > +        }
> > +        y = cmpxchg(&page->count_info, x, PGC_xen_heap);
> > +    } while ( y != x );
> > +
> > +    /* Remove the page form the list of domain pages. */
> > +    page_list_del(page, &d->xenpage_list);
> > +    drop_dom_ref = (--d->xenheap_pages == 0);
> 
> Aren't you open coding
> 
>     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
>         put_page(page);
> 
> here (except for the check on the initial value, which could be
> moved up)?

Yes, that's right.

> > +    /* Remove the owner and clear the flags. */
> > +    page_set_owner(page, NULL);
> > +    page->u.inuse.type_info = 0;
> 
> I think you'd better bail early if this is non-zero. Or else please use
> the order used elsewhere (clearing type info, then owner) - while
> it's benign, it avoids someone later wondering whether the order
> is wrong in either place.

It's certainly going to be non-zero, because share_xen_page_with_guests sets it
to:

page->u.inuse.type_info  = (readonly ? PGT_none : PGT_writable_page);
page->u.inuse.type_info |= PGT_validated | 1;

I've ended up coding it as:

int __init unshare_xen_page_with_guest(struct page_info *page,
                                       struct domain *d)
{
    if ( page_get_owner(page) != d || !is_xen_heap_page(page) )
        return -EINVAL;

    if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
        put_page(page);

    /* Remove the owner and clear the flags. */
    page->u.inuse.type_info = 0;
    page_set_owner(page, NULL);

    return 0;
}

(note that put_page does indeed use a cmpxchg, but the benefits of not open
coding it far outweighs the penalty of using cmpxchg IMHO).

Roger.
Jan Beulich Dec. 19, 2016, 7:48 a.m. UTC | #3
>>> On 16.12.16 at 18:38, <roger.pau@citrix.com> wrote:
> On Fri, Dec 09, 2016 at 09:48:32AM -0700, Jan Beulich wrote:
>> >>> On 30.11.16 at 17:49, <roger.pau@citrix.com> wrote:
>> > @@ -545,11 +552,12 @@ static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
>> >      ASSERT(nr_holes == 0);
>> >  }
>> >  
>> > -static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>> > +static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
>> 
>> Why?
> 
> So that afterwards I can remove all the pvh_ functions and leave the hvm_ ones
> only. But seeing your response to the other patch, would you prefer me to just
> use pvh_ for the new functions also?

Yes. After all the intention is to rip out all PVHv1 stuff.

>> > @@ -577,8 +585,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>> >              continue;
>> >          }
>> >  
>> > -        *entry_guest = *entry;
>> > -        pages = PFN_UP(entry_guest->size);
>> > +        /*
>> > +         * Make sure the start and length are aligned to PAGE_SIZE, because
>> > +         * that's the minimum granularity of the 2nd stage translation.
>> > +         */
>> > +        start = ROUNDUP(entry->addr, PAGE_SIZE);
>> > +        end = (entry->addr + entry->size) & PAGE_MASK;
>> 
>> Taking the comment into consideration, I wonder whether you
>> wouldn't better use PAGE_ORDER_4K here, as that's what the
>> p2m code uses.
> 
> That's going to be more cumbersome, since PAGE_SIZE would become 1UL <<
> PAGE_ORDER_4K << PAGE_SHIFT, and PAGE_MASK is going to be -1 and ~ of the
> previous construct. But I see your point, maybe I should define PAGE_SIZE_4K
> and PAGE_MASK_4K in xen/include/asm-x86/page.h?

That's an option, but considering the p2m code has got along
without it so far, I'm not fully convinced we need it. Perhaps
get an opinion from George (as the x86/mm maintainer).

>> > +static int __init hvm_setup_p2m(struct domain *d)
>> > +{
>> > +    struct vcpu *v = d->vcpu[0];
>> > +    unsigned long nr_pages;
>> > +    unsigned int i;
>> > +    int rc;
>> > +    bool preempted;
>> > +#define MB1_PAGES PFN_DOWN(MB(1))
>> > +
>> > +    nr_pages = compute_dom0_nr_pages(d, NULL, 0);
>> > +
>> > +    hvm_setup_e820(d, nr_pages);
>> > +    do {
>> > +        preempted = false;
>> > +        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
>> > +                              &preempted);
>> > +        process_pending_softirqs();
>> > +    } while ( preempted );
>> > +
>> > +    /*
>> > +     * Memory below 1MB is identity mapped.
>> > +     * NB: this only makes sense when booted from legacy BIOS.
>> > +     */
>> > +    rc = modify_identity_mmio(d, 0, PFN_DOWN(MB(1)), true);
>> > +    if ( rc )
>> > +    {
>> > +        printk("Failed to identity map low 1MB: %d\n", rc);
>> > +        return rc;
>> > +    }
>> > +
>> > +    /* Populate memory map. */
>> > +    for ( i = 0; i < d->arch.nr_e820; i++ )
>> > +    {
>> > +        unsigned long addr, size;
>> > +
>> > +        if ( d->arch.e820[i].type != E820_RAM )
>> > +            continue;
>> > +
>> > +        addr = PFN_DOWN(d->arch.e820[i].addr);
>> > +        size = PFN_DOWN(d->arch.e820[i].size);
>> > +
>> > +        if ( addr >= MB1_PAGES )
>> > +            rc = hvm_populate_memory_range(d, addr, size);
>> > +        else if ( addr + size > MB1_PAGES )
>> > +        {
>> > +            hvm_steal_low_ram(d, addr, MB1_PAGES - addr);
>> > +            rc = hvm_populate_memory_range(d, MB1_PAGES,
>> > +                                           size - (MB1_PAGES - addr));
>> 
>> Is this case possible at all? All x86 systems have some form of
>> BIOS right below the 1Mb boundary, and the E820 map for
>> Dom0 is being derived from the host one.
> 
> Heh, I don't think so but I wanted to cover all possible inputs. TBH I have no
> idea how broken e820 memory maps can really be.
> 
> Would you be fine with removing this case and adding an ASSERT instead?

Yes; in fact that would be my preference.

>> > +    /* Remove the owner and clear the flags. */
>> > +    page_set_owner(page, NULL);
>> > +    page->u.inuse.type_info = 0;
>> 
>> I think you'd better bail early if this is non-zero. Or else please use
>> the order used elsewhere (clearing type info, then owner) - while
>> it's benign, it avoids someone later wondering whether the order
>> is wrong in either place.
> 
> It's certainly going to be non-zero, because share_xen_page_with_guests sets it
> to:
> 
> page->u.inuse.type_info  = (readonly ? PGT_none : PGT_writable_page);
> page->u.inuse.type_info |= PGT_validated | 1;
> 
> I've ended up coding it as:
> 
> int __init unshare_xen_page_with_guest(struct page_info *page,
>                                        struct domain *d)
> {
>     if ( page_get_owner(page) != d || !is_xen_heap_page(page) )
>         return -EINVAL;
> 
>     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
>         put_page(page);
> 
>     /* Remove the owner and clear the flags. */
>     page->u.inuse.type_info = 0;
>     page_set_owner(page, NULL);
> 
>     return 0;
> }

This looks good, thanks.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 2c9ebf2..8602566 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -22,6 +22,7 @@ 
 #include <xen/compat.h>
 #include <xen/libelf.h>
 #include <xen/pfn.h>
+#include <xen/guest_access.h>
 #include <asm/regs.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -43,6 +44,9 @@  static long __initdata dom0_nrpages;
 static long __initdata dom0_min_nrpages;
 static long __initdata dom0_max_nrpages = LONG_MAX;
 
+/* Size of the VM86 TSS for virtual 8086 mode to use. */
+#define HVM_VM86_TSS_SIZE   128
+
 /*
  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
  * 
@@ -213,11 +217,12 @@  boolean_param("ro-hpet", ro_hpet);
 #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
 #define round_pgdown(_p)  ((_p)&PAGE_MASK)
 
+static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
+
 static struct page_info * __init alloc_chunk(
     struct domain *d, unsigned long max_pages)
 {
     static unsigned int __initdata last_order = MAX_ORDER;
-    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
     struct page_info *page;
     unsigned int order = get_order_from_pages(max_pages), free_order;
 
@@ -302,7 +307,8 @@  static unsigned long __init compute_dom0_nr_pages(
             avail -= max_pdx >> s;
     }
 
-    need_paging = opt_dom0_shadow || (is_pvh_domain(d) && !iommu_hap_pt_share);
+    need_paging = opt_dom0_shadow || (has_hvm_container_domain(d) &&
+                  (!iommu_hap_pt_share || !paging_mode_hap(d)));
     for ( ; ; need_paging = 0 )
     {
         nr_pages = dom0_nrpages;
@@ -334,7 +340,8 @@  static unsigned long __init compute_dom0_nr_pages(
         avail -= dom0_paging_pages(d, nr_pages);
     }
 
-    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
+    if ( is_pv_domain(d) &&
+         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
          ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) )
     {
         /*
@@ -545,11 +552,12 @@  static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
     ASSERT(nr_holes == 0);
 }
 
-static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
+static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
 {
     struct e820entry *entry, *entry_guest;
     unsigned int i;
     unsigned long pages, cur_pages = 0;
+    uint64_t start, end;
 
     /*
      * Craft the e820 memory map for Dom0 based on the hardware e820 map.
@@ -577,8 +585,19 @@  static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
             continue;
         }
 
-        *entry_guest = *entry;
-        pages = PFN_UP(entry_guest->size);
+        /*
+         * Make sure the start and length are aligned to PAGE_SIZE, because
+         * that's the minimum granularity of the 2nd stage translation.
+         */
+        start = ROUNDUP(entry->addr, PAGE_SIZE);
+        end = (entry->addr + entry->size) & PAGE_MASK;
+        if ( start >= end )
+            continue;
+
+        entry_guest->type = E820_RAM;
+        entry_guest->addr = start;
+        entry_guest->size = end - start;
+        pages = PFN_DOWN(entry_guest->size);
         if ( (cur_pages + pages) > nr_pages )
         {
             /* Truncate region */
@@ -1010,8 +1029,6 @@  static int __init construct_dom0_pv(
     BUG_ON(d->vcpu[0] == NULL);
     BUG_ON(v->is_initialised);
 
-    process_pending_softirqs();
-
     printk("*** LOADING DOMAIN 0 ***\n");
 
     d->max_pages = ~0U;
@@ -1637,7 +1654,7 @@  static int __init construct_dom0_pv(
         dom0_update_physmap(d, pfn, mfn, 0);
 
         pvh_map_all_iomem(d, nr_pages);
-        pvh_setup_e820(d, nr_pages);
+        hvm_setup_e820(d, nr_pages);
     }
 
     if ( d->domain_id == hardware_domid )
@@ -1653,15 +1670,289 @@  out:
     return rc;
 }
 
+static int __init modify_identity_mmio(struct domain *d, unsigned long pfn,
+                                       unsigned long nr_pages, bool map)
+{
+    int rc;
+
+    for ( ; ; )
+    {
+        rc = (map ? map_mmio_regions : unmap_mmio_regions)
+             (d, _gfn(pfn), nr_pages, _mfn(pfn));
+        if ( rc == 0 )
+            break;
+        if ( rc < 0 )
+        {
+            printk(XENLOG_WARNING
+                   "Failed to identity %smap [%#lx,%#lx) for d%d: %d\n",
+                   map ? "" : "un", pfn, pfn + nr_pages, d->domain_id, rc);
+            break;
+        }
+        nr_pages -= rc;
+        pfn += rc;
+        process_pending_softirqs();
+    }
+
+    return rc;
+}
+
+/* Populate an HVM memory range using the biggest possible order. */
+static int __init hvm_populate_memory_range(struct domain *d,
+                                            unsigned long start,
+                                            unsigned long nr_pages)
+{
+    unsigned int order, i = 0;
+    struct page_info *page;
+    int rc;
+#define MAP_MAX_ITER 64
+
+    order = MAX_ORDER;
+    while ( nr_pages != 0 )
+    {
+        unsigned int range_order = get_order_from_pages(nr_pages + 1);
+
+        order = min(range_order ? range_order - 1 : 0, order);
+        page = alloc_domheap_pages(d, order, memflags);
+        if ( page == NULL )
+        {
+            if ( order == 0 && memflags )
+            {
+                /* Try again without any memflags. */
+                memflags = 0;
+                order = MAX_ORDER;
+                continue;
+            }
+            if ( order == 0 )
+            {
+                printk("Unable to allocate memory with order 0!\n");
+                return -ENOMEM;
+            }
+            order--;
+            continue;
+        }
+
+        rc = guest_physmap_add_page(d, _gfn(start), _mfn(page_to_mfn(page)),
+                                    order);
+        if ( rc != 0 )
+        {
+            printk("Failed to populate memory: [%#lx,%lx): %d\n",
+                   start, start + (1UL << order), rc);
+            return -ENOMEM;
+        }
+        start += 1UL << order;
+        nr_pages -= 1UL << order;
+        if ( (++i % MAP_MAX_ITER) == 0 )
+            process_pending_softirqs();
+    }
+
+    return 0;
+#undef MAP_MAX_ITER
+}
+
+static int __init hvm_steal_ram(struct domain *d, unsigned long size,
+                                paddr_t limit, paddr_t *addr)
+{
+    unsigned int i = d->arch.nr_e820;
+
+    while ( i-- )
+    {
+        struct e820entry *entry = &d->arch.e820[i];
+
+        if ( entry->type != E820_RAM || entry->size < size )
+            continue;
+
+        /* Subtract from the beginning. */
+        if ( entry->addr + size < limit && entry->addr >= MB(1) )
+        {
+            *addr = entry->addr;
+            entry->addr += size;
+            entry->size -= size;
+            return 0;
+        }
+    }
+
+    return -ENOMEM;
+}
+
+static int __init hvm_setup_vmx_realmode_helpers(struct domain *d)
+{
+    p2m_type_t p2mt;
+    uint32_t rc, *ident_pt;
+    uint8_t *tss;
+    mfn_t mfn;
+    paddr_t gaddr;
+    unsigned int i;
+
+    /*
+     * Steal some space from the last found RAM region. One page will be
+     * used for the identity page tables, and the remaining space for the
+     * VM86 TSS. Note that after this not all e820 regions will be aligned
+     * to PAGE_SIZE.
+     */
+    if ( hvm_steal_ram(d, PAGE_SIZE + HVM_VM86_TSS_SIZE, ULONG_MAX, &gaddr) )
+    {
+        printk("Unable to find memory to stash the identity map and TSS\n");
+        return -ENOMEM;
+    }
+
+    /*
+     * Identity-map page table is required for running with CR0.PG=0
+     * when using Intel EPT. Create a 32-bit non-PAE page directory of
+     * superpages.
+     */
+    ident_pt = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)),
+                              &mfn, &p2mt, 0, &rc);
+    if ( ident_pt == NULL )
+    {
+        printk("Unable to map identity page tables\n");
+        return -ENOMEM;
+    }
+    for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+        ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+                       _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+    unmap_domain_page(ident_pt);
+    put_page(mfn_to_page(mfn_x(mfn)));
+    d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] = gaddr;
+    gaddr += PAGE_SIZE;
+    ASSERT(IS_ALIGNED(gaddr, PAGE_SIZE));
+
+    tss = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)),
+                         &mfn, &p2mt, 0, &rc);
+    if ( tss == NULL )
+    {
+        printk("Unable to map VM86 TSS area\n");
+        return 0;
+    }
+
+    memset(tss, 0, HVM_VM86_TSS_SIZE);
+    unmap_domain_page(tss);
+    put_page(mfn_to_page(mfn_x(mfn)));
+    d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] = gaddr;
+
+    return 0;
+}
+
+static void __init hvm_steal_low_ram(struct domain *d, unsigned long start,
+                                     unsigned long nr_pages)
+{
+    unsigned long mfn;
+
+    ASSERT(start + nr_pages < PFN_DOWN(MB(1)));
+
+    for ( mfn = start; mfn < start + nr_pages; mfn++ )
+    {
+        struct page_info *pg = mfn_to_page(mfn);
+        int rc;
+
+        rc = unshare_xen_page_with_guest(pg, dom_io);
+        if ( rc )
+        {
+            printk("Unable to unshare Xen mfn %#lx: %d\n", mfn, rc);
+            continue;
+        }
+
+        share_xen_page_with_guest(pg, d, XENSHARE_writable);
+        rc = guest_physmap_add_entry(d, _gfn(mfn), _mfn(mfn), 0, p2m_ram_rw);
+        if ( rc )
+            printk("Unable to add mfn %#lx to p2m: %d\n", mfn, rc);
+    }
+}
+
+static int __init hvm_setup_p2m(struct domain *d)
+{
+    struct vcpu *v = d->vcpu[0];
+    unsigned long nr_pages;
+    unsigned int i;
+    int rc;
+    bool preempted;
+#define MB1_PAGES PFN_DOWN(MB(1))
+
+    nr_pages = compute_dom0_nr_pages(d, NULL, 0);
+
+    hvm_setup_e820(d, nr_pages);
+    do {
+        preempted = false;
+        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
+                              &preempted);
+        process_pending_softirqs();
+    } while ( preempted );
+
+    /*
+     * Memory below 1MB is identity mapped.
+     * NB: this only makes sense when booted from legacy BIOS.
+     */
+    rc = modify_identity_mmio(d, 0, PFN_DOWN(MB(1)), true);
+    if ( rc )
+    {
+        printk("Failed to identity map low 1MB: %d\n", rc);
+        return rc;
+    }
+
+    /* Populate memory map. */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        unsigned long addr, size;
+
+        if ( d->arch.e820[i].type != E820_RAM )
+            continue;
+
+        addr = PFN_DOWN(d->arch.e820[i].addr);
+        size = PFN_DOWN(d->arch.e820[i].size);
+
+        if ( addr >= MB1_PAGES )
+            rc = hvm_populate_memory_range(d, addr, size);
+        else if ( addr + size > MB1_PAGES )
+        {
+            hvm_steal_low_ram(d, addr, MB1_PAGES - addr);
+            rc = hvm_populate_memory_range(d, MB1_PAGES,
+                                           size - (MB1_PAGES - addr));
+        }
+        else
+            hvm_steal_low_ram(d, addr, size);
+
+        if ( rc )
+            return rc;
+    }
+
+    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
+    {
+        /*
+         * Since Dom0 cannot be migrated, we will only setup the
+         * unrestricted guest helpers if they are needed by the current
+         * hardware we are running on.
+         */
+        rc = hvm_setup_vmx_realmode_helpers(d);
+        if ( rc )
+            return rc;
+    }
+
+    return 0;
+#undef MB1_PAGES
+}
+
 static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
                                      unsigned long image_headroom,
                                      module_t *initrd,
                                      void *(*bootstrap_map)(const module_t *),
                                      char *cmdline)
 {
+    int rc;
 
     printk("** Building a PVH Dom0 **\n");
 
+    /* Sanity! */
+    BUG_ON(d->domain_id);
+    BUG_ON(!d->vcpu[0]);
+
+    iommu_hwdom_init(d);
+
+    rc = hvm_setup_p2m(d);
+    if ( rc )
+    {
+        printk("Failed to setup Dom0 physical memory map\n");
+        return rc;
+    }
+
     return 0;
 }
 
@@ -1670,6 +1961,7 @@  int __init construct_dom0(struct domain *d, const module_t *image,
                           void *(*bootstrap_map)(const module_t *),
                           char *cmdline)
 {
+    process_pending_softirqs();
 
     return (is_hvm_domain(d) ? construct_dom0_hvm : construct_dom0_pv)
            (d, image, image_headroom, initrd,bootstrap_map, cmdline);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 03dcd71..a31106c 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -475,6 +475,43 @@  void share_xen_page_with_guest(
     spin_unlock(&d->page_alloc_lock);
 }
 
+int unshare_xen_page_with_guest(struct page_info *page, struct domain *d)
+{
+    unsigned long y, x;
+    bool drop_dom_ref;
+
+    if ( page_get_owner(page) != d || !(page->count_info & PGC_xen_heap) )
+        return -EINVAL;
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* Drop the page reference so we can chanfge the owner. */
+    y = page->count_info;
+    do {
+        x = y;
+        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
+        {
+            spin_unlock(&d->page_alloc_lock);
+            return -EINVAL;
+        }
+        y = cmpxchg(&page->count_info, x, PGC_xen_heap);
+    } while ( y != x );
+
+    /* Remove the page form the list of domain pages. */
+    page_list_del(page, &d->xenpage_list);
+    drop_dom_ref = (--d->xenheap_pages == 0);
+
+    /* Remove the owner and clear the flags. */
+    page_set_owner(page, NULL);
+    page->u.inuse.type_info = 0;
+    spin_unlock(&d->page_alloc_lock);
+
+    if ( drop_dom_ref )
+        put_domain(d);
+
+    return 0;
+}
+
 void share_xen_page_with_privileged_guests(
     struct page_info *page, int readonly)
 {
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 1b4d1c3..041692b 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -276,6 +276,8 @@  struct spage_info
 #define XENSHARE_readonly 1
 extern void share_xen_page_with_guest(
     struct page_info *page, struct domain *d, int readonly);
+extern int unshare_xen_page_with_guest(struct page_info *page,
+                                       struct domain *d);
 extern void share_xen_page_with_privileged_guests(
     struct page_info *page, int readonly);
 extern void free_shared_domheap_page(struct page_info *page);