diff mbox series

[v10,1/3] xen/mem_sharing: VM forking

Message ID 8df741964b56c10ed912f9187dcb31aae7251085.1582658216.git.tamas.lengyel@intel.com (mailing list archive)
State Superseded
Headers show
Series VM forking | expand

Commit Message

Tamas K Lengyel Feb. 25, 2020, 7:17 p.m. UTC
VM forking is the process of creating a domain with an empty memory space and a
parent domain specified from which to populate the memory when necessary. For
the new domain to be functional the VM state is copied over as part of the fork
operation (HVM params, hap allocation, etc).

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
v10: setup vcpu_info pages for vCPUs in the fork if the parent has them
     setup pages for special HVM PFNs if the parent has them
     minor adjustments based on Roger's comments
---
 xen/arch/x86/domain.c             |  11 ++
 xen/arch/x86/hvm/hvm.c            |   4 +-
 xen/arch/x86/mm/hap/hap.c         |   3 +-
 xen/arch/x86/mm/mem_sharing.c     | 287 ++++++++++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c             |   9 +-
 xen/common/domain.c               |   3 +
 xen/include/asm-x86/hap.h         |   1 +
 xen/include/asm-x86/hvm/hvm.h     |   2 +
 xen/include/asm-x86/mem_sharing.h |  17 ++
 xen/include/public/memory.h       |   5 +
 xen/include/xen/sched.h           |   5 +
 11 files changed, 342 insertions(+), 5 deletions(-)

Comments

Roger Pau Monné Feb. 26, 2020, 3:12 p.m. UTC | #1
On Tue, Feb 25, 2020 at 11:17:55AM -0800, Tamas K Lengyel wrote:
> VM forking is the process of creating a domain with an empty memory space and a
> parent domain specified from which to populate the memory when necessary. For
> the new domain to be functional the VM state is copied over as part of the fork
> operation (HVM params, hap allocation, etc).
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> ---
> v10: setup vcpu_info pages for vCPUs in the fork if the parent has them
>      setup pages for special HVM PFNs if the parent has them
>      minor adjustments based on Roger's comments
> ---
>  xen/arch/x86/domain.c             |  11 ++
>  xen/arch/x86/hvm/hvm.c            |   4 +-
>  xen/arch/x86/mm/hap/hap.c         |   3 +-
>  xen/arch/x86/mm/mem_sharing.c     | 287 ++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/p2m.c             |   9 +-
>  xen/common/domain.c               |   3 +
>  xen/include/asm-x86/hap.h         |   1 +
>  xen/include/asm-x86/hvm/hvm.h     |   2 +
>  xen/include/asm-x86/mem_sharing.h |  17 ++
>  xen/include/public/memory.h       |   5 +
>  xen/include/xen/sched.h           |   5 +
>  11 files changed, 342 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index fe63c23676..1ab0ca0942 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2203,6 +2203,17 @@ int domain_relinquish_resources(struct domain *d)
>              ret = relinquish_shared_pages(d);
>              if ( ret )
>                  return ret;
> +
> +            /*
> +             * If the domain is forked, decrement the parent's pause count
> +             * and release the domain.
> +             */
> +            if ( d->parent )
> +            {
> +                domain_unpause(d->parent);
> +                put_domain(d->parent);
> +                d->parent = NULL;
> +            }
>          }
>  #endif
>  
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index a339b36a0d..c284f3cf5f 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1915,7 +1915,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>      }
>  #endif
>  
> -    /* Spurious fault? PoD and log-dirty also take this path. */
> +    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
>      if ( p2m_is_ram(p2mt) )
>      {
>          rc = 1;
> @@ -4429,7 +4429,7 @@ static int hvm_allow_get_param(struct domain *d,
>      return rc;
>  }
>  
> -static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> +int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
>  {
>      int rc;
>  
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index 3d93f3451c..c7c7ff6e99 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct page_info *pg)
>  }
>  
>  /* Return the size of the pool, rounded up to the nearest MB */
> -static unsigned int
> -hap_get_allocation(struct domain *d)
> +unsigned int hap_get_allocation(struct domain *d)
>  {
>      unsigned int pg = d->arch.paging.hap.total_pages
>          + d->arch.paging.hap.p2m_pages;
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 3835bc928f..8ee37e6943 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -22,6 +22,7 @@
>  
>  #include <xen/types.h>
>  #include <xen/domain_page.h>
> +#include <xen/event.h>
>  #include <xen/spinlock.h>
>  #include <xen/rwlock.h>
>  #include <xen/mm.h>
> @@ -36,6 +37,8 @@
>  #include <asm/altp2m.h>
>  #include <asm/atomic.h>
>  #include <asm/event.h>
> +#include <asm/hap.h>
> +#include <asm/hvm/hvm.h>
>  #include <xsm/xsm.h>
>  
>  #include "mm-locks.h"
> @@ -1444,6 +1447,263 @@ static inline int mem_sharing_control(struct domain *d, bool enable)
>      return 0;
>  }
>  
> +/*
> + * Forking a page only gets called when the VM faults due to no entry being
> + * in the EPT for the access. Depending on the type of access we either
> + * populate the physmap with a shared entry for read-only access or
> + * fork the page if its a write access.
> + *
> + * The client p2m is already locked so we only need to lock
> + * the parent's here.
> + */
> +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
> +{
> +    int rc = -ENOENT;
> +    shr_handle_t handle;
> +    struct domain *parent = d->parent;
> +    struct p2m_domain *p2m;
> +    unsigned long gfn_l = gfn_x(gfn);
> +    mfn_t mfn, new_mfn;
> +    p2m_type_t p2mt;
> +    struct page_info *page;
> +
> +    if ( !mem_sharing_is_fork(d) )
> +        return -ENOENT;
> +
> +    if ( !unsharing )
> +    {
> +        /* For read-only accesses we just add a shared entry to the physmap */
> +        while ( parent )
> +        {
> +            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
> +                break;
> +
> +            parent = parent->parent;
> +        }
> +
> +        if ( !rc )
> +        {
> +            /* The client's p2m is already locked */
> +            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
> +
> +            p2m_lock(pp2m);
> +            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
> +            p2m_unlock(pp2m);
> +
> +            if ( !rc )
> +                return 0;
> +        }
> +    }
> +
> +    /*
> +     * If it's a write access (ie. unsharing) or if adding a shared entry to
> +     * the physmap failed we'll fork the page directly.
> +     */
> +    p2m = p2m_get_hostp2m(d);
> +    parent = d->parent;
> +
> +    while ( parent )
> +    {
> +        mfn = get_gfn_query(parent, gfn_l, &p2mt);
> +
> +        /*
> +         * We can't fork grant memory from the parent, only regular ram.
> +         */
> +        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
> +            break;
> +
> +        put_gfn(parent, gfn_l);
> +        parent = parent->parent;
> +    }
> +
> +    if ( !parent )
> +        return -ENOENT;
> +
> +    if ( !(page = alloc_domheap_page(d, 0)) )
> +    {
> +        put_gfn(parent, gfn_l);
> +        return -ENOMEM;
> +    }
> +
> +    new_mfn = page_to_mfn(page);
> +    copy_domain_page(new_mfn, mfn);
> +    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> +
> +    put_gfn(parent, gfn_l);
> +
> +    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> +                          p2m->default_access, -1);
> +}
> +
> +static int bring_up_vcpus(struct domain *cd, struct domain *d)
> +{
> +    unsigned int i;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    int ret = -EINVAL;
> +
> +    if ( d->max_vcpus != cd->max_vcpus )
> +        return ret;
> +
> +    if ( (ret = cpupool_move_domain(cd, d->cpupool)) )
> +        return ret;

You can join both ifs into a single one, since both return ret.

> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        mfn_t vcpu_info_mfn;
> +
> +        if ( !d->vcpu[i] || cd->vcpu[i] )
> +            continue;
> +
> +        if ( !vcpu_create(cd, i) )
> +            return -EINVAL;
> +
> +        /*
> +         * Map in a page for the vcpu_info if the guest uses one to the exact
> +         * same spot.
> +         */
> +        vcpu_info_mfn = d->vcpu[i]->vcpu_info_mfn;
> +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> +        {
> +            struct page_info *page;
> +            mfn_t new_mfn;
> +            gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> +            unsigned long gfn_l = gfn_x(gfn);
> +
> +            if ( !(page = alloc_domheap_page(cd, 0)) )
> +                return -ENOMEM;
> +
> +            new_mfn = page_to_mfn(page);
> +            set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> +
> +            if ( !(ret = p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K,
> +                                        p2m_ram_rw, p2m->default_access, -1)) )
> +                return ret;
> +
> +            if ( !(ret = map_vcpu_info(cd->vcpu[i], gfn_l,
> +                                       d->vcpu[i]->vcpu_info_offset)) )
> +                return ret;

I think you also need to copy the contents from the parent into those
vcpu_info areas, or else you might discard pending event channels
contained in the evtchn_* fields? (and the masked channels if any).

The runtime area should be handled in a similar way AFAICT (albeit
there's no need to copy the parent's data in that case), see
VCPUOP_register_runstate_memory_area.

> +        }
> +    }
> +
> +    domain_update_node_affinity(cd);
> +    return 0;
> +}
> +
> +static int fork_hap_allocation(struct domain *cd, struct domain *d)
> +{
> +    int rc;
> +    bool preempted;
> +    unsigned long mb = hap_get_allocation(d);
> +
> +    if ( mb == hap_get_allocation(cd) )
> +        return 0;
> +
> +    paging_lock(cd);
> +    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
> +    paging_unlock(cd);
> +
> +    return preempted ? -ERESTART : rc;
> +}
> +
> +static void fork_tsc(struct domain *cd, struct domain *d)
> +{
> +    uint32_t tsc_mode;
> +    uint32_t gtsc_khz;
> +    uint32_t incarnation;
> +    uint64_t elapsed_nsec;
> +
> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> +    /* Don't bump incarnation on set */
> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
> +}
> +
> +static int populate_special_pages(struct domain *cd)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    static const unsigned int params[] =
> +    {
> +        HVM_PARAM_STORE_PFN,
> +        HVM_PARAM_IOREQ_PFN,
> +        HVM_PARAM_BUFIOREQ_PFN,
> +        HVM_PARAM_CONSOLE_PFN
> +    };
> +    unsigned int i;
> +
> +    for ( i=0; i<4; i++ )

Nit: can you please add some spaces around the operators?

> +    {
> +        uint64_t value = 0;
> +        mfn_t new_mfn;
> +        struct page_info *page;
> +
> +        if ( hvm_get_param(cd, params[i], &value) || !value )
> +            continue;
> +
> +        if ( !(page = alloc_domheap_page(cd, 0)) )
> +            return -ENOMEM;
> +
> +        new_mfn = page_to_mfn(page);
> +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> +
> +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> +                              p2m_ram_rw, p2m->default_access, -1);

I think you also need to copy the contents from the parent page here.

> +    }
> +
> +    return 0;
> +}
> +
> +static int fork(struct domain *d, struct domain *cd)
> +{
> +    int rc = -EBUSY;
> +
> +    if ( !cd->controller_pause_count )
> +        return rc;
> +
> +    /*
> +     * We only want to get and pause the parent once, not each time this
> +     * operation is restarted due to preemption.
> +     */
> +    if ( !cd->parent_paused )
> +    {
> +        if ( !get_domain(d) )
> +        {
> +            ASSERT_UNREACHABLE();
> +            return -EBUSY;
> +        }
> +
> +        domain_pause(d);
> +        cd->parent_paused = true;
> +        cd->max_pages = d->max_pages;
> +        cd->max_vcpus = d->max_vcpus;
> +    }
> +
> +    /* this is preemptible so it's the first to get done */
> +    if ( (rc = fork_hap_allocation(cd, d)) )
> +        goto done;
> +
> +    if ( (rc = bring_up_vcpus(cd, d)) )
> +        goto done;
> +
> +    if ( (rc = hvm_copy_context_and_params(cd, d)) )
> +        goto done;
> +
> +    if ( (rc = populate_special_pages(cd)) )
> +        goto done;
> +
> +    fork_tsc(cd, d);

I think you need to copy the contents of the shared info page from the
parent into the child, or else you are discarding any pending event
channels. You should also map such shared info page into the same gfn
as the parent.

Thanks, Roger.
Tamas K Lengyel Feb. 26, 2020, 3:20 p.m. UTC | #2
> > +    if ( (ret = cpupool_move_domain(cd, d->cpupool)) )
> > +        return ret;
>
> You can join both ifs into a single one, since both return ret.

Sure.

> > +
> > +    for ( i = 0; i < cd->max_vcpus; i++ )
> > +    {
> > +        mfn_t vcpu_info_mfn;
> > +
> > +        if ( !d->vcpu[i] || cd->vcpu[i] )
> > +            continue;
> > +
> > +        if ( !vcpu_create(cd, i) )
> > +            return -EINVAL;
> > +
> > +        /*
> > +         * Map in a page for the vcpu_info if the guest uses one to the exact
> > +         * same spot.
> > +         */
> > +        vcpu_info_mfn = d->vcpu[i]->vcpu_info_mfn;
> > +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > +        {
> > +            struct page_info *page;
> > +            mfn_t new_mfn;
> > +            gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> > +            unsigned long gfn_l = gfn_x(gfn);
> > +
> > +            if ( !(page = alloc_domheap_page(cd, 0)) )
> > +                return -ENOMEM;
> > +
> > +            new_mfn = page_to_mfn(page);
> > +            set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> > +
> > +            if ( !(ret = p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K,
> > +                                        p2m_ram_rw, p2m->default_access, -1)) )
> > +                return ret;
> > +
> > +            if ( !(ret = map_vcpu_info(cd->vcpu[i], gfn_l,
> > +                                       d->vcpu[i]->vcpu_info_offset)) )
> > +                return ret;
>
> I think you also need to copy the contents from the parent into those
> vcpu_info areas, or else you might discard pending event channels
> contained in the evtchn_* fields? (and the masked channels if any).
>
> The runtime area should be handled in a similar way AFAICT (albeit
> there's no need to copy the parent's data in that case), see
> VCPUOP_register_runstate_memory_area.

Will do.


> > +static int populate_special_pages(struct domain *cd)
> > +{
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > +    static const unsigned int params[] =
> > +    {
> > +        HVM_PARAM_STORE_PFN,
> > +        HVM_PARAM_IOREQ_PFN,
> > +        HVM_PARAM_BUFIOREQ_PFN,
> > +        HVM_PARAM_CONSOLE_PFN
> > +    };
> > +    unsigned int i;
> > +
> > +    for ( i=0; i<4; i++ )
>
> Nit: can you please add some spaces around the operators?

Sure.

>
> > +    {
> > +        uint64_t value = 0;
> > +        mfn_t new_mfn;
> > +        struct page_info *page;
> > +
> > +        if ( hvm_get_param(cd, params[i], &value) || !value )
> > +            continue;
> > +
> > +        if ( !(page = alloc_domheap_page(cd, 0)) )
> > +            return -ENOMEM;
> > +
> > +        new_mfn = page_to_mfn(page);
> > +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> > +
> > +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> > +                              p2m_ram_rw, p2m->default_access, -1);
>
> I think you also need to copy the contents from the parent page here.

The toolstack simply clears these pages during restore so I'm not sure
(see https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libxc/xc_sr_restore_x86_hvm.c;h=3f78248f32fec239db77b0e483b0195211e6b974;hb=HEAD#l61).
I don't see why you would have to clear the pages first if they get
overwritten by saved versions later. Or these pages are expected to be
torn-down by the save/restore aware guests?

> > +static int fork(struct domain *d, struct domain *cd)
> > +{
> > +    int rc = -EBUSY;
> > +
> > +    if ( !cd->controller_pause_count )
> > +        return rc;
> > +
> > +    /*
> > +     * We only want to get and pause the parent once, not each time this
> > +     * operation is restarted due to preemption.
> > +     */
> > +    if ( !cd->parent_paused )
> > +    {
> > +        if ( !get_domain(d) )
> > +        {
> > +            ASSERT_UNREACHABLE();
> > +            return -EBUSY;
> > +        }
> > +
> > +        domain_pause(d);
> > +        cd->parent_paused = true;
> > +        cd->max_pages = d->max_pages;
> > +        cd->max_vcpus = d->max_vcpus;
> > +    }
> > +
> > +    /* this is preemptible so it's the first to get done */
> > +    if ( (rc = fork_hap_allocation(cd, d)) )
> > +        goto done;
> > +
> > +    if ( (rc = bring_up_vcpus(cd, d)) )
> > +        goto done;
> > +
> > +    if ( (rc = hvm_copy_context_and_params(cd, d)) )
> > +        goto done;
> > +
> > +    if ( (rc = populate_special_pages(cd)) )
> > +        goto done;
> > +
> > +    fork_tsc(cd, d);
>
> I think you need to copy the contents of the shared info page from the
> parent into the child, or else you are discarding any pending event
> channels. You should also map such shared info page into the same gfn
> as the parent.
>

I'll look into it, thanks!

Tamas
Roger Pau Monné Feb. 26, 2020, 3:36 p.m. UTC | #3
On Wed, Feb 26, 2020 at 08:20:30AM -0700, Tamas K Lengyel wrote:
> > > +static int populate_special_pages(struct domain *cd)
> > > +{
> > > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > +    static const unsigned int params[] =
> > > +    {
> > > +        HVM_PARAM_STORE_PFN,
> > > +        HVM_PARAM_IOREQ_PFN,
> > > +        HVM_PARAM_BUFIOREQ_PFN,
> > > +        HVM_PARAM_CONSOLE_PFN
> > > +    };
> > > +    unsigned int i;
> > > +
> > > +    for ( i=0; i<4; i++ )
> >
> > Nit: can you please add some spaces around the operators?
> 
> Sure.
> 
> >
> > > +    {
> > > +        uint64_t value = 0;
> > > +        mfn_t new_mfn;
> > > +        struct page_info *page;
> > > +
> > > +        if ( hvm_get_param(cd, params[i], &value) || !value )
> > > +            continue;
> > > +
> > > +        if ( !(page = alloc_domheap_page(cd, 0)) )
> > > +            return -ENOMEM;
> > > +
> > > +        new_mfn = page_to_mfn(page);
> > > +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> > > +
> > > +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> > > +                              p2m_ram_rw, p2m->default_access, -1);
> >
> > I think you also need to copy the contents from the parent page here.
> 
> The toolstack simply clears these pages during restore so I'm not sure
> (see https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libxc/xc_sr_restore_x86_hvm.c;h=3f78248f32fec239db77b0e483b0195211e6b974;hb=HEAD#l61).
> I don't see why you would have to clear the pages first if they get
> overwritten by saved versions later. Or these pages are expected to be
> torn-down by the save/restore aware guests?

Guests using those pages know they are torn down during suspend/resume
and expect to find a clean state when resuming. That's not the case with
forking however, as the guest is completely unaware of the fork
happening.

One thing I'm not sure of is whether the backends (xenstored,
xenconsoled) will cope with those pages being already populated on
guest creation.

AFAICT another issue is that xenstore watches are not copied over from
the parent, so any watches the parent might have set will not fire on
the child. That would require some kind of interaction with xenstored
in order to request a guest state to be copied over to another guest.

> > > +static int fork(struct domain *d, struct domain *cd)
> > > +{
> > > +    int rc = -EBUSY;
> > > +
> > > +    if ( !cd->controller_pause_count )
> > > +        return rc;
> > > +
> > > +    /*
> > > +     * We only want to get and pause the parent once, not each time this
> > > +     * operation is restarted due to preemption.
> > > +     */
> > > +    if ( !cd->parent_paused )
> > > +    {
> > > +        if ( !get_domain(d) )
> > > +        {
> > > +            ASSERT_UNREACHABLE();
> > > +            return -EBUSY;
> > > +        }
> > > +
> > > +        domain_pause(d);
> > > +        cd->parent_paused = true;
> > > +        cd->max_pages = d->max_pages;
> > > +        cd->max_vcpus = d->max_vcpus;
> > > +    }
> > > +
> > > +    /* this is preemptible so it's the first to get done */
> > > +    if ( (rc = fork_hap_allocation(cd, d)) )
> > > +        goto done;
> > > +
> > > +    if ( (rc = bring_up_vcpus(cd, d)) )
> > > +        goto done;
> > > +
> > > +    if ( (rc = hvm_copy_context_and_params(cd, d)) )
> > > +        goto done;
> > > +
> > > +    if ( (rc = populate_special_pages(cd)) )
> > > +        goto done;
> > > +
> > > +    fork_tsc(cd, d);
> >
> > I think you need to copy the contents of the shared info page from the
> > parent into the child, or else you are discarding any pending event
> > channels. You should also map such shared info page into the same gfn
> > as the parent.
> >
> 
> I'll look into it, thanks!

Oh, and the PV timer state should also be copied over, so that PV
timer interrupts are not lost.

Thanks, Roger.
Tamas K Lengyel Feb. 26, 2020, 3:58 p.m. UTC | #4
On Wed, Feb 26, 2020 at 8:36 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Wed, Feb 26, 2020 at 08:20:30AM -0700, Tamas K Lengyel wrote:
> > > > +static int populate_special_pages(struct domain *cd)
> > > > +{
> > > > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > > +    static const unsigned int params[] =
> > > > +    {
> > > > +        HVM_PARAM_STORE_PFN,
> > > > +        HVM_PARAM_IOREQ_PFN,
> > > > +        HVM_PARAM_BUFIOREQ_PFN,
> > > > +        HVM_PARAM_CONSOLE_PFN
> > > > +    };
> > > > +    unsigned int i;
> > > > +
> > > > +    for ( i=0; i<4; i++ )
> > >
> > > Nit: can you please add some spaces around the operators?
> >
> > Sure.
> >
> > >
> > > > +    {
> > > > +        uint64_t value = 0;
> > > > +        mfn_t new_mfn;
> > > > +        struct page_info *page;
> > > > +
> > > > +        if ( hvm_get_param(cd, params[i], &value) || !value )
> > > > +            continue;
> > > > +
> > > > +        if ( !(page = alloc_domheap_page(cd, 0)) )
> > > > +            return -ENOMEM;
> > > > +
> > > > +        new_mfn = page_to_mfn(page);
> > > > +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> > > > +
> > > > +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> > > > +                              p2m_ram_rw, p2m->default_access, -1);
> > >
> > > I think you also need to copy the contents from the parent page here.
> >
> > The toolstack simply clears these pages during restore so I'm not sure
> > (see https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libxc/xc_sr_restore_x86_hvm.c;h=3f78248f32fec239db77b0e483b0195211e6b974;hb=HEAD#l61).
> > I don't see why you would have to clear the pages first if they get
> > overwritten by saved versions later. Or these pages are expected to be
> > torn-down by the save/restore aware guests?
>
> Guests using those pages know they are torn down during suspend/resume
> and expect to find a clean state when resuming. That's not the case with
> forking however, as the guest is completely unaware of the fork
> happening.
>
> One thing I'm not sure of is whether the backends (xenstored,
> xenconsoled) will cope with those pages being already populated on
> guest creation.
>
> AFAICT another issue is that xenstore watches are not copied over from
> the parent, so any watches the parent might have set will not fire on
> the child. That would require some kind of interaction with xenstored
> in order to request a guest state to be copied over to another guest.

Sounds like it most likely would need to be handled if the guest uses
them. I'm not sure if a default Linux HVM guest uses them though. A
Windows HVM guest without the PV drivers is certainly not Xen aware so
there things already work just fine and that is our primary target for
our use-case. PVHVM/PVH Linux guests are not. So that's really outside
the scope of what I can contribute at the moment.

Thanks,
Tamas
Roger Pau Monné Feb. 26, 2020, 4:10 p.m. UTC | #5
On Wed, Feb 26, 2020 at 08:58:05AM -0700, Tamas K Lengyel wrote:
> On Wed, Feb 26, 2020 at 8:36 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Wed, Feb 26, 2020 at 08:20:30AM -0700, Tamas K Lengyel wrote:
> > > > > +static int populate_special_pages(struct domain *cd)
> > > > > +{
> > > > > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > > > +    static const unsigned int params[] =
> > > > > +    {
> > > > > +        HVM_PARAM_STORE_PFN,
> > > > > +        HVM_PARAM_IOREQ_PFN,
> > > > > +        HVM_PARAM_BUFIOREQ_PFN,
> > > > > +        HVM_PARAM_CONSOLE_PFN
> > > > > +    };
> > > > > +    unsigned int i;
> > > > > +
> > > > > +    for ( i=0; i<4; i++ )
> > > >
> > > > Nit: can you please add some spaces around the operators?
> > >
> > > Sure.
> > >
> > > >
> > > > > +    {
> > > > > +        uint64_t value = 0;
> > > > > +        mfn_t new_mfn;
> > > > > +        struct page_info *page;
> > > > > +
> > > > > +        if ( hvm_get_param(cd, params[i], &value) || !value )
> > > > > +            continue;
> > > > > +
> > > > > +        if ( !(page = alloc_domheap_page(cd, 0)) )
> > > > > +            return -ENOMEM;
> > > > > +
> > > > > +        new_mfn = page_to_mfn(page);
> > > > > +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> > > > > +
> > > > > +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> > > > > +                              p2m_ram_rw, p2m->default_access, -1);
> > > >
> > > > I think you also need to copy the contents from the parent page here.
> > >
> > > The toolstack simply clears these pages during restore so I'm not sure
> > > (see https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libxc/xc_sr_restore_x86_hvm.c;h=3f78248f32fec239db77b0e483b0195211e6b974;hb=HEAD#l61).
> > > I don't see why you would have to clear the pages first if they get
> > > overwritten by saved versions later. Or these pages are expected to be
> > > torn-down by the save/restore aware guests?
> >
> > Guests using those pages know they are torn down during suspend/resume
> > and expect to find a clean state when resuming. That's not the case with
> > forking however, as the guest is completely unaware of the fork
> > happening.
> >
> > One thing I'm not sure of is whether the backends (xenstored,
> > xenconsoled) will cope with those pages being already populated on
> > guest creation.
> >
> > AFAICT another issue is that xenstore watches are not copied over from
> > the parent, so any watches the parent might have set will not fire on
> > the child. That would require some kind of interaction with xenstored
> > in order to request a guest state to be copied over to another guest.
> 
> Sounds like it most likely would need to be handled if the guest uses
> them. I'm not sure if a default Linux HVM guest uses them though.

Linux PVHVM does use xenstore watches to monitor backend state
changes, but it would require a non-trivial amount of work to clone
the state of PV devices, so I guess it's something to be left as a
TODO item.

> A
> Windows HVM guest without the PV drivers is certainly not Xen aware so
> there things already work just fine and that is our primary target for
> our use-case. PVHVM/PVH Linux guests are not. So that's really outside
> the scope of what I can contribute at the moment.

Sure.

Can you please add a TODO item here to note that the contents of those
special pages likely need to be copied over, and that the state of PV
devices and interfaces that rely on backends running in userspace is
not handled at all.

Thanks, Roger.
Tamas K Lengyel Feb. 26, 2020, 4:17 p.m. UTC | #6
On Wed, Feb 26, 2020 at 9:10 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Wed, Feb 26, 2020 at 08:58:05AM -0700, Tamas K Lengyel wrote:
> > On Wed, Feb 26, 2020 at 8:36 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > >
> > > On Wed, Feb 26, 2020 at 08:20:30AM -0700, Tamas K Lengyel wrote:
> > > > > > +static int populate_special_pages(struct domain *cd)
> > > > > > +{
> > > > > > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > > > > +    static const unsigned int params[] =
> > > > > > +    {
> > > > > > +        HVM_PARAM_STORE_PFN,
> > > > > > +        HVM_PARAM_IOREQ_PFN,
> > > > > > +        HVM_PARAM_BUFIOREQ_PFN,
> > > > > > +        HVM_PARAM_CONSOLE_PFN
> > > > > > +    };
> > > > > > +    unsigned int i;
> > > > > > +
> > > > > > +    for ( i=0; i<4; i++ )
> > > > >
> > > > > Nit: can you please add some spaces around the operators?
> > > >
> > > > Sure.
> > > >
> > > > >
> > > > > > +    {
> > > > > > +        uint64_t value = 0;
> > > > > > +        mfn_t new_mfn;
> > > > > > +        struct page_info *page;
> > > > > > +
> > > > > > +        if ( hvm_get_param(cd, params[i], &value) || !value )
> > > > > > +            continue;
> > > > > > +
> > > > > > +        if ( !(page = alloc_domheap_page(cd, 0)) )
> > > > > > +            return -ENOMEM;
> > > > > > +
> > > > > > +        new_mfn = page_to_mfn(page);
> > > > > > +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> > > > > > +
> > > > > > +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> > > > > > +                              p2m_ram_rw, p2m->default_access, -1);
> > > > >
> > > > > I think you also need to copy the contents from the parent page here.
> > > >
> > > > The toolstack simply clears these pages during restore so I'm not sure
> > > > (see https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libxc/xc_sr_restore_x86_hvm.c;h=3f78248f32fec239db77b0e483b0195211e6b974;hb=HEAD#l61).
> > > > I don't see why you would have to clear the pages first if they get
> > > > overwritten by saved versions later. Or these pages are expected to be
> > > > torn-down by the save/restore aware guests?
> > >
> > > Guests using those pages know they are torn down during suspend/resume
> > > and expect to find a clean state when resuming. That's not the case with
> > > forking however, as the guest is completely unaware of the fork
> > > happening.
> > >
> > > One thing I'm not sure of is whether the backends (xenstored,
> > > xenconsoled) will cope with those pages being already populated on
> > > guest creation.
> > >
> > > AFAICT another issue is that xenstore watches are not copied over from
> > > the parent, so any watches the parent might have set will not fire on
> > > the child. That would require some kind of interaction with xenstored
> > > in order to request a guest state to be copied over to another guest.
> >
> > Sounds like it most likely would need to be handled if the guest uses
> > them. I'm not sure if a default Linux HVM guest uses them though.
>
> Linux PVHVM does use xenstore watches to monitor backend state
> changes, but it would require a non-trivial amount of work to clone
> the state of PV devices, so I guess it's something to be left as a
> TODO item.
>
> > A
> > Windows HVM guest without the PV drivers is certainly not Xen aware so
> > there things already work just fine and that is our primary target for
> > our use-case. PVHVM/PVH Linux guests are not. So that's really outside
> > the scope of what I can contribute at the moment.
>
> Sure.
>
> Can you please add a TODO item here to note that the contents of those
> special pages likely need to be copied over, and that the state of PV
> devices and interfaces that rely on backends running in userspace is
> not handled at all.
>

Of course. I will finish up the PV timer parts and copying the shared
info page, but will leave the rest as TODO. Hopefully this will get at
least a default Linux HVM fork working the same way as Windows does,
but if not then I'll unfortunately have to leave it as-is for now.

Tamas
Tamas K Lengyel Feb. 26, 2020, 5:21 p.m. UTC | #7
On Wed, Feb 26, 2020 at 9:17 AM Tamas K Lengyel <tamas@tklengyel.com> wrote:
>
> On Wed, Feb 26, 2020 at 9:10 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Wed, Feb 26, 2020 at 08:58:05AM -0700, Tamas K Lengyel wrote:
> > > On Wed, Feb 26, 2020 at 8:36 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > >
> > > > On Wed, Feb 26, 2020 at 08:20:30AM -0700, Tamas K Lengyel wrote:
> > > > > > > +static int populate_special_pages(struct domain *cd)
> > > > > > > +{
> > > > > > > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > > > > > +    static const unsigned int params[] =
> > > > > > > +    {
> > > > > > > +        HVM_PARAM_STORE_PFN,
> > > > > > > +        HVM_PARAM_IOREQ_PFN,
> > > > > > > +        HVM_PARAM_BUFIOREQ_PFN,
> > > > > > > +        HVM_PARAM_CONSOLE_PFN
> > > > > > > +    };
> > > > > > > +    unsigned int i;
> > > > > > > +
> > > > > > > +    for ( i=0; i<4; i++ )
> > > > > >
> > > > > > Nit: can you please add some spaces around the operators?
> > > > >
> > > > > Sure.
> > > > >
> > > > > >
> > > > > > > +    {
> > > > > > > +        uint64_t value = 0;
> > > > > > > +        mfn_t new_mfn;
> > > > > > > +        struct page_info *page;
> > > > > > > +
> > > > > > > +        if ( hvm_get_param(cd, params[i], &value) || !value )
> > > > > > > +            continue;
> > > > > > > +
> > > > > > > +        if ( !(page = alloc_domheap_page(cd, 0)) )
> > > > > > > +            return -ENOMEM;
> > > > > > > +
> > > > > > > +        new_mfn = page_to_mfn(page);
> > > > > > > +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> > > > > > > +
> > > > > > > +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> > > > > > > +                              p2m_ram_rw, p2m->default_access, -1);
> > > > > >
> > > > > > I think you also need to copy the contents from the parent page here.
> > > > >
> > > > > The toolstack simply clears these pages during restore so I'm not sure
> > > > > (see https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libxc/xc_sr_restore_x86_hvm.c;h=3f78248f32fec239db77b0e483b0195211e6b974;hb=HEAD#l61).
> > > > > I don't see why you would have to clear the pages first if they get
> > > > > overwritten by saved versions later. Or these pages are expected to be
> > > > > torn-down by the save/restore aware guests?
> > > >
> > > > Guests using those pages know they are torn down during suspend/resume
> > > > and expect to find a clean state when resuming. That's not the case with
> > > > forking however, as the guest is completely unaware of the fork
> > > > happening.
> > > >
> > > > One thing I'm not sure of is whether the backends (xenstored,
> > > > xenconsoled) will cope with those pages being already populated on
> > > > guest creation.
> > > >
> > > > AFAICT another issue is that xenstore watches are not copied over from
> > > > the parent, so any watches the parent might have set will not fire on
> > > > the child. That would require some kind of interaction with xenstored
> > > > in order to request a guest state to be copied over to another guest.
> > >
> > > Sounds like it most likely would need to be handled if the guest uses
> > > them. I'm not sure if a default Linux HVM guest uses them though.
> >
> > Linux PVHVM does use xenstore watches to monitor backend state
> > changes, but it would require a non-trivial amount of work to clone
> > the state of PV devices, so I guess it's something to be left as a
> > TODO item.
> >
> > > A
> > > Windows HVM guest without the PV drivers is certainly not Xen aware so
> > > there things already work just fine and that is our primary target for
> > > our use-case. PVHVM/PVH Linux guests are not. So that's really outside
> > > the scope of what I can contribute at the moment.
> >
> > Sure.
> >
> > Can you please add a TODO item here to note that the contents of those
> > special pages likely need to be copied over, and that the state of PV
> > devices and interfaces that rely on backends running in userspace is
> > not handled at all.
> >
>
> Of course. I will finish up the PV timer parts and copying the shared
> info page, but will leave the rest as TODO. Hopefully this will get at
> least a default Linux HVM fork working the same way as Windows does,
> but if not then I'll unfortunately have to leave it as-is for now.
>

Unfortunately Linux's VNC console is still only responsive if the
parent was just restored from a save file. There must be some other
pieces missing here but I'm just going to document it as a TODO as
it's really outside our scope.

Tamas
diff mbox series

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index fe63c23676..1ab0ca0942 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2203,6 +2203,17 @@  int domain_relinquish_resources(struct domain *d)
             ret = relinquish_shared_pages(d);
             if ( ret )
                 return ret;
+
+            /*
+             * If the domain is forked, decrement the parent's pause count
+             * and release the domain.
+             */
+            if ( d->parent )
+            {
+                domain_unpause(d->parent);
+                put_domain(d->parent);
+                d->parent = NULL;
+            }
         }
 #endif
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index a339b36a0d..c284f3cf5f 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1915,7 +1915,7 @@  int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     }
 #endif
 
-    /* Spurious fault? PoD and log-dirty also take this path. */
+    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
     if ( p2m_is_ram(p2mt) )
     {
         rc = 1;
@@ -4429,7 +4429,7 @@  static int hvm_allow_get_param(struct domain *d,
     return rc;
 }
 
-static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
+int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
 {
     int rc;
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 3d93f3451c..c7c7ff6e99 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -321,8 +321,7 @@  static void hap_free_p2m_page(struct domain *d, struct page_info *pg)
 }
 
 /* Return the size of the pool, rounded up to the nearest MB */
-static unsigned int
-hap_get_allocation(struct domain *d)
+unsigned int hap_get_allocation(struct domain *d)
 {
     unsigned int pg = d->arch.paging.hap.total_pages
         + d->arch.paging.hap.p2m_pages;
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 3835bc928f..8ee37e6943 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -22,6 +22,7 @@ 
 
 #include <xen/types.h>
 #include <xen/domain_page.h>
+#include <xen/event.h>
 #include <xen/spinlock.h>
 #include <xen/rwlock.h>
 #include <xen/mm.h>
@@ -36,6 +37,8 @@ 
 #include <asm/altp2m.h>
 #include <asm/atomic.h>
 #include <asm/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/hvm.h>
 #include <xsm/xsm.h>
 
 #include "mm-locks.h"
@@ -1444,6 +1447,263 @@  static inline int mem_sharing_control(struct domain *d, bool enable)
     return 0;
 }
 
+/*
+ * Forking a page only gets called when the VM faults due to no entry being
+ * in the EPT for the access. Depending on the type of access we either
+ * populate the physmap with a shared entry for read-only access or
+ * fork the page if its a write access.
+ *
+ * The client p2m is already locked so we only need to lock
+ * the parent's here.
+ */
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
+{
+    int rc = -ENOENT;
+    shr_handle_t handle;
+    struct domain *parent = d->parent;
+    struct p2m_domain *p2m;
+    unsigned long gfn_l = gfn_x(gfn);
+    mfn_t mfn, new_mfn;
+    p2m_type_t p2mt;
+    struct page_info *page;
+
+    if ( !mem_sharing_is_fork(d) )
+        return -ENOENT;
+
+    if ( !unsharing )
+    {
+        /* For read-only accesses we just add a shared entry to the physmap */
+        while ( parent )
+        {
+            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
+                break;
+
+            parent = parent->parent;
+        }
+
+        if ( !rc )
+        {
+            /* The client's p2m is already locked */
+            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
+
+            p2m_lock(pp2m);
+            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
+            p2m_unlock(pp2m);
+
+            if ( !rc )
+                return 0;
+        }
+    }
+
+    /*
+     * If it's a write access (ie. unsharing) or if adding a shared entry to
+     * the physmap failed we'll fork the page directly.
+     */
+    p2m = p2m_get_hostp2m(d);
+    parent = d->parent;
+
+    while ( parent )
+    {
+        mfn = get_gfn_query(parent, gfn_l, &p2mt);
+
+        /*
+         * We can't fork grant memory from the parent, only regular ram.
+         */
+        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
+            break;
+
+        put_gfn(parent, gfn_l);
+        parent = parent->parent;
+    }
+
+    if ( !parent )
+        return -ENOENT;
+
+    if ( !(page = alloc_domheap_page(d, 0)) )
+    {
+        put_gfn(parent, gfn_l);
+        return -ENOMEM;
+    }
+
+    new_mfn = page_to_mfn(page);
+    copy_domain_page(new_mfn, mfn);
+    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
+
+    put_gfn(parent, gfn_l);
+
+    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                          p2m->default_access, -1);
+}
+
+static int bring_up_vcpus(struct domain *cd, struct domain *d)
+{
+    unsigned int i;
+    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
+    int ret = -EINVAL;
+
+    if ( d->max_vcpus != cd->max_vcpus )
+        return ret;
+
+    if ( (ret = cpupool_move_domain(cd, d->cpupool)) )
+        return ret;
+
+    for ( i = 0; i < cd->max_vcpus; i++ )
+    {
+        mfn_t vcpu_info_mfn;
+
+        if ( !d->vcpu[i] || cd->vcpu[i] )
+            continue;
+
+        if ( !vcpu_create(cd, i) )
+            return -EINVAL;
+
+        /*
+         * Map in a page for the vcpu_info if the guest uses one to the exact
+         * same spot.
+         */
+        vcpu_info_mfn = d->vcpu[i]->vcpu_info_mfn;
+        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
+        {
+            struct page_info *page;
+            mfn_t new_mfn;
+            gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
+            unsigned long gfn_l = gfn_x(gfn);
+
+            if ( !(page = alloc_domheap_page(cd, 0)) )
+                return -ENOMEM;
+
+            new_mfn = page_to_mfn(page);
+            set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
+
+            if ( !(ret = p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K,
+                                        p2m_ram_rw, p2m->default_access, -1)) )
+                return ret;
+
+            if ( !(ret = map_vcpu_info(cd->vcpu[i], gfn_l,
+                                       d->vcpu[i]->vcpu_info_offset)) )
+                return ret;
+        }
+    }
+
+    domain_update_node_affinity(cd);
+    return 0;
+}
+
+static int fork_hap_allocation(struct domain *cd, struct domain *d)
+{
+    int rc;
+    bool preempted;
+    unsigned long mb = hap_get_allocation(d);
+
+    if ( mb == hap_get_allocation(cd) )
+        return 0;
+
+    paging_lock(cd);
+    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
+    paging_unlock(cd);
+
+    return preempted ? -ERESTART : rc;
+}
+
+static void fork_tsc(struct domain *cd, struct domain *d)
+{
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint64_t elapsed_nsec;
+
+    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
+    /* Don't bump incarnation on set */
+    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
+}
+
+static int populate_special_pages(struct domain *cd)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
+    static const unsigned int params[] =
+    {
+        HVM_PARAM_STORE_PFN,
+        HVM_PARAM_IOREQ_PFN,
+        HVM_PARAM_BUFIOREQ_PFN,
+        HVM_PARAM_CONSOLE_PFN
+    };
+    unsigned int i;
+
+    for ( i=0; i<4; i++ )
+    {
+        uint64_t value = 0;
+        mfn_t new_mfn;
+        struct page_info *page;
+
+        if ( hvm_get_param(cd, params[i], &value) || !value )
+            continue;
+
+        if ( !(page = alloc_domheap_page(cd, 0)) )
+            return -ENOMEM;
+
+        new_mfn = page_to_mfn(page);
+        set_gpfn_from_mfn(mfn_x(new_mfn), value);
+
+        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
+                              p2m_ram_rw, p2m->default_access, -1);
+    }
+
+    return 0;
+}
+
+static int fork(struct domain *d, struct domain *cd)
+{
+    int rc = -EBUSY;
+
+    if ( !cd->controller_pause_count )
+        return rc;
+
+    /*
+     * We only want to get and pause the parent once, not each time this
+     * operation is restarted due to preemption.
+     */
+    if ( !cd->parent_paused )
+    {
+        if ( !get_domain(d) )
+        {
+            ASSERT_UNREACHABLE();
+            return -EBUSY;
+        }
+
+        domain_pause(d);
+        cd->parent_paused = true;
+        cd->max_pages = d->max_pages;
+        cd->max_vcpus = d->max_vcpus;
+    }
+
+    /* this is preemptible so it's the first to get done */
+    if ( (rc = fork_hap_allocation(cd, d)) )
+        goto done;
+
+    if ( (rc = bring_up_vcpus(cd, d)) )
+        goto done;
+
+    if ( (rc = hvm_copy_context_and_params(cd, d)) )
+        goto done;
+
+    if ( (rc = populate_special_pages(cd)) )
+        goto done;
+
+    fork_tsc(cd, d);
+
+    cd->parent = d;
+
+ done:
+    if ( rc && rc != -ERESTART )
+    {
+        domain_unpause(d);
+        put_domain(d);
+        cd->parent_paused = false;
+    }
+
+    return rc;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
     int rc;
@@ -1698,6 +1958,33 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         rc = debug_gref(d, mso.u.debug.u.gref);
         break;
 
+    case XENMEM_sharing_op_fork:
+    {
+        struct domain *pd;
+
+        rc = -EINVAL;
+        if ( mso.u.fork._pad[0] || mso.u.fork._pad[1] ||
+             mso.u.fork._pad[2] )
+            goto out;
+
+        rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
+                                               &pd);
+        if ( rc )
+            goto out;
+
+        if ( !mem_sharing_enabled(pd) && (rc = mem_sharing_control(pd, true)) )
+            goto out;
+
+        rc = fork(pd, d);
+
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
+                                               "lh", XENMEM_sharing_op,
+                                               arg);
+        rcu_unlock_domain(pd);
+        break;
+    }
+
     default:
         rc = -ENOSYS;
         break;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index c5f428d67c..2358808227 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -509,6 +509,12 @@  mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
 
     mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
 
+    /* Check if we need to fork the page */
+    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
+         !mem_sharing_fork_page(p2m->domain, gfn, !!(q & P2M_UNSHARE)) )
+        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
+
+    /* Check if we need to unshare the page */
     if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
     {
         ASSERT(p2m_is_hostp2m(p2m));
@@ -588,7 +594,8 @@  struct page_info *p2m_get_page_from_gfn(
             return page;
 
         /* Error path: not a suitable GFN at all */
-        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
+        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
+             !mem_sharing_is_fork(p2m->domain) )
             return NULL;
     }
 
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 6ad458fa6b..02998235dd 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1269,6 +1269,9 @@  int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
 
     v->vcpu_info = new_info;
     v->vcpu_info_mfn = page_to_mfn(page);
+#ifdef CONFIG_MEM_SHARING
+    v->vcpu_info_offset = offset;
+#endif
 
     /* Set new vcpu_info pointer /before/ setting pending flags. */
     smp_wmb();
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index b94bfb4ed0..1bf07e49fe 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -45,6 +45,7 @@  int   hap_track_dirty_vram(struct domain *d,
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
 int hap_set_allocation(struct domain *d, unsigned int pages, bool *preempted);
+unsigned int hap_get_allocation(struct domain *d);
 
 #endif /* XEN_HAP_H */
 
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 24da824cbf..35e970b030 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -339,6 +339,8 @@  bool hvm_flush_vcpu_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
 
 int hvm_copy_context_and_params(struct domain *src, struct domain *dst);
 
+int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value);
+
 #ifdef CONFIG_HVM
 
 #define hvm_get_guest_tsc(v) hvm_get_guest_tsc_fixed(v, 0)
diff --git a/xen/include/asm-x86/mem_sharing.h b/xen/include/asm-x86/mem_sharing.h
index 53760a2896..ac968fae3f 100644
--- a/xen/include/asm-x86/mem_sharing.h
+++ b/xen/include/asm-x86/mem_sharing.h
@@ -39,6 +39,9 @@  struct mem_sharing_domain
 
 #define mem_sharing_enabled(d) ((d)->arch.hvm.mem_sharing.enabled)
 
+#define mem_sharing_is_fork(d) \
+    (mem_sharing_enabled(d) && !!((d)->parent))
+
 /* Auditing of memory sharing code? */
 #ifndef NDEBUG
 #define MEM_SHARING_AUDIT 1
@@ -88,6 +91,9 @@  static inline int mem_sharing_unshare_page(struct domain *d,
     return rc;
 }
 
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn,
+                          bool unsharing);
+
 /*
  * If called by a foreign domain, possible errors are
  *   -EBUSY -> ring full
@@ -117,6 +123,7 @@  int relinquish_shared_pages(struct domain *d);
 #else
 
 #define mem_sharing_enabled(d) false
+#define mem_sharing_is_fork(p2m) false
 
 static inline unsigned int mem_sharing_get_nr_saved_mfns(void)
 {
@@ -141,6 +148,16 @@  static inline int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
     return -EOPNOTSUPP;
 }
 
+static inline int mem_sharing_fork(struct domain *d, struct domain *cd, bool vcpu)
+{
+    return -EOPNOTSUPP;
+}
+
+static inline int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool lock)
+{
+    return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* __MEM_SHARING_H__ */
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 126d0ff06e..c1dbad060e 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -482,6 +482,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_add_physmap       6
 #define XENMEM_sharing_op_audit             7
 #define XENMEM_sharing_op_range_share       8
+#define XENMEM_sharing_op_fork              9
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
@@ -532,6 +533,10 @@  struct xen_mem_sharing_op {
                 uint32_t gref;     /* IN: gref to debug         */
             } u;
         } debug;
+        struct mem_sharing_op_fork {      /* OP_FORK */
+            domid_t parent_domain;        /* IN: parent's domain id */
+            uint16_t _pad[3];             /* Must be set to 0 */
+        } fork;
     } u;
 };
 typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 3a4f43098c..c6ba5a52a4 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -248,6 +248,9 @@  struct vcpu
 
     /* Guest-specified relocation of vcpu_info. */
     mfn_t            vcpu_info_mfn;
+#ifdef CONFIG_MEM_SHARING
+    uint32_t         vcpu_info_offset;
+#endif
 
     struct evtchn_fifo_vcpu *evtchn_fifo;
 
@@ -503,6 +506,8 @@  struct domain
     /* Memory sharing support */
 #ifdef CONFIG_MEM_SHARING
     struct vm_event_domain *vm_event_share;
+    struct domain *parent; /* VM fork parent */
+    bool parent_paused;
 #endif
     /* Memory paging support */
 #ifdef CONFIG_HAS_MEM_PAGING