diff mbox series

[v12,1/3] xen/mem_sharing: VM forking

Message ID a8cf8742054d04760f2f5060cfeef5bef1edbd6f.1584981438.git.tamas.lengyel@intel.com (mailing list archive)
State Superseded
Headers show
Series VM forking | expand

Commit Message

Tamas K Lengyel March 23, 2020, 5:04 p.m. UTC
VM forking is the process of creating a domain with an empty memory space and a
parent domain specified from which to populate the memory when necessary. For
the new domain to be functional the VM state is copied over as part of the fork
operation (HVM params, hap allocation, etc).

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
v12: Minor style adjustments Jan pointed out
     Convert mem_sharing_is_fork to inline function
v11: Fully copy vcpu_info pages
     Setup vcpu_runstate for forks
     Added TODO note for PV timers
     Copy shared_info page
     Add copy_settings function, to be shared with fork_reset in the next patch
---
 xen/arch/x86/domain.c             |  11 +
 xen/arch/x86/hvm/hvm.c            |   4 +-
 xen/arch/x86/mm/hap/hap.c         |   3 +-
 xen/arch/x86/mm/mem_sharing.c     | 368 ++++++++++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c             |   9 +-
 xen/common/domain.c               |   3 +
 xen/include/asm-x86/hap.h         |   1 +
 xen/include/asm-x86/hvm/hvm.h     |   2 +
 xen/include/asm-x86/mem_sharing.h |  18 ++
 xen/include/public/memory.h       |   5 +
 xen/include/xen/sched.h           |   5 +
 11 files changed, 424 insertions(+), 5 deletions(-)

Comments

Roger Pau Monné March 25, 2020, 3:47 p.m. UTC | #1
Sorry it has taken me a while to get to this.

On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote:
> VM forking is the process of creating a domain with an empty memory space and a
> parent domain specified from which to populate the memory when necessary. For
> the new domain to be functional the VM state is copied over as part of the fork
> operation (HVM params, hap allocation, etc).
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> ---
> v12: Minor style adjustments Jan pointed out
>      Convert mem_sharing_is_fork to inline function
> v11: Fully copy vcpu_info pages
>      Setup vcpu_runstate for forks
>      Added TODO note for PV timers
>      Copy shared_info page
>      Add copy_settings function, to be shared with fork_reset in the next patch
> ---
>  xen/arch/x86/domain.c             |  11 +
>  xen/arch/x86/hvm/hvm.c            |   4 +-
>  xen/arch/x86/mm/hap/hap.c         |   3 +-
>  xen/arch/x86/mm/mem_sharing.c     | 368 ++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/p2m.c             |   9 +-
>  xen/common/domain.c               |   3 +
>  xen/include/asm-x86/hap.h         |   1 +
>  xen/include/asm-x86/hvm/hvm.h     |   2 +
>  xen/include/asm-x86/mem_sharing.h |  18 ++
>  xen/include/public/memory.h       |   5 +
>  xen/include/xen/sched.h           |   5 +
>  11 files changed, 424 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index caf2ecad7e..11d3c2216e 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
>              ret = relinquish_shared_pages(d);
>              if ( ret )
>                  return ret;
> +
> +            /*
> +             * If the domain is forked, decrement the parent's pause count
> +             * and release the domain.
> +             */
> +            if ( mem_sharing_is_fork(d) )
> +            {
> +                domain_unpause(d->parent);
> +                put_domain(d->parent);
> +                d->parent = NULL;
> +            }
>          }
>  #endif
>  
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index a3d115b650..304b3d1562 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1917,7 +1917,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>      }
>  #endif
>  
> -    /* Spurious fault? PoD and log-dirty also take this path. */
> +    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
>      if ( p2m_is_ram(p2mt) )
>      {
>          rc = 1;
> @@ -4377,7 +4377,7 @@ static int hvm_allow_get_param(struct domain *d,
>      return rc;
>  }
>  
> -static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> +int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
>  {
>      int rc;
>  
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index a6d5e39b02..814d0c3253 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct page_info *pg)
>  }
>  
>  /* Return the size of the pool, rounded up to the nearest MB */
> -static unsigned int
> -hap_get_allocation(struct domain *d)
> +unsigned int hap_get_allocation(struct domain *d)
>  {
>      unsigned int pg = d->arch.paging.hap.total_pages
>          + d->arch.paging.hap.p2m_pages;
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 3835bc928f..23deeddff2 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -22,6 +22,7 @@
>  
>  #include <xen/types.h>
>  #include <xen/domain_page.h>
> +#include <xen/event.h>
>  #include <xen/spinlock.h>
>  #include <xen/rwlock.h>
>  #include <xen/mm.h>
> @@ -36,6 +37,8 @@
>  #include <asm/altp2m.h>
>  #include <asm/atomic.h>
>  #include <asm/event.h>
> +#include <asm/hap.h>
> +#include <asm/hvm/hvm.h>
>  #include <xsm/xsm.h>
>  
>  #include "mm-locks.h"
> @@ -1444,6 +1447,334 @@ static inline int mem_sharing_control(struct domain *d, bool enable)
>      return 0;
>  }
>  
> +/*
> + * Forking a page only gets called when the VM faults due to no entry being
> + * in the EPT for the access. Depending on the type of access we either
> + * populate the physmap with a shared entry for read-only access or
> + * fork the page if its a write access.
> + *
> + * The client p2m is already locked so we only need to lock
> + * the parent's here.
> + */
> +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
> +{
> +    int rc = -ENOENT;
> +    shr_handle_t handle;
> +    struct domain *parent = d->parent;
> +    struct p2m_domain *p2m;
> +    unsigned long gfn_l = gfn_x(gfn);
> +    mfn_t mfn, new_mfn;
> +    p2m_type_t p2mt;
> +    struct page_info *page;
> +
> +    if ( !mem_sharing_is_fork(d) )
> +        return -ENOENT;
> +
> +    if ( !unsharing )
> +    {
> +        /* For read-only accesses we just add a shared entry to the physmap */
> +        while ( parent )
> +        {
> +            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
> +                break;
> +
> +            parent = parent->parent;
> +        }
> +
> +        if ( !rc )
> +        {
> +            /* The client's p2m is already locked */
> +            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);

Nit: I think you could just use the existing p2m local variable.

> +            p2m_lock(pp2m);
> +            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
> +            p2m_unlock(pp2m);
> +
> +            if ( !rc )
> +                return 0;
> +        }
> +    }
> +
> +    /*
> +     * If it's a write access (ie. unsharing) or if adding a shared entry to
> +     * the physmap failed we'll fork the page directly.
> +     */
> +    p2m = p2m_get_hostp2m(d);
> +    parent = d->parent;
> +
> +    while ( parent )
> +    {
> +        mfn = get_gfn_query(parent, gfn_l, &p2mt);
> +
> +        /*
> +         * We can't fork grant memory from the parent, only regular ram.
> +         */

Nit: single line comments should use /* ... */ (here and below).

> +        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
> +            break;
> +
> +        put_gfn(parent, gfn_l);
> +        parent = parent->parent;
> +    }
> +
> +    if ( !parent )
> +        return -ENOENT;
> +
> +    if ( !(page = alloc_domheap_page(d, 0)) )
> +    {
> +        put_gfn(parent, gfn_l);
> +        return -ENOMEM;
> +    }
> +
> +    new_mfn = page_to_mfn(page);
> +    copy_domain_page(new_mfn, mfn);
> +    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> +
> +    put_gfn(parent, gfn_l);
> +
> +    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> +                          p2m->default_access, -1);
> +}
> +
> +static int bring_up_vcpus(struct domain *cd, struct domain *d)
> +{
> +    unsigned int i;
> +    int ret = -EINVAL;

Nit: you can get rid of ret...

> +
> +    if ( d->max_vcpus != cd->max_vcpus ||
> +        (ret = cpupool_move_domain(cd, d->cpupool)) )
> +        return ret;

...and just return -EINVAL here. Seeing as it's not used anywhere
else.

> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        if ( !d->vcpu[i] || cd->vcpu[i] )
> +            continue;
> +
> +        if ( !vcpu_create(cd, i) )
> +            return -EINVAL;
> +    }
> +
> +    domain_update_node_affinity(cd);
> +    return 0;
> +}
> +
> +static int copy_vcpu_settings(struct domain *cd, struct domain *d)
> +{
> +    unsigned int i;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    int ret = -EINVAL;
> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        const struct vcpu *d_vcpu = d->vcpu[i];
> +        struct vcpu *cd_vcpu = cd->vcpu[i];
> +        struct vcpu_runstate_info runstate;
> +        mfn_t vcpu_info_mfn;
> +
> +        if ( !d_vcpu || !cd_vcpu )
> +            continue;
> +
> +        /*
> +         * Copy & map in the vcpu_info page if the guest uses one
> +         */
> +        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> +        {
> +            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> +
> +            /*
> +             * Allocate & map the page for it if it hasn't been already
> +             */
> +            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
> +            {
> +                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> +                unsigned long gfn_l = gfn_x(gfn);
> +                struct page_info *page;
> +
> +                if ( !(page = alloc_domheap_page(cd, 0)) )
> +                    return -ENOMEM;
> +
> +                new_vcpu_info_mfn = page_to_mfn(page);
> +                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
> +
> +                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, PAGE_ORDER_4K,
> +                                     p2m_ram_rw, p2m->default_access, -1);
> +                if ( ret )
> +                    return ret;
> +
> +                ret = map_vcpu_info(cd_vcpu, gfn_l,
> +                                    d_vcpu->vcpu_info_offset);
> +                if ( ret )
> +                    return ret;
> +            }
> +
> +            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
> +        }
> +
> +        /*
> +         * Setup the vCPU runstate area
> +         */
> +        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )

Maybe I'm confused, but isn't this the other way around and you need
to check? If the parent runstate is not null copy it to the fork,
ie:

if ( !guest_handle_is_null(runstate_guest(d_vcpu)) )
{
    ...

> +        {
> +            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
> +            vcpu_runstate_get(cd_vcpu, &runstate);
> +            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);

You should check the return code I think.

> +        }
> +
> +        /*
> +         * TODO: to support VMs with PV interfaces copy additional
> +         * settings here, such as PV timers.
> +         */
> +    }
> +
> +    return 0;
> +}
> +
> +static int fork_hap_allocation(struct domain *cd, struct domain *d)
> +{
> +    int rc;
> +    bool preempted;
> +    unsigned long mb = hap_get_allocation(d);
> +
> +    if ( mb == hap_get_allocation(cd) )
> +        return 0;
> +
> +    paging_lock(cd);
> +    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
> +    paging_unlock(cd);
> +
> +    return preempted ? -ERESTART : rc;
> +}
> +
> +static void copy_tsc(struct domain *cd, struct domain *d)
> +{
> +    uint32_t tsc_mode;
> +    uint32_t gtsc_khz;
> +    uint32_t incarnation;
> +    uint64_t elapsed_nsec;
> +
> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> +    /* Don't bump incarnation on set */
> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
> +}
> +
> +static int copy_special_pages(struct domain *cd, struct domain *d)
> +{
> +    mfn_t new_mfn, old_mfn;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    static const unsigned int params[] =
> +    {
> +        HVM_PARAM_STORE_PFN,
> +        HVM_PARAM_IOREQ_PFN,
> +        HVM_PARAM_BUFIOREQ_PFN,
> +        HVM_PARAM_CONSOLE_PFN
> +    };
> +    unsigned int i;
> +    int rc;
> +
> +    for ( i = 0; i < 4; i++ )

Please use ARRAY_SIZE instead of hard coding 4.

> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 9f51370327..1ed7d13084 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
>  
>      mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
>  
> +    /* Check if we need to fork the page */
> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> +
> +    /* Check if we need to unshare the page */
>      if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
>      {
>          ASSERT(p2m_is_hostp2m(p2m));
> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
>              return page;
>  
>          /* Error path: not a suitable GFN at all */
> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> +             !mem_sharing_is_fork(p2m->domain) )
>              return NULL;
>      }
>  
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index b4eb476a9c..62aed53a16 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
>  
>      v->vcpu_info = new_info;
>      v->vcpu_info_mfn = page_to_mfn(page);
> +#ifdef CONFIG_MEM_SHARING
> +    v->vcpu_info_offset = offset;

There's no need to introduce this field, you can just use v->vcpu_info
& ~PAGE_MASK AFAICT.

Thanks, Roger.
Tamas K Lengyel March 25, 2020, 4:04 p.m. UTC | #2
On Wed, Mar 25, 2020 at 9:47 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> Sorry it has taken me a while to get to this.

Thanks for the review, I'm addressing all the items you noticed in the
next revision.

Tamas
Tamas K Lengyel March 25, 2020, 4:16 p.m. UTC | #3
> > +static int bring_up_vcpus(struct domain *cd, struct domain *d)
> > +{
> > +    unsigned int i;
> > +    int ret = -EINVAL;
>
> Nit: you can get rid of ret...
>
> > +
> > +    if ( d->max_vcpus != cd->max_vcpus ||
> > +        (ret = cpupool_move_domain(cd, d->cpupool)) )
> > +        return ret;
>
> ...and just return -EINVAL here. Seeing as it's not used anywhere
> else.
>

It is actually still needed, note that we store the return value of
cpupool_move_domain in it.

Tamas
Tamas K Lengyel March 25, 2020, 4:34 p.m. UTC | #4
On Wed, Mar 25, 2020 at 9:47 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> Sorry it has taken me a while to get to this.
>
> On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote:
> > VM forking is the process of creating a domain with an empty memory space and a
> > parent domain specified from which to populate the memory when necessary. For
> > the new domain to be functional the VM state is copied over as part of the fork
> > operation (HVM params, hap allocation, etc).
> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > ---
> > v12: Minor style adjustments Jan pointed out
> >      Convert mem_sharing_is_fork to inline function
> > v11: Fully copy vcpu_info pages
> >      Setup vcpu_runstate for forks
> >      Added TODO note for PV timers
> >      Copy shared_info page
> >      Add copy_settings function, to be shared with fork_reset in the next patch
> > ---
> >  xen/arch/x86/domain.c             |  11 +
> >  xen/arch/x86/hvm/hvm.c            |   4 +-
> >  xen/arch/x86/mm/hap/hap.c         |   3 +-
> >  xen/arch/x86/mm/mem_sharing.c     | 368 ++++++++++++++++++++++++++++++
> >  xen/arch/x86/mm/p2m.c             |   9 +-
> >  xen/common/domain.c               |   3 +
> >  xen/include/asm-x86/hap.h         |   1 +
> >  xen/include/asm-x86/hvm/hvm.h     |   2 +
> >  xen/include/asm-x86/mem_sharing.h |  18 ++
> >  xen/include/public/memory.h       |   5 +
> >  xen/include/xen/sched.h           |   5 +
> >  11 files changed, 424 insertions(+), 5 deletions(-)
> >
> > diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> > index caf2ecad7e..11d3c2216e 100644
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
> >              ret = relinquish_shared_pages(d);
> >              if ( ret )
> >                  return ret;
> > +
> > +            /*
> > +             * If the domain is forked, decrement the parent's pause count
> > +             * and release the domain.
> > +             */
> > +            if ( mem_sharing_is_fork(d) )
> > +            {
> > +                domain_unpause(d->parent);
> > +                put_domain(d->parent);
> > +                d->parent = NULL;
> > +            }
> >          }
> >  #endif
> >
> > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> > index a3d115b650..304b3d1562 100644
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -1917,7 +1917,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
> >      }
> >  #endif
> >
> > -    /* Spurious fault? PoD and log-dirty also take this path. */
> > +    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
> >      if ( p2m_is_ram(p2mt) )
> >      {
> >          rc = 1;
> > @@ -4377,7 +4377,7 @@ static int hvm_allow_get_param(struct domain *d,
> >      return rc;
> >  }
> >
> > -static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> > +int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> >  {
> >      int rc;
> >
> > diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> > index a6d5e39b02..814d0c3253 100644
> > --- a/xen/arch/x86/mm/hap/hap.c
> > +++ b/xen/arch/x86/mm/hap/hap.c
> > @@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct page_info *pg)
> >  }
> >
> >  /* Return the size of the pool, rounded up to the nearest MB */
> > -static unsigned int
> > -hap_get_allocation(struct domain *d)
> > +unsigned int hap_get_allocation(struct domain *d)
> >  {
> >      unsigned int pg = d->arch.paging.hap.total_pages
> >          + d->arch.paging.hap.p2m_pages;
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index 3835bc928f..23deeddff2 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -22,6 +22,7 @@
> >
> >  #include <xen/types.h>
> >  #include <xen/domain_page.h>
> > +#include <xen/event.h>
> >  #include <xen/spinlock.h>
> >  #include <xen/rwlock.h>
> >  #include <xen/mm.h>
> > @@ -36,6 +37,8 @@
> >  #include <asm/altp2m.h>
> >  #include <asm/atomic.h>
> >  #include <asm/event.h>
> > +#include <asm/hap.h>
> > +#include <asm/hvm/hvm.h>
> >  #include <xsm/xsm.h>
> >
> >  #include "mm-locks.h"
> > @@ -1444,6 +1447,334 @@ static inline int mem_sharing_control(struct domain *d, bool enable)
> >      return 0;
> >  }
> >
> > +/*
> > + * Forking a page only gets called when the VM faults due to no entry being
> > + * in the EPT for the access. Depending on the type of access we either
> > + * populate the physmap with a shared entry for read-only access or
> > + * fork the page if its a write access.
> > + *
> > + * The client p2m is already locked so we only need to lock
> > + * the parent's here.
> > + */
> > +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
> > +{
> > +    int rc = -ENOENT;
> > +    shr_handle_t handle;
> > +    struct domain *parent = d->parent;
> > +    struct p2m_domain *p2m;
> > +    unsigned long gfn_l = gfn_x(gfn);
> > +    mfn_t mfn, new_mfn;
> > +    p2m_type_t p2mt;
> > +    struct page_info *page;
> > +
> > +    if ( !mem_sharing_is_fork(d) )
> > +        return -ENOENT;
> > +
> > +    if ( !unsharing )
> > +    {
> > +        /* For read-only accesses we just add a shared entry to the physmap */
> > +        while ( parent )
> > +        {
> > +            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
> > +                break;
> > +
> > +            parent = parent->parent;
> > +        }
> > +
> > +        if ( !rc )
> > +        {
> > +            /* The client's p2m is already locked */
> > +            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
>
> Nit: I think you could just use the existing p2m local variable.
>
> > +            p2m_lock(pp2m);
> > +            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
> > +            p2m_unlock(pp2m);
> > +
> > +            if ( !rc )
> > +                return 0;
> > +        }
> > +    }
> > +
> > +    /*
> > +     * If it's a write access (ie. unsharing) or if adding a shared entry to
> > +     * the physmap failed we'll fork the page directly.
> > +     */
> > +    p2m = p2m_get_hostp2m(d);
> > +    parent = d->parent;
> > +
> > +    while ( parent )
> > +    {
> > +        mfn = get_gfn_query(parent, gfn_l, &p2mt);
> > +
> > +        /*
> > +         * We can't fork grant memory from the parent, only regular ram.
> > +         */
>
> Nit: single line comments should use /* ... */ (here and below).
>
> > +        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
> > +            break;
> > +
> > +        put_gfn(parent, gfn_l);
> > +        parent = parent->parent;
> > +    }
> > +
> > +    if ( !parent )
> > +        return -ENOENT;
> > +
> > +    if ( !(page = alloc_domheap_page(d, 0)) )
> > +    {
> > +        put_gfn(parent, gfn_l);
> > +        return -ENOMEM;
> > +    }
> > +
> > +    new_mfn = page_to_mfn(page);
> > +    copy_domain_page(new_mfn, mfn);
> > +    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> > +
> > +    put_gfn(parent, gfn_l);
> > +
> > +    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> > +                          p2m->default_access, -1);
> > +}
> > +
> > +static int bring_up_vcpus(struct domain *cd, struct domain *d)
> > +{
> > +    unsigned int i;
> > +    int ret = -EINVAL;
>
> Nit: you can get rid of ret...
>
> > +
> > +    if ( d->max_vcpus != cd->max_vcpus ||
> > +        (ret = cpupool_move_domain(cd, d->cpupool)) )
> > +        return ret;
>
> ...and just return -EINVAL here. Seeing as it's not used anywhere
> else.
>
> > +
> > +    for ( i = 0; i < cd->max_vcpus; i++ )
> > +    {
> > +        if ( !d->vcpu[i] || cd->vcpu[i] )
> > +            continue;
> > +
> > +        if ( !vcpu_create(cd, i) )
> > +            return -EINVAL;
> > +    }
> > +
> > +    domain_update_node_affinity(cd);
> > +    return 0;
> > +}
> > +
> > +static int copy_vcpu_settings(struct domain *cd, struct domain *d)
> > +{
> > +    unsigned int i;
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > +    int ret = -EINVAL;
> > +
> > +    for ( i = 0; i < cd->max_vcpus; i++ )
> > +    {
> > +        const struct vcpu *d_vcpu = d->vcpu[i];
> > +        struct vcpu *cd_vcpu = cd->vcpu[i];
> > +        struct vcpu_runstate_info runstate;
> > +        mfn_t vcpu_info_mfn;
> > +
> > +        if ( !d_vcpu || !cd_vcpu )
> > +            continue;
> > +
> > +        /*
> > +         * Copy & map in the vcpu_info page if the guest uses one
> > +         */
> > +        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > +        {
> > +            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> > +
> > +            /*
> > +             * Allocate & map the page for it if it hasn't been already
> > +             */
> > +            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
> > +            {
> > +                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> > +                unsigned long gfn_l = gfn_x(gfn);
> > +                struct page_info *page;
> > +
> > +                if ( !(page = alloc_domheap_page(cd, 0)) )
> > +                    return -ENOMEM;
> > +
> > +                new_vcpu_info_mfn = page_to_mfn(page);
> > +                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
> > +
> > +                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, PAGE_ORDER_4K,
> > +                                     p2m_ram_rw, p2m->default_access, -1);
> > +                if ( ret )
> > +                    return ret;
> > +
> > +                ret = map_vcpu_info(cd_vcpu, gfn_l,
> > +                                    d_vcpu->vcpu_info_offset);
> > +                if ( ret )
> > +                    return ret;
> > +            }
> > +
> > +            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
> > +        }
> > +
> > +        /*
> > +         * Setup the vCPU runstate area
> > +         */
> > +        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )
>
> Maybe I'm confused, but isn't this the other way around and you need
> to check? If the parent runstate is not null copy it to the fork,
> ie:
>
> if ( !guest_handle_is_null(runstate_guest(d_vcpu)) )
> {
>     ...
>
> > +        {
> > +            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
> > +            vcpu_runstate_get(cd_vcpu, &runstate);
> > +            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);
>
> You should check the return code I think.
>
> > +        }
> > +
> > +        /*
> > +         * TODO: to support VMs with PV interfaces copy additional
> > +         * settings here, such as PV timers.
> > +         */
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int fork_hap_allocation(struct domain *cd, struct domain *d)
> > +{
> > +    int rc;
> > +    bool preempted;
> > +    unsigned long mb = hap_get_allocation(d);
> > +
> > +    if ( mb == hap_get_allocation(cd) )
> > +        return 0;
> > +
> > +    paging_lock(cd);
> > +    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
> > +    paging_unlock(cd);
> > +
> > +    return preempted ? -ERESTART : rc;
> > +}
> > +
> > +static void copy_tsc(struct domain *cd, struct domain *d)
> > +{
> > +    uint32_t tsc_mode;
> > +    uint32_t gtsc_khz;
> > +    uint32_t incarnation;
> > +    uint64_t elapsed_nsec;
> > +
> > +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> > +    /* Don't bump incarnation on set */
> > +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
> > +}
> > +
> > +static int copy_special_pages(struct domain *cd, struct domain *d)
> > +{
> > +    mfn_t new_mfn, old_mfn;
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > +    static const unsigned int params[] =
> > +    {
> > +        HVM_PARAM_STORE_PFN,
> > +        HVM_PARAM_IOREQ_PFN,
> > +        HVM_PARAM_BUFIOREQ_PFN,
> > +        HVM_PARAM_CONSOLE_PFN
> > +    };
> > +    unsigned int i;
> > +    int rc;
> > +
> > +    for ( i = 0; i < 4; i++ )
>
> Please use ARRAY_SIZE instead of hard coding 4.
>
> > diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> > index 9f51370327..1ed7d13084 100644
> > --- a/xen/arch/x86/mm/p2m.c
> > +++ b/xen/arch/x86/mm/p2m.c
> > @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
> >
> >      mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> >
> > +    /* Check if we need to fork the page */
> > +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> > +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> > +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > +
> > +    /* Check if we need to unshare the page */
> >      if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
> >      {
> >          ASSERT(p2m_is_hostp2m(p2m));
> > @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> >              return page;
> >
> >          /* Error path: not a suitable GFN at all */
> > -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> > +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> > +             !mem_sharing_is_fork(p2m->domain) )
> >              return NULL;
> >      }
> >
> > diff --git a/xen/common/domain.c b/xen/common/domain.c
> > index b4eb476a9c..62aed53a16 100644
> > --- a/xen/common/domain.c
> > +++ b/xen/common/domain.c
> > @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
> >
> >      v->vcpu_info = new_info;
> >      v->vcpu_info_mfn = page_to_mfn(page);
> > +#ifdef CONFIG_MEM_SHARING
> > +    v->vcpu_info_offset = offset;
>
> There's no need to introduce this field, you can just use v->vcpu_info
> & ~PAGE_MASK AFAICT.

Just doing what you suggest above results in:

mem_sharing.c:1603:55: error: invalid operands to binary & (have
‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
int’)
                                     d_vcpu->vcpu_info & ~PAGE_MASK);

I can of course cast the vcpu_info pointer to (long int), it's just a
bit ugly. Thoughts?

Tamas
Julien Grall March 25, 2020, 4:42 p.m. UTC | #5
Hi,

On 25/03/2020 16:34, Tamas K Lengyel wrote:
>>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
>>> index 9f51370327..1ed7d13084 100644
>>> --- a/xen/arch/x86/mm/p2m.c
>>> +++ b/xen/arch/x86/mm/p2m.c
>>> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
>>>
>>>       mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
>>>
>>> +    /* Check if we need to fork the page */
>>> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
>>> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
>>> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
>>> +
>>> +    /* Check if we need to unshare the page */
>>>       if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
>>>       {
>>>           ASSERT(p2m_is_hostp2m(p2m));
>>> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
>>>               return page;
>>>
>>>           /* Error path: not a suitable GFN at all */
>>> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
>>> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
>>> +             !mem_sharing_is_fork(p2m->domain) )
>>>               return NULL;
>>>       }
>>>
>>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>>> index b4eb476a9c..62aed53a16 100644
>>> --- a/xen/common/domain.c
>>> +++ b/xen/common/domain.c
>>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
>>>
>>>       v->vcpu_info = new_info;
>>>       v->vcpu_info_mfn = page_to_mfn(page);
>>> +#ifdef CONFIG_MEM_SHARING
>>> +    v->vcpu_info_offset = offset;
>>
>> There's no need to introduce this field, you can just use v->vcpu_info
>> & ~PAGE_MASK AFAICT.
> 
> Just doing what you suggest above results in:
> 
> mem_sharing.c:1603:55: error: invalid operands to binary & (have
> ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
> int’)
>                                       d_vcpu->vcpu_info & ~PAGE_MASK);
> 
> I can of course cast the vcpu_info pointer to (long int), it's just a
> bit ugly. Thoughts?

FWIW, I will also need the offset for liveupdate. I have used (unsigned 
long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.

So I am all for either a new field or a macro hiding this uglyness.

Cheers,
Tamas K Lengyel March 25, 2020, 4:47 p.m. UTC | #6
On Wed, Mar 25, 2020 at 10:42 AM Julien Grall <julien@xen.org> wrote:
>
> Hi,
>
> On 25/03/2020 16:34, Tamas K Lengyel wrote:
> >>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> >>> index 9f51370327..1ed7d13084 100644
> >>> --- a/xen/arch/x86/mm/p2m.c
> >>> +++ b/xen/arch/x86/mm/p2m.c
> >>> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
> >>>
> >>>       mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> >>>
> >>> +    /* Check if we need to fork the page */
> >>> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> >>> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> >>> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> >>> +
> >>> +    /* Check if we need to unshare the page */
> >>>       if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
> >>>       {
> >>>           ASSERT(p2m_is_hostp2m(p2m));
> >>> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> >>>               return page;
> >>>
> >>>           /* Error path: not a suitable GFN at all */
> >>> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> >>> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> >>> +             !mem_sharing_is_fork(p2m->domain) )
> >>>               return NULL;
> >>>       }
> >>>
> >>> diff --git a/xen/common/domain.c b/xen/common/domain.c
> >>> index b4eb476a9c..62aed53a16 100644
> >>> --- a/xen/common/domain.c
> >>> +++ b/xen/common/domain.c
> >>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
> >>>
> >>>       v->vcpu_info = new_info;
> >>>       v->vcpu_info_mfn = page_to_mfn(page);
> >>> +#ifdef CONFIG_MEM_SHARING
> >>> +    v->vcpu_info_offset = offset;
> >>
> >> There's no need to introduce this field, you can just use v->vcpu_info
> >> & ~PAGE_MASK AFAICT.
> >
> > Just doing what you suggest above results in:
> >
> > mem_sharing.c:1603:55: error: invalid operands to binary & (have
> > ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
> > int’)
> >                                       d_vcpu->vcpu_info & ~PAGE_MASK);
> >
> > I can of course cast the vcpu_info pointer to (long int), it's just a
> > bit ugly. Thoughts?
>
> FWIW, I will also need the offset for liveupdate. I have used (unsigned
> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.
>
> So I am all for either a new field or a macro hiding this uglyness.

A macro sounds like a good way to go, no need for an extra field if we
can calculate it based on the currently existing one. How about

#define VCPU_INFO_OFFSET(v) (((unsigned long)v->vcpu_info) & ~PAGE_MASK)

?

Tamas
Julien Grall March 25, 2020, 4:51 p.m. UTC | #7
On 25/03/2020 16:47, Tamas K Lengyel wrote:
> On Wed, Mar 25, 2020 at 10:42 AM Julien Grall <julien@xen.org> wrote:
>>
>> Hi,
>>
>> On 25/03/2020 16:34, Tamas K Lengyel wrote:
>>>>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
>>>>> index 9f51370327..1ed7d13084 100644
>>>>> --- a/xen/arch/x86/mm/p2m.c
>>>>> +++ b/xen/arch/x86/mm/p2m.c
>>>>> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
>>>>>
>>>>>        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
>>>>>
>>>>> +    /* Check if we need to fork the page */
>>>>> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
>>>>> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
>>>>> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
>>>>> +
>>>>> +    /* Check if we need to unshare the page */
>>>>>        if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
>>>>>        {
>>>>>            ASSERT(p2m_is_hostp2m(p2m));
>>>>> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
>>>>>                return page;
>>>>>
>>>>>            /* Error path: not a suitable GFN at all */
>>>>> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
>>>>> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
>>>>> +             !mem_sharing_is_fork(p2m->domain) )
>>>>>                return NULL;
>>>>>        }
>>>>>
>>>>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>>>>> index b4eb476a9c..62aed53a16 100644
>>>>> --- a/xen/common/domain.c
>>>>> +++ b/xen/common/domain.c
>>>>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
>>>>>
>>>>>        v->vcpu_info = new_info;
>>>>>        v->vcpu_info_mfn = page_to_mfn(page);
>>>>> +#ifdef CONFIG_MEM_SHARING
>>>>> +    v->vcpu_info_offset = offset;
>>>>
>>>> There's no need to introduce this field, you can just use v->vcpu_info
>>>> & ~PAGE_MASK AFAICT.
>>>
>>> Just doing what you suggest above results in:
>>>
>>> mem_sharing.c:1603:55: error: invalid operands to binary & (have
>>> ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
>>> int’)
>>>                                        d_vcpu->vcpu_info & ~PAGE_MASK);
>>>
>>> I can of course cast the vcpu_info pointer to (long int), it's just a
>>> bit ugly. Thoughts?
>>
>> FWIW, I will also need the offset for liveupdate. I have used (unsigned
>> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.
>>
>> So I am all for either a new field or a macro hiding this uglyness.
> 
> A macro sounds like a good way to go, no need for an extra field if we
> can calculate it based on the currently existing one. How about
> 
> #define VCPU_INFO_OFFSET(v) (((unsigned long)v->vcpu_info) & ~PAGE_MASK)

I was more thinking a generic macro to find the offset in a page.

PAGE_OFFSET(ptr) ((unsigned long)(ptr) & ~PAGE_MASK)

Anyway, I am happy either way.

Cheers,
Roger Pau Monné March 25, 2020, 4:54 p.m. UTC | #8
On Wed, Mar 25, 2020 at 04:42:07PM +0000, Julien Grall wrote:
> Hi,
> 
> On 25/03/2020 16:34, Tamas K Lengyel wrote:
> > > > diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> > > > index 9f51370327..1ed7d13084 100644
> > > > --- a/xen/arch/x86/mm/p2m.c
> > > > +++ b/xen/arch/x86/mm/p2m.c
> > > > @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
> > > > 
> > > >       mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > > > 
> > > > +    /* Check if we need to fork the page */
> > > > +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> > > > +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> > > > +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > > > +
> > > > +    /* Check if we need to unshare the page */
> > > >       if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
> > > >       {
> > > >           ASSERT(p2m_is_hostp2m(p2m));
> > > > @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> > > >               return page;
> > > > 
> > > >           /* Error path: not a suitable GFN at all */
> > > > -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> > > > +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> > > > +             !mem_sharing_is_fork(p2m->domain) )
> > > >               return NULL;
> > > >       }
> > > > 
> > > > diff --git a/xen/common/domain.c b/xen/common/domain.c
> > > > index b4eb476a9c..62aed53a16 100644
> > > > --- a/xen/common/domain.c
> > > > +++ b/xen/common/domain.c
> > > > @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
> > > > 
> > > >       v->vcpu_info = new_info;
> > > >       v->vcpu_info_mfn = page_to_mfn(page);
> > > > +#ifdef CONFIG_MEM_SHARING
> > > > +    v->vcpu_info_offset = offset;
> > > 
> > > There's no need to introduce this field, you can just use v->vcpu_info
> > > & ~PAGE_MASK AFAICT.
> > 
> > Just doing what you suggest above results in:
> > 
> > mem_sharing.c:1603:55: error: invalid operands to binary & (have
> > ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
> > int’)
> >                                       d_vcpu->vcpu_info & ~PAGE_MASK);
> > 
> > I can of course cast the vcpu_info pointer to (long int), it's just a
> > bit ugly. Thoughts?
> 
> FWIW, I will also need the offset for liveupdate. I have used (unsigned
> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.

I think using:

(vaddr_t)v->vcpu_info & ~PAGE_MASK

is acceptable, but that would require adding a vaddr_t typedef to x86.

Adding a macro to do so would be OK by me, maybe PAGE_OFFSET or some
such?

> So I am all for either a new field or a macro hiding this uglyness.

Macro would be better IMO, as it's redundant to store the offset when
we can obtain it from an existing field.

Thanks, Roger.
Tamas K Lengyel March 25, 2020, 5 p.m. UTC | #9
On Wed, Mar 25, 2020 at 10:52 AM Julien Grall <julien@xen.org> wrote:
>
>
>
> On 25/03/2020 16:47, Tamas K Lengyel wrote:
> > On Wed, Mar 25, 2020 at 10:42 AM Julien Grall <julien@xen.org> wrote:
> >>
> >> Hi,
> >>
> >> On 25/03/2020 16:34, Tamas K Lengyel wrote:
> >>>>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> >>>>> index 9f51370327..1ed7d13084 100644
> >>>>> --- a/xen/arch/x86/mm/p2m.c
> >>>>> +++ b/xen/arch/x86/mm/p2m.c
> >>>>> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
> >>>>>
> >>>>>        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> >>>>>
> >>>>> +    /* Check if we need to fork the page */
> >>>>> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> >>>>> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> >>>>> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> >>>>> +
> >>>>> +    /* Check if we need to unshare the page */
> >>>>>        if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
> >>>>>        {
> >>>>>            ASSERT(p2m_is_hostp2m(p2m));
> >>>>> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> >>>>>                return page;
> >>>>>
> >>>>>            /* Error path: not a suitable GFN at all */
> >>>>> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> >>>>> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> >>>>> +             !mem_sharing_is_fork(p2m->domain) )
> >>>>>                return NULL;
> >>>>>        }
> >>>>>
> >>>>> diff --git a/xen/common/domain.c b/xen/common/domain.c
> >>>>> index b4eb476a9c..62aed53a16 100644
> >>>>> --- a/xen/common/domain.c
> >>>>> +++ b/xen/common/domain.c
> >>>>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
> >>>>>
> >>>>>        v->vcpu_info = new_info;
> >>>>>        v->vcpu_info_mfn = page_to_mfn(page);
> >>>>> +#ifdef CONFIG_MEM_SHARING
> >>>>> +    v->vcpu_info_offset = offset;
> >>>>
> >>>> There's no need to introduce this field, you can just use v->vcpu_info
> >>>> & ~PAGE_MASK AFAICT.
> >>>
> >>> Just doing what you suggest above results in:
> >>>
> >>> mem_sharing.c:1603:55: error: invalid operands to binary & (have
> >>> ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
> >>> int’)
> >>>                                        d_vcpu->vcpu_info & ~PAGE_MASK);
> >>>
> >>> I can of course cast the vcpu_info pointer to (long int), it's just a
> >>> bit ugly. Thoughts?
> >>
> >> FWIW, I will also need the offset for liveupdate. I have used (unsigned
> >> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.
> >>
> >> So I am all for either a new field or a macro hiding this uglyness.
> >
> > A macro sounds like a good way to go, no need for an extra field if we
> > can calculate it based on the currently existing one. How about
> >
> > #define VCPU_INFO_OFFSET(v) (((unsigned long)v->vcpu_info) & ~PAGE_MASK)
>
> I was more thinking a generic macro to find the offset in a page.
>
> PAGE_OFFSET(ptr) ((unsigned long)(ptr) & ~PAGE_MASK)

LGTM. Should we stuff this into xen/sched.h or asm/page.h?

Tamas
Roger Pau Monné March 25, 2020, 5:16 p.m. UTC | #10
On Wed, Mar 25, 2020 at 11:00:05AM -0600, Tamas K Lengyel wrote:
> On Wed, Mar 25, 2020 at 10:52 AM Julien Grall <julien@xen.org> wrote:
> >
> >
> >
> > On 25/03/2020 16:47, Tamas K Lengyel wrote:
> > > On Wed, Mar 25, 2020 at 10:42 AM Julien Grall <julien@xen.org> wrote:
> > >>
> > >> Hi,
> > >>
> > >> On 25/03/2020 16:34, Tamas K Lengyel wrote:
> > >>>>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> > >>>>> index 9f51370327..1ed7d13084 100644
> > >>>>> --- a/xen/arch/x86/mm/p2m.c
> > >>>>> +++ b/xen/arch/x86/mm/p2m.c
> > >>>>> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
> > >>>>>
> > >>>>>        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > >>>>>
> > >>>>> +    /* Check if we need to fork the page */
> > >>>>> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> > >>>>> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> > >>>>> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > >>>>> +
> > >>>>> +    /* Check if we need to unshare the page */
> > >>>>>        if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
> > >>>>>        {
> > >>>>>            ASSERT(p2m_is_hostp2m(p2m));
> > >>>>> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> > >>>>>                return page;
> > >>>>>
> > >>>>>            /* Error path: not a suitable GFN at all */
> > >>>>> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> > >>>>> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> > >>>>> +             !mem_sharing_is_fork(p2m->domain) )
> > >>>>>                return NULL;
> > >>>>>        }
> > >>>>>
> > >>>>> diff --git a/xen/common/domain.c b/xen/common/domain.c
> > >>>>> index b4eb476a9c..62aed53a16 100644
> > >>>>> --- a/xen/common/domain.c
> > >>>>> +++ b/xen/common/domain.c
> > >>>>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
> > >>>>>
> > >>>>>        v->vcpu_info = new_info;
> > >>>>>        v->vcpu_info_mfn = page_to_mfn(page);
> > >>>>> +#ifdef CONFIG_MEM_SHARING
> > >>>>> +    v->vcpu_info_offset = offset;
> > >>>>
> > >>>> There's no need to introduce this field, you can just use v->vcpu_info
> > >>>> & ~PAGE_MASK AFAICT.
> > >>>
> > >>> Just doing what you suggest above results in:
> > >>>
> > >>> mem_sharing.c:1603:55: error: invalid operands to binary & (have
> > >>> ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
> > >>> int’)
> > >>>                                        d_vcpu->vcpu_info & ~PAGE_MASK);
> > >>>
> > >>> I can of course cast the vcpu_info pointer to (long int), it's just a
> > >>> bit ugly. Thoughts?
> > >>
> > >> FWIW, I will also need the offset for liveupdate. I have used (unsigned
> > >> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.
> > >>
> > >> So I am all for either a new field or a macro hiding this uglyness.
> > >
> > > A macro sounds like a good way to go, no need for an extra field if we
> > > can calculate it based on the currently existing one. How about
> > >
> > > #define VCPU_INFO_OFFSET(v) (((unsigned long)v->vcpu_info) & ~PAGE_MASK)
> >
> > I was more thinking a generic macro to find the offset in a page.
> >
> > PAGE_OFFSET(ptr) ((unsigned long)(ptr) & ~PAGE_MASK)
> 
> LGTM. Should we stuff this into xen/sched.h or asm/page.h?

page.h would be better, albeit you will have to duplicate it for x86
and Arm. There's xen/pfn.h which is not arch specific, but feels a bit
weird to place it there.

Thanks, Roger.
Tamas K Lengyel March 25, 2020, 5:47 p.m. UTC | #11
On Wed, Mar 25, 2020 at 11:16 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Wed, Mar 25, 2020 at 11:00:05AM -0600, Tamas K Lengyel wrote:
> > On Wed, Mar 25, 2020 at 10:52 AM Julien Grall <julien@xen.org> wrote:
> > >
> > >
> > >
> > > On 25/03/2020 16:47, Tamas K Lengyel wrote:
> > > > On Wed, Mar 25, 2020 at 10:42 AM Julien Grall <julien@xen.org> wrote:
> > > >>
> > > >> Hi,
> > > >>
> > > >> On 25/03/2020 16:34, Tamas K Lengyel wrote:
> > > >>>>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> > > >>>>> index 9f51370327..1ed7d13084 100644
> > > >>>>> --- a/xen/arch/x86/mm/p2m.c
> > > >>>>> +++ b/xen/arch/x86/mm/p2m.c
> > > >>>>> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
> > > >>>>>
> > > >>>>>        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > > >>>>>
> > > >>>>> +    /* Check if we need to fork the page */
> > > >>>>> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> > > >>>>> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> > > >>>>> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > > >>>>> +
> > > >>>>> +    /* Check if we need to unshare the page */
> > > >>>>>        if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
> > > >>>>>        {
> > > >>>>>            ASSERT(p2m_is_hostp2m(p2m));
> > > >>>>> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> > > >>>>>                return page;
> > > >>>>>
> > > >>>>>            /* Error path: not a suitable GFN at all */
> > > >>>>> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> > > >>>>> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> > > >>>>> +             !mem_sharing_is_fork(p2m->domain) )
> > > >>>>>                return NULL;
> > > >>>>>        }
> > > >>>>>
> > > >>>>> diff --git a/xen/common/domain.c b/xen/common/domain.c
> > > >>>>> index b4eb476a9c..62aed53a16 100644
> > > >>>>> --- a/xen/common/domain.c
> > > >>>>> +++ b/xen/common/domain.c
> > > >>>>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
> > > >>>>>
> > > >>>>>        v->vcpu_info = new_info;
> > > >>>>>        v->vcpu_info_mfn = page_to_mfn(page);
> > > >>>>> +#ifdef CONFIG_MEM_SHARING
> > > >>>>> +    v->vcpu_info_offset = offset;
> > > >>>>
> > > >>>> There's no need to introduce this field, you can just use v->vcpu_info
> > > >>>> & ~PAGE_MASK AFAICT.
> > > >>>
> > > >>> Just doing what you suggest above results in:
> > > >>>
> > > >>> mem_sharing.c:1603:55: error: invalid operands to binary & (have
> > > >>> ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
> > > >>> int’)
> > > >>>                                        d_vcpu->vcpu_info & ~PAGE_MASK);
> > > >>>
> > > >>> I can of course cast the vcpu_info pointer to (long int), it's just a
> > > >>> bit ugly. Thoughts?
> > > >>
> > > >> FWIW, I will also need the offset for liveupdate. I have used (unsigned
> > > >> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.
> > > >>
> > > >> So I am all for either a new field or a macro hiding this uglyness.
> > > >
> > > > A macro sounds like a good way to go, no need for an extra field if we
> > > > can calculate it based on the currently existing one. How about
> > > >
> > > > #define VCPU_INFO_OFFSET(v) (((unsigned long)v->vcpu_info) & ~PAGE_MASK)
> > >
> > > I was more thinking a generic macro to find the offset in a page.
> > >
> > > PAGE_OFFSET(ptr) ((unsigned long)(ptr) & ~PAGE_MASK)
> >
> > LGTM. Should we stuff this into xen/sched.h or asm/page.h?
>
> page.h would be better, albeit you will have to duplicate it for x86
> and Arm. There's xen/pfn.h which is not arch specific, but feels a bit
> weird to place it there.
>

I'll go with page.h, that allows us to use vaddr_t for the ARM macro.

Tamas
Jan Beulich March 26, 2020, 7:02 a.m. UTC | #12
On 25.03.2020 17:54, Roger Pau Monné wrote:
> On Wed, Mar 25, 2020 at 04:42:07PM +0000, Julien Grall wrote:
>> On 25/03/2020 16:34, Tamas K Lengyel wrote:
>>>>> --- a/xen/common/domain.c
>>>>> +++ b/xen/common/domain.c
>>>>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
>>>>>
>>>>>       v->vcpu_info = new_info;
>>>>>       v->vcpu_info_mfn = page_to_mfn(page);
>>>>> +#ifdef CONFIG_MEM_SHARING
>>>>> +    v->vcpu_info_offset = offset;
>>>>
>>>> There's no need to introduce this field, you can just use v->vcpu_info
>>>> & ~PAGE_MASK AFAICT.
>>>
>>> Just doing what you suggest above results in:
>>>
>>> mem_sharing.c:1603:55: error: invalid operands to binary & (have
>>> ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
>>> int’)
>>>                                       d_vcpu->vcpu_info & ~PAGE_MASK);
>>>
>>> I can of course cast the vcpu_info pointer to (long int), it's just a
>>> bit ugly. Thoughts?
>>
>> FWIW, I will also need the offset for liveupdate. I have used (unsigned
>> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.
> 
> I think using:
> 
> (vaddr_t)v->vcpu_info & ~PAGE_MASK
> 
> is acceptable, but that would require adding a vaddr_t typedef to x86.

I don't think vaddr_t is necessary given that all over the place we
assume conversions between pointers and unsigned long to be fine.

Jan
Jan Beulich March 26, 2020, 7:07 a.m. UTC | #13
On 25.03.2020 16:47, Roger Pau Monné wrote:
> On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote:
>> +static int copy_vcpu_settings(struct domain *cd, struct domain *d)
>> +{
>> +    unsigned int i;
>> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
>> +    int ret = -EINVAL;
>> +
>> +    for ( i = 0; i < cd->max_vcpus; i++ )
>> +    {
>> +        const struct vcpu *d_vcpu = d->vcpu[i];
>> +        struct vcpu *cd_vcpu = cd->vcpu[i];
>> +        struct vcpu_runstate_info runstate;
>> +        mfn_t vcpu_info_mfn;
>> +
>> +        if ( !d_vcpu || !cd_vcpu )
>> +            continue;
>> +
>> +        /*
>> +         * Copy & map in the vcpu_info page if the guest uses one
>> +         */
>> +        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
>> +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
>> +        {
>> +            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
>> +
>> +            /*
>> +             * Allocate & map the page for it if it hasn't been already
>> +             */
>> +            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
>> +            {
>> +                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
>> +                unsigned long gfn_l = gfn_x(gfn);
>> +                struct page_info *page;
>> +
>> +                if ( !(page = alloc_domheap_page(cd, 0)) )
>> +                    return -ENOMEM;
>> +
>> +                new_vcpu_info_mfn = page_to_mfn(page);
>> +                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
>> +
>> +                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, PAGE_ORDER_4K,
>> +                                     p2m_ram_rw, p2m->default_access, -1);
>> +                if ( ret )
>> +                    return ret;
>> +
>> +                ret = map_vcpu_info(cd_vcpu, gfn_l,
>> +                                    d_vcpu->vcpu_info_offset);
>> +                if ( ret )
>> +                    return ret;
>> +            }
>> +
>> +            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
>> +        }
>> +
>> +        /*
>> +         * Setup the vCPU runstate area
>> +         */
>> +        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )
> 
> Maybe I'm confused, but isn't this the other way around and you need
> to check? If the parent runstate is not null copy it to the fork,
> ie:
> 
> if ( !guest_handle_is_null(runstate_guest(d_vcpu)) )
> {
>     ...
> 
>> +        {
>> +            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
>> +            vcpu_runstate_get(cd_vcpu, &runstate);
>> +            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);
> 
> You should check the return code I think.

I don't think so - this is a best effort operation just like e.g.
in the handling of VCPUOP_register_runstate_memory_area.

Jan
Roger Pau Monné March 26, 2020, 8:42 a.m. UTC | #14
On Thu, Mar 26, 2020 at 08:02:22AM +0100, Jan Beulich wrote:
> On 25.03.2020 17:54, Roger Pau Monné wrote:
> > On Wed, Mar 25, 2020 at 04:42:07PM +0000, Julien Grall wrote:
> >> On 25/03/2020 16:34, Tamas K Lengyel wrote:
> >>>>> --- a/xen/common/domain.c
> >>>>> +++ b/xen/common/domain.c
> >>>>> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
> >>>>>
> >>>>>       v->vcpu_info = new_info;
> >>>>>       v->vcpu_info_mfn = page_to_mfn(page);
> >>>>> +#ifdef CONFIG_MEM_SHARING
> >>>>> +    v->vcpu_info_offset = offset;
> >>>>
> >>>> There's no need to introduce this field, you can just use v->vcpu_info
> >>>> & ~PAGE_MASK AFAICT.
> >>>
> >>> Just doing what you suggest above results in:
> >>>
> >>> mem_sharing.c:1603:55: error: invalid operands to binary & (have
> >>> ‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
> >>> int’)
> >>>                                       d_vcpu->vcpu_info & ~PAGE_MASK);
> >>>
> >>> I can of course cast the vcpu_info pointer to (long int), it's just a
> >>> bit ugly. Thoughts?
> >>
> >> FWIW, I will also need the offset for liveupdate. I have used (unsigned
> >> long)v->vcpu_info & ~PAGE_MASK so far but this is not really pretty.
> > 
> > I think using:
> > 
> > (vaddr_t)v->vcpu_info & ~PAGE_MASK
> > 
> > is acceptable, but that would require adding a vaddr_t typedef to x86.
> 
> I don't think vaddr_t is necessary given that all over the place we
> assume conversions between pointers and unsigned long to be fine.

Right, using unsigned long is indeed fine, but I also agree with Tamas
that it's slightly ugly and hence wanted to provide a 'cleaner'
suggestion.

Roger.
Roger Pau Monné March 26, 2020, 9:10 a.m. UTC | #15
On Thu, Mar 26, 2020 at 08:07:09AM +0100, Jan Beulich wrote:
> On 25.03.2020 16:47, Roger Pau Monné wrote:
> > On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote:
> >> +static int copy_vcpu_settings(struct domain *cd, struct domain *d)
> >> +{
> >> +    unsigned int i;
> >> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> >> +    int ret = -EINVAL;
> >> +
> >> +    for ( i = 0; i < cd->max_vcpus; i++ )
> >> +    {
> >> +        const struct vcpu *d_vcpu = d->vcpu[i];
> >> +        struct vcpu *cd_vcpu = cd->vcpu[i];
> >> +        struct vcpu_runstate_info runstate;
> >> +        mfn_t vcpu_info_mfn;
> >> +
> >> +        if ( !d_vcpu || !cd_vcpu )
> >> +            continue;
> >> +
> >> +        /*
> >> +         * Copy & map in the vcpu_info page if the guest uses one
> >> +         */
> >> +        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> >> +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> >> +        {
> >> +            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> >> +
> >> +            /*
> >> +             * Allocate & map the page for it if it hasn't been already
> >> +             */
> >> +            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
> >> +            {
> >> +                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> >> +                unsigned long gfn_l = gfn_x(gfn);
> >> +                struct page_info *page;
> >> +
> >> +                if ( !(page = alloc_domheap_page(cd, 0)) )
> >> +                    return -ENOMEM;
> >> +
> >> +                new_vcpu_info_mfn = page_to_mfn(page);
> >> +                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
> >> +
> >> +                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, PAGE_ORDER_4K,
> >> +                                     p2m_ram_rw, p2m->default_access, -1);
> >> +                if ( ret )
> >> +                    return ret;
> >> +
> >> +                ret = map_vcpu_info(cd_vcpu, gfn_l,
> >> +                                    d_vcpu->vcpu_info_offset);
> >> +                if ( ret )
> >> +                    return ret;
> >> +            }
> >> +
> >> +            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
> >> +        }
> >> +
> >> +        /*
> >> +         * Setup the vCPU runstate area
> >> +         */
> >> +        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )
> > 
> > Maybe I'm confused, but isn't this the other way around and you need
> > to check? If the parent runstate is not null copy it to the fork,
> > ie:
> > 
> > if ( !guest_handle_is_null(runstate_guest(d_vcpu)) )
> > {
> >     ...
> > 
> >> +        {
> >> +            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
> >> +            vcpu_runstate_get(cd_vcpu, &runstate);
> >> +            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);
> > 
> > You should check the return code I think.
> 
> I don't think so - this is a best effort operation just like e.g.
> in the handling of VCPUOP_register_runstate_memory_area.

I think printing a debug message might be helpful, not so much as for
the importance of failing to copy the runstate area, but it could
signal that something went wrong, anyway I don't have such a strong
opinion.

Just to confirm, __copy_to_guest will cause the forked domain memory
to be populated and the whole page to be copied over right? (and will
also cause the page tables to be added to the fork physmap in write
mode to set the accessed/dirty bits)

Thanks, Roger.
Jan Beulich March 26, 2020, 12:33 p.m. UTC | #16
On 23.03.2020 18:04, Tamas K Lengyel wrote:
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
>              ret = relinquish_shared_pages(d);
>              if ( ret )
>                  return ret;
> +
> +            /*
> +             * If the domain is forked, decrement the parent's pause count
> +             * and release the domain.
> +             */
> +            if ( mem_sharing_is_fork(d) )
> +            {
> +                domain_unpause(d->parent);
> +                put_domain(d->parent);
> +                d->parent = NULL;

I think you want to clear the field before putting the reference,
to make sure possible readers of it won't see it non-NULL when
the domain is already being cleaned up, or even gone.

With this, applicable parts of the change
Acked-by: Jan Beulich <jbeulich@suse.com>

I'll try to keep an eye on when you and Roger have settled on the
remaining aspects, to determine when this (probably v13) can be
committed.

> --- a/xen/include/asm-x86/mem_sharing.h
> +++ b/xen/include/asm-x86/mem_sharing.h
> @@ -77,6 +77,14 @@ static inline int mem_sharing_unshare_page(struct domain *d,
>      return rc;
>  }
>  
> +static inline bool mem_sharing_is_fork(struct domain *d)

const? (then also in the stub further down)

Jan
Tamas K Lengyel March 26, 2020, 2:52 p.m. UTC | #17
On Thu, Mar 26, 2020 at 6:33 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 23.03.2020 18:04, Tamas K Lengyel wrote:
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
> >              ret = relinquish_shared_pages(d);
> >              if ( ret )
> >                  return ret;
> > +
> > +            /*
> > +             * If the domain is forked, decrement the parent's pause count
> > +             * and release the domain.
> > +             */
> > +            if ( mem_sharing_is_fork(d) )
> > +            {
> > +                domain_unpause(d->parent);
> > +                put_domain(d->parent);
> > +                d->parent = NULL;
>
> I think you want to clear the field before putting the reference,
> to make sure possible readers of it won't see it non-NULL when
> the domain is already being cleaned up, or even gone.

Sure.

>
> With this, applicable parts of the change
> Acked-by: Jan Beulich <jbeulich@suse.com>
>
> I'll try to keep an eye on when you and Roger have settled on the
> remaining aspects, to determine when this (probably v13) can be
> committed.

Thanks!

>
> > --- a/xen/include/asm-x86/mem_sharing.h
> > +++ b/xen/include/asm-x86/mem_sharing.h
> > @@ -77,6 +77,14 @@ static inline int mem_sharing_unshare_page(struct domain *d,
> >      return rc;
> >  }
> >
> > +static inline bool mem_sharing_is_fork(struct domain *d)
>
> const? (then also in the stub further down)

Sure.

Tamas
Tamas K Lengyel March 26, 2020, 5:01 p.m. UTC | #18
On Thu, Mar 26, 2020 at 3:10 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Thu, Mar 26, 2020 at 08:07:09AM +0100, Jan Beulich wrote:
> > On 25.03.2020 16:47, Roger Pau Monné wrote:
> > > On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote:
> > >> +static int copy_vcpu_settings(struct domain *cd, struct domain *d)
> > >> +{
> > >> +    unsigned int i;
> > >> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > >> +    int ret = -EINVAL;
> > >> +
> > >> +    for ( i = 0; i < cd->max_vcpus; i++ )
> > >> +    {
> > >> +        const struct vcpu *d_vcpu = d->vcpu[i];
> > >> +        struct vcpu *cd_vcpu = cd->vcpu[i];
> > >> +        struct vcpu_runstate_info runstate;
> > >> +        mfn_t vcpu_info_mfn;
> > >> +
> > >> +        if ( !d_vcpu || !cd_vcpu )
> > >> +            continue;
> > >> +
> > >> +        /*
> > >> +         * Copy & map in the vcpu_info page if the guest uses one
> > >> +         */
> > >> +        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > >> +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > >> +        {
> > >> +            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> > >> +
> > >> +            /*
> > >> +             * Allocate & map the page for it if it hasn't been already
> > >> +             */
> > >> +            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
> > >> +            {
> > >> +                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> > >> +                unsigned long gfn_l = gfn_x(gfn);
> > >> +                struct page_info *page;
> > >> +
> > >> +                if ( !(page = alloc_domheap_page(cd, 0)) )
> > >> +                    return -ENOMEM;
> > >> +
> > >> +                new_vcpu_info_mfn = page_to_mfn(page);
> > >> +                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
> > >> +
> > >> +                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, PAGE_ORDER_4K,
> > >> +                                     p2m_ram_rw, p2m->default_access, -1);
> > >> +                if ( ret )
> > >> +                    return ret;
> > >> +
> > >> +                ret = map_vcpu_info(cd_vcpu, gfn_l,
> > >> +                                    d_vcpu->vcpu_info_offset);
> > >> +                if ( ret )
> > >> +                    return ret;
> > >> +            }
> > >> +
> > >> +            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
> > >> +        }
> > >> +
> > >> +        /*
> > >> +         * Setup the vCPU runstate area
> > >> +         */
> > >> +        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )
> > >
> > > Maybe I'm confused, but isn't this the other way around and you need
> > > to check? If the parent runstate is not null copy it to the fork,
> > > ie:
> > >
> > > if ( !guest_handle_is_null(runstate_guest(d_vcpu)) )
> > > {
> > >     ...
> > >
> > >> +        {
> > >> +            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
> > >> +            vcpu_runstate_get(cd_vcpu, &runstate);
> > >> +            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);
> > >
> > > You should check the return code I think.
> >
> > I don't think so - this is a best effort operation just like e.g.
> > in the handling of VCPUOP_register_runstate_memory_area.
>
> I think printing a debug message might be helpful, not so much as for
> the importance of failing to copy the runstate area, but it could
> signal that something went wrong, anyway I don't have such a strong
> opinion.
>
> Just to confirm, __copy_to_guest will cause the forked domain memory
> to be populated and the whole page to be copied over right? (and will
> also cause the page tables to be added to the fork physmap in write
> mode to set the accessed/dirty bits)

I checked this and it ends up calling hvm_translate_get_page which
issues a call to get_page_from_gfn already with P2M_UNSHARE already.
The problem is that we are still in the process of forking the VM, so
mem_sharing_is_fork is not yet true, since we haven't finished the
process yet completely. So what I'll do is set the parent pointer
early which will allow memory to be populated from the parent. If
there is an error during the fork operation the fork domain is getting
destroyed by the toolstack anyway so we don't have to worry about
unwinding a half-way completed fork.

Tamas
diff mbox series

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index caf2ecad7e..11d3c2216e 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2202,6 +2202,17 @@  int domain_relinquish_resources(struct domain *d)
             ret = relinquish_shared_pages(d);
             if ( ret )
                 return ret;
+
+            /*
+             * If the domain is forked, decrement the parent's pause count
+             * and release the domain.
+             */
+            if ( mem_sharing_is_fork(d) )
+            {
+                domain_unpause(d->parent);
+                put_domain(d->parent);
+                d->parent = NULL;
+            }
         }
 #endif
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index a3d115b650..304b3d1562 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1917,7 +1917,7 @@  int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     }
 #endif
 
-    /* Spurious fault? PoD and log-dirty also take this path. */
+    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
     if ( p2m_is_ram(p2mt) )
     {
         rc = 1;
@@ -4377,7 +4377,7 @@  static int hvm_allow_get_param(struct domain *d,
     return rc;
 }
 
-static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
+int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
 {
     int rc;
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index a6d5e39b02..814d0c3253 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -321,8 +321,7 @@  static void hap_free_p2m_page(struct domain *d, struct page_info *pg)
 }
 
 /* Return the size of the pool, rounded up to the nearest MB */
-static unsigned int
-hap_get_allocation(struct domain *d)
+unsigned int hap_get_allocation(struct domain *d)
 {
     unsigned int pg = d->arch.paging.hap.total_pages
         + d->arch.paging.hap.p2m_pages;
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 3835bc928f..23deeddff2 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -22,6 +22,7 @@ 
 
 #include <xen/types.h>
 #include <xen/domain_page.h>
+#include <xen/event.h>
 #include <xen/spinlock.h>
 #include <xen/rwlock.h>
 #include <xen/mm.h>
@@ -36,6 +37,8 @@ 
 #include <asm/altp2m.h>
 #include <asm/atomic.h>
 #include <asm/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/hvm.h>
 #include <xsm/xsm.h>
 
 #include "mm-locks.h"
@@ -1444,6 +1447,334 @@  static inline int mem_sharing_control(struct domain *d, bool enable)
     return 0;
 }
 
+/*
+ * Forking a page only gets called when the VM faults due to no entry being
+ * in the EPT for the access. Depending on the type of access we either
+ * populate the physmap with a shared entry for read-only access or
+ * fork the page if its a write access.
+ *
+ * The client p2m is already locked so we only need to lock
+ * the parent's here.
+ */
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
+{
+    int rc = -ENOENT;
+    shr_handle_t handle;
+    struct domain *parent = d->parent;
+    struct p2m_domain *p2m;
+    unsigned long gfn_l = gfn_x(gfn);
+    mfn_t mfn, new_mfn;
+    p2m_type_t p2mt;
+    struct page_info *page;
+
+    if ( !mem_sharing_is_fork(d) )
+        return -ENOENT;
+
+    if ( !unsharing )
+    {
+        /* For read-only accesses we just add a shared entry to the physmap */
+        while ( parent )
+        {
+            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
+                break;
+
+            parent = parent->parent;
+        }
+
+        if ( !rc )
+        {
+            /* The client's p2m is already locked */
+            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
+
+            p2m_lock(pp2m);
+            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
+            p2m_unlock(pp2m);
+
+            if ( !rc )
+                return 0;
+        }
+    }
+
+    /*
+     * If it's a write access (ie. unsharing) or if adding a shared entry to
+     * the physmap failed we'll fork the page directly.
+     */
+    p2m = p2m_get_hostp2m(d);
+    parent = d->parent;
+
+    while ( parent )
+    {
+        mfn = get_gfn_query(parent, gfn_l, &p2mt);
+
+        /*
+         * We can't fork grant memory from the parent, only regular ram.
+         */
+        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
+            break;
+
+        put_gfn(parent, gfn_l);
+        parent = parent->parent;
+    }
+
+    if ( !parent )
+        return -ENOENT;
+
+    if ( !(page = alloc_domheap_page(d, 0)) )
+    {
+        put_gfn(parent, gfn_l);
+        return -ENOMEM;
+    }
+
+    new_mfn = page_to_mfn(page);
+    copy_domain_page(new_mfn, mfn);
+    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
+
+    put_gfn(parent, gfn_l);
+
+    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                          p2m->default_access, -1);
+}
+
+static int bring_up_vcpus(struct domain *cd, struct domain *d)
+{
+    unsigned int i;
+    int ret = -EINVAL;
+
+    if ( d->max_vcpus != cd->max_vcpus ||
+        (ret = cpupool_move_domain(cd, d->cpupool)) )
+        return ret;
+
+    for ( i = 0; i < cd->max_vcpus; i++ )
+    {
+        if ( !d->vcpu[i] || cd->vcpu[i] )
+            continue;
+
+        if ( !vcpu_create(cd, i) )
+            return -EINVAL;
+    }
+
+    domain_update_node_affinity(cd);
+    return 0;
+}
+
+static int copy_vcpu_settings(struct domain *cd, struct domain *d)
+{
+    unsigned int i;
+    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
+    int ret = -EINVAL;
+
+    for ( i = 0; i < cd->max_vcpus; i++ )
+    {
+        const struct vcpu *d_vcpu = d->vcpu[i];
+        struct vcpu *cd_vcpu = cd->vcpu[i];
+        struct vcpu_runstate_info runstate;
+        mfn_t vcpu_info_mfn;
+
+        if ( !d_vcpu || !cd_vcpu )
+            continue;
+
+        /*
+         * Copy & map in the vcpu_info page if the guest uses one
+         */
+        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
+        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
+        {
+            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
+
+            /*
+             * Allocate & map the page for it if it hasn't been already
+             */
+            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
+            {
+                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
+                unsigned long gfn_l = gfn_x(gfn);
+                struct page_info *page;
+
+                if ( !(page = alloc_domheap_page(cd, 0)) )
+                    return -ENOMEM;
+
+                new_vcpu_info_mfn = page_to_mfn(page);
+                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
+
+                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, PAGE_ORDER_4K,
+                                     p2m_ram_rw, p2m->default_access, -1);
+                if ( ret )
+                    return ret;
+
+                ret = map_vcpu_info(cd_vcpu, gfn_l,
+                                    d_vcpu->vcpu_info_offset);
+                if ( ret )
+                    return ret;
+            }
+
+            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
+        }
+
+        /*
+         * Setup the vCPU runstate area
+         */
+        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )
+        {
+            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
+            vcpu_runstate_get(cd_vcpu, &runstate);
+            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);
+        }
+
+        /*
+         * TODO: to support VMs with PV interfaces copy additional
+         * settings here, such as PV timers.
+         */
+    }
+
+    return 0;
+}
+
+static int fork_hap_allocation(struct domain *cd, struct domain *d)
+{
+    int rc;
+    bool preempted;
+    unsigned long mb = hap_get_allocation(d);
+
+    if ( mb == hap_get_allocation(cd) )
+        return 0;
+
+    paging_lock(cd);
+    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
+    paging_unlock(cd);
+
+    return preempted ? -ERESTART : rc;
+}
+
+static void copy_tsc(struct domain *cd, struct domain *d)
+{
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint64_t elapsed_nsec;
+
+    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
+    /* Don't bump incarnation on set */
+    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
+}
+
+static int copy_special_pages(struct domain *cd, struct domain *d)
+{
+    mfn_t new_mfn, old_mfn;
+    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
+    static const unsigned int params[] =
+    {
+        HVM_PARAM_STORE_PFN,
+        HVM_PARAM_IOREQ_PFN,
+        HVM_PARAM_BUFIOREQ_PFN,
+        HVM_PARAM_CONSOLE_PFN
+    };
+    unsigned int i;
+    int rc;
+
+    for ( i = 0; i < 4; i++ )
+    {
+        p2m_type_t t;
+        uint64_t value = 0;
+        struct page_info *page;
+
+        if ( hvm_get_param(cd, params[i], &value) || !value )
+            continue;
+
+        old_mfn = get_gfn_query_unlocked(d, value, &t);
+        new_mfn = get_gfn_query_unlocked(cd, value, &t);
+
+        /*
+         * Allocate the page and map it in if it's not present
+         */
+        if ( mfn_eq(new_mfn, INVALID_MFN) )
+        {
+            if ( !(page = alloc_domheap_page(cd, 0)) )
+                return -ENOMEM;
+
+            new_mfn = page_to_mfn(page);
+            set_gpfn_from_mfn(mfn_x(new_mfn), value);
+
+            rc = p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
+                                p2m_ram_rw, p2m->default_access, -1);
+            if ( rc )
+                return rc;
+        }
+
+        copy_domain_page(new_mfn, old_mfn);
+    }
+
+    old_mfn = _mfn(virt_to_mfn(d->shared_info));
+    new_mfn = _mfn(virt_to_mfn(cd->shared_info));
+    copy_domain_page(new_mfn, old_mfn);
+
+    return 0;
+}
+
+static int copy_settings(struct domain *cd, struct domain *d)
+{
+    int rc;
+
+    if ( (rc = copy_vcpu_settings(cd, d)) )
+        return rc;
+
+    if ( (rc = hvm_copy_context_and_params(cd, d)) )
+        return rc;
+
+    if ( (rc = copy_special_pages(cd, d)) )
+        return rc;
+
+    copy_tsc(cd, d);
+
+    return rc;
+}
+
+static int fork(struct domain *cd, struct domain *d)
+{
+    int rc = -EBUSY;
+
+    if ( !cd->controller_pause_count )
+        return rc;
+
+    /*
+     * We only want to get and pause the parent once, not each time this
+     * operation is restarted due to preemption.
+     */
+    if ( !cd->parent_paused )
+    {
+        if ( !get_domain(d) )
+        {
+            ASSERT_UNREACHABLE();
+            return -EBUSY;
+        }
+
+        domain_pause(d);
+        cd->parent_paused = true;
+        cd->max_pages = d->max_pages;
+    }
+
+    /* this is preemptible so it's the first to get done */
+    if ( (rc = fork_hap_allocation(cd, d)) )
+        goto done;
+
+    if ( (rc = bring_up_vcpus(cd, d)) )
+        goto done;
+
+    if ( (rc = copy_settings(cd, d)) )
+        goto done;
+
+    cd->parent = d;
+
+ done:
+    if ( rc && rc != -ERESTART )
+    {
+        domain_unpause(d);
+        put_domain(d);
+        cd->parent_paused = false;
+    }
+
+    return rc;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
     int rc;
@@ -1698,6 +2029,43 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         rc = debug_gref(d, mso.u.debug.u.gref);
         break;
 
+    case XENMEM_sharing_op_fork:
+    {
+        struct domain *pd;
+
+        rc = -EINVAL;
+        if ( mso.u.fork.pad[0] || mso.u.fork.pad[1] ||
+             mso.u.fork.pad[2] )
+            goto out;
+
+        rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
+                                               &pd);
+        if ( rc )
+            goto out;
+
+        rc = -EINVAL;
+        if ( pd->max_vcpus != d->max_vcpus )
+        {
+            rcu_unlock_domain(pd);
+            goto out;
+        }
+
+        if ( !mem_sharing_enabled(pd) && (rc = mem_sharing_control(pd, true)) )
+        {
+            rcu_unlock_domain(pd);
+            goto out;
+        }
+
+        rc = fork(d, pd);
+
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
+                                               "lh", XENMEM_sharing_op,
+                                               arg);
+        rcu_unlock_domain(pd);
+        break;
+    }
+
     default:
         rc = -ENOSYS;
         break;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 9f51370327..1ed7d13084 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -509,6 +509,12 @@  mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
 
     mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
 
+    /* Check if we need to fork the page */
+    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
+         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
+        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
+
+    /* Check if we need to unshare the page */
     if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
     {
         ASSERT(p2m_is_hostp2m(p2m));
@@ -588,7 +594,8 @@  struct page_info *p2m_get_page_from_gfn(
             return page;
 
         /* Error path: not a suitable GFN at all */
-        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
+        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
+             !mem_sharing_is_fork(p2m->domain) )
             return NULL;
     }
 
diff --git a/xen/common/domain.c b/xen/common/domain.c
index b4eb476a9c..62aed53a16 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1270,6 +1270,9 @@  int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
 
     v->vcpu_info = new_info;
     v->vcpu_info_mfn = page_to_mfn(page);
+#ifdef CONFIG_MEM_SHARING
+    v->vcpu_info_offset = offset;
+#endif
 
     /* Set new vcpu_info pointer /before/ setting pending flags. */
     smp_wmb();
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index b94bfb4ed0..1bf07e49fe 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -45,6 +45,7 @@  int   hap_track_dirty_vram(struct domain *d,
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
 int hap_set_allocation(struct domain *d, unsigned int pages, bool *preempted);
+unsigned int hap_get_allocation(struct domain *d);
 
 #endif /* XEN_HAP_H */
 
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index b007b2e343..f283c7d187 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -336,6 +336,8 @@  unsigned long hvm_cr4_guest_valid_bits(const struct domain *d, bool restore);
 
 int hvm_copy_context_and_params(struct domain *src, struct domain *dst);
 
+int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value);
+
 #ifdef CONFIG_HVM
 
 #define hvm_get_guest_tsc(v) hvm_get_guest_tsc_fixed(v, 0)
diff --git a/xen/include/asm-x86/mem_sharing.h b/xen/include/asm-x86/mem_sharing.h
index 53b7929d0e..78c3a2c343 100644
--- a/xen/include/asm-x86/mem_sharing.h
+++ b/xen/include/asm-x86/mem_sharing.h
@@ -77,6 +77,14 @@  static inline int mem_sharing_unshare_page(struct domain *d,
     return rc;
 }
 
+static inline bool mem_sharing_is_fork(struct domain *d)
+{
+    return d->parent;
+}
+
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn,
+                          bool unsharing);
+
 /*
  * If called by a foreign domain, possible errors are
  *   -EBUSY -> ring full
@@ -130,6 +138,16 @@  static inline int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
     return -EOPNOTSUPP;
 }
 
+static inline bool mem_sharing_is_fork(struct domain *d)
+{
+    return false;
+}
+
+static inline int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool lock)
+{
+    return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* __MEM_SHARING_H__ */
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 126d0ff06e..5ee4e0da12 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -482,6 +482,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_add_physmap       6
 #define XENMEM_sharing_op_audit             7
 #define XENMEM_sharing_op_range_share       8
+#define XENMEM_sharing_op_fork              9
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
@@ -532,6 +533,10 @@  struct xen_mem_sharing_op {
                 uint32_t gref;     /* IN: gref to debug         */
             } u;
         } debug;
+        struct mem_sharing_op_fork {      /* OP_FORK */
+            domid_t parent_domain;        /* IN: parent's domain id */
+            uint16_t pad[3];              /* Must be set to 0 */
+        } fork;
     } u;
 };
 typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index e6813288ab..881f2bb0c2 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -247,6 +247,9 @@  struct vcpu
 
     /* Guest-specified relocation of vcpu_info. */
     mfn_t            vcpu_info_mfn;
+#ifdef CONFIG_MEM_SHARING
+    unsigned short   vcpu_info_offset;
+#endif
 
     struct evtchn_fifo_vcpu *evtchn_fifo;
 
@@ -480,6 +483,8 @@  struct domain
     /* Memory sharing support */
 #ifdef CONFIG_MEM_SHARING
     struct vm_event_domain *vm_event_share;
+    struct domain *parent; /* VM fork parent */
+    bool parent_paused;
 #endif
     /* Memory paging support */
 #ifdef CONFIG_HAS_MEM_PAGING