diff mbox series

[v8,3/5] xen/mem_sharing: VM forking

Message ID 0f7d636910c45e9ca32fda4ef864a9b7d6e32745.1581362050.git.tamas.lengyel@intel.com (mailing list archive)
State Superseded
Headers show
Series VM forking | expand

Commit Message

Tamas K Lengyel Feb. 10, 2020, 7:21 p.m. UTC
VM forking is the process of creating a domain with an empty memory space and a
parent domain specified from which to populate the memory when necessary. For
the new domain to be functional the VM state is copied over as part of the fork
operation (HVM params, hap allocation, etc).

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 xen/arch/x86/domain.c             |  11 ++
 xen/arch/x86/hvm/hvm.c            |   2 +-
 xen/arch/x86/mm/mem_sharing.c     | 221 ++++++++++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c             |  11 +-
 xen/include/asm-x86/mem_sharing.h |  17 +++
 xen/include/public/memory.h       |   5 +
 xen/include/xen/sched.h           |   2 +
 7 files changed, 267 insertions(+), 2 deletions(-)

Comments

Andrew Cooper Feb. 21, 2020, 1:43 p.m. UTC | #1
On 10/02/2020 19:21, Tamas K Lengyel wrote:
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 3835bc928f..ccf338918d 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -36,6 +37,9 @@
>  #include <asm/altp2m.h>
>  #include <asm/atomic.h>
>  #include <asm/event.h>
> +#include <asm/hap.h>
> +#include <asm/hvm/hvm.h>
> +#include <asm/hvm/save.h>

This include is stale, I think.

> +static void fork_tsc(struct domain *cd, struct domain *d)
> +{
> +    uint32_t tsc_mode;
> +    uint32_t gtsc_khz;
> +    uint32_t incarnation;
> +    uint64_t elapsed_nsec;
> +
> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation);

Sadly, get and set are asymetric.  For reasons best understood by the
original authors, incarnation gets automatically incremented on set,
rather than happing as part of migration where it logically lives.

As a result, you probably want to set incarnation - 1, and leave a
comment saying "Don't bump the incarnation" or similar.

~Andrew
Andrew Cooper Feb. 21, 2020, 2:02 p.m. UTC | #2
On 21/02/2020 13:43, Andrew Cooper wrote:
> On 10/02/2020 19:21, Tamas K Lengyel wrote:
>> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
>> index 3835bc928f..ccf338918d 100644
>> --- a/xen/arch/x86/mm/mem_sharing.c
>> +++ b/xen/arch/x86/mm/mem_sharing.c
>> @@ -36,6 +37,9 @@
>>  #include <asm/altp2m.h>
>>  #include <asm/atomic.h>
>>  #include <asm/event.h>
>> +#include <asm/hap.h>
>> +#include <asm/hvm/hvm.h>
>> +#include <asm/hvm/save.h>
> This include is stale, I think.
>
>> +static void fork_tsc(struct domain *cd, struct domain *d)
>> +{
>> +    uint32_t tsc_mode;
>> +    uint32_t gtsc_khz;
>> +    uint32_t incarnation;
>> +    uint64_t elapsed_nsec;
>> +
>> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
>> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation);
> Sadly, get and set are asymetric.  For reasons best understood by the
> original authors, incarnation gets automatically incremented on set,
> rather than happing as part of migration where it logically lives.
>
> As a result, you probably want to set incarnation - 1, and leave a
> comment saying "Don't bump the incarnation" or similar.

P.S. Can both be fixed on commit if you agree.  Seems pointless sending
a v9 just for these two.

~Andrew
Tamas K Lengyel Feb. 21, 2020, 2:22 p.m. UTC | #3
On Fri, Feb 21, 2020 at 7:02 AM Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> On 21/02/2020 13:43, Andrew Cooper wrote:
> > On 10/02/2020 19:21, Tamas K Lengyel wrote:
> >> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> >> index 3835bc928f..ccf338918d 100644
> >> --- a/xen/arch/x86/mm/mem_sharing.c
> >> +++ b/xen/arch/x86/mm/mem_sharing.c
> >> @@ -36,6 +37,9 @@
> >>  #include <asm/altp2m.h>
> >>  #include <asm/atomic.h>
> >>  #include <asm/event.h>
> >> +#include <asm/hap.h>
> >> +#include <asm/hvm/hvm.h>
> >> +#include <asm/hvm/save.h>
> > This include is stale, I think.
> >
> >> +static void fork_tsc(struct domain *cd, struct domain *d)
> >> +{
> >> +    uint32_t tsc_mode;
> >> +    uint32_t gtsc_khz;
> >> +    uint32_t incarnation;
> >> +    uint64_t elapsed_nsec;
> >> +
> >> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> >> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation);
> > Sadly, get and set are asymetric.  For reasons best understood by the
> > original authors, incarnation gets automatically incremented on set,
> > rather than happing as part of migration where it logically lives.
> >
> > As a result, you probably want to set incarnation - 1, and leave a
> > comment saying "Don't bump the incarnation" or similar.
>
> P.S. Can both be fixed on commit if you agree.  Seems pointless sending
> a v9 just for these two.

Great, I have no issue with these changes.

Thanks!
Tamas
Andrew Cooper Feb. 21, 2020, 2:42 p.m. UTC | #4
On 10/02/2020 19:21, Tamas K Lengyel wrote:
> +static int mem_sharing_fork(struct domain *d, struct domain *cd)
> +{
> +    int rc = -EINVAL;
> +
> +    if ( !cd->controller_pause_count )
> +        return rc;
> +
> +    /*
> +     * We only want to get and pause the parent once, not each time this
> +     * operation is restarted due to preemption.
> +     */
> +    if ( !cd->parent_paused )
> +    {
> +        ASSERT(get_domain(d));
> +        domain_pause(d);
> +
> +        cd->parent_paused = true;
> +        cd->max_pages = d->max_pages;
> +        cd->max_vcpus = d->max_vcpus;

Sorry, I spoke too soon.  You can't modify max_vcpus here, because it
violates the invariant that domain_vcpu() depends upon for safety.

If the toolstack gets things wrong, Xen will either leak struct vcpu's
on cd's teardown, or corrupt memory beyond the end of the cd->vcpu[] array.

Looking at the hypercall semantics, userspace creates a new domain
(which specifies max_cpus), then calls mem_sharing_fork(parent_dom,
new_dom);  Forking should be rejected if toolstack hasn't chosen the
same number of vcpus for the new domain.

This raises the question of whether the same should be true for
max_pages as well.

~Andrew
Tamas K Lengyel Feb. 21, 2020, 5:07 p.m. UTC | #5
On Fri, Feb 21, 2020 at 7:42 AM Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> On 10/02/2020 19:21, Tamas K Lengyel wrote:
> > +static int mem_sharing_fork(struct domain *d, struct domain *cd)
> > +{
> > +    int rc = -EINVAL;
> > +
> > +    if ( !cd->controller_pause_count )
> > +        return rc;
> > +
> > +    /*
> > +     * We only want to get and pause the parent once, not each time this
> > +     * operation is restarted due to preemption.
> > +     */
> > +    if ( !cd->parent_paused )
> > +    {
> > +        ASSERT(get_domain(d));
> > +        domain_pause(d);
> > +
> > +        cd->parent_paused = true;
> > +        cd->max_pages = d->max_pages;
> > +        cd->max_vcpus = d->max_vcpus;
>
> Sorry, I spoke too soon.  You can't modify max_vcpus here, because it
> violates the invariant that domain_vcpu() depends upon for safety.
>
> If the toolstack gets things wrong, Xen will either leak struct vcpu's
> on cd's teardown, or corrupt memory beyond the end of the cd->vcpu[] array.
>
> Looking at the hypercall semantics, userspace creates a new domain
> (which specifies max_cpus), then calls mem_sharing_fork(parent_dom,
> new_dom);  Forking should be rejected if toolstack hasn't chosen the
> same number of vcpus for the new domain.

That's unfortunate since this would require an extra hypercall just to
get information Xen already has. I think instead of what you recommend
what I'll do is extend XEN_DOMCTL_createdomain to include the parent
domain's ID already so Xen can gather these information automatically
without the toolstack having to do it this roundabout way.

>
> This raises the question of whether the same should be true for
> max_pages as well.

Could you expand on this?

Thanks,
Tamas
Andrew Cooper Feb. 21, 2020, 5:47 p.m. UTC | #6
On 21/02/2020 17:07, Tamas K Lengyel wrote:
> On Fri, Feb 21, 2020 at 7:42 AM Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 10/02/2020 19:21, Tamas K Lengyel wrote:
>>> +static int mem_sharing_fork(struct domain *d, struct domain *cd)
>>> +{
>>> +    int rc = -EINVAL;
>>> +
>>> +    if ( !cd->controller_pause_count )
>>> +        return rc;
>>> +
>>> +    /*
>>> +     * We only want to get and pause the parent once, not each time this
>>> +     * operation is restarted due to preemption.
>>> +     */
>>> +    if ( !cd->parent_paused )
>>> +    {
>>> +        ASSERT(get_domain(d));
>>> +        domain_pause(d);
>>> +
>>> +        cd->parent_paused = true;
>>> +        cd->max_pages = d->max_pages;
>>> +        cd->max_vcpus = d->max_vcpus;
>> Sorry, I spoke too soon.  You can't modify max_vcpus here, because it
>> violates the invariant that domain_vcpu() depends upon for safety.
>>
>> If the toolstack gets things wrong, Xen will either leak struct vcpu's
>> on cd's teardown, or corrupt memory beyond the end of the cd->vcpu[] array.
>>
>> Looking at the hypercall semantics, userspace creates a new domain
>> (which specifies max_cpus), then calls mem_sharing_fork(parent_dom,
>> new_dom);  Forking should be rejected if toolstack hasn't chosen the
>> same number of vcpus for the new domain.
> That's unfortunate since this would require an extra hypercall just to
> get information Xen already has. I think instead of what you recommend
> what I'll do is extend XEN_DOMCTL_createdomain to include the parent
> domain's ID already so Xen can gather these information automatically
> without the toolstack having to do it this roundabout way.

Conceptually, I think that is cleaner.  You really do want to duplicate
the parent domain, whatever its settings are.

However, I'd suggest not extending createdomain.  What should the
semantics be for such a call which passes conflicting settings?

How about a new top level domctl for clone_domain, taking a parent
domain identifier, and optionally providing a specific new domid?  This
way, it is impossible to provide conflicting settings, and the semantics
should be quite clear.  Internally, Xen can do whatever it needs to copy
appropriate settings, and get things sorted before the domain becomes
globally visible.

One question needing considering is whether such a hypercall could in
theory be useful without the mem_sharing infrastructure, or not.  (i.e.
how tightly integrated we should aim for.)

>> This raises the question of whether the same should be true for
>> max_pages as well.
> Could you expand on this?

Well - these two are a very odd subset of things to blindly copy.  The
max_cpus side is wrong, which makes max_pages likely to be wrong as well.

~Andrew
Tamas K Lengyel Feb. 21, 2020, 5:56 p.m. UTC | #7
On Fri, Feb 21, 2020 at 10:47 AM Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>
> On 21/02/2020 17:07, Tamas K Lengyel wrote:
> > On Fri, Feb 21, 2020 at 7:42 AM Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> >> On 10/02/2020 19:21, Tamas K Lengyel wrote:
> >>> +static int mem_sharing_fork(struct domain *d, struct domain *cd)
> >>> +{
> >>> +    int rc = -EINVAL;
> >>> +
> >>> +    if ( !cd->controller_pause_count )
> >>> +        return rc;
> >>> +
> >>> +    /*
> >>> +     * We only want to get and pause the parent once, not each time this
> >>> +     * operation is restarted due to preemption.
> >>> +     */
> >>> +    if ( !cd->parent_paused )
> >>> +    {
> >>> +        ASSERT(get_domain(d));
> >>> +        domain_pause(d);
> >>> +
> >>> +        cd->parent_paused = true;
> >>> +        cd->max_pages = d->max_pages;
> >>> +        cd->max_vcpus = d->max_vcpus;
> >> Sorry, I spoke too soon.  You can't modify max_vcpus here, because it
> >> violates the invariant that domain_vcpu() depends upon for safety.
> >>
> >> If the toolstack gets things wrong, Xen will either leak struct vcpu's
> >> on cd's teardown, or corrupt memory beyond the end of the cd->vcpu[] array.
> >>
> >> Looking at the hypercall semantics, userspace creates a new domain
> >> (which specifies max_cpus), then calls mem_sharing_fork(parent_dom,
> >> new_dom);  Forking should be rejected if toolstack hasn't chosen the
> >> same number of vcpus for the new domain.
> > That's unfortunate since this would require an extra hypercall just to
> > get information Xen already has. I think instead of what you recommend
> > what I'll do is extend XEN_DOMCTL_createdomain to include the parent
> > domain's ID already so Xen can gather these information automatically
> > without the toolstack having to do it this roundabout way.
>
> Conceptually, I think that is cleaner.  You really do want to duplicate
> the parent domain, whatever its settings are.
>
> However, I'd suggest not extending createdomain.  What should the
> semantics be for such a call which passes conflicting settings?

I'm not sure what conflicting settings you worry about? I simply add a
field to xen_domctl_createdomain called parent_domain, then in the
domctl handler if its set we copy the max_vcpus value directly from
there:

        parent_dom = op->u.createdomain.parent_domid;
        if ( parent_dom )
        {
            struct domain *pd = rcu_lock_domain_by_id(parent_dom);

            ret = -EINVAL;
            if ( !pd )
                break;

            op->u.createdomain.max_vcpus = pd->max_vcpus;
            rcu_unlock_domain(pd);
        }

>
> How about a new top level domctl for clone_domain, taking a parent
> domain identifier, and optionally providing a specific new domid?  This
> way, it is impossible to provide conflicting settings, and the semantics
> should be quite clear.  Internally, Xen can do whatever it needs to copy
> appropriate settings, and get things sorted before the domain becomes
> globally visible.

I already have a hypercall added for fork_domain and I even tried
doing everything in a single hypercall. The problem I ran into with
that was it required a lot of refactoring within Xen since
domain_create is not preemptible right now, while some other parts of
forking need to be preemptible. So it was just easier to do it with 2
hypercalls. One just creates the domain shell via
XEN_DOMCTL_createdomain and the second actually sets it up as a fork.

>
> One question needing considering is whether such a hypercall could in
> theory be useful without the mem_sharing infrastructure, or not.  (i.e.
> how tightly integrated we should aim for.)
>
> >> This raises the question of whether the same should be true for
> >> max_pages as well.
> > Could you expand on this?
>
> Well - these two are a very odd subset of things to blindly copy.  The
> max_cpus side is wrong, which makes max_pages likely to be wrong as well.

The max_cpus side is clearer why it's wrong since there is an
allocation during domain_create based on that number. I haven't ran
into that issue it seems because all the domains I tested with had
only a single vCPU. But max_pages should be safe to copy, since any
page that would get accessed up to max_pages would get forked from the
parent. I haven't run into any issues with that. So I don't really see
what's the problem there.

Tamas
Tamas K Lengyel Feb. 21, 2020, 6:08 p.m. UTC | #8
> One question needing considering is whether such a hypercall could in
> theory be useful without the mem_sharing infrastructure, or not.  (i.e.
> how tightly integrated we should aim for.)

It would be possible to create domain forks without mem_sharing. The
mem_sharing part just adds an extra optimization on top so we don't
end up copying all accessed pages needlessly, we only do that when the
page is written to. In one of the earlier revisions of the series
(~v4) due to a bug we actually ran the system with all pages being
deduplicated and no mem_sharing and it was working just fine. I'm not
sure if that's what you were asking though :)

Tamas
diff mbox series

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f53ae5ff86..a98e2e0479 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2189,6 +2189,17 @@  int domain_relinquish_resources(struct domain *d)
             ret = relinquish_shared_pages(d);
             if ( ret )
                 return ret;
+
+            /*
+             * If the domain is forked, decrement the parent's pause count
+             * and release the domain.
+             */
+            if ( d->parent )
+            {
+                domain_unpause(d->parent);
+                put_domain(d->parent);
+                d->parent = NULL;
+            }
         }
 #endif
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 00a9e70b7c..55520bbd23 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1915,7 +1915,7 @@  int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     }
 #endif
 
-    /* Spurious fault? PoD and log-dirty also take this path. */
+    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
     if ( p2m_is_ram(p2mt) )
     {
         rc = 1;
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 3835bc928f..ccf338918d 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -22,6 +22,7 @@ 
 
 #include <xen/types.h>
 #include <xen/domain_page.h>
+#include <xen/event.h>
 #include <xen/spinlock.h>
 #include <xen/rwlock.h>
 #include <xen/mm.h>
@@ -36,6 +37,9 @@ 
 #include <asm/altp2m.h>
 #include <asm/atomic.h>
 #include <asm/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/save.h>
 #include <xsm/xsm.h>
 
 #include "mm-locks.h"
@@ -1444,6 +1448,193 @@  static inline int mem_sharing_control(struct domain *d, bool enable)
     return 0;
 }
 
+/*
+ * Forking a page only gets called when the VM faults due to no entry being
+ * in the EPT for the access. Depending on the type of access we either
+ * populate the physmap with a shared entry for read-only access or
+ * fork the page if its a write access.
+ *
+ * The client p2m is already locked so we only need to lock
+ * the parent's here.
+ */
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
+{
+    int rc = -ENOENT;
+    shr_handle_t handle;
+    struct domain *parent;
+    struct p2m_domain *p2m;
+    unsigned long gfn_l = gfn_x(gfn);
+    mfn_t mfn, new_mfn;
+    p2m_type_t p2mt;
+    struct page_info *page;
+
+    if ( !mem_sharing_is_fork(d) )
+        return -ENOENT;
+
+    parent = d->parent;
+
+    if ( !unsharing )
+    {
+        /* For read-only accesses we just add a shared entry to the physmap */
+        while ( parent )
+        {
+            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
+                break;
+
+            parent = parent->parent;
+        }
+
+        if ( !rc )
+        {
+            /* The client's p2m is already locked */
+            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
+
+            p2m_lock(pp2m);
+            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
+            p2m_unlock(pp2m);
+
+            if ( !rc )
+                return 0;
+        }
+    }
+
+    /*
+     * If it's a write access (ie. unsharing) or if adding a shared entry to
+     * the physmap failed we'll fork the page directly.
+     */
+    p2m = p2m_get_hostp2m(d);
+    parent = d->parent;
+
+    while ( parent )
+    {
+        mfn = get_gfn_query(parent, gfn_l, &p2mt);
+
+        if ( mfn_valid(mfn) && p2m_is_any_ram(p2mt) )
+            break;
+
+        put_gfn(parent, gfn_l);
+        parent = parent->parent;
+    }
+
+    if ( !parent )
+        return -ENOENT;
+
+    if ( !(page = alloc_domheap_page(d, 0)) )
+    {
+        put_gfn(parent, gfn_l);
+        return -ENOMEM;
+    }
+
+    new_mfn = page_to_mfn(page);
+    copy_domain_page(new_mfn, mfn);
+    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
+
+    put_gfn(parent, gfn_l);
+
+    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                          p2m->default_access, -1);
+}
+
+static int bring_up_vcpus(struct domain *cd, struct cpupool *cpupool)
+{
+    int ret;
+    unsigned int i;
+
+    if ( (ret = cpupool_move_domain(cd, cpupool)) )
+        return ret;
+
+    for ( i = 0; i < cd->max_vcpus; i++ )
+    {
+        if ( cd->vcpu[i] )
+            continue;
+
+        if ( !vcpu_create(cd, i) )
+            return -EINVAL;
+    }
+
+    domain_update_node_affinity(cd);
+    return 0;
+}
+
+static int fork_hap_allocation(struct domain *cd, struct domain *d)
+{
+    int rc;
+    bool preempted;
+    unsigned long mb = hap_get_allocation(d);
+
+    if ( mb == hap_get_allocation(cd) )
+        return 0;
+
+    paging_lock(cd);
+    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
+    paging_unlock(cd);
+
+    if ( rc )
+        return rc;
+
+    if ( preempted )
+        return -ERESTART;
+
+    return 0;
+}
+
+static void fork_tsc(struct domain *cd, struct domain *d)
+{
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint64_t elapsed_nsec;
+
+    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
+    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation);
+}
+
+static int mem_sharing_fork(struct domain *d, struct domain *cd)
+{
+    int rc = -EINVAL;
+
+    if ( !cd->controller_pause_count )
+        return rc;
+
+    /*
+     * We only want to get and pause the parent once, not each time this
+     * operation is restarted due to preemption.
+     */
+    if ( !cd->parent_paused )
+    {
+        ASSERT(get_domain(d));
+        domain_pause(d);
+
+        cd->parent_paused = true;
+        cd->max_pages = d->max_pages;
+        cd->max_vcpus = d->max_vcpus;
+    }
+
+    /* this is preemptible so it's the first to get done */
+    if ( (rc = fork_hap_allocation(cd, d)) )
+        goto done;
+
+    if ( (rc = bring_up_vcpus(cd, d->cpupool)) )
+        goto done;
+
+    if ( (rc = hvm_copy_context_and_params(cd, d)) )
+        goto done;
+
+    fork_tsc(cd, d);
+
+    cd->parent = d;
+
+ done:
+    if ( rc && rc != -ERESTART )
+    {
+        domain_unpause(d);
+        put_domain(d);
+        cd->parent_paused = false;
+    }
+
+    return rc;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
     int rc;
@@ -1698,6 +1889,36 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         rc = debug_gref(d, mso.u.debug.u.gref);
         break;
 
+    case XENMEM_sharing_op_fork:
+    {
+        struct domain *pd;
+
+        rc = -EINVAL;
+        if ( mso.u.fork._pad[0] || mso.u.fork._pad[1] ||
+             mso.u.fork._pad[2] )
+            goto out;
+
+        rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
+                                               &pd);
+        if ( rc )
+            goto out;
+
+        if ( !mem_sharing_enabled(pd) )
+        {
+            if ( (rc = mem_sharing_control(pd, true)) )
+                goto out;
+        }
+
+        rc = mem_sharing_fork(pd, d);
+
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
+                                               "lh", XENMEM_sharing_op,
+                                               arg);
+        rcu_unlock_domain(pd);
+        break;
+    }
+
     default:
         rc = -ENOSYS;
         break;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 2c0bb7e869..72b4485970 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -509,6 +509,14 @@  mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
 
     mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
 
+    /* Check if we need to fork the page */
+    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
+         !mem_sharing_fork_page(p2m->domain, gfn, !!(q & P2M_UNSHARE)) )
+    {
+        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
+    }
+
+    /* Check if we need to unshare the page */
     if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
     {
         ASSERT(p2m_is_hostp2m(p2m));
@@ -587,7 +595,8 @@  struct page_info *p2m_get_page_from_gfn(
             return page;
 
         /* Error path: not a suitable GFN at all */
-        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
+        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
+             !mem_sharing_is_fork(p2m->domain) )
             return NULL;
     }
 
diff --git a/xen/include/asm-x86/mem_sharing.h b/xen/include/asm-x86/mem_sharing.h
index 53760a2896..ac968fae3f 100644
--- a/xen/include/asm-x86/mem_sharing.h
+++ b/xen/include/asm-x86/mem_sharing.h
@@ -39,6 +39,9 @@  struct mem_sharing_domain
 
 #define mem_sharing_enabled(d) ((d)->arch.hvm.mem_sharing.enabled)
 
+#define mem_sharing_is_fork(d) \
+    (mem_sharing_enabled(d) && !!((d)->parent))
+
 /* Auditing of memory sharing code? */
 #ifndef NDEBUG
 #define MEM_SHARING_AUDIT 1
@@ -88,6 +91,9 @@  static inline int mem_sharing_unshare_page(struct domain *d,
     return rc;
 }
 
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn,
+                          bool unsharing);
+
 /*
  * If called by a foreign domain, possible errors are
  *   -EBUSY -> ring full
@@ -117,6 +123,7 @@  int relinquish_shared_pages(struct domain *d);
 #else
 
 #define mem_sharing_enabled(d) false
+#define mem_sharing_is_fork(p2m) false
 
 static inline unsigned int mem_sharing_get_nr_saved_mfns(void)
 {
@@ -141,6 +148,16 @@  static inline int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
     return -EOPNOTSUPP;
 }
 
+static inline int mem_sharing_fork(struct domain *d, struct domain *cd, bool vcpu)
+{
+    return -EOPNOTSUPP;
+}
+
+static inline int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool lock)
+{
+    return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* __MEM_SHARING_H__ */
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index cfdda6e2a8..90a3f4498e 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -482,6 +482,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_add_physmap       6
 #define XENMEM_sharing_op_audit             7
 #define XENMEM_sharing_op_range_share       8
+#define XENMEM_sharing_op_fork              9
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
@@ -532,6 +533,10 @@  struct xen_mem_sharing_op {
                 uint32_t gref;     /* IN: gref to debug         */
             } u;
         } debug;
+        struct mem_sharing_op_fork {
+            domid_t parent_domain;
+            uint16_t _pad[3];                /* Must be set to 0 */
+        } fork;
     } u;
 };
 typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 7c5c437247..8ed727e10c 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -507,6 +507,8 @@  struct domain
     /* Memory sharing support */
 #ifdef CONFIG_MEM_SHARING
     struct vm_event_domain *vm_event_share;
+    struct domain *parent; /* VM fork parent */
+    bool parent_paused;
 #endif
     /* Memory paging support */
 #ifdef CONFIG_HAS_MEM_PAGING