diff mbox series

[v5,15/18] xen/mem_sharing: VM forking

Message ID d2a7c532b8ff412cdda102d81c114e0c642dfcbc.1579628566.git.tamas.lengyel@intel.com (mailing list archive)
State Superseded
Headers show
Series VM forking | expand

Commit Message

Tamas K Lengyel Jan. 21, 2020, 5:49 p.m. UTC
VM forking is the process of creating a domain with an empty memory space and a
parent domain specified from which to populate the memory when necessary. For
the new domain to be functional the VM state is copied over as part of the fork
operation (HVM params, hap allocation, etc).

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 xen/arch/x86/domain.c             |   9 ++
 xen/arch/x86/hvm/hvm.c            |   2 +-
 xen/arch/x86/mm/mem_sharing.c     | 220 ++++++++++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c             |  11 +-
 xen/include/asm-x86/mem_sharing.h |  20 ++-
 xen/include/public/memory.h       |   5 +
 xen/include/xen/sched.h           |   2 +
 7 files changed, 265 insertions(+), 4 deletions(-)

Comments

George Dunlap Jan. 23, 2020, 5:21 p.m. UTC | #1
On 1/21/20 5:49 PM, Tamas K Lengyel wrote:
> VM forking is the process of creating a domain with an empty memory space and a
> parent domain specified from which to populate the memory when necessary. For
> the new domain to be functional the VM state is copied over as part of the fork
> operation (HVM params, hap allocation, etc).
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>

Overall this looks really good.  Just a few questions.

> ---
>  xen/arch/x86/domain.c             |   9 ++
>  xen/arch/x86/hvm/hvm.c            |   2 +-
>  xen/arch/x86/mm/mem_sharing.c     | 220 ++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/p2m.c             |  11 +-
>  xen/include/asm-x86/mem_sharing.h |  20 ++-
>  xen/include/public/memory.h       |   5 +
>  xen/include/xen/sched.h           |   2 +
>  7 files changed, 265 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 28fefa1f81..953abcc1fc 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2198,6 +2198,15 @@ int domain_relinquish_resources(struct domain *d)
>              ret = relinquish_shared_pages(d);
>              if ( ret )
>                  return ret;
> +
> +            /*
> +             * If the domain is forked, decrement the parent's pause count.
> +             */
> +            if ( d->parent )
> +            {
> +                domain_unpause(d->parent);
> +                d->parent = NULL;

Did this want to be `if ( d->parent_paused )`?

> +static int bring_up_vcpus(struct domain *cd, struct cpupool *cpupool)
> +{
> +    int ret;
> +    unsigned int i;
> +
> +    if ( (ret = cpupool_move_domain(cd, cpupool)) )
> +        return ret;
> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        if ( cd->vcpu[i] )
> +            continue;
> +
> +        if ( !vcpu_create(cd, i) )
> +            return -EINVAL;

You're not copying the contents of the vcpu registers or anything here
-- is that something you're leaving to your dom0 agent?

> +static int mem_sharing_fork(struct domain *d, struct domain *cd)
> +{
> +    int rc = -EINVAL;
> +
> +    if ( !cd->controller_pause_count )
> +        return rc;
> +
> +    /*
> +     * We only want to pause the parent once, not each time this
> +     * operation is restarted due to preemption.
> +     */
> +    if ( !cd->parent_paused )
> +    {
> +        domain_pause(d);
> +        cd->parent_paused = true;
> +    }
> +
> +    cd->max_pages = d->max_pages;
> +    cd->max_vcpus = d->max_vcpus;
> +
> +    /* this is preemptible so it's the first to get done */
> +    if ( (rc = fork_hap_allocation(d, cd)) )
> +        goto done;
> +
> +    if ( (rc = bring_up_vcpus(cd, d->cpupool)) )
> +        goto done;
> +
> +    if ( (rc = hvm_copy_context_and_params(d, cd)) )
> +        goto done;
> +
> +    fork_tsc(d, cd);
> +
> +    cd->parent = d;

What happens if the parent dies?

It seems like we might want to do get_domain(parent) here, and
put_domain(parent) in domain_destroy.

I'll probably need to come back to this, at which point I may have more
questions.

 -George
Tamas K Lengyel Jan. 23, 2020, 5:30 p.m. UTC | #2
On Thu, Jan 23, 2020 at 10:21 AM George Dunlap <george.dunlap@citrix.com> wrote:
>
> On 1/21/20 5:49 PM, Tamas K Lengyel wrote:
> > VM forking is the process of creating a domain with an empty memory space and a
> > parent domain specified from which to populate the memory when necessary. For
> > the new domain to be functional the VM state is copied over as part of the fork
> > operation (HVM params, hap allocation, etc).
> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
>
> Overall this looks really good.  Just a few questions.
>
> > ---
> >  xen/arch/x86/domain.c             |   9 ++
> >  xen/arch/x86/hvm/hvm.c            |   2 +-
> >  xen/arch/x86/mm/mem_sharing.c     | 220 ++++++++++++++++++++++++++++++
> >  xen/arch/x86/mm/p2m.c             |  11 +-
> >  xen/include/asm-x86/mem_sharing.h |  20 ++-
> >  xen/include/public/memory.h       |   5 +
> >  xen/include/xen/sched.h           |   2 +
> >  7 files changed, 265 insertions(+), 4 deletions(-)
> >
> > diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> > index 28fefa1f81..953abcc1fc 100644
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -2198,6 +2198,15 @@ int domain_relinquish_resources(struct domain *d)
> >              ret = relinquish_shared_pages(d);
> >              if ( ret )
> >                  return ret;
> > +
> > +            /*
> > +             * If the domain is forked, decrement the parent's pause count.
> > +             */
> > +            if ( d->parent )
> > +            {
> > +                domain_unpause(d->parent);
> > +                d->parent = NULL;
>
> Did this want to be `if ( d->parent_paused )`?

If the domain has the parent pointer set, it's guaranteed that the
parent is paused. It's paused before the parent pointer is set during
the fork hypercall.

>
> > +static int bring_up_vcpus(struct domain *cd, struct cpupool *cpupool)
> > +{
> > +    int ret;
> > +    unsigned int i;
> > +
> > +    if ( (ret = cpupool_move_domain(cd, cpupool)) )
> > +        return ret;
> > +
> > +    for ( i = 0; i < cd->max_vcpus; i++ )
> > +    {
> > +        if ( cd->vcpu[i] )
> > +            continue;
> > +
> > +        if ( !vcpu_create(cd, i) )
> > +            return -EINVAL;
>
> You're not copying the contents of the vcpu registers or anything here
> -- is that something you're leaving to your dom0 agent?

The registers are being copied as part of the HVM contexts.

>
> > +static int mem_sharing_fork(struct domain *d, struct domain *cd)
> > +{
> > +    int rc = -EINVAL;
> > +
> > +    if ( !cd->controller_pause_count )
> > +        return rc;
> > +
> > +    /*
> > +     * We only want to pause the parent once, not each time this
> > +     * operation is restarted due to preemption.
> > +     */
> > +    if ( !cd->parent_paused )
> > +    {
> > +        domain_pause(d);
> > +        cd->parent_paused = true;
> > +    }
> > +
> > +    cd->max_pages = d->max_pages;
> > +    cd->max_vcpus = d->max_vcpus;
> > +
> > +    /* this is preemptible so it's the first to get done */
> > +    if ( (rc = fork_hap_allocation(d, cd)) )
> > +        goto done;
> > +
> > +    if ( (rc = bring_up_vcpus(cd, d->cpupool)) )
> > +        goto done;
> > +
> > +    if ( (rc = hvm_copy_context_and_params(d, cd)) )
> > +        goto done;
> > +
> > +    fork_tsc(d, cd);
> > +
> > +    cd->parent = d;
>
> What happens if the parent dies?
>
> It seems like we might want to do get_domain(parent) here, and
> put_domain(parent) in domain_destroy.

If forks are still active when someone destroys the parent than Xen
will crash I assume. Right now it's a requirement that the parent
remains in existence - and it's paused - while there are forks active.
We enforce the pause state but making the parent undestroyable is not
implemented right now. We just trust that the user of this
experimental system won't do that.

But yes, doing the get_domain()/put_domain() dance would be an easy
way to do that. Will add that and then we don't have to worry about
the parent getting pulled from under our feat.

>
> I'll probably need to come back to this, at which point I may have more
> questions.

Thanks!
Tamas
diff mbox series

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 28fefa1f81..953abcc1fc 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2198,6 +2198,15 @@  int domain_relinquish_resources(struct domain *d)
             ret = relinquish_shared_pages(d);
             if ( ret )
                 return ret;
+
+            /*
+             * If the domain is forked, decrement the parent's pause count.
+             */
+            if ( d->parent )
+            {
+                domain_unpause(d->parent);
+                d->parent = NULL;
+            }
         }
 #endif
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index e60b4931bf..6012b88845 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1906,7 +1906,7 @@  int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     }
 #endif
 
-    /* Spurious fault? PoD and log-dirty also take this path. */
+    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
     if ( p2m_is_ram(p2mt) )
     {
         rc = 1;
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index e3ddb63b4f..749305417c 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -22,11 +22,13 @@ 
 
 #include <xen/types.h>
 #include <xen/domain_page.h>
+#include <xen/event.h>
 #include <xen/spinlock.h>
 #include <xen/rwlock.h>
 #include <xen/mm.h>
 #include <xen/grant_table.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/rcupdate.h>
 #include <xen/guest_access.h>
 #include <xen/vm_event.h>
@@ -36,6 +38,9 @@ 
 #include <asm/altp2m.h>
 #include <asm/atomic.h>
 #include <asm/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/save.h>
 #include <xsm/xsm.h>
 
 #include "mm-locks.h"
@@ -1421,6 +1426,191 @@  static inline int mem_sharing_control(struct domain *d, bool enable)
     return 0;
 }
 
+/*
+ * Forking a page only gets called when the VM faults due to no entry being
+ * in the EPT for the access. Depending on the type of access we either
+ * populate the physmap with a shared entry for read-only access or
+ * fork the page if its a write access.
+ *
+ * The client p2m is already locked so we only need to lock
+ * the parent's here.
+ */
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
+{
+    int rc = -ENOENT;
+    shr_handle_t handle;
+    struct domain *parent;
+    struct p2m_domain *p2m;
+    unsigned long gfn_l = gfn_x(gfn);
+    mfn_t mfn, new_mfn;
+    p2m_type_t p2mt;
+    struct page_info *page;
+
+    if ( !mem_sharing_is_fork(d) )
+        return -ENOENT;
+
+    parent = d->parent;
+
+    if ( !unsharing )
+    {
+        /* For read-only accesses we just add a shared entry to the physmap */
+        while ( parent )
+        {
+            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
+                break;
+
+            parent = parent->parent;
+        }
+
+        if ( !rc )
+        {
+            /* The client's p2m is already locked */
+            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
+
+            p2m_lock(pp2m);
+            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
+            p2m_unlock(pp2m);
+
+            if ( !rc )
+                return 0;
+        }
+    }
+
+    /*
+     * If it's a write access (ie. unsharing) or if adding a shared entry to
+     * the physmap failed we'll fork the page directly.
+     */
+    p2m = p2m_get_hostp2m(d);
+    parent = d->parent;
+
+    while ( parent )
+    {
+        mfn = get_gfn_query(parent, gfn_l, &p2mt);
+
+        if ( mfn_valid(mfn) && p2m_is_any_ram(p2mt) )
+            break;
+
+        put_gfn(parent, gfn_l);
+        parent = parent->parent;
+    }
+
+    if ( !parent )
+        return -ENOENT;
+
+    if ( !(page = alloc_domheap_page(d, 0)) )
+    {
+        put_gfn(parent, gfn_l);
+        return -ENOMEM;
+    }
+
+    new_mfn = page_to_mfn(page);
+    copy_domain_page(new_mfn, mfn);
+    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
+
+    put_gfn(parent, gfn_l);
+
+    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                          p2m->default_access, -1);
+}
+
+static int bring_up_vcpus(struct domain *cd, struct cpupool *cpupool)
+{
+    int ret;
+    unsigned int i;
+
+    if ( (ret = cpupool_move_domain(cd, cpupool)) )
+        return ret;
+
+    for ( i = 0; i < cd->max_vcpus; i++ )
+    {
+        if ( cd->vcpu[i] )
+            continue;
+
+        if ( !vcpu_create(cd, i) )
+            return -EINVAL;
+    }
+
+    domain_update_node_affinity(cd);
+    return 0;
+}
+
+static int fork_hap_allocation(struct domain *d, struct domain *cd)
+{
+    int rc;
+    bool preempted;
+    unsigned long mb = hap_get_allocation(d);
+
+    if ( mb == hap_get_allocation(cd) )
+        return 0;
+
+    paging_lock(cd);
+    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
+    paging_unlock(cd);
+
+    if ( rc )
+        return rc;
+
+    if ( preempted )
+        return -ERESTART;
+
+    return 0;
+}
+
+static void fork_tsc(struct domain *d, struct domain *cd)
+{
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint64_t elapsed_nsec;
+
+    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
+    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation);
+}
+
+static int mem_sharing_fork(struct domain *d, struct domain *cd)
+{
+    int rc = -EINVAL;
+
+    if ( !cd->controller_pause_count )
+        return rc;
+
+    /*
+     * We only want to pause the parent once, not each time this
+     * operation is restarted due to preemption.
+     */
+    if ( !cd->parent_paused )
+    {
+        domain_pause(d);
+        cd->parent_paused = true;
+    }
+
+    cd->max_pages = d->max_pages;
+    cd->max_vcpus = d->max_vcpus;
+
+    /* this is preemptible so it's the first to get done */
+    if ( (rc = fork_hap_allocation(d, cd)) )
+        goto done;
+
+    if ( (rc = bring_up_vcpus(cd, d->cpupool)) )
+        goto done;
+
+    if ( (rc = hvm_copy_context_and_params(d, cd)) )
+        goto done;
+
+    fork_tsc(d, cd);
+
+    cd->parent = d;
+
+ done:
+    if ( rc && rc != -ERESTART )
+    {
+        domain_unpause(d);
+        cd->parent_paused = false;
+    }
+
+    return rc;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
     int rc;
@@ -1675,6 +1865,36 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         rc = debug_gref(d, mso.u.debug.u.gref);
         break;
 
+    case XENMEM_sharing_op_fork:
+    {
+        struct domain *pd;
+
+        rc = -EINVAL;
+        if ( mso.u.fork._pad[0] || mso.u.fork._pad[1] ||
+             mso.u.fork._pad[2] )
+            goto out;
+
+        rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
+                                               &pd);
+        if ( rc )
+            goto out;
+
+        if ( !mem_sharing_enabled(pd) )
+        {
+            if ( (rc = mem_sharing_control(pd, true)) )
+                goto out;
+        }
+
+        rc = mem_sharing_fork(pd, d);
+
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
+                                               "lh", XENMEM_sharing_op,
+                                               arg);
+        rcu_unlock_domain(pd);
+        break;
+    }
+
     default:
         rc = -ENOSYS;
         break;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 43dd46a32f..c73f5aefbb 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -508,6 +508,14 @@  mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l,
 
     mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
 
+    /* Check if we need to fork the page */
+    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
+         !mem_sharing_fork_page(p2m->domain, gfn, !!(q & P2M_UNSHARE)) )
+    {
+        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
+    }
+
+    /* Check if we need to unshare the page */
     if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
     {
         ASSERT(p2m_is_hostp2m(p2m));
@@ -585,7 +593,8 @@  struct page_info *p2m_get_page_from_gfn(
             return page;
 
         /* Error path: not a suitable GFN at all */
-        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
+        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
+             !mem_sharing_is_fork(p2m->domain) )
             return NULL;
     }
 
diff --git a/xen/include/asm-x86/mem_sharing.h b/xen/include/asm-x86/mem_sharing.h
index 53760a2896..812171284f 100644
--- a/xen/include/asm-x86/mem_sharing.h
+++ b/xen/include/asm-x86/mem_sharing.h
@@ -26,8 +26,7 @@ 
 
 #ifdef CONFIG_MEM_SHARING
 
-struct mem_sharing_domain
-{
+struct mem_sharing_domain {
     bool enabled;
 
     /*
@@ -39,6 +38,9 @@  struct mem_sharing_domain
 
 #define mem_sharing_enabled(d) ((d)->arch.hvm.mem_sharing.enabled)
 
+#define mem_sharing_is_fork(d) \
+    (mem_sharing_enabled(d) && !!((d)->parent))
+
 /* Auditing of memory sharing code? */
 #ifndef NDEBUG
 #define MEM_SHARING_AUDIT 1
@@ -88,6 +90,9 @@  static inline int mem_sharing_unshare_page(struct domain *d,
     return rc;
 }
 
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn,
+                          bool unsharing);
+
 /*
  * If called by a foreign domain, possible errors are
  *   -EBUSY -> ring full
@@ -117,6 +122,7 @@  int relinquish_shared_pages(struct domain *d);
 #else
 
 #define mem_sharing_enabled(d) false
+#define mem_sharing_is_fork(p2m) false
 
 static inline unsigned int mem_sharing_get_nr_saved_mfns(void)
 {
@@ -141,6 +147,16 @@  static inline int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
     return -EOPNOTSUPP;
 }
 
+static inline int mem_sharing_fork(struct domain *d, struct domain *cd, bool vcpu)
+{
+    return -EOPNOTSUPP;
+}
+
+static inline int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool lock)
+{
+    return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* __MEM_SHARING_H__ */
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index cfdda6e2a8..90a3f4498e 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -482,6 +482,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_add_physmap       6
 #define XENMEM_sharing_op_audit             7
 #define XENMEM_sharing_op_range_share       8
+#define XENMEM_sharing_op_fork              9
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
@@ -532,6 +533,10 @@  struct xen_mem_sharing_op {
                 uint32_t gref;     /* IN: gref to debug         */
             } u;
         } debug;
+        struct mem_sharing_op_fork {
+            domid_t parent_domain;
+            uint16_t _pad[3];                /* Must be set to 0 */
+        } fork;
     } u;
 };
 typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index cc942a3621..67d44e219c 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -504,6 +504,8 @@  struct domain
     /* Memory sharing support */
 #ifdef CONFIG_MEM_SHARING
     struct vm_event_domain *vm_event_share;
+    struct domain *parent; /* VM fork parent */
+    bool parent_paused;
 #endif
     /* Memory paging support */
 #ifdef CONFIG_HAS_MEM_PAGING