diff mbox series

[1/3] x86/mem_sharing: option to skip populating special pages during fork

Message ID fb927228a8f68ce983ae0b46e6665b5b8dd0764e.1647970630.git.tamas.lengyel@intel.com (mailing list archive)
State New, archived
Headers show
Series [1/3] x86/mem_sharing: option to skip populating special pages during fork | expand

Commit Message

Tamas K Lengyel March 22, 2022, 5:41 p.m. UTC
Add option to the fork memop to skip populating the fork with special pages.
These special pages are only necessary when setting up forks to be fully
functional with a toolstack. For short-lived forks where no toolstack is active
these pages are uneccesary.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
 xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
 xen/include/public/memory.h           |  4 ++--
 3 files changed, 26 insertions(+), 15 deletions(-)

Comments

Roger Pau Monné March 24, 2022, 2:50 p.m. UTC | #1
On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> Add option to the fork memop to skip populating the fork with special pages.
> These special pages are only necessary when setting up forks to be fully
> functional with a toolstack. For short-lived forks where no toolstack is active
> these pages are uneccesary.

I'm not sure those are strictly related to having a toolstack. For
example the vcpu_info has nothing to do with having a toolstack, and
is only used by the guest in order to receive events or fetch time
related data. So while a short-lived fork might not make use of those,
that has nothing to do with having a toolstack or not.

> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> ---
>  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
>  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
>  xen/include/public/memory.h           |  4 ++--
>  3 files changed, 26 insertions(+), 15 deletions(-)
> 
> diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> index 698455444e..446cd06411 100644
> --- a/xen/arch/x86/include/asm/hvm/domain.h
> +++ b/xen/arch/x86/include/asm/hvm/domain.h
> @@ -31,7 +31,9 @@
>  #ifdef CONFIG_MEM_SHARING
>  struct mem_sharing_domain
>  {
> -    bool enabled, block_interrupts;
> +    bool enabled;
> +    bool block_interrupts;
> +    bool skip_special_pages;
>  
>      /*
>       * When releasing shared gfn's in a preemptible manner, recall where
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 15e6a7ed81..84c04ddfa3 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
>      return 0;
>  }
>  
> -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> +                              bool skip_special_pages)
>  {
>      unsigned int i;
>      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
>  
>          /* Copy & map in the vcpu_info page if the guest uses one */
>          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
>          {
>              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
>  
> @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
>      return 0;
>  }
>  
> -static int copy_settings(struct domain *cd, struct domain *d)
> +static int copy_settings(struct domain *cd, struct domain *d,
> +                         bool skip_special_pages)
>  {
>      int rc;
>  
> -    if ( (rc = copy_vcpu_settings(cd, d)) )
> +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
>          return rc;
>  
>      if ( (rc = hvm_copy_context_and_params(cd, d)) )
>          return rc;
>  
> -    if ( (rc = copy_special_pages(cd, d)) )
> +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
>          return rc;
>  
>      copy_tsc(cd, d);
> @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
>      return rc;
>  }
>  
> -static int fork(struct domain *cd, struct domain *d)
> +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
>  {
>      int rc = -EBUSY;
> +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
>  
>      if ( !cd->controller_pause_count )
>          return rc;
> @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
>      if ( (rc = bring_up_vcpus(cd, d)) )
>          goto done;
>  
> -    rc = copy_settings(cd, d);
> +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )

Can you set
cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
so that you don't need to pass the options around to copy_settings and
copy_vcpu_settings?

> +    {
> +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> +        cd->creation_finished = skip_special_pages;

I think this is dangerous. While it might be true at the moment that
the arch_domain_creation_finished only maps the vAPIC page, there's no
guarantee it couldn't do other stuff in the future that could be
required for the VM to be started.

Does it add much overhead to map the vAPIC page?

Thanks, Roger.
Tamas K Lengyel March 24, 2022, 3:15 p.m. UTC | #2
On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > Add option to the fork memop to skip populating the fork with special pages.
> > These special pages are only necessary when setting up forks to be fully
> > functional with a toolstack. For short-lived forks where no toolstack is active
> > these pages are uneccesary.
>
> I'm not sure those are strictly related to having a toolstack. For
> example the vcpu_info has nothing to do with having a toolstack, and
> is only used by the guest in order to receive events or fetch time
> related data. So while a short-lived fork might not make use of those,
> that has nothing to do with having a toolstack or not.

Fair enough, the point is that the short live fork doesn't use these pages.

> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > ---
> >  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
> >  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
> >  xen/include/public/memory.h           |  4 ++--
> >  3 files changed, 26 insertions(+), 15 deletions(-)
> >
> > diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> > index 698455444e..446cd06411 100644
> > --- a/xen/arch/x86/include/asm/hvm/domain.h
> > +++ b/xen/arch/x86/include/asm/hvm/domain.h
> > @@ -31,7 +31,9 @@
> >  #ifdef CONFIG_MEM_SHARING
> >  struct mem_sharing_domain
> >  {
> > -    bool enabled, block_interrupts;
> > +    bool enabled;
> > +    bool block_interrupts;
> > +    bool skip_special_pages;
> >
> >      /*
> >       * When releasing shared gfn's in a preemptible manner, recall where
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index 15e6a7ed81..84c04ddfa3 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
> >      return 0;
> >  }
> >
> > -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> > +                              bool skip_special_pages)
> >  {
> >      unsigned int i;
> >      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> >
> >          /* Copy & map in the vcpu_info page if the guest uses one */
> >          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> >          {
> >              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> >
> > @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
> >      return 0;
> >  }
> >
> > -static int copy_settings(struct domain *cd, struct domain *d)
> > +static int copy_settings(struct domain *cd, struct domain *d,
> > +                         bool skip_special_pages)
> >  {
> >      int rc;
> >
> > -    if ( (rc = copy_vcpu_settings(cd, d)) )
> > +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
> >          return rc;
> >
> >      if ( (rc = hvm_copy_context_and_params(cd, d)) )
> >          return rc;
> >
> > -    if ( (rc = copy_special_pages(cd, d)) )
> > +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
> >          return rc;
> >
> >      copy_tsc(cd, d);
> > @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
> >      return rc;
> >  }
> >
> > -static int fork(struct domain *cd, struct domain *d)
> > +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> >  {
> >      int rc = -EBUSY;
> > +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> > +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
> >
> >      if ( !cd->controller_pause_count )
> >          return rc;
> > @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
> >      if ( (rc = bring_up_vcpus(cd, d)) )
> >          goto done;
> >
> > -    rc = copy_settings(cd, d);
> > +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
>
> Can you set
> cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
> so that you don't need to pass the options around to copy_settings and
> copy_vcpu_settings?

Would be possible yes, but then we would have to clear them in case
the forking failed at any point. Setting them only at the end when the
fork finished ensures that those fields are only ever valid if the VM
is a fork. Both are valid approaches and I prefer this over the other.

>
> > +    {
> > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > +        cd->creation_finished = skip_special_pages;
>
> I think this is dangerous. While it might be true at the moment that
> the arch_domain_creation_finished only maps the vAPIC page, there's no
> guarantee it couldn't do other stuff in the future that could be
> required for the VM to be started.

I understand this domain_creation_finished route could be expanded in
the future to include other stuff, but IMHO we can evaluate what to do
about that when and if it becomes necessary. This is all experimental
to begin with.

> Does it add much overhead to map the vAPIC page?

I don't have numbers but it does add overhead. When we do a fork reset
we loop through all pages in the physmap to determine what needs to be
removed. So having an extra page means that loop is always larger than
it actually needs to be. Considering we do the reset thousands of
times per second per core, you can imagine it adding up over time.

Tamas
Roger Pau Monné March 24, 2022, 3:51 p.m. UTC | #3
On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > Add option to the fork memop to skip populating the fork with special pages.
> > > These special pages are only necessary when setting up forks to be fully
> > > functional with a toolstack. For short-lived forks where no toolstack is active
> > > these pages are uneccesary.
> >
> > I'm not sure those are strictly related to having a toolstack. For
> > example the vcpu_info has nothing to do with having a toolstack, and
> > is only used by the guest in order to receive events or fetch time
> > related data. So while a short-lived fork might not make use of those,
> > that has nothing to do with having a toolstack or not.
> 
> Fair enough, the point is that the short live fork doesn't use these pages.
> 
> > >
> > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > > ---
> > >  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
> > >  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
> > >  xen/include/public/memory.h           |  4 ++--
> > >  3 files changed, 26 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> > > index 698455444e..446cd06411 100644
> > > --- a/xen/arch/x86/include/asm/hvm/domain.h
> > > +++ b/xen/arch/x86/include/asm/hvm/domain.h
> > > @@ -31,7 +31,9 @@
> > >  #ifdef CONFIG_MEM_SHARING
> > >  struct mem_sharing_domain
> > >  {
> > > -    bool enabled, block_interrupts;
> > > +    bool enabled;
> > > +    bool block_interrupts;
> > > +    bool skip_special_pages;
> > >
> > >      /*
> > >       * When releasing shared gfn's in a preemptible manner, recall where
> > > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > > index 15e6a7ed81..84c04ddfa3 100644
> > > --- a/xen/arch/x86/mm/mem_sharing.c
> > > +++ b/xen/arch/x86/mm/mem_sharing.c
> > > @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
> > >      return 0;
> > >  }
> > >
> > > -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > > +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> > > +                              bool skip_special_pages)
> > >  {
> > >      unsigned int i;
> > >      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > >
> > >          /* Copy & map in the vcpu_info page if the guest uses one */
> > >          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > > -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > > +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > >          {
> > >              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> > >
> > > @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
> > >      return 0;
> > >  }
> > >
> > > -static int copy_settings(struct domain *cd, struct domain *d)
> > > +static int copy_settings(struct domain *cd, struct domain *d,
> > > +                         bool skip_special_pages)
> > >  {
> > >      int rc;
> > >
> > > -    if ( (rc = copy_vcpu_settings(cd, d)) )
> > > +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
> > >          return rc;
> > >
> > >      if ( (rc = hvm_copy_context_and_params(cd, d)) )
> > >          return rc;
> > >
> > > -    if ( (rc = copy_special_pages(cd, d)) )
> > > +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
> > >          return rc;
> > >
> > >      copy_tsc(cd, d);
> > > @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
> > >      return rc;
> > >  }
> > >
> > > -static int fork(struct domain *cd, struct domain *d)
> > > +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> > >  {
> > >      int rc = -EBUSY;
> > > +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> > > +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
> > >
> > >      if ( !cd->controller_pause_count )
> > >          return rc;
> > > @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
> > >      if ( (rc = bring_up_vcpus(cd, d)) )
> > >          goto done;
> > >
> > > -    rc = copy_settings(cd, d);
> > > +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
> >
> > Can you set
> > cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
> > so that you don't need to pass the options around to copy_settings and
> > copy_vcpu_settings?
> 
> Would be possible yes, but then we would have to clear them in case
> the forking failed at any point. Setting them only at the end when the
> fork finished ensures that those fields are only ever valid if the VM
> is a fork. Both are valid approaches and I prefer this over the other.

I think I'm confused, doesn't the fork get destroyed if there's a
failure? In which case the values
cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages}
shouldn't really matter?

> >
> > > +    {
> > > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > > +        cd->creation_finished = skip_special_pages;
> >
> > I think this is dangerous. While it might be true at the moment that
> > the arch_domain_creation_finished only maps the vAPIC page, there's no
> > guarantee it couldn't do other stuff in the future that could be
> > required for the VM to be started.
> 
> I understand this domain_creation_finished route could be expanded in
> the future to include other stuff, but IMHO we can evaluate what to do
> about that when and if it becomes necessary. This is all experimental
> to begin with.

Maybe you could check the skip_special_pages field from
domain_creation_finished in order to decide whether the vAPIC page
needs to be mapped?

Thanks, Roger.
Tamas K Lengyel March 24, 2022, 4:27 p.m. UTC | #4
On Thu, Mar 24, 2022 at 11:51 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> > On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > >
> > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > Add option to the fork memop to skip populating the fork with special pages.
> > > > These special pages are only necessary when setting up forks to be fully
> > > > functional with a toolstack. For short-lived forks where no toolstack is active
> > > > these pages are uneccesary.
> > >
> > > I'm not sure those are strictly related to having a toolstack. For
> > > example the vcpu_info has nothing to do with having a toolstack, and
> > > is only used by the guest in order to receive events or fetch time
> > > related data. So while a short-lived fork might not make use of those,
> > > that has nothing to do with having a toolstack or not.
> >
> > Fair enough, the point is that the short live fork doesn't use these pages.
> >
> > > >
> > > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > > > ---
> > > >  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
> > > >  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
> > > >  xen/include/public/memory.h           |  4 ++--
> > > >  3 files changed, 26 insertions(+), 15 deletions(-)
> > > >
> > > > diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> > > > index 698455444e..446cd06411 100644
> > > > --- a/xen/arch/x86/include/asm/hvm/domain.h
> > > > +++ b/xen/arch/x86/include/asm/hvm/domain.h
> > > > @@ -31,7 +31,9 @@
> > > >  #ifdef CONFIG_MEM_SHARING
> > > >  struct mem_sharing_domain
> > > >  {
> > > > -    bool enabled, block_interrupts;
> > > > +    bool enabled;
> > > > +    bool block_interrupts;
> > > > +    bool skip_special_pages;
> > > >
> > > >      /*
> > > >       * When releasing shared gfn's in a preemptible manner, recall where
> > > > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > > > index 15e6a7ed81..84c04ddfa3 100644
> > > > --- a/xen/arch/x86/mm/mem_sharing.c
> > > > +++ b/xen/arch/x86/mm/mem_sharing.c
> > > > @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
> > > >      return 0;
> > > >  }
> > > >
> > > > -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > > > +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> > > > +                              bool skip_special_pages)
> > > >  {
> > > >      unsigned int i;
> > > >      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > > @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > > >
> > > >          /* Copy & map in the vcpu_info page if the guest uses one */
> > > >          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > > > -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > > > +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > > >          {
> > > >              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> > > >
> > > > @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
> > > >      return 0;
> > > >  }
> > > >
> > > > -static int copy_settings(struct domain *cd, struct domain *d)
> > > > +static int copy_settings(struct domain *cd, struct domain *d,
> > > > +                         bool skip_special_pages)
> > > >  {
> > > >      int rc;
> > > >
> > > > -    if ( (rc = copy_vcpu_settings(cd, d)) )
> > > > +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
> > > >          return rc;
> > > >
> > > >      if ( (rc = hvm_copy_context_and_params(cd, d)) )
> > > >          return rc;
> > > >
> > > > -    if ( (rc = copy_special_pages(cd, d)) )
> > > > +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
> > > >          return rc;
> > > >
> > > >      copy_tsc(cd, d);
> > > > @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
> > > >      return rc;
> > > >  }
> > > >
> > > > -static int fork(struct domain *cd, struct domain *d)
> > > > +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> > > >  {
> > > >      int rc = -EBUSY;
> > > > +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> > > > +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
> > > >
> > > >      if ( !cd->controller_pause_count )
> > > >          return rc;
> > > > @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
> > > >      if ( (rc = bring_up_vcpus(cd, d)) )
> > > >          goto done;
> > > >
> > > > -    rc = copy_settings(cd, d);
> > > > +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
> > >
> > > Can you set
> > > cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
> > > so that you don't need to pass the options around to copy_settings and
> > > copy_vcpu_settings?
> >
> > Would be possible yes, but then we would have to clear them in case
> > the forking failed at any point. Setting them only at the end when the
> > fork finished ensures that those fields are only ever valid if the VM
> > is a fork. Both are valid approaches and I prefer this over the other.
>
> I think I'm confused, doesn't the fork get destroyed if there's a
> failure? In which case the values
> cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages}
> shouldn't really matter?

No, the domain that will be a fork is just a regular domain until the
fork operation completes. If the fork operation fails the domain
remains.

> > >
> > > > +    {
> > > > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > > > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > > > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > > > +        cd->creation_finished = skip_special_pages;
> > >
> > > I think this is dangerous. While it might be true at the moment that
> > > the arch_domain_creation_finished only maps the vAPIC page, there's no
> > > guarantee it couldn't do other stuff in the future that could be
> > > required for the VM to be started.
> >
> > I understand this domain_creation_finished route could be expanded in
> > the future to include other stuff, but IMHO we can evaluate what to do
> > about that when and if it becomes necessary. This is all experimental
> > to begin with.
>
> Maybe you could check the skip_special_pages field from
> domain_creation_finished in order to decide whether the vAPIC page
> needs to be mapped?

Could certainly do that but it means adding experimental code in an
#ifdef to the vAPIC mapping logic. Technically nothing wrong with that
but I would prefer keeping all this code in a single-place if
possible.

Tamas
Roger Pau Monné March 25, 2022, 10:25 a.m. UTC | #5
On Thu, Mar 24, 2022 at 12:27:02PM -0400, Tamas K Lengyel wrote:
> On Thu, Mar 24, 2022 at 11:51 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> > > On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > >
> > > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > > +    {
> > > > > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > > > > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > > > > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > > > > +        cd->creation_finished = skip_special_pages;
> > > >
> > > > I think this is dangerous. While it might be true at the moment that
> > > > the arch_domain_creation_finished only maps the vAPIC page, there's no
> > > > guarantee it couldn't do other stuff in the future that could be
> > > > required for the VM to be started.
> > >
> > > I understand this domain_creation_finished route could be expanded in
> > > the future to include other stuff, but IMHO we can evaluate what to do
> > > about that when and if it becomes necessary. This is all experimental
> > > to begin with.
> >
> > Maybe you could check the skip_special_pages field from
> > domain_creation_finished in order to decide whether the vAPIC page
> > needs to be mapped?
> 
> Could certainly do that but it means adding experimental code in an
> #ifdef to the vAPIC mapping logic. Technically nothing wrong with that
> but I would prefer keeping all this code in a single-place if
> possible.

I see, while I agree it's best to keep the code contained when
possible, I think in this case it's better to modify the hook,
specially because further changes to domain_creation_finished will
easily spot that this path is special cases for VM forks.

While the code is experimental it doesn't mean it shouldn't be
properly integrated with the rest of the code base.

Thanks, Roger.
Tamas K Lengyel March 25, 2022, 10:51 a.m. UTC | #6
On Fri, Mar 25, 2022, 6:25 AM Roger Pau Monné <roger.pau@citrix.com> wrote:

> On Thu, Mar 24, 2022 at 12:27:02PM -0400, Tamas K Lengyel wrote:
> > On Thu, Mar 24, 2022 at 11:51 AM Roger Pau Monné <roger.pau@citrix.com>
> wrote:
> > >
> > > On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> > > > On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <
> roger.pau@citrix.com> wrote:
> > > > >
> > > > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > > > +    {
> > > > > > +        cd->arch.hvm.mem_sharing.block_interrupts =
> block_interrupts;
> > > > > > +        cd->arch.hvm.mem_sharing.skip_special_pages =
> skip_special_pages;
> > > > > > +        /* skip mapping the vAPIC page on unpause if skipping
> special pages */
> > > > > > +        cd->creation_finished = skip_special_pages;
> > > > >
> > > > > I think this is dangerous. While it might be true at the moment
> that
> > > > > the arch_domain_creation_finished only maps the vAPIC page,
> there's no
> > > > > guarantee it couldn't do other stuff in the future that could be
> > > > > required for the VM to be started.
> > > >
> > > > I understand this domain_creation_finished route could be expanded in
> > > > the future to include other stuff, but IMHO we can evaluate what to
> do
> > > > about that when and if it becomes necessary. This is all experimental
> > > > to begin with.
> > >
> > > Maybe you could check the skip_special_pages field from
> > > domain_creation_finished in order to decide whether the vAPIC page
> > > needs to be mapped?
> >
> > Could certainly do that but it means adding experimental code in an
> > #ifdef to the vAPIC mapping logic. Technically nothing wrong with that
> > but I would prefer keeping all this code in a single-place if
> > possible.
>
> I see, while I agree it's best to keep the code contained when
> possible, I think in this case it's better to modify the hook,
> specially because further changes to domain_creation_finished will
> easily spot that this path is special cases for VM forks.
>
> While the code is experimental it doesn't mean it shouldn't be
> properly integrated with the rest of the code base.
>

Sure, I'm fine with moving it there.

Tamas

>
Roger Pau Monné March 25, 2022, 10:59 a.m. UTC | #7
On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> Add option to the fork memop to skip populating the fork with special pages.
> These special pages are only necessary when setting up forks to be fully
> functional with a toolstack. For short-lived forks where no toolstack is active
> these pages are uneccesary.

Replying here because there's no cover letter AFAICT.

For this kind of performance related changes it would be better if you
could provide some figures about the performance impact. It would help
if we knew whether avoiding mapping the vAPIC page means you can
create 0.1% more forks per-minute or 20%.

If you really want to speed up the forking path it might be good to start
by perf sampling Xen in order to find the bottlenecks?

Thanks, Roger.
Tamas K Lengyel March 25, 2022, 11:15 a.m. UTC | #8
On Fri, Mar 25, 2022, 6:59 AM Roger Pau Monné <roger.pau@citrix.com> wrote:

> On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > Add option to the fork memop to skip populating the fork with special
> pages.
> > These special pages are only necessary when setting up forks to be fully
> > functional with a toolstack. For short-lived forks where no toolstack is
> active
> > these pages are uneccesary.
>
> Replying here because there's no cover letter AFAICT.
>
> For this kind of performance related changes it would be better if you
> could provide some figures about the performance impact. It would help
> if we knew whether avoiding mapping the vAPIC page means you can
> create 0.1% more forks per-minute or 20%.
>
> If you really want to speed up the forking path it might be good to start
> by perf sampling Xen in order to find the bottlenecks?
>

Sure but for experiment systems I don't think its necessary to collect that
data.

There is also a non-performance reason why we want to keep special pages
from being populated, in cases we really want the forks physmap to start
empty for better control over its state. There was already a case where
having special pages mapped in ended up triggering unexpected Xen behaviors
leading to chain of events not easy to follow. For example if page 0 gets
brought in while the vCPU is being created it ends up as a misconfigured
ept entry if nested virtualization is enabled. That leads to ept
misconfiguration exits instead of epf faults. Simply enforcing no entry in
the physmap until forking is complete eliminates the chance of something
like that happening again and makes reasoning about the VM's behavior from
the start easier.

Tamas

>
Roger Pau Monné March 25, 2022, 11:31 a.m. UTC | #9
On Fri, Mar 25, 2022 at 07:15:59AM -0400, Tamas K Lengyel wrote:
> On Fri, Mar 25, 2022, 6:59 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
> > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > Add option to the fork memop to skip populating the fork with special
> > pages.
> > > These special pages are only necessary when setting up forks to be fully
> > > functional with a toolstack. For short-lived forks where no toolstack is
> > active
> > > these pages are uneccesary.
> >
> > Replying here because there's no cover letter AFAICT.
> >
> > For this kind of performance related changes it would be better if you
> > could provide some figures about the performance impact. It would help
> > if we knew whether avoiding mapping the vAPIC page means you can
> > create 0.1% more forks per-minute or 20%.
> >
> > If you really want to speed up the forking path it might be good to start
> > by perf sampling Xen in order to find the bottlenecks?
> >
> 
> Sure but for experiment systems I don't think its necessary to collect that
> data.

It helps weight whether the extra logic is worth the performance
benefit IMO. Here it might not matter that much since you say there's
also a non-performance reason for the change.

> There is also a non-performance reason why we want to keep special pages
> from being populated, in cases we really want the forks physmap to start
> empty for better control over its state. There was already a case where
> having special pages mapped in ended up triggering unexpected Xen behaviors
> leading to chain of events not easy to follow. For example if page 0 gets
> brought in while the vCPU is being created it ends up as a misconfigured
> ept entry if nested virtualization is enabled. That leads to ept
> misconfiguration exits instead of epf faults. Simply enforcing no entry in
> the physmap until forking is complete eliminates the chance of something
> like that happening again and makes reasoning about the VM's behavior from
> the start easier.

Could we have this added to the commit message then, and the option
renamed to 'empty_p2m' or some such. Then you should also ASSERT that
at the end of the fork process the p2m is indeed empty, not sure if
checking d->arch.paging.hap.p2m_pages == 0 would accomplish that?

Thanks, Roger.
Tamas K Lengyel March 25, 2022, 12:13 p.m. UTC | #10
On Fri, Mar 25, 2022, 7:31 AM Roger Pau Monné <roger.pau@citrix.com> wrote:

> On Fri, Mar 25, 2022 at 07:15:59AM -0400, Tamas K Lengyel wrote:
> > On Fri, Mar 25, 2022, 6:59 AM Roger Pau Monné <roger.pau@citrix.com>
> wrote:
> >
> > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > Add option to the fork memop to skip populating the fork with special
> > > pages.
> > > > These special pages are only necessary when setting up forks to be
> fully
> > > > functional with a toolstack. For short-lived forks where no
> toolstack is
> > > active
> > > > these pages are uneccesary.
> > >
> > > Replying here because there's no cover letter AFAICT.
> > >
> > > For this kind of performance related changes it would be better if you
> > > could provide some figures about the performance impact. It would help
> > > if we knew whether avoiding mapping the vAPIC page means you can
> > > create 0.1% more forks per-minute or 20%.
> > >
> > > If you really want to speed up the forking path it might be good to
> start
> > > by perf sampling Xen in order to find the bottlenecks?
> > >
> >
> > Sure but for experiment systems I don't think its necessary to collect
> that
> > data.
>
> It helps weight whether the extra logic is worth the performance
> benefit IMO. Here it might not matter that much since you say there's
> also a non-performance reason for the change.
>
> > There is also a non-performance reason why we want to keep special pages
> > from being populated, in cases we really want the forks physmap to start
> > empty for better control over its state. There was already a case where
> > having special pages mapped in ended up triggering unexpected Xen
> behaviors
> > leading to chain of events not easy to follow. For example if page 0 gets
> > brought in while the vCPU is being created it ends up as a misconfigured
> > ept entry if nested virtualization is enabled. That leads to ept
> > misconfiguration exits instead of epf faults. Simply enforcing no entry
> in
> > the physmap until forking is complete eliminates the chance of something
> > like that happening again and makes reasoning about the VM's behavior
> from
> > the start easier.
>
> Could we have this added to the commit message then, and the option
> renamed to 'empty_p2m' or some such. Then you should also ASSERT that
> at the end of the fork process the p2m is indeed empty, not sure if
> checking d->arch.paging.hap.p2m_pages == 0 would accomplish that?
>

Sure, I can do that.

Thanks,
Tamas

>
diff mbox series

Patch

diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
index 698455444e..446cd06411 100644
--- a/xen/arch/x86/include/asm/hvm/domain.h
+++ b/xen/arch/x86/include/asm/hvm/domain.h
@@ -31,7 +31,9 @@ 
 #ifdef CONFIG_MEM_SHARING
 struct mem_sharing_domain
 {
-    bool enabled, block_interrupts;
+    bool enabled;
+    bool block_interrupts;
+    bool skip_special_pages;
 
     /*
      * When releasing shared gfn's in a preemptible manner, recall where
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 15e6a7ed81..84c04ddfa3 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1643,7 +1643,8 @@  static int bring_up_vcpus(struct domain *cd, struct domain *d)
     return 0;
 }
 
-static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
+static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
+                              bool skip_special_pages)
 {
     unsigned int i;
     struct p2m_domain *p2m = p2m_get_hostp2m(cd);
@@ -1660,7 +1661,7 @@  static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
 
         /* Copy & map in the vcpu_info page if the guest uses one */
         vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
-        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
+        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
         {
             mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
 
@@ -1807,17 +1808,18 @@  static int copy_special_pages(struct domain *cd, struct domain *d)
     return 0;
 }
 
-static int copy_settings(struct domain *cd, struct domain *d)
+static int copy_settings(struct domain *cd, struct domain *d,
+                         bool skip_special_pages)
 {
     int rc;
 
-    if ( (rc = copy_vcpu_settings(cd, d)) )
+    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
         return rc;
 
     if ( (rc = hvm_copy_context_and_params(cd, d)) )
         return rc;
 
-    if ( (rc = copy_special_pages(cd, d)) )
+    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
         return rc;
 
     copy_tsc(cd, d);
@@ -1826,9 +1828,11 @@  static int copy_settings(struct domain *cd, struct domain *d)
     return rc;
 }
 
-static int fork(struct domain *cd, struct domain *d)
+static int fork(struct domain *cd, struct domain *d, uint16_t flags)
 {
     int rc = -EBUSY;
+    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
+    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
 
     if ( !cd->controller_pause_count )
         return rc;
@@ -1856,7 +1860,13 @@  static int fork(struct domain *cd, struct domain *d)
     if ( (rc = bring_up_vcpus(cd, d)) )
         goto done;
 
-    rc = copy_settings(cd, d);
+    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
+    {
+        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
+        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
+        /* skip mapping the vAPIC page on unpause if skipping special pages */
+        cd->creation_finished = skip_special_pages;
+    }
 
  done:
     if ( rc && rc != -ERESTART )
@@ -1920,7 +1930,7 @@  static int mem_sharing_fork_reset(struct domain *d)
     }
     spin_unlock_recursive(&d->page_alloc_lock);
 
-    rc = copy_settings(d, pd);
+    rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
 
     domain_unpause(d);
 
@@ -2190,7 +2200,8 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         if ( mso.u.fork.pad )
             goto out;
         if ( mso.u.fork.flags &
-             ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS) )
+             ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS |
+               XENMEM_FORK_SKIP_SPECIAL_PAGES) )
             goto out;
 
         rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
@@ -2212,14 +2223,12 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
             goto out;
         }
 
-        rc = fork(d, pd);
+        rc = fork(d, pd, mso.u.fork.flags);
 
         if ( rc == -ERESTART )
             rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
                                                "lh", XENMEM_sharing_op,
                                                arg);
-        else if ( !rc && (mso.u.fork.flags & XENMEM_FORK_BLOCK_INTERRUPTS) )
-            d->arch.hvm.mem_sharing.block_interrupts = true;
 
         rcu_unlock_domain(pd);
         break;
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index a1a0f0233a..208d8dcbd9 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -543,10 +543,10 @@  struct xen_mem_sharing_op {
         } debug;
         struct mem_sharing_op_fork {      /* OP_FORK */
             domid_t parent_domain;        /* IN: parent's domain id */
-/* Only makes sense for short-lived forks */
+/* These flags only makes sense for short-lived forks */
 #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
-/* Only makes sense for short-lived forks */
 #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
+#define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
             uint16_t flags;               /* IN: optional settings */
             uint32_t pad;                 /* Must be set to 0 */
         } fork;