diff mbox series

[1/2] xen/arm: Convert runstate address during hypcall

Message ID 8b450dddb2c855225c97741dce66453a80b9add2.1591806713.git.bertrand.marquis@arm.com (mailing list archive)
State Superseded
Headers show
Series xen/arm: Convert runstate address during hypcall | expand

Commit Message

Bertrand Marquis June 11, 2020, 11:58 a.m. UTC
At the moment on Arm, a Linux guest running with KTPI enabled will
cause the following error when a context switch happens in user mode:
(XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0

This patch is modifying the register runstate area handling on arm to
convert the guest address during the hypercall. During runtime context
switches the area is mapped to update the guest runstate copy.
The patch is also removing the initial copy which was done during the
hypercall handling as this is done on the current core when the context
is restore to go back to guest execution on arm.

As a consequence if the register runstate hypercall is called on one
vcpu for an other vcpu, the area will not be updated until the vcpu
will be executed (on arm only).

On x86, the behaviour is not modified, the address conversion is done
during the context switch and the area is updated fully during the
hypercall.
inline functions in headers could not be used as the architecture
domain.h is included before the global domain.h making it impossible
to use the struct vcpu inside the architecture header.
This should not have any performance impact as the hypercall is only
called once per vcpu usually.

Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
---
 xen/arch/arm/domain.c        | 109 +++++++++++++++++++++++++++++------
 xen/arch/x86/domain.c        |  30 +++++++++-
 xen/arch/x86/x86_64/domain.c |   4 +-
 xen/common/domain.c          |  19 ++----
 xen/include/asm-arm/domain.h |   8 +++
 xen/include/asm-x86/domain.h |  16 +++++
 xen/include/xen/domain.h     |   4 ++
 xen/include/xen/sched.h      |  16 +----
 8 files changed, 153 insertions(+), 53 deletions(-)

Comments

Stefano Stabellini June 11, 2020, 6:16 p.m. UTC | #1
On Thu, 11 Jun 2020, Bertrand Marquis wrote:
> At the moment on Arm, a Linux guest running with KTPI enabled will
> cause the following error when a context switch happens in user mode:
> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
> 
> This patch is modifying the register runstate area handling on arm to
> convert the guest address during the hypercall. During runtime context
> switches the area is mapped to update the guest runstate copy.
> The patch is also removing the initial copy which was done during the
> hypercall handling as this is done on the current core when the context
> is restore to go back to guest execution on arm.
> 
> As a consequence if the register runstate hypercall is called on one
> vcpu for an other vcpu, the area will not be updated until the vcpu
> will be executed (on arm only).
> 
> On x86, the behaviour is not modified, the address conversion is done
> during the context switch and the area is updated fully during the
> hypercall.
> inline functions in headers could not be used as the architecture
> domain.h is included before the global domain.h making it impossible
> to use the struct vcpu inside the architecture header.
> This should not have any performance impact as the hypercall is only
> called once per vcpu usually.
> 
> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
> ---
>  xen/arch/arm/domain.c        | 109 +++++++++++++++++++++++++++++------
>  xen/arch/x86/domain.c        |  30 +++++++++-
>  xen/arch/x86/x86_64/domain.c |   4 +-
>  xen/common/domain.c          |  19 ++----
>  xen/include/asm-arm/domain.h |   8 +++
>  xen/include/asm-x86/domain.h |  16 +++++
>  xen/include/xen/domain.h     |   4 ++
>  xen/include/xen/sched.h      |  16 +----
>  8 files changed, 153 insertions(+), 53 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 31169326b2..739059234f 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -19,6 +19,7 @@
>  #include <xen/sched.h>
>  #include <xen/softirq.h>
>  #include <xen/wait.h>
> +#include <xen/domain_page.h>
>  
>  #include <asm/alternative.h>
>  #include <asm/cpuerrata.h>
> @@ -275,36 +276,104 @@ static void ctxt_switch_to(struct vcpu *n)
>      virt_timer_restore(n);
>  }
>  
> -/* Update per-VCPU guest runstate shared memory area (if registered). */
> -static void update_runstate_area(struct vcpu *v)
> +void arch_cleanup_runstate_guest(struct vcpu *v)
>  {
> -    void __user *guest_handle = NULL;
> -    struct vcpu_runstate_info runstate;
> +    spin_lock(&v->arch.runstate_guest.lock);
>  
> -    if ( guest_handle_is_null(runstate_guest(v)) )
> -        return;
> +    /* cleanup previous page if any */
> +    if ( v->arch.runstate_guest.page )
> +    {
> +        put_page_and_type(v->arch.runstate_guest.page);
> +        v->arch.runstate_guest.page = NULL;
> +        v->arch.runstate_guest.offset = 0;
> +    }
>  
> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
> +    spin_unlock(&v->arch.runstate_guest.lock);
> +}
>  
> -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +int arch_setup_runstate_guest(struct vcpu *v,
> +                            struct vcpu_register_runstate_memory_area area)
> +{
> +    struct page_info *page;
> +    unsigned offset;
> +
> +    spin_lock(&v->arch.runstate_guest.lock);
> +
> +    /* cleanup previous page if any */
> +    if ( v->arch.runstate_guest.page )
>      {
> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
> -        guest_handle--;
> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> -        smp_wmb();
> +        put_page_and_type(v->arch.runstate_guest.page);
> +        v->arch.runstate_guest.page = NULL;
> +        v->arch.runstate_guest.offset = 0;
> +    }
> +
> +    offset = ((vaddr_t)area.addr.v) & ~PAGE_MASK;
> +    if ( offset > (PAGE_SIZE - sizeof(struct vcpu_runstate_info)) )
> +    {
> +        spin_unlock(&v->arch.runstate_guest.lock);
> +        gprintk(XENLOG_DEBUG, "Runstate is crossing pages\n");
> +        return -EINVAL;
> +    }
> +
> +    /* provided address must be aligned to a 64bit */
> +    if ( offset % alignof(struct vcpu_runstate_info) )
> +    {
> +        spin_unlock(&v->arch.runstate_guest.lock);
> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not aligned\n");
> +        return -EINVAL;
> +    }
> +
> +    page = get_page_from_gva(v, (vaddr_t)area.addr.v, GV2M_WRITE);
> +    if ( !page )
> +    {
> +        spin_unlock(&v->arch.runstate_guest.lock);
> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not mapped\n");

I would make all these XENLOG_WARNING or XENLOG_ERR, they are errors
after all. This last one I'd say "Could not map runstate point" and also
print the address.


> +        return -EINVAL;
>      }
>  
> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> +    v->arch.runstate_guest.page = page;
> +    v->arch.runstate_guest.offset = offset;
> +
> +    spin_unlock(&v->arch.runstate_guest.lock);
> +
> +    return 0;
> +}
> +
> +
> +/* Update per-VCPU guest runstate shared memory area (if registered). */
> +static void update_runstate_area(struct vcpu *v)
> +{
> +    struct vcpu_runstate_info *guest_runstate;
> +    void *p;
> +
> +    spin_lock(&v->arch.runstate_guest.lock);
>  
> -    if ( guest_handle )
> +    if ( v->arch.runstate_guest.page )
>      {
> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +        p = __map_domain_page(v->arch.runstate_guest.page);
> +        guest_runstate = p + v->arch.runstate_guest.offset;
> +
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;

I think that this write to guest_runstate should use write_atomic or
another atomic write operation.


> +            smp_wmb();
> +        }
> +
> +        memcpy(guest_runstate, &v->runstate, sizeof(v->runstate));
>          smp_wmb();
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> +
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +            guest_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;

same here


> +            smp_wmb();
> +        }
> +
> +        unmap_domain_page(p);
>      }
> +
> +    spin_unlock(&v->arch.runstate_guest.lock);
>  }
>  
>  static void schedule_tail(struct vcpu *prev)
> @@ -560,6 +629,8 @@ int arch_vcpu_create(struct vcpu *v)
>      v->arch.saved_context.sp = (register_t)v->arch.cpu_info;
>      v->arch.saved_context.pc = (register_t)continue_new_vcpu;
>  
> +    spin_lock_init(&v->arch.runstate_guest.lock);
> +
>      /* Idle VCPUs don't need the rest of this setup */
>      if ( is_idle_vcpu(v) )
>          return rc;
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index fee6c3931a..32bbed87d5 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1642,6 +1642,30 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
>          wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
>  }
>  
> +int arch_setup_runstate_guest(struct vcpu *v,
> +                             struct vcpu_register_runstate_memory_area area)
> +{
> +    struct vcpu_runstate_info runstate;
> +
> +    runstate_guest(v) = area.addr.h;
> +
> +    if ( v == current )
> +    {
> +        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
> +    }
> +    else
> +    {
> +        vcpu_runstate_get(v, &runstate);
> +        __copy_to_guest(runstate_guest(v), &runstate, 1);
> +    }
> +    return 0;
> +}
> +
> +void arch_cleanup_runstate_guest(struct vcpu *v)
> +{
> +    set_xen_guest_handle(runstate_guest(v), NULL);
> +}
> +
>  /* Update per-VCPU guest runstate shared memory area (if registered). */
>  bool update_runstate_area(struct vcpu *v)
>  {
> @@ -1660,8 +1684,8 @@ bool update_runstate_area(struct vcpu *v)
>      if ( VM_ASSIST(v->domain, runstate_update_flag) )
>      {
>          guest_handle = has_32bit_shinfo(v->domain)
> -            ? &v->runstate_guest.compat.p->state_entry_time + 1
> -            : &v->runstate_guest.native.p->state_entry_time + 1;
> +            ? &v->arch.runstate_guest.compat.p->state_entry_time + 1
> +            : &v->arch.runstate_guest.native.p->state_entry_time + 1;
>          guest_handle--;
>          runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>          __raw_copy_to_guest(guest_handle,
> @@ -1674,7 +1698,7 @@ bool update_runstate_area(struct vcpu *v)
>          struct compat_vcpu_runstate_info info;
>  
>          XLAT_vcpu_runstate_info(&info, &runstate);
> -        __copy_to_guest(v->runstate_guest.compat, &info, 1);
> +        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
>          rc = true;
>      }
>      else
> diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c
> index c46dccc25a..b879e8dd2c 100644
> --- a/xen/arch/x86/x86_64/domain.c
> +++ b/xen/arch/x86/x86_64/domain.c
> @@ -36,7 +36,7 @@ arch_compat_vcpu_op(
>              break;
>  
>          rc = 0;
> -        guest_from_compat_handle(v->runstate_guest.compat, area.addr.h);
> +        guest_from_compat_handle(v->arch.runstate_guest.compat, area.addr.h);
>  
>          if ( v == current )
>          {
> @@ -49,7 +49,7 @@ arch_compat_vcpu_op(
>              vcpu_runstate_get(v, &runstate);
>              XLAT_vcpu_runstate_info(&info, &runstate);
>          }
> -        __copy_to_guest(v->runstate_guest.compat, &info, 1);
> +        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
>  
>          break;
>      }
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 7cc9526139..0ca6bf4dbc 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -727,7 +727,10 @@ int domain_kill(struct domain *d)
>          if ( cpupool_move_domain(d, cpupool0) )
>              return -ERESTART;
>          for_each_vcpu ( d, v )
> +        {
> +            arch_cleanup_runstate_guest(v);
>              unmap_vcpu_info(v);
> +        }
>          d->is_dying = DOMDYING_dead;
>          /* Mem event cleanup has to go here because the rings 
>           * have to be put before we call put_domain. */
> @@ -1167,7 +1170,7 @@ int domain_soft_reset(struct domain *d)
>  
>      for_each_vcpu ( d, v )
>      {
> -        set_xen_guest_handle(runstate_guest(v), NULL);
> +        arch_cleanup_runstate_guest(v);
>          unmap_vcpu_info(v);
>      }
>  
> @@ -1484,7 +1487,6 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
>      case VCPUOP_register_runstate_memory_area:
>      {
>          struct vcpu_register_runstate_memory_area area;
> -        struct vcpu_runstate_info runstate;
>  
>          rc = -EFAULT;
>          if ( copy_from_guest(&area, arg, 1) )
> @@ -1493,18 +1495,7 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
>          if ( !guest_handle_okay(area.addr.h, 1) )
>              break;
>  
> -        rc = 0;
> -        runstate_guest(v) = area.addr.h;
> -
> -        if ( v == current )
> -        {
> -            __copy_to_guest(runstate_guest(v), &v->runstate, 1);
> -        }
> -        else
> -        {
> -            vcpu_runstate_get(v, &runstate);
> -            __copy_to_guest(runstate_guest(v), &runstate, 1);
> -        }
> +        rc = arch_setup_runstate_guest(v, area);
>  
>          break;
>      }
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 4e2f582006..3a7f53e13d 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -11,6 +11,7 @@
>  #include <asm/vgic.h>
>  #include <asm/vpl011.h>
>  #include <public/hvm/params.h>
> +#include <public/vcpu.h>
>  #include <xen/serial.h>
>  #include <xen/rbtree.h>
>  
> @@ -206,6 +207,13 @@ struct arch_vcpu
>       */
>      bool need_flush_to_ram;
>  
> +    /* runstate guest info */
> +    struct {
> +        struct page_info *page;  /* guest page */
> +        unsigned         offset; /* offset in page */
> +        spinlock_t       lock;   /* lock to access page */
> +    } runstate_guest;
> +
>  }  __cacheline_aligned;
>  
>  void vcpu_show_execution_state(struct vcpu *);
> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
> index e8cee3d5e5..f917b48cb8 100644
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -11,6 +11,11 @@
>  #include <asm/x86_emulate.h>
>  #include <public/vcpu.h>
>  #include <public/hvm/hvm_info_table.h>
> +#ifdef CONFIG_COMPAT
> +#include <compat/vcpu.h>
> +DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
> +#endif
> +
>  
>  #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
>  
> @@ -626,6 +631,17 @@ struct arch_vcpu
>      struct {
>          bool next_interrupt_enabled;
>      } monitor;
> +
> +#ifndef CONFIG_COMPAT
> +# define runstate_guest(v) ((v)->arch.runstate_guest)
> +    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
> +#else
> +# define runstate_guest(v) ((v)->arch.runstate_guest.native)
> +    union {
> +        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
> +        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
> +    } runstate_guest;
> +#endif
>  };
>  
>  struct guest_memory_policy
> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
> index 7e51d361de..d5d73ce997 100644
> --- a/xen/include/xen/domain.h
> +++ b/xen/include/xen/domain.h
> @@ -63,6 +63,10 @@ void arch_vcpu_destroy(struct vcpu *v);
>  int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset);
>  void unmap_vcpu_info(struct vcpu *v);
>  
> +int arch_setup_runstate_guest(struct vcpu *v,
> +                            struct vcpu_register_runstate_memory_area area);

NIT: code style, the indentation is off


> +void arch_cleanup_runstate_guest(struct vcpu *v);
> +
>  int arch_domain_create(struct domain *d,
>                         struct xen_domctl_createdomain *config);
>  
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index ac53519d7f..fac030fb83 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -29,11 +29,6 @@
>  #include <public/vcpu.h>
>  #include <public/event_channel.h>
>  
> -#ifdef CONFIG_COMPAT
> -#include <compat/vcpu.h>
> -DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
> -#endif
> -
>  /*
>   * Stats
>   *
> @@ -166,16 +161,7 @@ struct vcpu
>      struct sched_unit *sched_unit;
>  
>      struct vcpu_runstate_info runstate;
> -#ifndef CONFIG_COMPAT
> -# define runstate_guest(v) ((v)->runstate_guest)
> -    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
> -#else
> -# define runstate_guest(v) ((v)->runstate_guest.native)
> -    union {
> -        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
> -        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
> -    } runstate_guest; /* guest address */
> -#endif
> +
>      unsigned int     new_state;
>  
>      /* Has the FPU been initialised? */
> -- 
> 2.17.1
>
Julien Grall June 11, 2020, 6:24 p.m. UTC | #2
> > +        return -EINVAL;
> >      }
> >
> > -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> > +    v->arch.runstate_guest.page = page;
> > +    v->arch.runstate_guest.offset = offset;
> > +
> > +    spin_unlock(&v->arch.runstate_guest.lock);
> > +
> > +    return 0;
> > +}
> > +
> > +
> > +/* Update per-VCPU guest runstate shared memory area (if registered). */
> > +static void update_runstate_area(struct vcpu *v)
> > +{
> > +    struct vcpu_runstate_info *guest_runstate;
> > +    void *p;
> > +
> > +    spin_lock(&v->arch.runstate_guest.lock);
> >
> > -    if ( guest_handle )
> > +    if ( v->arch.runstate_guest.page )
> >      {
> > -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> > +        p = __map_domain_page(v->arch.runstate_guest.page);
> > +        guest_runstate = p + v->arch.runstate_guest.offset;
> > +
> > +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> > +        {
> > +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> > +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>
> I think that this write to guest_runstate should use write_atomic or
> another atomic write operation.

I thought about suggesting the same, but  guest_copy_* helpers may not
do a single memory write to state_entry_time.
What are you trying to prevent with the write_atomic()?

Cheers,
Stefano Stabellini June 11, 2020, 6:50 p.m. UTC | #3
On Thu, 11 Jun 2020, Julien Grall wrote:
> > > +        return -EINVAL;
> > >      }
> > >
> > > -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> > > +    v->arch.runstate_guest.page = page;
> > > +    v->arch.runstate_guest.offset = offset;
> > > +
> > > +    spin_unlock(&v->arch.runstate_guest.lock);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +
> > > +/* Update per-VCPU guest runstate shared memory area (if registered). */
> > > +static void update_runstate_area(struct vcpu *v)
> > > +{
> > > +    struct vcpu_runstate_info *guest_runstate;
> > > +    void *p;
> > > +
> > > +    spin_lock(&v->arch.runstate_guest.lock);
> > >
> > > -    if ( guest_handle )
> > > +    if ( v->arch.runstate_guest.page )
> > >      {
> > > -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> > > +        p = __map_domain_page(v->arch.runstate_guest.page);
> > > +        guest_runstate = p + v->arch.runstate_guest.offset;
> > > +
> > > +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> > > +        {
> > > +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> > > +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> >
> > I think that this write to guest_runstate should use write_atomic or
> > another atomic write operation.
> 
> I thought about suggesting the same, but  guest_copy_* helpers may not
> do a single memory write to state_entry_time.
> What are you trying to prevent with the write_atomic()?

I am thinking that without using an atomic write, it would be (at least
theoretically) possible for a guest to see a partial write to
state_entry_time, which is not good. In theory, the set of assembly
instructions generated by the compiler could go through an intermediate
state that we don't want the guest to see. In practice, I doubt that any
possible combination of assembly instructions generated by the compiler
could lead to something harmful.
Julien Grall June 11, 2020, 7:38 p.m. UTC | #4
Hi Stefano,

On 11/06/2020 19:50, Stefano Stabellini wrote:
> On Thu, 11 Jun 2020, Julien Grall wrote:
>>>> +        return -EINVAL;
>>>>       }
>>>>
>>>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>>>> +    v->arch.runstate_guest.page = page;
>>>> +    v->arch.runstate_guest.offset = offset;
>>>> +
>>>> +    spin_unlock(&v->arch.runstate_guest.lock);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +
>>>> +/* Update per-VCPU guest runstate shared memory area (if registered). */
>>>> +static void update_runstate_area(struct vcpu *v)
>>>> +{
>>>> +    struct vcpu_runstate_info *guest_runstate;
>>>> +    void *p;
>>>> +
>>>> +    spin_lock(&v->arch.runstate_guest.lock);
>>>>
>>>> -    if ( guest_handle )
>>>> +    if ( v->arch.runstate_guest.page )
>>>>       {
>>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>>>> +
>>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>>> +        {
>>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>
>>> I think that this write to guest_runstate should use write_atomic or
>>> another atomic write operation.
>>
>> I thought about suggesting the same, but  guest_copy_* helpers may not
>> do a single memory write to state_entry_time.
>> What are you trying to prevent with the write_atomic()?
> 
> I am thinking that without using an atomic write, it would be (at least
> theoretically) possible for a guest to see a partial write to
> state_entry_time, which is not good. 

It is already the case with existing implementation as Xen may write 
byte by byte. So are you suggesting the existing code is also buggy?

> In theory, the set of assembly
> instructions generated by the compiler could go through an intermediate
> state that we don't want the guest to see. In practice, I doubt that any
> possible combination of assembly instructions generated by the compiler
> could lead to something harmful.

Well, I think you first need to define the theoritical state you are 
worried about. Then we can discuss whether they can happen in practice 
and how we can prevent them in the existing and new code.

So what sort of state your are concerned?

Cheers,
Stefano Stabellini June 12, 2020, 1:09 a.m. UTC | #5
On Thu, 11 Jun 2020, Julien Grall wrote:
> Hi Stefano,
> 
> On 11/06/2020 19:50, Stefano Stabellini wrote:
> > On Thu, 11 Jun 2020, Julien Grall wrote:
> > > > > +        return -EINVAL;
> > > > >       }
> > > > > 
> > > > > -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> > > > > +    v->arch.runstate_guest.page = page;
> > > > > +    v->arch.runstate_guest.offset = offset;
> > > > > +
> > > > > +    spin_unlock(&v->arch.runstate_guest.lock);
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +
> > > > > +/* Update per-VCPU guest runstate shared memory area (if registered).
> > > > > */
> > > > > +static void update_runstate_area(struct vcpu *v)
> > > > > +{
> > > > > +    struct vcpu_runstate_info *guest_runstate;
> > > > > +    void *p;
> > > > > +
> > > > > +    spin_lock(&v->arch.runstate_guest.lock);
> > > > > 
> > > > > -    if ( guest_handle )
> > > > > +    if ( v->arch.runstate_guest.page )
> > > > >       {
> > > > > -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> > > > > +        p = __map_domain_page(v->arch.runstate_guest.page);
> > > > > +        guest_runstate = p + v->arch.runstate_guest.offset;
> > > > > +
> > > > > +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> > > > > +        {
> > > > > +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> > > > > +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> > > > 
> > > > I think that this write to guest_runstate should use write_atomic or
> > > > another atomic write operation.
> > > 
> > > I thought about suggesting the same, but  guest_copy_* helpers may not
> > > do a single memory write to state_entry_time.
> > > What are you trying to prevent with the write_atomic()?
> > 
> > I am thinking that without using an atomic write, it would be (at least
> > theoretically) possible for a guest to see a partial write to
> > state_entry_time, which is not good. 
> 
> It is already the case with existing implementation as Xen may write byte by
> byte. So are you suggesting the existing code is also buggy?

Writing byte by byte is a different case. That is OK. In that case, the
guest could see the state after 3 bytes written and it would be fine and
consistent. If this hadn't been the case, then yes, the existing code
would also be buggy.

So if we did the write with a memcpy, it would be fine, no need for
atomics:

  memcpy(&guest_runstate->state_entry_time,
         &v->runstate.state_entry_time,
         XXX);


The |= case is different: GCC could implement it in any way it likes,
including going through a zero-write to any of the bytes in the word, or
doing an addition then a subtraction. GCC doesn't make any guarantees.
If we want guarantees we need to use atomics.
Bertrand Marquis June 12, 2020, 8:07 a.m. UTC | #6
> On 11 Jun 2020, at 19:16, Stefano Stabellini <sstabellini@kernel.org> wrote:
> 
> On Thu, 11 Jun 2020, Bertrand Marquis wrote:
>> At the moment on Arm, a Linux guest running with KTPI enabled will
>> cause the following error when a context switch happens in user mode:
>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>> 
>> This patch is modifying the register runstate area handling on arm to
>> convert the guest address during the hypercall. During runtime context
>> switches the area is mapped to update the guest runstate copy.
>> The patch is also removing the initial copy which was done during the
>> hypercall handling as this is done on the current core when the context
>> is restore to go back to guest execution on arm.
>> 
>> As a consequence if the register runstate hypercall is called on one
>> vcpu for an other vcpu, the area will not be updated until the vcpu
>> will be executed (on arm only).
>> 
>> On x86, the behaviour is not modified, the address conversion is done
>> during the context switch and the area is updated fully during the
>> hypercall.
>> inline functions in headers could not be used as the architecture
>> domain.h is included before the global domain.h making it impossible
>> to use the struct vcpu inside the architecture header.
>> This should not have any performance impact as the hypercall is only
>> called once per vcpu usually.
>> 
>> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
>> ---
>> xen/arch/arm/domain.c        | 109 +++++++++++++++++++++++++++++------
>> xen/arch/x86/domain.c        |  30 +++++++++-
>> xen/arch/x86/x86_64/domain.c |   4 +-
>> xen/common/domain.c          |  19 ++----
>> xen/include/asm-arm/domain.h |   8 +++
>> xen/include/asm-x86/domain.h |  16 +++++
>> xen/include/xen/domain.h     |   4 ++
>> xen/include/xen/sched.h      |  16 +----
>> 8 files changed, 153 insertions(+), 53 deletions(-)
>> 
>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>> index 31169326b2..739059234f 100644
>> --- a/xen/arch/arm/domain.c
>> +++ b/xen/arch/arm/domain.c
>> @@ -19,6 +19,7 @@
>> #include <xen/sched.h>
>> #include <xen/softirq.h>
>> #include <xen/wait.h>
>> +#include <xen/domain_page.h>
>> 
>> #include <asm/alternative.h>
>> #include <asm/cpuerrata.h>
>> @@ -275,36 +276,104 @@ static void ctxt_switch_to(struct vcpu *n)
>>     virt_timer_restore(n);
>> }
>> 
>> -/* Update per-VCPU guest runstate shared memory area (if registered). */
>> -static void update_runstate_area(struct vcpu *v)
>> +void arch_cleanup_runstate_guest(struct vcpu *v)
>> {
>> -    void __user *guest_handle = NULL;
>> -    struct vcpu_runstate_info runstate;
>> +    spin_lock(&v->arch.runstate_guest.lock);
>> 
>> -    if ( guest_handle_is_null(runstate_guest(v)) )
>> -        return;
>> +    /* cleanup previous page if any */
>> +    if ( v->arch.runstate_guest.page )
>> +    {
>> +        put_page_and_type(v->arch.runstate_guest.page);
>> +        v->arch.runstate_guest.page = NULL;
>> +        v->arch.runstate_guest.offset = 0;
>> +    }
>> 
>> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>> +    spin_unlock(&v->arch.runstate_guest.lock);
>> +}
>> 
>> -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +int arch_setup_runstate_guest(struct vcpu *v,
>> +                            struct vcpu_register_runstate_memory_area area)
>> +{
>> +    struct page_info *page;
>> +    unsigned offset;
>> +
>> +    spin_lock(&v->arch.runstate_guest.lock);
>> +
>> +    /* cleanup previous page if any */
>> +    if ( v->arch.runstate_guest.page )
>>     {
>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>> -        guest_handle--;
>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> -        smp_wmb();
>> +        put_page_and_type(v->arch.runstate_guest.page);
>> +        v->arch.runstate_guest.page = NULL;
>> +        v->arch.runstate_guest.offset = 0;
>> +    }
>> +
>> +    offset = ((vaddr_t)area.addr.v) & ~PAGE_MASK;
>> +    if ( offset > (PAGE_SIZE - sizeof(struct vcpu_runstate_info)) )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate is crossing pages\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    /* provided address must be aligned to a 64bit */
>> +    if ( offset % alignof(struct vcpu_runstate_info) )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not aligned\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    page = get_page_from_gva(v, (vaddr_t)area.addr.v, GV2M_WRITE);
>> +    if ( !page )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not mapped\n");
> 
> I would make all these XENLOG_WARNING or XENLOG_ERR, they are errors
> after all. This last one I'd say "Could not map runstate point" and also
> print the address.

Ok I will fix it to Warning and change the message like this.

> 
> 
>> +        return -EINVAL;
>>     }
>> 
>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>> +    v->arch.runstate_guest.page = page;
>> +    v->arch.runstate_guest.offset = offset;
>> +
>> +    spin_unlock(&v->arch.runstate_guest.lock);
>> +
>> +    return 0;
>> +}
>> +
>> +
>> +/* Update per-VCPU guest runstate shared memory area (if registered). */
>> +static void update_runstate_area(struct vcpu *v)
>> +{
>> +    struct vcpu_runstate_info *guest_runstate;
>> +    void *p;
>> +
>> +    spin_lock(&v->arch.runstate_guest.lock);
>> 
>> -    if ( guest_handle )
>> +    if ( v->arch.runstate_guest.page )
>>     {
>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>> +
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> 
> I think that this write to guest_runstate should use write_atomic or
> another atomic write operation.

I will answer at the end of the discussion on the subject.

> 
> 
>> +            smp_wmb();
>> +        }
>> +
>> +        memcpy(guest_runstate, &v->runstate, sizeof(v->runstate));
>>         smp_wmb();
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +            guest_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> 
> same here
> 
> 
>> +            smp_wmb();
>> +        }
>> +
>> +        unmap_domain_page(p);
>>     }
>> +
>> +    spin_unlock(&v->arch.runstate_guest.lock);
>> }
>> 
>> static void schedule_tail(struct vcpu *prev)
>> @@ -560,6 +629,8 @@ int arch_vcpu_create(struct vcpu *v)
>>     v->arch.saved_context.sp = (register_t)v->arch.cpu_info;
>>     v->arch.saved_context.pc = (register_t)continue_new_vcpu;
>> 
>> +    spin_lock_init(&v->arch.runstate_guest.lock);
>> +
>>     /* Idle VCPUs don't need the rest of this setup */
>>     if ( is_idle_vcpu(v) )
>>         return rc;
>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
>> index fee6c3931a..32bbed87d5 100644
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -1642,6 +1642,30 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
>>         wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
>> }
>> 
>> +int arch_setup_runstate_guest(struct vcpu *v,
>> +                             struct vcpu_register_runstate_memory_area area)
>> +{
>> +    struct vcpu_runstate_info runstate;
>> +
>> +    runstate_guest(v) = area.addr.h;
>> +
>> +    if ( v == current )
>> +    {
>> +        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
>> +    }
>> +    else
>> +    {
>> +        vcpu_runstate_get(v, &runstate);
>> +        __copy_to_guest(runstate_guest(v), &runstate, 1);
>> +    }
>> +    return 0;
>> +}
>> +
>> +void arch_cleanup_runstate_guest(struct vcpu *v)
>> +{
>> +    set_xen_guest_handle(runstate_guest(v), NULL);
>> +}
>> +
>> /* Update per-VCPU guest runstate shared memory area (if registered). */
>> bool update_runstate_area(struct vcpu *v)
>> {
>> @@ -1660,8 +1684,8 @@ bool update_runstate_area(struct vcpu *v)
>>     if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>     {
>>         guest_handle = has_32bit_shinfo(v->domain)
>> -            ? &v->runstate_guest.compat.p->state_entry_time + 1
>> -            : &v->runstate_guest.native.p->state_entry_time + 1;
>> +            ? &v->arch.runstate_guest.compat.p->state_entry_time + 1
>> +            : &v->arch.runstate_guest.native.p->state_entry_time + 1;
>>         guest_handle--;
>>         runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>         __raw_copy_to_guest(guest_handle,
>> @@ -1674,7 +1698,7 @@ bool update_runstate_area(struct vcpu *v)
>>         struct compat_vcpu_runstate_info info;
>> 
>>         XLAT_vcpu_runstate_info(&info, &runstate);
>> -        __copy_to_guest(v->runstate_guest.compat, &info, 1);
>> +        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
>>         rc = true;
>>     }
>>     else
>> diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c
>> index c46dccc25a..b879e8dd2c 100644
>> --- a/xen/arch/x86/x86_64/domain.c
>> +++ b/xen/arch/x86/x86_64/domain.c
>> @@ -36,7 +36,7 @@ arch_compat_vcpu_op(
>>             break;
>> 
>>         rc = 0;
>> -        guest_from_compat_handle(v->runstate_guest.compat, area.addr.h);
>> +        guest_from_compat_handle(v->arch.runstate_guest.compat, area.addr.h);
>> 
>>         if ( v == current )
>>         {
>> @@ -49,7 +49,7 @@ arch_compat_vcpu_op(
>>             vcpu_runstate_get(v, &runstate);
>>             XLAT_vcpu_runstate_info(&info, &runstate);
>>         }
>> -        __copy_to_guest(v->runstate_guest.compat, &info, 1);
>> +        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
>> 
>>         break;
>>     }
>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>> index 7cc9526139..0ca6bf4dbc 100644
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -727,7 +727,10 @@ int domain_kill(struct domain *d)
>>         if ( cpupool_move_domain(d, cpupool0) )
>>             return -ERESTART;
>>         for_each_vcpu ( d, v )
>> +        {
>> +            arch_cleanup_runstate_guest(v);
>>             unmap_vcpu_info(v);
>> +        }
>>         d->is_dying = DOMDYING_dead;
>>         /* Mem event cleanup has to go here because the rings 
>>          * have to be put before we call put_domain. */
>> @@ -1167,7 +1170,7 @@ int domain_soft_reset(struct domain *d)
>> 
>>     for_each_vcpu ( d, v )
>>     {
>> -        set_xen_guest_handle(runstate_guest(v), NULL);
>> +        arch_cleanup_runstate_guest(v);
>>         unmap_vcpu_info(v);
>>     }
>> 
>> @@ -1484,7 +1487,6 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
>>     case VCPUOP_register_runstate_memory_area:
>>     {
>>         struct vcpu_register_runstate_memory_area area;
>> -        struct vcpu_runstate_info runstate;
>> 
>>         rc = -EFAULT;
>>         if ( copy_from_guest(&area, arg, 1) )
>> @@ -1493,18 +1495,7 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
>>         if ( !guest_handle_okay(area.addr.h, 1) )
>>             break;
>> 
>> -        rc = 0;
>> -        runstate_guest(v) = area.addr.h;
>> -
>> -        if ( v == current )
>> -        {
>> -            __copy_to_guest(runstate_guest(v), &v->runstate, 1);
>> -        }
>> -        else
>> -        {
>> -            vcpu_runstate_get(v, &runstate);
>> -            __copy_to_guest(runstate_guest(v), &runstate, 1);
>> -        }
>> +        rc = arch_setup_runstate_guest(v, area);
>> 
>>         break;
>>     }
>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>> index 4e2f582006..3a7f53e13d 100644
>> --- a/xen/include/asm-arm/domain.h
>> +++ b/xen/include/asm-arm/domain.h
>> @@ -11,6 +11,7 @@
>> #include <asm/vgic.h>
>> #include <asm/vpl011.h>
>> #include <public/hvm/params.h>
>> +#include <public/vcpu.h>
>> #include <xen/serial.h>
>> #include <xen/rbtree.h>
>> 
>> @@ -206,6 +207,13 @@ struct arch_vcpu
>>      */
>>     bool need_flush_to_ram;
>> 
>> +    /* runstate guest info */
>> +    struct {
>> +        struct page_info *page;  /* guest page */
>> +        unsigned         offset; /* offset in page */
>> +        spinlock_t       lock;   /* lock to access page */
>> +    } runstate_guest;
>> +
>> }  __cacheline_aligned;
>> 
>> void vcpu_show_execution_state(struct vcpu *);
>> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
>> index e8cee3d5e5..f917b48cb8 100644
>> --- a/xen/include/asm-x86/domain.h
>> +++ b/xen/include/asm-x86/domain.h
>> @@ -11,6 +11,11 @@
>> #include <asm/x86_emulate.h>
>> #include <public/vcpu.h>
>> #include <public/hvm/hvm_info_table.h>
>> +#ifdef CONFIG_COMPAT
>> +#include <compat/vcpu.h>
>> +DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
>> +#endif
>> +
>> 
>> #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
>> 
>> @@ -626,6 +631,17 @@ struct arch_vcpu
>>     struct {
>>         bool next_interrupt_enabled;
>>     } monitor;
>> +
>> +#ifndef CONFIG_COMPAT
>> +# define runstate_guest(v) ((v)->arch.runstate_guest)
>> +    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
>> +#else
>> +# define runstate_guest(v) ((v)->arch.runstate_guest.native)
>> +    union {
>> +        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>> +        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>> +    } runstate_guest;
>> +#endif
>> };
>> 
>> struct guest_memory_policy
>> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
>> index 7e51d361de..d5d73ce997 100644
>> --- a/xen/include/xen/domain.h
>> +++ b/xen/include/xen/domain.h
>> @@ -63,6 +63,10 @@ void arch_vcpu_destroy(struct vcpu *v);
>> int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset);
>> void unmap_vcpu_info(struct vcpu *v);
>> 
>> +int arch_setup_runstate_guest(struct vcpu *v,
>> +                            struct vcpu_register_runstate_memory_area area);
> 
> NIT: code style, the indentation is off

Ok

> 
> 
>> +void arch_cleanup_runstate_guest(struct vcpu *v);
>> +
>> int arch_domain_create(struct domain *d,
>>                        struct xen_domctl_createdomain *config);
>> 
>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>> index ac53519d7f..fac030fb83 100644
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -29,11 +29,6 @@
>> #include <public/vcpu.h>
>> #include <public/event_channel.h>
>> 
>> -#ifdef CONFIG_COMPAT
>> -#include <compat/vcpu.h>
>> -DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
>> -#endif
>> -
>> /*
>>  * Stats
>>  *
>> @@ -166,16 +161,7 @@ struct vcpu
>>     struct sched_unit *sched_unit;
>> 
>>     struct vcpu_runstate_info runstate;
>> -#ifndef CONFIG_COMPAT
>> -# define runstate_guest(v) ((v)->runstate_guest)
>> -    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
>> -#else
>> -# define runstate_guest(v) ((v)->runstate_guest.native)
>> -    union {
>> -        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>> -        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>> -    } runstate_guest; /* guest address */
>> -#endif
>> +
>>     unsigned int     new_state;
>> 
>>     /* Has the FPU been initialised? */
>> -- 
>> 2.17.1
Bertrand Marquis June 12, 2020, 8:13 a.m. UTC | #7
> On 12 Jun 2020, at 02:09, Stefano Stabellini <sstabellini@kernel.org> wrote:
> 
> On Thu, 11 Jun 2020, Julien Grall wrote:
>> Hi Stefano,
>> 
>> On 11/06/2020 19:50, Stefano Stabellini wrote:
>>> On Thu, 11 Jun 2020, Julien Grall wrote:
>>>>>> +        return -EINVAL;
>>>>>>      }
>>>>>> 
>>>>>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>>>>>> +    v->arch.runstate_guest.page = page;
>>>>>> +    v->arch.runstate_guest.offset = offset;
>>>>>> +
>>>>>> +    spin_unlock(&v->arch.runstate_guest.lock);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +
>>>>>> +/* Update per-VCPU guest runstate shared memory area (if registered).
>>>>>> */
>>>>>> +static void update_runstate_area(struct vcpu *v)
>>>>>> +{
>>>>>> +    struct vcpu_runstate_info *guest_runstate;
>>>>>> +    void *p;
>>>>>> +
>>>>>> +    spin_lock(&v->arch.runstate_guest.lock);
>>>>>> 
>>>>>> -    if ( guest_handle )
>>>>>> +    if ( v->arch.runstate_guest.page )
>>>>>>      {
>>>>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>>>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>>>>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>>>>>> +
>>>>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>>>>> +        {
>>>>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>>> 
>>>>> I think that this write to guest_runstate should use write_atomic or
>>>>> another atomic write operation.
>>>> 
>>>> I thought about suggesting the same, but  guest_copy_* helpers may not
>>>> do a single memory write to state_entry_time.
>>>> What are you trying to prevent with the write_atomic()?
>>> 
>>> I am thinking that without using an atomic write, it would be (at least
>>> theoretically) possible for a guest to see a partial write to
>>> state_entry_time, which is not good. 
>> 
>> It is already the case with existing implementation as Xen may write byte by
>> byte. So are you suggesting the existing code is also buggy?
> 
> Writing byte by byte is a different case. That is OK. In that case, the
> guest could see the state after 3 bytes written and it would be fine and
> consistent. If this hadn't been the case, then yes, the existing code
> would also be buggy.
> 
> So if we did the write with a memcpy, it would be fine, no need for
> atomics:
> 
>  memcpy(&guest_runstate->state_entry_time,
>         &v->runstate.state_entry_time,
>         XXX);
> 
> 
> The |= case is different: GCC could implement it in any way it likes,
> including going through a zero-write to any of the bytes in the word, or
> doing an addition then a subtraction. GCC doesn't make any guarantees.
> If we want guarantees we need to use atomics.

Wouldn’t that require all accesses to state_entry_time to use also atomic operations ?
In this case we could not propagate the changes to a guest without changing the interface itself.

As the copy time needs to be protected, the write barriers are there to make sure that during the copy the bit is set and that when we unset it, the copy is done.
I added for this purpose a barrier after the memcpy to make sure that when/if we unset the bit the copy has already been done.

Cheers

Bertrand
Julien Grall June 12, 2020, 9:53 a.m. UTC | #8
On 12/06/2020 02:09, Stefano Stabellini wrote:
> On Thu, 11 Jun 2020, Julien Grall wrote:
>> Hi Stefano,
>>
>> On 11/06/2020 19:50, Stefano Stabellini wrote:
>>> On Thu, 11 Jun 2020, Julien Grall wrote:
>>>>>> +        return -EINVAL;
>>>>>>        }
>>>>>>
>>>>>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>>>>>> +    v->arch.runstate_guest.page = page;
>>>>>> +    v->arch.runstate_guest.offset = offset;
>>>>>> +
>>>>>> +    spin_unlock(&v->arch.runstate_guest.lock);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +
>>>>>> +/* Update per-VCPU guest runstate shared memory area (if registered).
>>>>>> */
>>>>>> +static void update_runstate_area(struct vcpu *v)
>>>>>> +{
>>>>>> +    struct vcpu_runstate_info *guest_runstate;
>>>>>> +    void *p;
>>>>>> +
>>>>>> +    spin_lock(&v->arch.runstate_guest.lock);
>>>>>>
>>>>>> -    if ( guest_handle )
>>>>>> +    if ( v->arch.runstate_guest.page )
>>>>>>        {
>>>>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>>>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>>>>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>>>>>> +
>>>>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>>>>> +        {
>>>>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>>>
>>>>> I think that this write to guest_runstate should use write_atomic or
>>>>> another atomic write operation.
>>>>
>>>> I thought about suggesting the same, but  guest_copy_* helpers may not
>>>> do a single memory write to state_entry_time.
>>>> What are you trying to prevent with the write_atomic()?
>>>
>>> I am thinking that without using an atomic write, it would be (at least
>>> theoretically) possible for a guest to see a partial write to
>>> state_entry_time, which is not good.
>>
>> It is already the case with existing implementation as Xen may write byte by
>> byte. So are you suggesting the existing code is also buggy?

It looks like I may have misread the code as we only copy one byte. But 
I still think this is fragile.

For this context, I agree that a write_atomic() should do the job.

However, I still want to answer to your comments below.

> 
> Writing byte by byte is a different case. That is OK. In that case, the
> guest could see the state after 3 bytes written and it would be fine and
> consistent.

Why? What does actually prevent a guest to see an in-between value?

To give a concrete example, if the original value is 0xabc and you want 
to write 0xdef. Why would the guest never see 0xabf or 0xaec?

> If this hadn't been the case, then yes, the existing code
> would also be buggy.
> 
> So if we did the write with a memcpy, it would be fine, no need for
> atomics:
> 
>    memcpy(&guest_runstate->state_entry_time,
>           &v->runstate.state_entry_time,
>           XXX);
> 
> 
> The |= case is different: GCC could implement it in any way it likes,
> including going through a zero-write to any of the bytes in the word, or
> doing an addition then a subtraction. GCC doesn't make any guarantees.
> If we want guarantees we need to use atomics.

Yes GCC can generate assembly for |= in any way it likes. But so does 
for memcpy(). So I still don't understand why one would be fine for you 
and not the other...

Cheers,
Julien Grall June 12, 2020, 10:53 a.m. UTC | #9
Hi Bertrand,

On 11/06/2020 12:58, Bertrand Marquis wrote:
> At the moment on Arm, a Linux guest running with KTPI enabled will
> cause the following error when a context switch happens in user mode:
> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0

I think you want to add a bit more context explaining the reason on the 
failure. I.e. this is because the virtual address used by the runstate 
is only accessible when running in kernel space.

> 
> This patch is modifying the register runstate area handling on arm to
> convert the guest address during the hypercall. During runtime context
> switches the area is mapped to update the guest runstate copy.
> The patch is also removing the initial copy which was done during the
> hypercall handling as this is done on the current core when the context
> is restore to go back to guest execution on arm.

This is explaining what the commit does but not why we want to translate 
the virtual address at hypercall time. More importantly, this doesn't 
explain the restrictions added on the hypercall and why they are fine.

Note that they also should be documented in the public header.

> 
> As a consequence if the register runstate hypercall is called on one
> vcpu for an other vcpu, the area will not be updated until the vcpu
> will be executed (on arm only).

The code below suggests the hypercall will just fail if you call it from 
a different vCPU. Is that correct?

> 
> On x86, the behaviour is not modified, the address conversion is done
> during the context switch and the area is updated fully during the
> hypercall.
> inline functions in headers could not be used as the architecture
> domain.h is included before the global domain.h making it impossible
> to use the struct vcpu inside the architecture header.
> This should not have any performance impact as the hypercall is only
> called once per vcpu usually.
> 
> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
> ---
>   xen/arch/arm/domain.c        | 109 +++++++++++++++++++++++++++++------
>   xen/arch/x86/domain.c        |  30 +++++++++-
>   xen/arch/x86/x86_64/domain.c |   4 +-
>   xen/common/domain.c          |  19 ++----
>   xen/include/asm-arm/domain.h |   8 +++
>   xen/include/asm-x86/domain.h |  16 +++++
>   xen/include/xen/domain.h     |   4 ++
>   xen/include/xen/sched.h      |  16 +----
>   8 files changed, 153 insertions(+), 53 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 31169326b2..739059234f 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -19,6 +19,7 @@
>   #include <xen/sched.h>
>   #include <xen/softirq.h>
>   #include <xen/wait.h>
> +#include <xen/domain_page.h>
>   
>   #include <asm/alternative.h>
>   #include <asm/cpuerrata.h>
> @@ -275,36 +276,104 @@ static void ctxt_switch_to(struct vcpu *n)
>       virt_timer_restore(n);
>   }
>   
> -/* Update per-VCPU guest runstate shared memory area (if registered). */
> -static void update_runstate_area(struct vcpu *v)
> +void arch_cleanup_runstate_guest(struct vcpu *v)

I would prefer if this is name arch_vcpu_cleanup_runstate() as this is 
per-vCPU and not per-domain information.

>   {
> -    void __user *guest_handle = NULL;
> -    struct vcpu_runstate_info runstate;
> +    spin_lock(&v->arch.runstate_guest.lock);
>   
> -    if ( guest_handle_is_null(runstate_guest(v)) )
> -        return;
> +    /* cleanup previous page if any */
> +    if ( v->arch.runstate_guest.page )
> +    {
> +        put_page_and_type(v->arch.runstate_guest.page);

get_page_from_gva() is only grabbing a reference on the page. So you 
want to use put_page() here.

Note that we don't have type reference on Arm, so it equivalent to 
put_page(). But this wouldn't be technically correct :).

> +        v->arch.runstate_guest.page = NULL;
> +        v->arch.runstate_guest.offset = 0;
> +    }
>   
> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
> +    spin_unlock(&v->arch.runstate_guest.lock);
> +}
>   
> -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +int arch_setup_runstate_guest(struct vcpu *v,
> +                            struct vcpu_register_runstate_memory_area area)

The indentation looks off here.

Also, same remark for the naming.


> +{
> +    struct page_info *page;
> +    unsigned offset;
> +
> +    spin_lock(&v->arch.runstate_guest.lock);
> +
> +    /* cleanup previous page if any */
> +    if ( v->arch.runstate_guest.page )
>       {
> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
> -        guest_handle--;
> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> -        smp_wmb();
> +        put_page_and_type(v->arch.runstate_guest.page);

Same remark here. Although I would prefer if we try to have a common 
helper to cleaning up the runstate. Maybe cleanup_runstate_vcpu_locked()?

> +        v->arch.runstate_guest.page = NULL;
> +        v->arch.runstate_guest.offset = 0;
> +    }
> +
> +    offset = ((vaddr_t)area.addr.v) & ~PAGE_MASK;
> +    if ( offset > (PAGE_SIZE - sizeof(struct vcpu_runstate_info)) )
> +    {
> +        spin_unlock(&v->arch.runstate_guest.lock);
> +        gprintk(XENLOG_DEBUG, "Runstate is crossing pages\n");
> +        return -EINVAL;
> +    }
> +
> +    /* provided address must be aligned to a 64bit */
> +    if ( offset % alignof(struct vcpu_runstate_info) )
> +    {
> +        spin_unlock(&v->arch.runstate_guest.lock);
> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not aligned\n");
> +        return -EINVAL;
> +    }
> +
> +    page = get_page_from_gva(v, (vaddr_t)area.addr.v, GV2M_WRITE);

In the current implementation, 0 was used to unregister the runstate 
area. I think we want to keep that feature and not throw an error.

> +    if ( !page )
> +    {
> +        spin_unlock(&v->arch.runstate_guest.lock);
> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not mapped\n");
> +        return -EINVAL;
>       }
>   
> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> +    v->arch.runstate_guest.page = page;
> +    v->arch.runstate_guest.offset = offset;
> +

In the current implementation, the runstate area was update with the 
latest information during the hypercall. This doesn't seem to happen 
anymore. Is there any specific reason?

> +    spin_unlock(&v->arch.runstate_guest.lock);
> +
> +    return 0;
> +}
> +
> +
> +/* Update per-VCPU guest runstate shared memory area (if registered). */
> +static void update_runstate_area(struct vcpu *v)
> +{
> +    struct vcpu_runstate_info *guest_runstate;
> +    void *p;
> +
> +    spin_lock(&v->arch.runstate_guest.lock);
>   
> -    if ( guest_handle )
> +    if ( v->arch.runstate_guest.page )
>       {
> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +        p = __map_domain_page(v->arch.runstate_guest.page);
> +        guest_runstate = p + v->arch.runstate_guest.offset;
> +
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> +            smp_wmb();
> +        }
> +
> +        memcpy(guest_runstate, &v->runstate, sizeof(v->runstate));
>           smp_wmb();
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> +
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +            guest_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +            smp_wmb();

Why do you need this barrier?


[...]

> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 4e2f582006..3a7f53e13d 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -11,6 +11,7 @@
>   #include <asm/vgic.h>
>   #include <asm/vpl011.h>
>   #include <public/hvm/params.h>
> +#include <public/vcpu.h>

Why do you need to add this new include?

>   #include <xen/serial.h>
>   #include <xen/rbtree.h>
>   
> @@ -206,6 +207,13 @@ struct arch_vcpu
>        */
>       bool need_flush_to_ram;
>   
> +    /* runstate guest info */
> +    struct {
> +        struct page_info *page;  /* guest page */
> +        unsigned         offset; /* offset in page */

Please use unsigned int.

Cheers,
Bertrand Marquis June 12, 2020, 2:13 p.m. UTC | #10
Hi Julien,

> On 12 Jun 2020, at 11:53, Julien Grall <julien@xen.org> wrote:
> 
> Hi Bertrand,
> 
> On 11/06/2020 12:58, Bertrand Marquis wrote:
>> At the moment on Arm, a Linux guest running with KTPI enabled will
>> cause the following error when a context switch happens in user mode:
>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
> 
> I think you want to add a bit more context explaining the reason on the failure. I.e. this is because the virtual address used by the runstate is only accessible when running in kernel space.

Ok

> 
>> This patch is modifying the register runstate area handling on arm to
>> convert the guest address during the hypercall. During runtime context
>> switches the area is mapped to update the guest runstate copy.
>> The patch is also removing the initial copy which was done during the
>> hypercall handling as this is done on the current core when the context
>> is restore to go back to guest execution on arm.
> 
> This is explaining what the commit does but not why we want to translate the virtual address at hypercall time. More importantly, this doesn't explain the restrictions added on the hypercall and why they are fine.
> 
> Note that they also should be documented in the public header.

Ok

> 
>> As a consequence if the register runstate hypercall is called on one
>> vcpu for an other vcpu, the area will not be updated until the vcpu
>> will be executed (on arm only).
> 
> The code below suggests the hypercall will just fail if you call it from a different vCPU. Is that correct?

No the hypercall will work, but the area in this case won’t be updated.
When the hypercall is called on the current vcpu, the area will be updated when the context will be restored before returning to the guest so there is no need to do it an extra time in the hypercall itself.
When this is done for a different vcpu the current code is not updating the area anymore.

I did think at first to do it but the complexity it was adding (mainly due to the dual page case) was very high and I could not really think of a use case or find one in Linux.
But now that I think of it I could, in that specific case, use a copy_to_guest.

Do you think this is something which would make sense to restore ?

Anyway I will fix the commit message to be clearer on that part.

> 
>> On x86, the behaviour is not modified, the address conversion is done
>> during the context switch and the area is updated fully during the
>> hypercall.
>> inline functions in headers could not be used as the architecture
>> domain.h is included before the global domain.h making it impossible
>> to use the struct vcpu inside the architecture header.
>> This should not have any performance impact as the hypercall is only
>> called once per vcpu usually.
>> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
>> ---
>>  xen/arch/arm/domain.c        | 109 +++++++++++++++++++++++++++++------
>>  xen/arch/x86/domain.c        |  30 +++++++++-
>>  xen/arch/x86/x86_64/domain.c |   4 +-
>>  xen/common/domain.c          |  19 ++----
>>  xen/include/asm-arm/domain.h |   8 +++
>>  xen/include/asm-x86/domain.h |  16 +++++
>>  xen/include/xen/domain.h     |   4 ++
>>  xen/include/xen/sched.h      |  16 +----
>>  8 files changed, 153 insertions(+), 53 deletions(-)
>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>> index 31169326b2..739059234f 100644
>> --- a/xen/arch/arm/domain.c
>> +++ b/xen/arch/arm/domain.c
>> @@ -19,6 +19,7 @@
>>  #include <xen/sched.h>
>>  #include <xen/softirq.h>
>>  #include <xen/wait.h>
>> +#include <xen/domain_page.h>
>>    #include <asm/alternative.h>
>>  #include <asm/cpuerrata.h>
>> @@ -275,36 +276,104 @@ static void ctxt_switch_to(struct vcpu *n)
>>      virt_timer_restore(n);
>>  }
>>  -/* Update per-VCPU guest runstate shared memory area (if registered). */
>> -static void update_runstate_area(struct vcpu *v)
>> +void arch_cleanup_runstate_guest(struct vcpu *v)
> 
> I would prefer if this is name arch_vcpu_cleanup_runstate() as this is per-vCPU and not per-domain information.

Ok

> 
>>  {
>> -    void __user *guest_handle = NULL;
>> -    struct vcpu_runstate_info runstate;
>> +    spin_lock(&v->arch.runstate_guest.lock);
>>  -    if ( guest_handle_is_null(runstate_guest(v)) )
>> -        return;
>> +    /* cleanup previous page if any */
>> +    if ( v->arch.runstate_guest.page )
>> +    {
>> +        put_page_and_type(v->arch.runstate_guest.page);
> 
> get_page_from_gva() is only grabbing a reference on the page. So you want to use put_page() here.
> 
> Note that we don't have type reference on Arm, so it equivalent to put_page(). But this wouldn't be technically correct :).

Ok, thanks for the explanation.

> 
>> +        v->arch.runstate_guest.page = NULL;
>> +        v->arch.runstate_guest.offset = 0;
>> +    }
>>  -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>> +    spin_unlock(&v->arch.runstate_guest.lock);
>> +}
>>  -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +int arch_setup_runstate_guest(struct vcpu *v,
>> +                            struct vcpu_register_runstate_memory_area area)
> 
> The indentation looks off here.
> 
> Also, same remark for the naming.

Ok

> 
> 
>> +{
>> +    struct page_info *page;
>> +    unsigned offset;
>> +
>> +    spin_lock(&v->arch.runstate_guest.lock);
>> +
>> +    /* cleanup previous page if any */
>> +    if ( v->arch.runstate_guest.page )
>>      {
>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>> -        guest_handle--;
>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> -        smp_wmb();
>> +        put_page_and_type(v->arch.runstate_guest.page);
> 
> Same remark here. Although I would prefer if we try to have a common helper to cleaning up the runstate. Maybe cleanup_runstate_vcpu_locked()?

Agree I will add a static function and use it on those 2 locations.

> 
>> +        v->arch.runstate_guest.page = NULL;
>> +        v->arch.runstate_guest.offset = 0;
>> +    }
>> +
>> +    offset = ((vaddr_t)area.addr.v) & ~PAGE_MASK;
>> +    if ( offset > (PAGE_SIZE - sizeof(struct vcpu_runstate_info)) )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate is crossing pages\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    /* provided address must be aligned to a 64bit */
>> +    if ( offset % alignof(struct vcpu_runstate_info) )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not aligned\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    page = get_page_from_gva(v, (vaddr_t)area.addr.v, GV2M_WRITE);
> 
> In the current implementation, 0 was used to unregister the runstate area. I think we want to keep that feature and not throw an error.

I will add this use case (I missed it).

> 
>> +    if ( !page )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not mapped\n");
>> +        return -EINVAL;
>>      }
>>  -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>> +    v->arch.runstate_guest.page = page;
>> +    v->arch.runstate_guest.offset = offset;
>> +
> 
> In the current implementation, the runstate area was update with the latest information during the hypercall. This doesn't seem to happen anymore. Is there any specific reason?

As explained before, this is still happening on the current vcpu as a result of the context switch back to the guest at the end of the hypercall.
This is not done if called for a different vcpu anymore, but I could add it back using copy_to_guest

> 
>> +    spin_unlock(&v->arch.runstate_guest.lock);
>> +
>> +    return 0;
>> +}
>> +
>> +
>> +/* Update per-VCPU guest runstate shared memory area (if registered). */
>> +static void update_runstate_area(struct vcpu *v)
>> +{
>> +    struct vcpu_runstate_info *guest_runstate;
>> +    void *p;
>> +
>> +    spin_lock(&v->arch.runstate_guest.lock);
>>  -    if ( guest_handle )
>> +    if ( v->arch.runstate_guest.page )
>>      {
>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>> +
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +            smp_wmb();
>> +        }
>> +
>> +        memcpy(guest_runstate, &v->runstate, sizeof(v->runstate));
>>          smp_wmb();
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +            guest_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +            smp_wmb();
> 
> Why do you need this barrier?

As the bit is supposed to be used to wait for an update to be done, I felt it was better to push this out as soon as possible.
As there is already one after the memcpy, the cost should be fairly limited here.

> 
> 
> [...]
> 
>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>> index 4e2f582006..3a7f53e13d 100644
>> --- a/xen/include/asm-arm/domain.h
>> +++ b/xen/include/asm-arm/domain.h
>> @@ -11,6 +11,7 @@
>>  #include <asm/vgic.h>
>>  #include <asm/vpl011.h>
>>  #include <public/hvm/params.h>
>> +#include <public/vcpu.h>
> 
> Why do you need to add this new include?
> 
>>  #include <xen/serial.h>
>>  #include <xen/rbtree.h>
>>  @@ -206,6 +207,13 @@ struct arch_vcpu
>>       */
>>      bool need_flush_to_ram;
>>  +    /* runstate guest info */
>> +    struct {
>> +        struct page_info *page;  /* guest page */
>> +        unsigned         offset; /* offset in page */
> 
> Please use unsigned int.
Ok

Thanks for your comments.
Bertrand
Bertrand Marquis June 12, 2020, 4:51 p.m. UTC | #11
Hi,

> On 12 Jun 2020, at 11:53, Julien Grall <julien@xen.org> wrote:
> 
> Hi Bertrand,
> 
> On 11/06/2020 12:58, Bertrand Marquis wrote:
>> At the moment on Arm, a Linux guest running with KTPI enabled will
>> cause the following error when a context switch happens in user mode:
>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
> 
> I think you want to add a bit more context explaining the reason on the failure. I.e. this is because the virtual address used by the runstate is only accessible when running in kernel space.
> 
>> This patch is modifying the register runstate area handling on arm to
>> convert the guest address during the hypercall. During runtime context
>> switches the area is mapped to update the guest runstate copy.
>> The patch is also removing the initial copy which was done during the
>> hypercall handling as this is done on the current core when the context
>> is restore to go back to guest execution on arm.
> 
> This is explaining what the commit does but not why we want to translate the virtual address at hypercall time. More importantly, this doesn't explain the restrictions added on the hypercall and why they are fine.
> 
> Note that they also should be documented in the public header.
> 
>> As a consequence if the register runstate hypercall is called on one
>> vcpu for an other vcpu, the area will not be updated until the vcpu
>> will be executed (on arm only).
> 
> The code below suggests the hypercall will just fail if you call it from a different vCPU. Is that correct?
> 
>> On x86, the behaviour is not modified, the address conversion is done
>> during the context switch and the area is updated fully during the
>> hypercall.
>> inline functions in headers could not be used as the architecture
>> domain.h is included before the global domain.h making it impossible
>> to use the struct vcpu inside the architecture header.
>> This should not have any performance impact as the hypercall is only
>> called once per vcpu usually.
>> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
>> ---
>>  xen/arch/arm/domain.c        | 109 +++++++++++++++++++++++++++++------
>>  xen/arch/x86/domain.c        |  30 +++++++++-
>>  xen/arch/x86/x86_64/domain.c |   4 +-
>>  xen/common/domain.c          |  19 ++----
>>  xen/include/asm-arm/domain.h |   8 +++
>>  xen/include/asm-x86/domain.h |  16 +++++
>>  xen/include/xen/domain.h     |   4 ++
>>  xen/include/xen/sched.h      |  16 +----
>>  8 files changed, 153 insertions(+), 53 deletions(-)
>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>> index 31169326b2..739059234f 100644
>> --- a/xen/arch/arm/domain.c
>> +++ b/xen/arch/arm/domain.c
>> @@ -19,6 +19,7 @@
>>  #include <xen/sched.h>
>>  #include <xen/softirq.h>
>>  #include <xen/wait.h>
>> +#include <xen/domain_page.h>
>>    #include <asm/alternative.h>
>>  #include <asm/cpuerrata.h>
>> @@ -275,36 +276,104 @@ static void ctxt_switch_to(struct vcpu *n)
>>      virt_timer_restore(n);
>>  }
>>  -/* Update per-VCPU guest runstate shared memory area (if registered). */
>> -static void update_runstate_area(struct vcpu *v)
>> +void arch_cleanup_runstate_guest(struct vcpu *v)
> 
> I would prefer if this is name arch_vcpu_cleanup_runstate() as this is per-vCPU and not per-domain information.
> 
>>  {
>> -    void __user *guest_handle = NULL;
>> -    struct vcpu_runstate_info runstate;
>> +    spin_lock(&v->arch.runstate_guest.lock);
>>  -    if ( guest_handle_is_null(runstate_guest(v)) )
>> -        return;
>> +    /* cleanup previous page if any */
>> +    if ( v->arch.runstate_guest.page )
>> +    {
>> +        put_page_and_type(v->arch.runstate_guest.page);
> 
> get_page_from_gva() is only grabbing a reference on the page. So you want to use put_page() here.
> 
> Note that we don't have type reference on Arm, so it equivalent to put_page(). But this wouldn't be technically correct :).
> 
>> +        v->arch.runstate_guest.page = NULL;
>> +        v->arch.runstate_guest.offset = 0;
>> +    }
>>  -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>> +    spin_unlock(&v->arch.runstate_guest.lock);
>> +}
>>  -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +int arch_setup_runstate_guest(struct vcpu *v,
>> +                            struct vcpu_register_runstate_memory_area area)
> 
> The indentation looks off here.
> 
> Also, same remark for the naming.
> 
> 
>> +{
>> +    struct page_info *page;
>> +    unsigned offset;
>> +
>> +    spin_lock(&v->arch.runstate_guest.lock);
>> +
>> +    /* cleanup previous page if any */
>> +    if ( v->arch.runstate_guest.page )
>>      {
>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>> -        guest_handle--;
>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> -        smp_wmb();
>> +        put_page_and_type(v->arch.runstate_guest.page);
> 
> Same remark here. Although I would prefer if we try to have a common helper to cleaning up the runstate. Maybe cleanup_runstate_vcpu_locked()?
> 
>> +        v->arch.runstate_guest.page = NULL;
>> +        v->arch.runstate_guest.offset = 0;
>> +    }
>> +
>> +    offset = ((vaddr_t)area.addr.v) & ~PAGE_MASK;
>> +    if ( offset > (PAGE_SIZE - sizeof(struct vcpu_runstate_info)) )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate is crossing pages\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    /* provided address must be aligned to a 64bit */
>> +    if ( offset % alignof(struct vcpu_runstate_info) )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not aligned\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    page = get_page_from_gva(v, (vaddr_t)area.addr.v, GV2M_WRITE);
> 
> In the current implementation, 0 was used to unregister the runstate area. I think we want to keep that feature and not throw an error.
> 
>> +    if ( !page )
>> +    {
>> +        spin_unlock(&v->arch.runstate_guest.lock);
>> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not mapped\n");
>> +        return -EINVAL;
>>      }
>>  -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>> +    v->arch.runstate_guest.page = page;
>> +    v->arch.runstate_guest.offset = offset;
>> +
> 
> In the current implementation, the runstate area was update with the latest information during the hypercall. This doesn't seem to happen anymore. Is there any specific reason?
> 
>> +    spin_unlock(&v->arch.runstate_guest.lock);
>> +
>> +    return 0;
>> +}
>> +
>> +
>> +/* Update per-VCPU guest runstate shared memory area (if registered). */
>> +static void update_runstate_area(struct vcpu *v)
>> +{
>> +    struct vcpu_runstate_info *guest_runstate;
>> +    void *p;
>> +
>> +    spin_lock(&v->arch.runstate_guest.lock);
>>  -    if ( guest_handle )
>> +    if ( v->arch.runstate_guest.page )
>>      {
>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>> +
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +            smp_wmb();
>> +        }
>> +
>> +        memcpy(guest_runstate, &v->runstate, sizeof(v->runstate));
>>          smp_wmb();
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +            guest_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +            smp_wmb();
> 
> Why do you need this barrier?
> 
> 
> [...]
> 
>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>> index 4e2f582006..3a7f53e13d 100644
>> --- a/xen/include/asm-arm/domain.h
>> +++ b/xen/include/asm-arm/domain.h
>> @@ -11,6 +11,7 @@
>>  #include <asm/vgic.h>
>>  #include <asm/vpl011.h>
>>  #include <public/hvm/params.h>
>> +#include <public/vcpu.h>
> 
> Why do you need to add this new include?

Sorry I forgot to answer to this one.
This is needed to have the definition of vcpu_register_runstate_memory_area.

Bertrand
Julien Grall June 12, 2020, 7:56 p.m. UTC | #12
On 12/06/2020 15:13, Bertrand Marquis wrote:
> Hi Julien,

Hi,

>> On 12 Jun 2020, at 11:53, Julien Grall <julien@xen.org> wrote:
>> The code below suggests the hypercall will just fail if you call it from a different vCPU. Is that correct?
> 
> No the hypercall will work, but the area in this case won’t be updated.
> When the hypercall is called on the current vcpu, the area will be updated when the context will be restored before returning to the guest so there is no need to do it an extra time in the hypercall itself.

I am afraid this is not correct, update_runstate_area() is only called 
when context switching between 2 vCPUs. So the only way this could be 
called on return from hypercall is if the vCPU get preempted.

> When this is done for a different vcpu the current code is not updating the area anymore.
> 
> I did think at first to do it but the complexity it was adding (mainly due to the dual page case) was very high and I could not really think of a use case or find one in Linux.

You may want to have an updated view without forcing the vCPU to exit to 
the hypervisor and do a context switch.

> But now that I think of it I could, in that specific case, use a copy_to_guest.

I think you should be able to call update_runstate_area() directly.

> 
> Do you think this is something which would make sense to restore ?

I would like to avoid diverging too much from the current definition of 
the hypercall. In this case, I don't think the restriction you add is 
necessary.

>>
>>> +    if ( !page )
>>> +    {
>>> +        spin_unlock(&v->arch.runstate_guest.lock);
>>> +        gprintk(XENLOG_DEBUG, "Runstate pointer is not mapped\n");
>>> +        return -EINVAL;
>>>       }
>>>   -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>>> +    v->arch.runstate_guest.page = page;
>>> +    v->arch.runstate_guest.offset = offset;
>>> +
>>
>> In the current implementation, the runstate area was update with the latest information during the hypercall. This doesn't seem to happen anymore. Is there any specific reason?
> 
> As explained before, this is still happening on the current vcpu as a result of the context switch back to the guest at the end of the hypercall.

See above, I don't believe this is correct. :).

>>> +    spin_unlock(&v->arch.runstate_guest.lock);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +
>>> +/* Update per-VCPU guest runstate shared memory area (if registered). */
>>> +static void update_runstate_area(struct vcpu *v)
>>> +{
>>> +    struct vcpu_runstate_info *guest_runstate;
>>> +    void *p;
>>> +
>>> +    spin_lock(&v->arch.runstate_guest.lock);
>>>   -    if ( guest_handle )
>>> +    if ( v->arch.runstate_guest.page )
>>>       {
>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>>> +
>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>> +        {
>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>> +            smp_wmb();
>>> +        }
>>> +
>>> +        memcpy(guest_runstate, &v->runstate, sizeof(v->runstate));
>>>           smp_wmb();
>>> -        __raw_copy_to_guest(guest_handle,
>>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>>> +
>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>> +        {
>>> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>> +            guest_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>> +            smp_wmb();
>>
>> Why do you need this barrier?
> 
> As the bit is supposed to be used to wait for an update to be done, I felt it was better to push this out as soon as possible.

smp_wmb() only ensure that any write before the barrier will be seen 
before after write after the barrier. It doesn't guarantee the other end 
will see it right after it...

> As there is already one after the memcpy, the cost should be fairly limited here.

... so this feel like more a micro-optimization (happy to be proved 
wrong). The memory barriers are already tricky enough to use/understand, 
so I would rather not want to use more than necessary.

Cheers,
Julien Grall June 12, 2020, 8:31 p.m. UTC | #13
On 12/06/2020 17:51, Bertrand Marquis wrote:

Hi,

>>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>>> index 4e2f582006..3a7f53e13d 100644
>>> --- a/xen/include/asm-arm/domain.h
>>> +++ b/xen/include/asm-arm/domain.h
>>> @@ -11,6 +11,7 @@
>>>   #include <asm/vgic.h>
>>>   #include <asm/vpl011.h>
>>>   #include <public/hvm/params.h>
>>> +#include <public/vcpu.h>
>>
>> Why do you need to add this new include?
> 
> Sorry I forgot to answer to this one.
> This is needed to have the definition of vcpu_register_runstate_memory_area.

I might be missing something because nothing you have added this header 
seem to require vcpu_register_runstate_memory_area. So should the 
include be done in a different header?

Cheers,
Stefano Stabellini June 13, 2020, 12:24 a.m. UTC | #14
On Fri, 12 Jun 2020, Julien Grall wrote:
> > Writing byte by byte is a different case. That is OK. In that case, the
> > guest could see the state after 3 bytes written and it would be fine and
> > consistent.
> 
> Why? What does actually prevent a guest to see an in-between value?
> 
> To give a concrete example, if the original value is 0xabc and you want to
> write 0xdef. Why would the guest never see 0xabf or 0xaec?

That is a good question, but the minimum granularity is 1 byte, so if

  current: 0xaabbcc
  new: 0xddeeff

Can the guest see one of the following?

  0xaabbff
  0xaaeecc

Yes, it can. I don't think it is a problem in this case because we only
change 1 byte, so to continue with the example:

  current: 0xaabbcc
  new: 0xffbbcc

So the only values the VM can see are either 0xaabbcc or 0xffbbcc.


> > If this hadn't been the case, then yes, the existing code
> > would also be buggy.
> > 
> > So if we did the write with a memcpy, it would be fine, no need for
> > atomics:
> > 
> >    memcpy(&guest_runstate->state_entry_time,
> >           &v->runstate.state_entry_time,
> >           XXX);
> > 
> > 
> > The |= case is different: GCC could implement it in any way it likes,
> > including going through a zero-write to any of the bytes in the word, or
> > doing an addition then a subtraction. GCC doesn't make any guarantees.
> > If we want guarantees we need to use atomics.
> 
> Yes GCC can generate assembly for |= in any way it likes. But so does for
> memcpy(). So I still don't understand why one would be fine for you and not
> the other...

It is down to the implementation of memcpy to guarantee that the only
thing memcpy does is memory copies. memcpy doesn't specify whether it is
going to use byte-copies or word-copies, but it should only do copies.
If we have to write memcpy in assembly to make it so, so be it :-)
Stefano Stabellini June 13, 2020, 12:24 a.m. UTC | #15
On Fri, 12 Jun 2020, Bertrand Marquis wrote:
> > On 12 Jun 2020, at 02:09, Stefano Stabellini <sstabellini@kernel.org> wrote:
> > 
> > On Thu, 11 Jun 2020, Julien Grall wrote:
> >> Hi Stefano,
> >> 
> >> On 11/06/2020 19:50, Stefano Stabellini wrote:
> >>> On Thu, 11 Jun 2020, Julien Grall wrote:
> >>>>>> +        return -EINVAL;
> >>>>>>      }
> >>>>>> 
> >>>>>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> >>>>>> +    v->arch.runstate_guest.page = page;
> >>>>>> +    v->arch.runstate_guest.offset = offset;
> >>>>>> +
> >>>>>> +    spin_unlock(&v->arch.runstate_guest.lock);
> >>>>>> +
> >>>>>> +    return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +
> >>>>>> +/* Update per-VCPU guest runstate shared memory area (if registered).
> >>>>>> */
> >>>>>> +static void update_runstate_area(struct vcpu *v)
> >>>>>> +{
> >>>>>> +    struct vcpu_runstate_info *guest_runstate;
> >>>>>> +    void *p;
> >>>>>> +
> >>>>>> +    spin_lock(&v->arch.runstate_guest.lock);
> >>>>>> 
> >>>>>> -    if ( guest_handle )
> >>>>>> +    if ( v->arch.runstate_guest.page )
> >>>>>>      {
> >>>>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> >>>>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
> >>>>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
> >>>>>> +
> >>>>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> >>>>>> +        {
> >>>>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> >>>>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> >>>>> 
> >>>>> I think that this write to guest_runstate should use write_atomic or
> >>>>> another atomic write operation.
> >>>> 
> >>>> I thought about suggesting the same, but  guest_copy_* helpers may not
> >>>> do a single memory write to state_entry_time.
> >>>> What are you trying to prevent with the write_atomic()?
> >>> 
> >>> I am thinking that without using an atomic write, it would be (at least
> >>> theoretically) possible for a guest to see a partial write to
> >>> state_entry_time, which is not good. 
> >> 
> >> It is already the case with existing implementation as Xen may write byte by
> >> byte. So are you suggesting the existing code is also buggy?
> > 
> > Writing byte by byte is a different case. That is OK. In that case, the
> > guest could see the state after 3 bytes written and it would be fine and
> > consistent. If this hadn't been the case, then yes, the existing code
> > would also be buggy.
> > 
> > So if we did the write with a memcpy, it would be fine, no need for
> > atomics:
> > 
> >  memcpy(&guest_runstate->state_entry_time,
> >         &v->runstate.state_entry_time,
> >         XXX);
> > 
> > 
> > The |= case is different: GCC could implement it in any way it likes,
> > including going through a zero-write to any of the bytes in the word, or
> > doing an addition then a subtraction. GCC doesn't make any guarantees.
> > If we want guarantees we need to use atomics.
> 
> Wouldn’t that require all accesses to state_entry_time to use also atomic operations ?
> In this case we could not propagate the changes to a guest without changing the interface itself.
> 
> As the copy time needs to be protected, the write barriers are there to make sure that during the copy the bit is set and that when we unset it, the copy is done.
> I added for this purpose a barrier after the memcpy to make sure that when/if we unset the bit the copy has already been done.

As you say, we have a flag to mark a transitiong period, the flag is
XEN_RUNSTATE_UPDATE. So, I think it is OK if we don't use atomics during
the transitioning period. But we need to make sure to use atomics for the
update of the XEN_RUNSTATE_UPDATE flag itself.

Does it make sense? Or maybe I misunderstood some of the things you
wrote?
Bertrand Marquis June 15, 2020, 2:01 p.m. UTC | #16
> On 12 Jun 2020, at 21:31, Julien Grall <julien@xen.org> wrote:
> 
> 
> 
> On 12/06/2020 17:51, Bertrand Marquis wrote:
> 
> Hi,
> 
>>>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>>>> index 4e2f582006..3a7f53e13d 100644
>>>> --- a/xen/include/asm-arm/domain.h
>>>> +++ b/xen/include/asm-arm/domain.h
>>>> @@ -11,6 +11,7 @@
>>>>  #include <asm/vgic.h>
>>>>  #include <asm/vpl011.h>
>>>>  #include <public/hvm/params.h>
>>>> +#include <public/vcpu.h>
>>> 
>>> Why do you need to add this new include?
>> Sorry I forgot to answer to this one.
>> This is needed to have the definition of vcpu_register_runstate_memory_area.
> 
> I might be missing something because nothing you have added this header seem to require vcpu_register_runstate_memory_area. So should the include be done in a different header?

Right.
This was required before when I had the definitions of the interface per arch to have a static inline for x86.
This is not needed and I will remove that include.

Cheers
Bertrand
Bertrand Marquis June 15, 2020, 2:09 p.m. UTC | #17
> On 13 Jun 2020, at 01:24, Stefano Stabellini <sstabellini@kernel.org> wrote:
> 
> On Fri, 12 Jun 2020, Bertrand Marquis wrote:
>>> On 12 Jun 2020, at 02:09, Stefano Stabellini <sstabellini@kernel.org> wrote:
>>> 
>>> On Thu, 11 Jun 2020, Julien Grall wrote:
>>>> Hi Stefano,
>>>> 
>>>> On 11/06/2020 19:50, Stefano Stabellini wrote:
>>>>> On Thu, 11 Jun 2020, Julien Grall wrote:
>>>>>>>> +        return -EINVAL;
>>>>>>>>     }
>>>>>>>> 
>>>>>>>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>>>>>>>> +    v->arch.runstate_guest.page = page;
>>>>>>>> +    v->arch.runstate_guest.offset = offset;
>>>>>>>> +
>>>>>>>> +    spin_unlock(&v->arch.runstate_guest.lock);
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +
>>>>>>>> +/* Update per-VCPU guest runstate shared memory area (if registered).
>>>>>>>> */
>>>>>>>> +static void update_runstate_area(struct vcpu *v)
>>>>>>>> +{
>>>>>>>> +    struct vcpu_runstate_info *guest_runstate;
>>>>>>>> +    void *p;
>>>>>>>> +
>>>>>>>> +    spin_lock(&v->arch.runstate_guest.lock);
>>>>>>>> 
>>>>>>>> -    if ( guest_handle )
>>>>>>>> +    if ( v->arch.runstate_guest.page )
>>>>>>>>     {
>>>>>>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>>>>>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
>>>>>>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
>>>>>>>> +
>>>>>>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>>>>>>> +        {
>>>>>>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>>>>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>>>>> 
>>>>>>> I think that this write to guest_runstate should use write_atomic or
>>>>>>> another atomic write operation.
>>>>>> 
>>>>>> I thought about suggesting the same, but  guest_copy_* helpers may not
>>>>>> do a single memory write to state_entry_time.
>>>>>> What are you trying to prevent with the write_atomic()?
>>>>> 
>>>>> I am thinking that without using an atomic write, it would be (at least
>>>>> theoretically) possible for a guest to see a partial write to
>>>>> state_entry_time, which is not good. 
>>>> 
>>>> It is already the case with existing implementation as Xen may write byte by
>>>> byte. So are you suggesting the existing code is also buggy?
>>> 
>>> Writing byte by byte is a different case. That is OK. In that case, the
>>> guest could see the state after 3 bytes written and it would be fine and
>>> consistent. If this hadn't been the case, then yes, the existing code
>>> would also be buggy.
>>> 
>>> So if we did the write with a memcpy, it would be fine, no need for
>>> atomics:
>>> 
>>> memcpy(&guest_runstate->state_entry_time,
>>>        &v->runstate.state_entry_time,
>>>        XXX);
>>> 
>>> 
>>> The |= case is different: GCC could implement it in any way it likes,
>>> including going through a zero-write to any of the bytes in the word, or
>>> doing an addition then a subtraction. GCC doesn't make any guarantees.
>>> If we want guarantees we need to use atomics.
>> 
>> Wouldn’t that require all accesses to state_entry_time to use also atomic operations ?
>> In this case we could not propagate the changes to a guest without changing the interface itself.
>> 
>> As the copy time needs to be protected, the write barriers are there to make sure that during the copy the bit is set and that when we unset it, the copy is done.
>> I added for this purpose a barrier after the memcpy to make sure that when/if we unset the bit the copy has already been done.
> 
> As you say, we have a flag to mark a transitiong period, the flag is
> XEN_RUNSTATE_UPDATE. So, I think it is OK if we don't use atomics during
> the transitioning period. But we need to make sure to use atomics for the
> update of the XEN_RUNSTATE_UPDATE flag itself.
> 
> Does it make sense? Or maybe I misunderstood some of the things you
> wrote?

To achieve this you would do an atomic operation on state_entry_time to set/unset the XEN_RUNSTATE_UPDATE bit.
This field is holding a flag in the upper bit but also a value, so all operations on state_entry_time would need to be changed to use atomic operations.

Also this state_entry_time might also be accessed by the guest on other cores at the same time (to retrieve the time part).

To prevent something being used as atomic and non atomic, specific types are usually used (atomic_t) and this structure is also used by guests so modifying it will not be easy.

Or did I missunderstood something here ?

Regards
Bertrand
Stefano Stabellini June 15, 2020, 8:30 p.m. UTC | #18
On Mon, 15 Jun 2020, Bertrand Marquis wrote:
> > On 13 Jun 2020, at 01:24, Stefano Stabellini <sstabellini@kernel.org> wrote:
> > 
> > On Fri, 12 Jun 2020, Bertrand Marquis wrote:
> >>> On 12 Jun 2020, at 02:09, Stefano Stabellini <sstabellini@kernel.org> wrote:
> >>> 
> >>> On Thu, 11 Jun 2020, Julien Grall wrote:
> >>>> Hi Stefano,
> >>>> 
> >>>> On 11/06/2020 19:50, Stefano Stabellini wrote:
> >>>>> On Thu, 11 Jun 2020, Julien Grall wrote:
> >>>>>>>> +        return -EINVAL;
> >>>>>>>>     }
> >>>>>>>> 
> >>>>>>>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> >>>>>>>> +    v->arch.runstate_guest.page = page;
> >>>>>>>> +    v->arch.runstate_guest.offset = offset;
> >>>>>>>> +
> >>>>>>>> +    spin_unlock(&v->arch.runstate_guest.lock);
> >>>>>>>> +
> >>>>>>>> +    return 0;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +
> >>>>>>>> +/* Update per-VCPU guest runstate shared memory area (if registered).
> >>>>>>>> */
> >>>>>>>> +static void update_runstate_area(struct vcpu *v)
> >>>>>>>> +{
> >>>>>>>> +    struct vcpu_runstate_info *guest_runstate;
> >>>>>>>> +    void *p;
> >>>>>>>> +
> >>>>>>>> +    spin_lock(&v->arch.runstate_guest.lock);
> >>>>>>>> 
> >>>>>>>> -    if ( guest_handle )
> >>>>>>>> +    if ( v->arch.runstate_guest.page )
> >>>>>>>>     {
> >>>>>>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> >>>>>>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
> >>>>>>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
> >>>>>>>> +
> >>>>>>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> >>>>>>>> +        {
> >>>>>>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> >>>>>>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> >>>>>>> 
> >>>>>>> I think that this write to guest_runstate should use write_atomic or
> >>>>>>> another atomic write operation.
> >>>>>> 
> >>>>>> I thought about suggesting the same, but  guest_copy_* helpers may not
> >>>>>> do a single memory write to state_entry_time.
> >>>>>> What are you trying to prevent with the write_atomic()?
> >>>>> 
> >>>>> I am thinking that without using an atomic write, it would be (at least
> >>>>> theoretically) possible for a guest to see a partial write to
> >>>>> state_entry_time, which is not good. 
> >>>> 
> >>>> It is already the case with existing implementation as Xen may write byte by
> >>>> byte. So are you suggesting the existing code is also buggy?
> >>> 
> >>> Writing byte by byte is a different case. That is OK. In that case, the
> >>> guest could see the state after 3 bytes written and it would be fine and
> >>> consistent. If this hadn't been the case, then yes, the existing code
> >>> would also be buggy.
> >>> 
> >>> So if we did the write with a memcpy, it would be fine, no need for
> >>> atomics:
> >>> 
> >>> memcpy(&guest_runstate->state_entry_time,
> >>>        &v->runstate.state_entry_time,
> >>>        XXX);
> >>> 
> >>> 
> >>> The |= case is different: GCC could implement it in any way it likes,
> >>> including going through a zero-write to any of the bytes in the word, or
> >>> doing an addition then a subtraction. GCC doesn't make any guarantees.
> >>> If we want guarantees we need to use atomics.
> >> 
> >> Wouldn’t that require all accesses to state_entry_time to use also atomic operations ?
> >> In this case we could not propagate the changes to a guest without changing the interface itself.
> >> 
> >> As the copy time needs to be protected, the write barriers are there to make sure that during the copy the bit is set and that when we unset it, the copy is done.
> >> I added for this purpose a barrier after the memcpy to make sure that when/if we unset the bit the copy has already been done.
> > 
> > As you say, we have a flag to mark a transitiong period, the flag is
> > XEN_RUNSTATE_UPDATE. So, I think it is OK if we don't use atomics during
> > the transitioning period. But we need to make sure to use atomics for the
> > update of the XEN_RUNSTATE_UPDATE flag itself.
> > 
> > Does it make sense? Or maybe I misunderstood some of the things you
> > wrote?
> 
> To achieve this you would do an atomic operation on state_entry_time to set/unset the XEN_RUNSTATE_UPDATE bit.
> This field is holding a flag in the upper bit but also a value, so all operations on state_entry_time would need to be changed to use atomic operations.

I don't think that all operations on state_entry_time need to be atomic.
Only the bit write to XEN_RUNSTATE_UPDATE. More on this below.


> Also this state_entry_time might also be accessed by the guest on other cores at the same time (to retrieve the time part).

Yes but they are all just readers, right?


> To prevent something being used as atomic and non atomic, specific types are usually used (atomic_t) and this structure is also used by guests so modifying it will not be easy.
> 
> Or did I missunderstood something here ?

I was not suggesting to use an atomic_t type. I was only suggesting to
use an atomic operation, i.e. calling write_u32_atomic directly (or
something like that.) I would not change the type of state_entry_time.
Also using memcpy would be acceptable due to the fact that we only need
to update one byte.
Julien Grall June 15, 2020, 8:44 p.m. UTC | #19
On Mon, 15 Jun 2020 at 21:30, Stefano Stabellini <sstabellini@kernel.org> wrote:
>
> On Mon, 15 Jun 2020, Bertrand Marquis wrote:
> > > On 13 Jun 2020, at 01:24, Stefano Stabellini <sstabellini@kernel.org> wrote:
> > >
> > > On Fri, 12 Jun 2020, Bertrand Marquis wrote:
> > >>> On 12 Jun 2020, at 02:09, Stefano Stabellini <sstabellini@kernel.org> wrote:
> > >>>
> > >>> On Thu, 11 Jun 2020, Julien Grall wrote:
> > >>>> Hi Stefano,
> > >>>>
> > >>>> On 11/06/2020 19:50, Stefano Stabellini wrote:
> > >>>>> On Thu, 11 Jun 2020, Julien Grall wrote:
> > >>>>>>>> +        return -EINVAL;
> > >>>>>>>>     }
> > >>>>>>>>
> > >>>>>>>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> > >>>>>>>> +    v->arch.runstate_guest.page = page;
> > >>>>>>>> +    v->arch.runstate_guest.offset = offset;
> > >>>>>>>> +
> > >>>>>>>> +    spin_unlock(&v->arch.runstate_guest.lock);
> > >>>>>>>> +
> > >>>>>>>> +    return 0;
> > >>>>>>>> +}
> > >>>>>>>> +
> > >>>>>>>> +
> > >>>>>>>> +/* Update per-VCPU guest runstate shared memory area (if registered).
> > >>>>>>>> */
> > >>>>>>>> +static void update_runstate_area(struct vcpu *v)
> > >>>>>>>> +{
> > >>>>>>>> +    struct vcpu_runstate_info *guest_runstate;
> > >>>>>>>> +    void *p;
> > >>>>>>>> +
> > >>>>>>>> +    spin_lock(&v->arch.runstate_guest.lock);
> > >>>>>>>>
> > >>>>>>>> -    if ( guest_handle )
> > >>>>>>>> +    if ( v->arch.runstate_guest.page )
> > >>>>>>>>     {
> > >>>>>>>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> > >>>>>>>> +        p = __map_domain_page(v->arch.runstate_guest.page);
> > >>>>>>>> +        guest_runstate = p + v->arch.runstate_guest.offset;
> > >>>>>>>> +
> > >>>>>>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> > >>>>>>>> +        {
> > >>>>>>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> > >>>>>>>> +            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> > >>>>>>>
> > >>>>>>> I think that this write to guest_runstate should use write_atomic or
> > >>>>>>> another atomic write operation.
> > >>>>>>
> > >>>>>> I thought about suggesting the same, but  guest_copy_* helpers may not
> > >>>>>> do a single memory write to state_entry_time.
> > >>>>>> What are you trying to prevent with the write_atomic()?
> > >>>>>
> > >>>>> I am thinking that without using an atomic write, it would be (at least
> > >>>>> theoretically) possible for a guest to see a partial write to
> > >>>>> state_entry_time, which is not good.
> > >>>>
> > >>>> It is already the case with existing implementation as Xen may write byte by
> > >>>> byte. So are you suggesting the existing code is also buggy?
> > >>>
> > >>> Writing byte by byte is a different case. That is OK. In that case, the
> > >>> guest could see the state after 3 bytes written and it would be fine and
> > >>> consistent. If this hadn't been the case, then yes, the existing code
> > >>> would also be buggy.
> > >>>
> > >>> So if we did the write with a memcpy, it would be fine, no need for
> > >>> atomics:
> > >>>
> > >>> memcpy(&guest_runstate->state_entry_time,
> > >>>        &v->runstate.state_entry_time,
> > >>>        XXX);
> > >>>
> > >>>
> > >>> The |= case is different: GCC could implement it in any way it likes,
> > >>> including going through a zero-write to any of the bytes in the word, or
> > >>> doing an addition then a subtraction. GCC doesn't make any guarantees.
> > >>> If we want guarantees we need to use atomics.
> > >>
> > >> Wouldn’t that require all accesses to state_entry_time to use also atomic operations ?
> > >> In this case we could not propagate the changes to a guest without changing the interface itself.
> > >>
> > >> As the copy time needs to be protected, the write barriers are there to make sure that during the copy the bit is set and that when we unset it, the copy is done.
> > >> I added for this purpose a barrier after the memcpy to make sure that when/if we unset the bit the copy has already been done.
> > >
> > > As you say, we have a flag to mark a transitiong period, the flag is
> > > XEN_RUNSTATE_UPDATE. So, I think it is OK if we don't use atomics during
> > > the transitioning period. But we need to make sure to use atomics for the
> > > update of the XEN_RUNSTATE_UPDATE flag itself.
> > >
> > > Does it make sense? Or maybe I misunderstood some of the things you
> > > wrote?
> >
> > To achieve this you would do an atomic operation on state_entry_time to set/unset the XEN_RUNSTATE_UPDATE bit.
> > This field is holding a flag in the upper bit but also a value, so all operations on state_entry_time would need to be changed to use atomic operations.
>
> I don't think that all operations on state_entry_time need to be atomic.
> Only the bit write to XEN_RUNSTATE_UPDATE. More on this below.
>
>
> > Also this state_entry_time might also be accessed by the guest on other cores at the same time (to retrieve the time part).
>
> Yes but they are all just readers, right?

This is only one part of the equation. The second part is the readers
*must not* process the rest of the content while XEN_RUNSTATE_UDPATE
is set.

>
> I was not suggesting to use an atomic_t type. I was only suggesting to
> use an atomic operation, i.e. calling write_u32_atomic directly (or
> something like that.) I would not change the type of state_entry_time.
> Also using memcpy would be acceptable due to the fact that we only need
> to update one byte.

Please don't use memcpy. This is an abuse to think it can make atomic
copy (even if it may be correct for byte).
At the same time, lets avoid to introduce more atomic bit operation on
guest memory that can be read-write (see XSA-295).

In this case a write_atomic() is sufficient as it would do a single
write for size smaller than 64-bit.

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 31169326b2..739059234f 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -19,6 +19,7 @@ 
 #include <xen/sched.h>
 #include <xen/softirq.h>
 #include <xen/wait.h>
+#include <xen/domain_page.h>
 
 #include <asm/alternative.h>
 #include <asm/cpuerrata.h>
@@ -275,36 +276,104 @@  static void ctxt_switch_to(struct vcpu *n)
     virt_timer_restore(n);
 }
 
-/* Update per-VCPU guest runstate shared memory area (if registered). */
-static void update_runstate_area(struct vcpu *v)
+void arch_cleanup_runstate_guest(struct vcpu *v)
 {
-    void __user *guest_handle = NULL;
-    struct vcpu_runstate_info runstate;
+    spin_lock(&v->arch.runstate_guest.lock);
 
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return;
+    /* cleanup previous page if any */
+    if ( v->arch.runstate_guest.page )
+    {
+        put_page_and_type(v->arch.runstate_guest.page);
+        v->arch.runstate_guest.page = NULL;
+        v->arch.runstate_guest.offset = 0;
+    }
 
-    memcpy(&runstate, &v->runstate, sizeof(runstate));
+    spin_unlock(&v->arch.runstate_guest.lock);
+}
 
-    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+int arch_setup_runstate_guest(struct vcpu *v,
+                            struct vcpu_register_runstate_memory_area area)
+{
+    struct page_info *page;
+    unsigned offset;
+
+    spin_lock(&v->arch.runstate_guest.lock);
+
+    /* cleanup previous page if any */
+    if ( v->arch.runstate_guest.page )
     {
-        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
-        guest_handle--;
-        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
-        smp_wmb();
+        put_page_and_type(v->arch.runstate_guest.page);
+        v->arch.runstate_guest.page = NULL;
+        v->arch.runstate_guest.offset = 0;
+    }
+
+    offset = ((vaddr_t)area.addr.v) & ~PAGE_MASK;
+    if ( offset > (PAGE_SIZE - sizeof(struct vcpu_runstate_info)) )
+    {
+        spin_unlock(&v->arch.runstate_guest.lock);
+        gprintk(XENLOG_DEBUG, "Runstate is crossing pages\n");
+        return -EINVAL;
+    }
+
+    /* provided address must be aligned to a 64bit */
+    if ( offset % alignof(struct vcpu_runstate_info) )
+    {
+        spin_unlock(&v->arch.runstate_guest.lock);
+        gprintk(XENLOG_DEBUG, "Runstate pointer is not aligned\n");
+        return -EINVAL;
+    }
+
+    page = get_page_from_gva(v, (vaddr_t)area.addr.v, GV2M_WRITE);
+    if ( !page )
+    {
+        spin_unlock(&v->arch.runstate_guest.lock);
+        gprintk(XENLOG_DEBUG, "Runstate pointer is not mapped\n");
+        return -EINVAL;
     }
 
-    __copy_to_guest(runstate_guest(v), &runstate, 1);
+    v->arch.runstate_guest.page = page;
+    v->arch.runstate_guest.offset = offset;
+
+    spin_unlock(&v->arch.runstate_guest.lock);
+
+    return 0;
+}
+
+
+/* Update per-VCPU guest runstate shared memory area (if registered). */
+static void update_runstate_area(struct vcpu *v)
+{
+    struct vcpu_runstate_info *guest_runstate;
+    void *p;
+
+    spin_lock(&v->arch.runstate_guest.lock);
 
-    if ( guest_handle )
+    if ( v->arch.runstate_guest.page )
     {
-        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        p = __map_domain_page(v->arch.runstate_guest.page);
+        guest_runstate = p + v->arch.runstate_guest.offset;
+
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+            guest_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
+            smp_wmb();
+        }
+
+        memcpy(guest_runstate, &v->runstate, sizeof(v->runstate));
         smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+            guest_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+            smp_wmb();
+        }
+
+        unmap_domain_page(p);
     }
+
+    spin_unlock(&v->arch.runstate_guest.lock);
 }
 
 static void schedule_tail(struct vcpu *prev)
@@ -560,6 +629,8 @@  int arch_vcpu_create(struct vcpu *v)
     v->arch.saved_context.sp = (register_t)v->arch.cpu_info;
     v->arch.saved_context.pc = (register_t)continue_new_vcpu;
 
+    spin_lock_init(&v->arch.runstate_guest.lock);
+
     /* Idle VCPUs don't need the rest of this setup */
     if ( is_idle_vcpu(v) )
         return rc;
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index fee6c3931a..32bbed87d5 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1642,6 +1642,30 @@  void paravirt_ctxt_switch_to(struct vcpu *v)
         wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
 }
 
+int arch_setup_runstate_guest(struct vcpu *v,
+                             struct vcpu_register_runstate_memory_area area)
+{
+    struct vcpu_runstate_info runstate;
+
+    runstate_guest(v) = area.addr.h;
+
+    if ( v == current )
+    {
+        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
+    }
+    else
+    {
+        vcpu_runstate_get(v, &runstate);
+        __copy_to_guest(runstate_guest(v), &runstate, 1);
+    }
+    return 0;
+}
+
+void arch_cleanup_runstate_guest(struct vcpu *v)
+{
+    set_xen_guest_handle(runstate_guest(v), NULL);
+}
+
 /* Update per-VCPU guest runstate shared memory area (if registered). */
 bool update_runstate_area(struct vcpu *v)
 {
@@ -1660,8 +1684,8 @@  bool update_runstate_area(struct vcpu *v)
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
         guest_handle = has_32bit_shinfo(v->domain)
-            ? &v->runstate_guest.compat.p->state_entry_time + 1
-            : &v->runstate_guest.native.p->state_entry_time + 1;
+            ? &v->arch.runstate_guest.compat.p->state_entry_time + 1
+            : &v->arch.runstate_guest.native.p->state_entry_time + 1;
         guest_handle--;
         runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
         __raw_copy_to_guest(guest_handle,
@@ -1674,7 +1698,7 @@  bool update_runstate_area(struct vcpu *v)
         struct compat_vcpu_runstate_info info;
 
         XLAT_vcpu_runstate_info(&info, &runstate);
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
+        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
         rc = true;
     }
     else
diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c
index c46dccc25a..b879e8dd2c 100644
--- a/xen/arch/x86/x86_64/domain.c
+++ b/xen/arch/x86/x86_64/domain.c
@@ -36,7 +36,7 @@  arch_compat_vcpu_op(
             break;
 
         rc = 0;
-        guest_from_compat_handle(v->runstate_guest.compat, area.addr.h);
+        guest_from_compat_handle(v->arch.runstate_guest.compat, area.addr.h);
 
         if ( v == current )
         {
@@ -49,7 +49,7 @@  arch_compat_vcpu_op(
             vcpu_runstate_get(v, &runstate);
             XLAT_vcpu_runstate_info(&info, &runstate);
         }
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
+        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
 
         break;
     }
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 7cc9526139..0ca6bf4dbc 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -727,7 +727,10 @@  int domain_kill(struct domain *d)
         if ( cpupool_move_domain(d, cpupool0) )
             return -ERESTART;
         for_each_vcpu ( d, v )
+        {
+            arch_cleanup_runstate_guest(v);
             unmap_vcpu_info(v);
+        }
         d->is_dying = DOMDYING_dead;
         /* Mem event cleanup has to go here because the rings 
          * have to be put before we call put_domain. */
@@ -1167,7 +1170,7 @@  int domain_soft_reset(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        set_xen_guest_handle(runstate_guest(v), NULL);
+        arch_cleanup_runstate_guest(v);
         unmap_vcpu_info(v);
     }
 
@@ -1484,7 +1487,6 @@  long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
     case VCPUOP_register_runstate_memory_area:
     {
         struct vcpu_register_runstate_memory_area area;
-        struct vcpu_runstate_info runstate;
 
         rc = -EFAULT;
         if ( copy_from_guest(&area, arg, 1) )
@@ -1493,18 +1495,7 @@  long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( !guest_handle_okay(area.addr.h, 1) )
             break;
 
-        rc = 0;
-        runstate_guest(v) = area.addr.h;
-
-        if ( v == current )
-        {
-            __copy_to_guest(runstate_guest(v), &v->runstate, 1);
-        }
-        else
-        {
-            vcpu_runstate_get(v, &runstate);
-            __copy_to_guest(runstate_guest(v), &runstate, 1);
-        }
+        rc = arch_setup_runstate_guest(v, area);
 
         break;
     }
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 4e2f582006..3a7f53e13d 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -11,6 +11,7 @@ 
 #include <asm/vgic.h>
 #include <asm/vpl011.h>
 #include <public/hvm/params.h>
+#include <public/vcpu.h>
 #include <xen/serial.h>
 #include <xen/rbtree.h>
 
@@ -206,6 +207,13 @@  struct arch_vcpu
      */
     bool need_flush_to_ram;
 
+    /* runstate guest info */
+    struct {
+        struct page_info *page;  /* guest page */
+        unsigned         offset; /* offset in page */
+        spinlock_t       lock;   /* lock to access page */
+    } runstate_guest;
+
 }  __cacheline_aligned;
 
 void vcpu_show_execution_state(struct vcpu *);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index e8cee3d5e5..f917b48cb8 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -11,6 +11,11 @@ 
 #include <asm/x86_emulate.h>
 #include <public/vcpu.h>
 #include <public/hvm/hvm_info_table.h>
+#ifdef CONFIG_COMPAT
+#include <compat/vcpu.h>
+DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
+#endif
+
 
 #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
 
@@ -626,6 +631,17 @@  struct arch_vcpu
     struct {
         bool next_interrupt_enabled;
     } monitor;
+
+#ifndef CONFIG_COMPAT
+# define runstate_guest(v) ((v)->arch.runstate_guest)
+    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
+#else
+# define runstate_guest(v) ((v)->arch.runstate_guest.native)
+    union {
+        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
+        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
+    } runstate_guest;
+#endif
 };
 
 struct guest_memory_policy
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index 7e51d361de..d5d73ce997 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -63,6 +63,10 @@  void arch_vcpu_destroy(struct vcpu *v);
 int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset);
 void unmap_vcpu_info(struct vcpu *v);
 
+int arch_setup_runstate_guest(struct vcpu *v,
+                            struct vcpu_register_runstate_memory_area area);
+void arch_cleanup_runstate_guest(struct vcpu *v);
+
 int arch_domain_create(struct domain *d,
                        struct xen_domctl_createdomain *config);
 
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index ac53519d7f..fac030fb83 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -29,11 +29,6 @@ 
 #include <public/vcpu.h>
 #include <public/event_channel.h>
 
-#ifdef CONFIG_COMPAT
-#include <compat/vcpu.h>
-DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
-#endif
-
 /*
  * Stats
  *
@@ -166,16 +161,7 @@  struct vcpu
     struct sched_unit *sched_unit;
 
     struct vcpu_runstate_info runstate;
-#ifndef CONFIG_COMPAT
-# define runstate_guest(v) ((v)->runstate_guest)
-    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
-#else
-# define runstate_guest(v) ((v)->runstate_guest.native)
-    union {
-        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
-        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
-    } runstate_guest; /* guest address */
-#endif
+
     unsigned int     new_state;
 
     /* Has the FPU been initialised? */