diff mbox series

[v2,2/2] xen: implement VCPUOP_register_runstate_phys_memory_area

Message ID 1556007026-31057-3-git-send-email-andrii.anisov@gmail.com (mailing list archive)
State New, archived
Headers show
Series Introduce runstate area registration with phys address | expand

Commit Message

Andrii Anisov April 23, 2019, 8:10 a.m. UTC
From: Andrii Anisov <andrii_anisov@epam.com>

VCPUOP_register_runstate_phys_memory_area is implemented via runstate
area mapping.

Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
---
 xen/arch/arm/domain.c        |  62 +++++++++++++++++--------
 xen/arch/x86/domain.c        | 105 +++++++++++++++++++++++++++++++------------
 xen/common/domain.c          |  80 ++++++++++++++++++++++++++++++++-
 xen/include/asm-arm/domain.h |   2 +
 xen/include/xen/domain.h     |   2 +
 xen/include/xen/sched.h      |   8 ++++
 6 files changed, 210 insertions(+), 49 deletions(-)

Comments

Julien Grall May 8, 2019, 3:40 p.m. UTC | #1
Hi,

On 23/04/2019 09:10, Andrii Anisov wrote:
> From: Andrii Anisov <andrii_anisov@epam.com>
> 
> VCPUOP_register_runstate_phys_memory_area is implemented via runstate
> area mapping. >
> Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
> ---
>   xen/arch/arm/domain.c        |  62 +++++++++++++++++--------
>   xen/arch/x86/domain.c        | 105 +++++++++++++++++++++++++++++++------------
>   xen/common/domain.c          |  80 ++++++++++++++++++++++++++++++++-
>   xen/include/asm-arm/domain.h |   2 +
>   xen/include/xen/domain.h     |   2 +
>   xen/include/xen/sched.h      |   8 ++++
>   6 files changed, 210 insertions(+), 49 deletions(-)

This patch is quite hard to read because you are reworking the code and at the 
same time implementing the new VCPUOP. How about moving the rework in a separate 
patch? The implementation can then be fold in the previous patch as suggested by 
George.

> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 6dc633e..8e24e63 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -275,32 +275,55 @@ static void ctxt_switch_to(struct vcpu *n)
>   }
>   
>   /* Update per-VCPU guest runstate shared memory area (if registered). */
> -static void update_runstate_area(struct vcpu *v)
> +void update_runstate_area(struct vcpu *v)

Why do you export update_runstate_area? The function does not seem to be called 
outside.

>   {
> -    void __user *guest_handle = NULL;
> +    if ( !guest_handle_is_null(runstate_guest(v)) )
> +    {
> +        void __user *guest_handle = NULL;
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +            guest_handle = &v->runstate_guest.p->state_entry_time + 1;
> +            guest_handle--;
> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> +            __raw_copy_to_guest(guest_handle,
> +                                (void *)(&v->runstate.state_entry_time + 1) - 1,
> +                                1);
> +            smp_wmb();
> +        }
>   
> -    if ( guest_handle_is_null(runstate_guest(v)) )
> -        return;
> +        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
>   
> -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
> -    {
> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
> -        guest_handle--;
> -        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
> -        smp_wmb();
> +        if ( guest_handle )
> +        {
> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +            smp_wmb();
> +            __raw_copy_to_guest(guest_handle,
> +                                (void *)(&v->runstate.state_entry_time + 1) - 1,
> +                                1);
> +        }
>       }
>   
> -    __copy_to_guest(runstate_guest(v), &v->runstate, 1);
> -
> -    if ( guest_handle )
> +    spin_lock(&v->mapped_runstate_lock);
> +    if ( v->mapped_runstate )

The code looks a bit odd to me, you seem to allow a guest to provide 2 runstate 
areas: one using guest virtual address the other using guest physical address.

It would be best if we prevent a guest to mix match them. IOW, if the guest 
provide a physical address first, then *all* the call should be physical 
address. Alternatively this could be a per vCPU decision.

>       {
> -        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> -        smp_wmb();
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +            v->mapped_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
> +            smp_wmb();
> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> +        }
> +
> +        memcpy(v->mapped_runstate, &v->runstate, sizeof(v->runstate));
> +
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +            v->mapped_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +            smp_wmb();
> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +        }
>       }
> +    spin_unlock(&v->mapped_runstate_lock);
> +

NIT: The newline is not necessary here.

>   }
>   
>   static void schedule_tail(struct vcpu *prev)
> @@ -998,6 +1021,7 @@ long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) a
>       {
>           case VCPUOP_register_vcpu_info:
>           case VCPUOP_register_runstate_memory_area:
> +        case VCPUOP_register_runstate_phys_memory_area:
>               return do_vcpu_op(cmd, vcpuid, arg);
>           default:
>               return -EINVAL;


[...]

> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index ae22049..6df76c6 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -149,6 +149,7 @@ struct vcpu *vcpu_create(
>       v->dirty_cpu = VCPU_CPU_CLEAN;
>   
>       spin_lock_init(&v->virq_lock);
> +    spin_lock_init(&v->mapped_runstate_lock);
>   
>       tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
>   
> @@ -699,6 +700,69 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
>       return 0;
>   }
>   
> +static void _unmap_runstate_area(struct vcpu *v)
A better name would be unamep_runstate_area_locked() so you avoid the reserved 
name and make clear of the use.

> +{
> +    mfn_t mfn;
> +
> +    if ( !v->mapped_runstate )
> +        return;
> +
> +    mfn = _mfn(virt_to_mfn(runstate_guest(v).p));

As pointed out by Jan in the previous version:

The pointer is the result of __map_domain_page_global(). So I don't think you
don't think you can legitimately use virt_to_mfn() on it, at
least not on x86; domain_page_map_to_mfn() is what you
want to use here.

> +
> +    unmap_domain_page_global((void *)
> +                             ((unsigned long)v->mapped_runstate &
> +                              PAGE_MASK));
> +
> +    v->mapped_runstate = NULL;
> +    put_page_and_type(mfn_to_page(mfn));
> +}

We seem to have this pattern in a few places now (see unmap_guest_page). It 
would be good to introduce helpers that can be used everywhere (probably lifted 
from common/event_fifo.c.

> +
> +static int map_runstate_area(struct vcpu *v,
> +                      struct vcpu_register_runstate_memory_area *area)
> +{
> +    unsigned long offset = area->addr.p & ~PAGE_MASK;
> +    gfn_t gfn = gaddr_to_gfn(area->addr.p);
> +    struct domain *d = v->domain;
> +    void *mapping;
> +    struct page_info *page;
> +    size_t size = sizeof (struct vcpu_runstate_info );

space is not necessary before ).

But is the variable really necessary?

> +
> +    if ( offset > (PAGE_SIZE - size) )
> +        return -EINVAL;
> +
> +    page = get_page_from_gfn(d, gfn_x(gfn), NULL, P2M_ALLOC);
> +    if ( !page )
> +        return -EINVAL;
> +
> +    if ( !get_page_type(page, PGT_writable_page) )
> +    {
> +        put_page(page);
> +        return -EINVAL;
> +    }
> +
> +    mapping = __map_domain_page_global(page);
> +
> +    if ( mapping == NULL )
> +    {
> +        put_page_and_type(page);
> +        return -ENOMEM;
> +    }
> +
> +    spin_lock(&v->mapped_runstate_lock);
> +    _unmap_runstate_area(v);
> +    v->mapped_runstate = mapping + offset;
> +    spin_unlock(&v->mapped_runstate_lock);
> +
> +    return 0;
> +}
> +
> +static void unmap_runstate_area(struct vcpu *v)
> +{
> +    spin_lock(&v->mapped_runstate_lock);
> +    _unmap_runstate_area(v);
> +    spin_unlock(&v->mapped_runstate_lock);
> +}
> +
>   int domain_kill(struct domain *d)
>   {
>       int rc = 0;
> @@ -737,7 +801,11 @@ int domain_kill(struct domain *d)
>           if ( cpupool_move_domain(d, cpupool0) )
>               return -ERESTART;
>           for_each_vcpu ( d, v )
> +        {
> +            set_xen_guest_handle(runstate_guest(v), NULL);
> +            unmap_runstate_area(v);
>               unmap_vcpu_info(v);
> +        }
>           d->is_dying = DOMDYING_dead;
>           /* Mem event cleanup has to go here because the rings
>            * have to be put before we call put_domain. */
> @@ -1192,6 +1260,7 @@ int domain_soft_reset(struct domain *d)
>       for_each_vcpu ( d, v )
>       {
>           set_xen_guest_handle(runstate_guest(v), NULL);
> +        unmap_runstate_area(v);
>           unmap_vcpu_info(v);
>       }
>   
> @@ -1536,8 +1605,17 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
>       }
>   
>       case VCPUOP_register_runstate_phys_memory_area:
> -        rc = -EOPNOTSUPP;
> +    {
> +        struct vcpu_register_runstate_memory_area area;
> +
> +        rc = -EFAULT;
> +        if ( copy_from_guest(&area, arg, 1) )
> +            break;
> +
> +        rc = map_runstate_area(v, &area);
> +
>           break;
> +    }
>   
>   #ifdef VCPU_TRAP_NMI
>       case VCPUOP_send_nmi:
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 312fec8..3fb6ea2 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -217,6 +217,8 @@ void vcpu_show_execution_state(struct vcpu *);
>   void vcpu_show_registers(const struct vcpu *);
>   void vcpu_switch_to_aarch64_mode(struct vcpu *);
>   
> +void update_runstate_area(struct vcpu *);
> +
>   /*
>    * Due to the restriction of GICv3, the number of vCPUs in AFF0 is
>    * limited to 16, thus only the first 4 bits of AFF0 are legal. We will
> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
> index d1bfc82..ecddcfe 100644
> --- a/xen/include/xen/domain.h
> +++ b/xen/include/xen/domain.h
> @@ -118,4 +118,6 @@ struct vnuma_info {
>   
>   void vnuma_destroy(struct vnuma_info *vnuma);
>   
> +struct vcpu_register_runstate_memory_area;
> +
>   #endif /* __XEN_DOMAIN_H__ */
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index 748bb0f..2afe31c 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -163,15 +163,23 @@ struct vcpu
>       void            *sched_priv;    /* scheduler-specific data */
>   
>       struct vcpu_runstate_info runstate;
> +
> +    spinlock_t      mapped_runstate_lock;
> +
>   #ifndef CONFIG_COMPAT
>   # define runstate_guest(v) ((v)->runstate_guest)
>       XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
> +    vcpu_runstate_info_t *mapped_runstate;
>   #else
>   # define runstate_guest(v) ((v)->runstate_guest.native)
>       union {
>           XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>           XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>       } runstate_guest; /* guest address */
> +    union {
> +        vcpu_runstate_info_t* native;
> +        vcpu_runstate_info_compat_t* compat;
> +    } mapped_runstate; /* guest address */

The combination of mapped_runstate and runstate_guest is a bit confusing. I 
think you want to rework the interface to show that only one is possible at the 
time and make clear which one is used by who. Maybe:

union
{
    /* Legacy interface to be used when the guest provides a virtual address */
    union {
       XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
       ...
    } virt;

    /* Interface used when the guest provides a physical address */
    union {
    } phys;
} runstate_guest.

runstate_guest_type /* could be a bool or enum */

Jan what do you think?

Cheers,
Jan Beulich May 9, 2019, 9:27 a.m. UTC | #2
>>> On 08.05.19 at 17:40, <julien.grall@arm.com> wrote:
> On 23/04/2019 09:10, Andrii Anisov wrote:
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -163,15 +163,23 @@ struct vcpu
>>       void            *sched_priv;    /* scheduler-specific data */
>>   
>>       struct vcpu_runstate_info runstate;
>> +
>> +    spinlock_t      mapped_runstate_lock;
>> +
>>   #ifndef CONFIG_COMPAT
>>   # define runstate_guest(v) ((v)->runstate_guest)
>>       XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
>> +    vcpu_runstate_info_t *mapped_runstate;
>>   #else
>>   # define runstate_guest(v) ((v)->runstate_guest.native)
>>       union {
>>           XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>>           XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>>       } runstate_guest; /* guest address */
>> +    union {
>> +        vcpu_runstate_info_t* native;
>> +        vcpu_runstate_info_compat_t* compat;
>> +    } mapped_runstate; /* guest address */
> 
> The combination of mapped_runstate and runstate_guest is a bit confusing. I 
> think you want to rework the interface to show that only one is possible at the 
> time and make clear which one is used by who. Maybe:
> 
> union
> {
>     /* Legacy interface to be used when the guest provides a virtual address */
>     union {
>        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>        ...
>     } virt;
> 
>     /* Interface used when the guest provides a physical address */
>     union {
>     } phys;
> } runstate_guest.
> 
> runstate_guest_type /* could be a bool or enum */
> 
> Jan what do you think?

I fully agree - no mixing of approaches here, if possible. However,
care needs to be taken that after a domain reset the new kernel
can chose the opposite model. Question is whether there are even
other cases where independent components (say boot loader and
OS) may need to be permitted to chose models independently of
one another.

As a side note, Andrii - would you please avoid sending double mail
to xen-devel (addresses differing just by domain used)?

Jan
Andrii Anisov May 13, 2019, 12:30 p.m. UTC | #3
On 08.05.19 18:40, Julien Grall wrote:
> This patch is quite hard to read because you are reworking the code and at the same time implementing the new VCPUOP. How about moving the rework in a separate patch? The implementation can then be fold in the previous patch as suggested by George.

OK.

> 
>>
>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>> index 6dc633e..8e24e63 100644
>> --- a/xen/arch/arm/domain.c
>> +++ b/xen/arch/arm/domain.c
>> @@ -275,32 +275,55 @@ static void ctxt_switch_to(struct vcpu *n)
>>   }
>>   /* Update per-VCPU guest runstate shared memory area (if registered). */
>> -static void update_runstate_area(struct vcpu *v)
>> +void update_runstate_area(struct vcpu *v)
> 
> Why do you export update_runstate_area? The function does not seem to be called outside.

Ouch, this left from one of the previous versions.

> 
>>   {
>> -    void __user *guest_handle = NULL;
>> +    if ( !guest_handle_is_null(runstate_guest(v)) )
>> +    {
>> +        void __user *guest_handle = NULL;
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>> +            guest_handle--;
>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +            __raw_copy_to_guest(guest_handle,
>> +                                (void *)(&v->runstate.state_entry_time + 1) - 1,
>> +                                1);
>> +            smp_wmb();
>> +        }
>> -    if ( guest_handle_is_null(runstate_guest(v)) )
>> -        return;
>> +        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
>> -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> -    {
>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>> -        guest_handle--;
>> -        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
>> -        smp_wmb();
>> +        if ( guest_handle )
>> +        {
>> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +            smp_wmb();
>> +            __raw_copy_to_guest(guest_handle,
>> +                                (void *)(&v->runstate.state_entry_time + 1) - 1,
>> +                                1);
>> +        }
>>       }
>> -    __copy_to_guest(runstate_guest(v), &v->runstate, 1);
>> -
>> -    if ( guest_handle )
>> +    spin_lock(&v->mapped_runstate_lock);
>> +    if ( v->mapped_runstate )
> 
> The code looks a bit odd to me, you seem to allow a guest to provide 2 runstate areas: one using guest virtual address the other using guest physical address.
> 
> It would be best if we prevent a guest to mix match them. 

Firstly I turned to implementing in that way, but the locking and decissions code become really ugly and complex while trying to cover 'guest's misbehavior' scenarios.

> IOW, if the guest provide a physical address first, then *all* the call should be physical address. Alternatively this could be a per vCPU decision.

I guess we should agree what to implement first.

> 
>>       {
>> -        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> -        smp_wmb();
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->mapped_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +            smp_wmb();
>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +        }
>> +
>> +        memcpy(v->mapped_runstate, &v->runstate, sizeof(v->runstate));
>> +
>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>> +        {
>> +            v->mapped_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +            smp_wmb();
>> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +        }
>>       }
>> +    spin_unlock(&v->mapped_runstate_lock);
>> +
> 
> NIT: The newline is not necessary here.

OK.

> 
>>   }
>>   static void schedule_tail(struct vcpu *prev)
>> @@ -998,6 +1021,7 @@ long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) a
>>       {
>>           case VCPUOP_register_vcpu_info:
>>           case VCPUOP_register_runstate_memory_area:
>> +        case VCPUOP_register_runstate_phys_memory_area:
>>               return do_vcpu_op(cmd, vcpuid, arg);
>>           default:
>>               return -EINVAL;
> 
> 
> [...]
> 
>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>> index ae22049..6df76c6 100644
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -149,6 +149,7 @@ struct vcpu *vcpu_create(
>>       v->dirty_cpu = VCPU_CPU_CLEAN;
>>       spin_lock_init(&v->virq_lock);
>> +    spin_lock_init(&v->mapped_runstate_lock);
>>       tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
>> @@ -699,6 +700,69 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
>>       return 0;
>>   }
>> +static void _unmap_runstate_area(struct vcpu *v)
> A better name would be unamep_runstate_area_locked() so you avoid the reserved name and make clear of the use.

OK.

> 
>> +{
>> +    mfn_t mfn;
>> +
>> +    if ( !v->mapped_runstate )
>> +        return;
>> +
>> +    mfn = _mfn(virt_to_mfn(runstate_guest(v).p));
> 
> As pointed out by Jan in the previous version:
> 
> The pointer is the result of __map_domain_page_global(). So I don't think you
> don't think you can legitimately use virt_to_mfn() on it, at
> least not on x86; domain_page_map_to_mfn() is what you
> want to use here.

Yep.

> 
>> +
>> +    unmap_domain_page_global((void *)
>> +                             ((unsigned long)v->mapped_runstate &
>> +                              PAGE_MASK));
>> +
>> +    v->mapped_runstate = NULL;
>> +    put_page_and_type(mfn_to_page(mfn));
>> +}
> 
> We seem to have this pattern in a few places now (see unmap_guest_page). It would be good to introduce helpers that can be used everywhere (probably lifted from common/event_fifo.c.

I'll check.

> 
>> +
>> +static int map_runstate_area(struct vcpu *v,
>> +                      struct vcpu_register_runstate_memory_area *area)
>> +{
>> +    unsigned long offset = area->addr.p & ~PAGE_MASK;
>> +    gfn_t gfn = gaddr_to_gfn(area->addr.p);
>> +    struct domain *d = v->domain;
>> +    void *mapping;
>> +    struct page_info *page;
>> +    size_t size = sizeof (struct vcpu_runstate_info );
> 
> space is not necessary before ).
> 
> But is the variable really necessary?

Well, I think it could be dropped.
> 
>> +
>> +    if ( offset > (PAGE_SIZE - size) )
>> +        return -EINVAL;
>> +
>> +    page = get_page_from_gfn(d, gfn_x(gfn), NULL, P2M_ALLOC);
>> +    if ( !page )
>> +        return -EINVAL;
>> +
>> +    if ( !get_page_type(page, PGT_writable_page) )
>> +    {
>> +        put_page(page);
>> +        return -EINVAL;
>> +    }
>> +
>> +    mapping = __map_domain_page_global(page);
>> +
>> +    if ( mapping == NULL )
>> +    {
>> +        put_page_and_type(page);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    spin_lock(&v->mapped_runstate_lock);
>> +    _unmap_runstate_area(v);
>> +    v->mapped_runstate = mapping + offset;
>> +    spin_unlock(&v->mapped_runstate_lock);
>> +
>> +    return 0;
>> +}
>> +
>> +static void unmap_runstate_area(struct vcpu *v)
>> +{
>> +    spin_lock(&v->mapped_runstate_lock);
>> +    _unmap_runstate_area(v);
>> +    spin_unlock(&v->mapped_runstate_lock);
>> +}
>> +
>>   int domain_kill(struct domain *d)
>>   {
>>       int rc = 0;
>> @@ -737,7 +801,11 @@ int domain_kill(struct domain *d)
>>           if ( cpupool_move_domain(d, cpupool0) )
>>               return -ERESTART;
>>           for_each_vcpu ( d, v )
>> +        {
>> +            set_xen_guest_handle(runstate_guest(v), NULL);
>> +            unmap_runstate_area(v);
>>               unmap_vcpu_info(v);
>> +        }
>>           d->is_dying = DOMDYING_dead;
>>           /* Mem event cleanup has to go here because the rings
>>            * have to be put before we call put_domain. */
>> @@ -1192,6 +1260,7 @@ int domain_soft_reset(struct domain *d)
>>       for_each_vcpu ( d, v )
>>       {
>>           set_xen_guest_handle(runstate_guest(v), NULL);
>> +        unmap_runstate_area(v);
>>           unmap_vcpu_info(v);
>>       }
>> @@ -1536,8 +1605,17 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
>>       }
>>       case VCPUOP_register_runstate_phys_memory_area:
>> -        rc = -EOPNOTSUPP;
>> +    {
>> +        struct vcpu_register_runstate_memory_area area;
>> +
>> +        rc = -EFAULT;
>> +        if ( copy_from_guest(&area, arg, 1) )
>> +            break;
>> +
>> +        rc = map_runstate_area(v, &area);
>> +
>>           break;
>> +    }
>>   #ifdef VCPU_TRAP_NMI
>>       case VCPUOP_send_nmi:
>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>> index 312fec8..3fb6ea2 100644
>> --- a/xen/include/asm-arm/domain.h
>> +++ b/xen/include/asm-arm/domain.h
>> @@ -217,6 +217,8 @@ void vcpu_show_execution_state(struct vcpu *);
>>   void vcpu_show_registers(const struct vcpu *);
>>   void vcpu_switch_to_aarch64_mode(struct vcpu *);
>> +void update_runstate_area(struct vcpu *);
>> +
>>   /*
>>    * Due to the restriction of GICv3, the number of vCPUs in AFF0 is
>>    * limited to 16, thus only the first 4 bits of AFF0 are legal. We will
>> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
>> index d1bfc82..ecddcfe 100644
>> --- a/xen/include/xen/domain.h
>> +++ b/xen/include/xen/domain.h
>> @@ -118,4 +118,6 @@ struct vnuma_info {
>>   void vnuma_destroy(struct vnuma_info *vnuma);
>> +struct vcpu_register_runstate_memory_area;
>> +
>>   #endif /* __XEN_DOMAIN_H__ */
>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>> index 748bb0f..2afe31c 100644
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -163,15 +163,23 @@ struct vcpu
>>       void            *sched_priv;    /* scheduler-specific data */
>>       struct vcpu_runstate_info runstate;
>> +
>> +    spinlock_t      mapped_runstate_lock;
>> +
>>   #ifndef CONFIG_COMPAT
>>   # define runstate_guest(v) ((v)->runstate_guest)
>>       XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
>> +    vcpu_runstate_info_t *mapped_runstate;
>>   #else
>>   # define runstate_guest(v) ((v)->runstate_guest.native)
>>       union {
>>           XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>>           XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>>       } runstate_guest; /* guest address */
>> +    union {
>> +        vcpu_runstate_info_t* native;
>> +        vcpu_runstate_info_compat_t* compat;
>> +    } mapped_runstate; /* guest address */
> > The combination of mapped_runstate and runstate_guest is a bit confusing. I think you want to rework the interface to show that only one is possible at the time and make clear which one is used by who. Maybe:

As I said before, IMO coupling those interfaces makes the code complicated and ugly.

> 
> union
> {
>     /* Legacy interface to be used when the guest provides a virtual address */
>     union {
>        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>        ...
>     } virt;
> 
>     /* Interface used when the guest provides a physical address */
>     union {
>     } phys;
> } runstate_guest.> 
> runstate_guest_type /* could be a bool or enum */
> 
> Jan what do you think?
> 
> Cheers,
>
Julien Grall May 14, 2019, 9:35 a.m. UTC | #4
Hi Jan,

On 09/05/2019 10:27, Jan Beulich wrote:
>>>> On 08.05.19 at 17:40, <julien.grall@arm.com> wrote:
>> On 23/04/2019 09:10, Andrii Anisov wrote:
>>> --- a/xen/include/xen/sched.h
>>> +++ b/xen/include/xen/sched.h
>>> @@ -163,15 +163,23 @@ struct vcpu
>>>        void            *sched_priv;    /* scheduler-specific data */
>>>    
>>>        struct vcpu_runstate_info runstate;
>>> +
>>> +    spinlock_t      mapped_runstate_lock;
>>> +
>>>    #ifndef CONFIG_COMPAT
>>>    # define runstate_guest(v) ((v)->runstate_guest)
>>>        XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
>>> +    vcpu_runstate_info_t *mapped_runstate;
>>>    #else
>>>    # define runstate_guest(v) ((v)->runstate_guest.native)
>>>        union {
>>>            XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>>>            XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>>>        } runstate_guest; /* guest address */
>>> +    union {
>>> +        vcpu_runstate_info_t* native;
>>> +        vcpu_runstate_info_compat_t* compat;
>>> +    } mapped_runstate; /* guest address */
>>
>> The combination of mapped_runstate and runstate_guest is a bit confusing. I
>> think you want to rework the interface to show that only one is possible at the
>> time and make clear which one is used by who. Maybe:
>>
>> union
>> {
>>      /* Legacy interface to be used when the guest provides a virtual address */
>>      union {
>>         XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>>         ...
>>      } virt;
>>
>>      /* Interface used when the guest provides a physical address */
>>      union {
>>      } phys;
>> } runstate_guest.
>>
>> runstate_guest_type /* could be a bool or enum */
>>
>> Jan what do you think?
> 
> I fully agree - no mixing of approaches here, if possible. However,
> care needs to be taken that after a domain reset the new kernel
> can chose the opposite model. Question is whether there are even
> other cases where independent components (say boot loader and
> OS) may need to be permitted to chose models independently of
> one another.
Good point. On a similar topic, how does Kexec works on Xen? Do we reset the 
domain as well?

Cheers,
Jan Beulich May 14, 2019, 9:48 a.m. UTC | #5
>>> On 14.05.19 at 11:35, <julien.grall@arm.com> wrote:
> On a similar topic, how does Kexec works on Xen? Do we reset the 
> domain as well?

No, how could we? What gets invoked isn't Xen in the common case,
but some other (typically bare metal) OS like Linux.

Jan
Julien Grall May 14, 2019, 9:58 a.m. UTC | #6
On 13/05/2019 13:30, Andrii Anisov wrote:
> 
> 
> On 08.05.19 18:40, Julien Grall wrote:
>>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>>> index 6dc633e..8e24e63 100644
>>
>>>   {
>>> -    void __user *guest_handle = NULL;
>>> +    if ( !guest_handle_is_null(runstate_guest(v)) )
>>> +    {
>>> +        void __user *guest_handle = NULL;
>>> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>> +        {
>>> +            guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>>> +            guest_handle--;
>>> +            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>> +            __raw_copy_to_guest(guest_handle,
>>> +                                (void *)(&v->runstate.state_entry_time + 1) 
>>> - 1,
>>> +                                1);
>>> +            smp_wmb();
>>> +        }
>>> -    if ( guest_handle_is_null(runstate_guest(v)) )
>>> -        return;
>>> +        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
>>> -    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>> -    {
>>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>>> -        guest_handle--;
>>> -        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>> -        __raw_copy_to_guest(guest_handle,
>>> -                            (void *)(&v->runstate.state_entry_time + 1) - 1, 
>>> 1);
>>> -        smp_wmb();
>>> +        if ( guest_handle )
>>> +        {
>>> +            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>> +            smp_wmb();
>>> +            __raw_copy_to_guest(guest_handle,
>>> +                                (void *)(&v->runstate.state_entry_time + 1) 
>>> - 1,
>>> +                                1);
>>> +        }
>>>       }
>>> -    __copy_to_guest(runstate_guest(v), &v->runstate, 1);
>>> -
>>> -    if ( guest_handle )
>>> +    spin_lock(&v->mapped_runstate_lock);
>>> +    if ( v->mapped_runstate )
>>
>> The code looks a bit odd to me, you seem to allow a guest to provide 2 
>> runstate areas: one using guest virtual address the other using guest physical 
>> address.
>>
>> It would be best if we prevent a guest to mix match them. 
> 
> Firstly I turned to implementing in that way, but the locking and decissions 
> code become really ugly and complex while trying to cover 'guest's misbehavior' 
> scenarios.

I think it is possible to have a simple version taking the decision on which 
method to use. You can either use the spin_lock to protect everything or use 
something like:

update_runstate_area():

if ( xchg(&v->runstate_in_use, 1) )
   return;

switch ( v->runstate_type )
{
GVADDR:
    update_runstate_by_gvaddr();
GPADDR:
    update_runstate_by_gpaddr();
}

xchg(&v->runstate_in_use, 0);

registering an area

while ( xchg(&v->runstate_in_use, 1) );
/* Clean-up and registering the area */

> 
>> IOW, if the guest provide a physical address first, then *all* the call should 
>> be physical address. Alternatively this could be a per vCPU decision.
> 
> I guess we should agree what to implement first.

I think there are an agreement that the two methods should not be used together.

[..]

>>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>>> index 312fec8..3fb6ea2 100644
>>> --- a/xen/include/asm-arm/domain.h
>>> +++ b/xen/include/asm-arm/domain.h
>>> @@ -217,6 +217,8 @@ void vcpu_show_execution_state(struct vcpu *);
>>>   void vcpu_show_registers(const struct vcpu *);
>>>   void vcpu_switch_to_aarch64_mode(struct vcpu *);
>>> +void update_runstate_area(struct vcpu *);
>>> +
>>>   /*
>>>    * Due to the restriction of GICv3, the number of vCPUs in AFF0 is
>>>    * limited to 16, thus only the first 4 bits of AFF0 are legal. We will
>>> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
>>> index d1bfc82..ecddcfe 100644
>>> --- a/xen/include/xen/domain.h
>>> +++ b/xen/include/xen/domain.h
>>> @@ -118,4 +118,6 @@ struct vnuma_info {
>>>   void vnuma_destroy(struct vnuma_info *vnuma);
>>> +struct vcpu_register_runstate_memory_area;
>>> +
>>>   #endif /* __XEN_DOMAIN_H__ */
>>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>>> index 748bb0f..2afe31c 100644
>>> --- a/xen/include/xen/sched.h
>>> +++ b/xen/include/xen/sched.h
>>> @@ -163,15 +163,23 @@ struct vcpu
>>>       void            *sched_priv;    /* scheduler-specific data */
>>>       struct vcpu_runstate_info runstate;
>>> +
>>> +    spinlock_t      mapped_runstate_lock;
>>> +
>>>   #ifndef CONFIG_COMPAT
>>>   # define runstate_guest(v) ((v)->runstate_guest)
>>>       XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
>>> +    vcpu_runstate_info_t *mapped_runstate;
>>>   #else
>>>   # define runstate_guest(v) ((v)->runstate_guest.native)
>>>       union {
>>>           XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
>>>           XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>>>       } runstate_guest; /* guest address */
>>> +    union {
>>> +        vcpu_runstate_info_t* native;
>>> +        vcpu_runstate_info_compat_t* compat;
>>> +    } mapped_runstate; /* guest address */
>> > The combination of mapped_runstate and runstate_guest is a bit confusing. I 
>> think you want to rework the interface to show that only one is possible at 
>> the time and make clear which one is used by who. Maybe:
> 
> As I said before, IMO coupling those interfaces makes the code complicated and 
> ugly.

Well, I can't see how it can be ugly (see my example above).

Cheers,
Andrii Anisov May 14, 2019, 10:08 a.m. UTC | #7
Hello Julien,

On 14.05.19 12:58, Julien Grall wrote:
>> I guess we should agree what to implement first.
> 
> I think there are an agreement that the two methods should not be used together.

For a domain or for a particular VCPU?
How should we response on the request to register paddr when we already have registered vaddr and vice versa?
Julien Grall May 14, 2019, 11:23 a.m. UTC | #8
On 14/05/2019 10:48, Jan Beulich wrote:
>>>> On 14.05.19 at 11:35, <julien.grall@arm.com> wrote:
>> On a similar topic, how does Kexec works on Xen? Do we reset the
>> domain as well?
> 
> No, how could we? What gets invoked isn't Xen in the common case,
> but some other (typically bare metal) OS like Linux.

It is not what I asked. What I asked is whether Xen is involved when a guest 
kernel is kexecing to another kernel.

I don't know enough Kexec to be able to answer that question myself.

Cheers,
Julien Grall May 14, 2019, 11:24 a.m. UTC | #9
On 14/05/2019 11:08, Andrii Anisov wrote:
> Hello Julien,
> 
> On 14.05.19 12:58, Julien Grall wrote:
>>> I guess we should agree what to implement first.
>>
>> I think there are an agreement that the two methods should not be used together.
> 
> For a domain or for a particular VCPU?
> How should we response on the request to register paddr when we already have 
> registered vaddr and vice versa?

 From the discussion with Jan, you would tear down the previous solution and then
register the new version. So this allows cases like a bootloader and a kernel 
using different version of the interface.

Cheers,
Jan Beulich May 14, 2019, 11:29 a.m. UTC | #10
>>> On 14.05.19 at 13:23, <julien.grall@arm.com> wrote:
> On 14/05/2019 10:48, Jan Beulich wrote:
>>>>> On 14.05.19 at 11:35, <julien.grall@arm.com> wrote:
>>> On a similar topic, how does Kexec works on Xen? Do we reset the
>>> domain as well?
>> 
>> No, how could we? What gets invoked isn't Xen in the common case,
>> but some other (typically bare metal) OS like Linux.
> 
> It is not what I asked. What I asked is whether Xen is involved when a guest 
> kernel is kexecing to another kernel.

I don't think I've ever heard of such a thing (outside of perhaps
using domain reset), so I don't think I can answer the (originally
ambiguous) question then.

Jan
Andrii Anisov May 14, 2019, 11:45 a.m. UTC | #11
On 14.05.19 14:24, Julien Grall wrote:
>>> I think there are an agreement that the two methods should not be used together.
>>
>> For a domain or for a particular VCPU?
>> How should we response on the request to register paddr when we already have registered vaddr and vice versa?
> 
>  From the discussion with Jan, you would tear down the previous solution and then
> register the new version. So this allows cases like a bootloader and a kernel using different version of the interface.

I'm not sure Jan stated that, he rather questioned that.

Jan, could you please confirm you agree that it should be dropped already registered runstate and (potentially) changed version in case of the new register request?
Jan Beulich May 14, 2019, 12:02 p.m. UTC | #12
>>> On 14.05.19 at 13:45, <andrii.anisov@gmail.com> wrote:
> On 14.05.19 14:24, Julien Grall wrote:
>>>> I think there are an agreement that the two methods should not be used together.
>>>
>>> For a domain or for a particular VCPU?
>>> How should we response on the request to register paddr when we already have registered vaddr and vice versa?
>> 
>>  From the discussion with Jan, you would tear down the previous solution and then
>> register the new version. So this allows cases like a bootloader and a 
> kernel using different version of the interface.
> 
> I'm not sure Jan stated that, he rather questioned that.
> 
> Jan, could you please confirm you agree that it should be dropped already 
> registered runstate and (potentially) changed version in case of the new 
> register request?

Well, I think Julian's implication was that we can't support in particular
the boot loader -> kernel handover case without extra measures (if
at all), and hence he added things together and said what results
from this. Of course ideally we'd reject mixed requests, but unless
someone can come up with a clever means of how to determine entity
boundaries I'm afraid this is not going to be possible without breaking
certain setups.

Jan
Andrii Anisov May 14, 2019, 1:05 p.m. UTC | #13
On 14.05.19 15:02, Jan Beulich wrote:
> Well, I think Julian's implication was that we can't support in particular
> the boot loader -> kernel handover case without extra measures (if
> at all), and hence he added things together and said what results
> from this. Of course ideally we'd reject mixed requests, but unless
> someone can come up with a clever means of how to determine entity
> boundaries I'm afraid this is not going to be possible without breaking
> certain setups.

 From my understanding, if we are speaking of different entities in a domain and their boundaries, we have to define unregister interface as well. So that those entities would be able to take care of themselves explicitly.
Julien Grall May 14, 2019, 1:49 p.m. UTC | #14
On 14/05/2019 14:05, Andrii Anisov wrote:
> 
> 
> On 14.05.19 15:02, Jan Beulich wrote:
>> Well, I think Julian's implication was that we can't support in particular
>> the boot loader -> kernel handover case without extra measures (if
>> at all), and hence he added things together and said what results
>> from this. Of course ideally we'd reject mixed requests, but unless
>> someone can come up with a clever means of how to determine entity
>> boundaries I'm afraid this is not going to be possible without breaking
>> certain setups.
> 
>  From my understanding, if we are speaking of different entities in a domain and 
> their boundaries, we have to define unregister interface as well. So that those 
> entities would be able to take care of themselves explicitly.

You have to keep in mind that existing OS have to run on newer Xen without any 
modification.

The existing hypercall allows you to:
    1) De-register an interface using the value 0.
    2) Replacing a current existing interface

You probably can't use 2) for a bootloader -> kernel handover because we are 
dealing with guest virtual address. There is an high chance the virtual address 
space layout is going to be different or even turning off MMU for a bit (done on 
Arm). So you have to use 1) otherwise you might write in a random place in memory.

I am not entirely sure whether there are actual value for 2). The only reason I 
can think of is if you want to move around the runstate in your virtual address 
space. But that's sounds a bit weird at least on Arm.

For the new hypercall, I think we at least want 1) (with a magic value TBD). 2) 
might be helpful in the case the bootloader didn't do the right thing or we are 
using Kexec to boot a new kernel. This would also be safer as physical address 
could be excluded more easily.

2) should not be too difficult to implement. It is just a matter of clean-up 
whatever was used previous before registering the new interface.

Cheers,
Jan Beulich May 14, 2019, 1:49 p.m. UTC | #15
>>> On 14.05.19 at 15:05, <andrii.anisov@gmail.com> wrote:
> On 14.05.19 15:02, Jan Beulich wrote:
>> Well, I think Julian's implication was that we can't support in particular
>> the boot loader -> kernel handover case without extra measures (if
>> at all), and hence he added things together and said what results
>> from this. Of course ideally we'd reject mixed requests, but unless
>> someone can come up with a clever means of how to determine entity
>> boundaries I'm afraid this is not going to be possible without breaking
>> certain setups.
> 
>  From my understanding, if we are speaking of different entities in a domain 
> and their boundaries, we have to define unregister interface as well. So that 
> those entities would be able to take care of themselves explicitly.

If this was a concern only for newly written code, this would be fine.
But you need to make sure all existing code also continues to work
with whatever new interface you implement. Just because a kernel
uses your new physical address based mechanism doesn't mean the
boot loader knows to unregister what it has registered.

Jan
Andrii Anisov May 15, 2019, 8:44 a.m. UTC | #16
Hello Jan,

On 14.05.19 16:49, Jan Beulich wrote:
> If this was a concern only for newly written code, this would be fine.
> But you need to make sure all existing code also continues to work
> with whatever new interface you implement.

And that is one more reason why I tend to introduce the new interface in parallel to be fully independent from the old one.
But not do a mixed implementation as you and Julien suggest.

> Just because a kernel
> uses your new physical address based mechanism doesn't mean the
> boot loader knows to unregister what it has registered.

As Julien already said, the current interface has an implicit mechanism to unregister runstate area.
Also if the bootloader misses unregistering runstate area with the current interface, we already have the broken system.
So it is really up to the guest system to take care about its possible transitions.
Andrii Anisov May 15, 2019, 9:04 a.m. UTC | #17
On 14.05.19 16:49, Julien Grall wrote:
> You have to keep in mind that existing OS have to run on newer Xen without any modification.

As I just written to Jan, it is one more reason to keep those interfaces living in parallel and do not mix their implementation.

> The existing hypercall allows you to:
>     1) De-register an interface using the value 0.

My current implementation can easily be updated with the same behavior.

>     2) Replacing a current existing interface

> You probably can't use 2) for a bootloader -> kernel handover because we are dealing with guest virtual address. There is an high chance the virtual address space layout is going to be different or even turning off MMU for a bit (done on Arm). So you have to use 1) otherwise you might write in a random place in memory.

This definitely not the way to handle transitions between systems in a guest domain.

> I am not entirely sure whether there are actual value for 2). The only reason I can think of is if you want to move around the runstate in your virtual address space. But that's sounds a bit weird at least on Arm.
> For the new hypercall, I think we at least want 1) (with a magic value TBD). 

The magic value 0x0 can easily be introduced.

>  2) might be helpful in the case the bootloader didn't do the right thing or we are using Kexec to boot a new kernel. This would also be safer as physical address could be excluded more easily.

But the new system have to get some knowledge about the previous phys addr is reserved (used by hypervisor), and do not use it prior to registering new runstate area.
Providing such a knowledge is something (e.g.) the bootloader should take care of. But, IMO, it is better to require from (e.g.) the bootloader to unregister its runstate area prior to switching to the new system.
Julien Grall May 15, 2019, 10:31 a.m. UTC | #18
Hi Andrii,

On 15/05/2019 10:04, Andrii Anisov wrote:
> 
> 
> On 14.05.19 16:49, Julien Grall wrote:
>> You have to keep in mind that existing OS have to run on newer Xen without any 
>> modification.
> 
> As I just written to Jan, it is one more reason to keep those interfaces living 
> in parallel and do not mix their implementation.
There are actually no good reason for a guest to register via the two interfaces 
at the same time. The more we want to encourage the OS developer to switch to 
the new interface.

I also provided in my previous e-mails way to make the two working together 
without much trouble.

> 
>> The existing hypercall allows you to:
>>     1) De-register an interface using the value 0.
> 
> My current implementation can easily be updated with the same behavior.
> 
>>     2) Replacing a current existing interface
> 
>> You probably can't use 2) for a bootloader -> kernel handover because we are 
>> dealing with guest virtual address. There is an high chance the virtual 
>> address space layout is going to be different or even turning off MMU for a 
>> bit (done on Arm). So you have to use 1) otherwise you might write in a random 
>> place in memory.
> 
> This definitely not the way to handle transitions between systems in a guest 
> domain.
> 
>> I am not entirely sure whether there are actual value for 2). The only reason 
>> I can think of is if you want to move around the runstate in your virtual 
>> address space. But that's sounds a bit weird at least on Arm.
>> For the new hypercall, I think we at least want 1) (with a magic value TBD). 
> 
> The magic value 0x0 can easily be introduced.

0x0 is not an option. It could be a valid physical address. We need a value that 
cannot be used by anyone.

> 
>>  2) might be helpful in the case the bootloader didn't do the right thing or 
>> we are using Kexec to boot a new kernel. This would also be safer as physical 
>> address could be excluded more easily.
> 
> But the new system have to get some knowledge about the previous phys addr is 
> reserved (used by hypervisor), and do not use it prior to registering new 
> runstate area.
> Providing such a knowledge is something (e.g.) the bootloader should take care 
> of. But, IMO, it is better to require from (e.g.) the bootloader to unregister 
> its runstate area prior to switching to the new system.

Well, if a bootloader keep some part in memory (such as for handling runtime 
services), it will usually mark those pages are reserved. So it can't be used by 
the kernel.

But here, the point is it would not be difficult to handle 2). So why would you 
try to forbid it?

Cheers,
Jan Beulich May 15, 2019, 11:59 a.m. UTC | #19
>>> On 15.05.19 at 10:44, <andrii.anisov@gmail.com> wrote:
> On 14.05.19 16:49, Jan Beulich wrote:
>> If this was a concern only for newly written code, this would be fine.
>> But you need to make sure all existing code also continues to work
>> with whatever new interface you implement.
> 
> And that is one more reason why I tend to introduce the new interface in 
> parallel to be fully independent from the old one.
> But not do a mixed implementation as you and Julien suggest.

What behavior guests see and how it is implemented in the hypervisor
are two largely independent things. That is, we could choose to forbid
mixing of registration methods while still having some level of code
sharing between how both hypercall variants get actually processed.

Jan
Jan Beulich May 16, 2019, 12:09 p.m. UTC | #20
>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote:
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -163,15 +163,23 @@ struct vcpu
>      void            *sched_priv;    /* scheduler-specific data */
>  
>      struct vcpu_runstate_info runstate;
> +
> +    spinlock_t      mapped_runstate_lock;

Besides other comments given elsewhere already - does this
really need to be a per-vCPU lock? Guests aren't expected to
invoke the API frequently, so quite likely a per-domain lock
would suffice. Quite possibly domain_{,un}lock() could
actually be (re-)used.

Jan
Andrii Anisov May 16, 2019, 1:30 p.m. UTC | #21
Hello Jan,

On 16.05.19 15:09, Jan Beulich wrote:
>>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote:
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -163,15 +163,23 @@ struct vcpu
>>       void            *sched_priv;    /* scheduler-specific data */
>>   
>>       struct vcpu_runstate_info runstate;
>> +
>> +    spinlock_t      mapped_runstate_lock;
> 
> Besides other comments given elsewhere already - does this
> really need to be a per-vCPU lock? Guests aren't expected to
> invoke the API frequently, so quite likely a per-domain lock
> would suffice. Quite possibly domain_{,un}lock() could
> actually be (re-)used.

I'd not use a per-domain lock here. This lock will be locked on every runstate area update, what's happening on every context switch. And the event of simultaneous switching of vcpus from the same domain has quite high probability.
Andrii Anisov May 16, 2019, 1:30 p.m. UTC | #22
Hello Jan,

On 16.05.19 15:09, Jan Beulich wrote:
>>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote:
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -163,15 +163,23 @@ struct vcpu
>>       void            *sched_priv;    /* scheduler-specific data */
>>   
>>       struct vcpu_runstate_info runstate;
>> +
>> +    spinlock_t      mapped_runstate_lock;
> 
> Besides other comments given elsewhere already - does this
> really need to be a per-vCPU lock? Guests aren't expected to
> invoke the API frequently, so quite likely a per-domain lock
> would suffice. Quite possibly domain_{,un}lock() could
> actually be (re-)used.

I'd not use a per-domain lock here. This lock will be locked on every runstate area update, what's happening on every context switch. And the event of simultaneous switching of several vcpus from the same domain has quite high probability.
Julien Grall May 16, 2019, 1:48 p.m. UTC | #23
On 16/05/2019 14:30, Andrii Anisov wrote:
> Hello Jan,
> 
> On 16.05.19 15:09, Jan Beulich wrote:
>>>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote:
>>> --- a/xen/include/xen/sched.h
>>> +++ b/xen/include/xen/sched.h
>>> @@ -163,15 +163,23 @@ struct vcpu
>>>       void            *sched_priv;    /* scheduler-specific data */
>>>       struct vcpu_runstate_info runstate;
>>> +
>>> +    spinlock_t      mapped_runstate_lock;
>>
>> Besides other comments given elsewhere already - does this
>> really need to be a per-vCPU lock? Guests aren't expected to
>> invoke the API frequently, so quite likely a per-domain lock
>> would suffice. Quite possibly domain_{,un}lock() could
>> actually be (re-)used.
> 
> I'd not use a per-domain lock here. This lock will be locked on every runstate 
> area update, what's happening on every context switch. And the event of 
> simultaneous switching of several vcpus from the same domain has quite high 
> probability.

The lock can be completely removed anyway. See my previous comments.

Cheers,
Andrii Anisov May 16, 2019, 2:25 p.m. UTC | #24
Hello Julien,

On 16.05.19 16:48, Julien Grall wrote:
> The lock can be completely removed anyway. See my previous comments.

You suggested kinda simplified try_lock with runstate update skipping in case of fail.
The question here is if we are OK with not updating runstate under circumstances?
Though even in this case the question here might be if we need `runstate_in_use` per VCPU or per Domain? My answer is per VCPU.
Julien Grall May 16, 2019, 2:28 p.m. UTC | #25
Hi Andrii,

On 16/05/2019 15:25, Andrii Anisov wrote:
> Hello Julien,
> 
> On 16.05.19 16:48, Julien Grall wrote:
>> The lock can be completely removed anyway. See my previous comments.
> 
> You suggested kinda simplified try_lock with runstate update skipping in case of 
> fail.
> The question here is if we are OK with not updating runstate under circumstances?

Well, if you fail the check then it means someone was modifying it behind your 
back. That someone could update the runstate with the new information once it is 
modified. So I can't see issue with not updating the runstate in the context switch.

Cheers,
Andrii Anisov May 16, 2019, 2:29 p.m. UTC | #26
On 16.05.19 17:28, Julien Grall wrote:
> Well, if you fail the check then it means someone was modifying it behind your back. That someone could update the runstate with the new information once it is modified. So I can't see issue with not updating the runstate in the context switch.

That's fair enough.
diff mbox series

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 6dc633e..8e24e63 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -275,32 +275,55 @@  static void ctxt_switch_to(struct vcpu *n)
 }
 
 /* Update per-VCPU guest runstate shared memory area (if registered). */
-static void update_runstate_area(struct vcpu *v)
+void update_runstate_area(struct vcpu *v)
 {
-    void __user *guest_handle = NULL;
+    if ( !guest_handle_is_null(runstate_guest(v)) )
+    {
+        void __user *guest_handle = NULL;
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            guest_handle = &v->runstate_guest.p->state_entry_time + 1;
+            guest_handle--;
+            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+            __raw_copy_to_guest(guest_handle,
+                                (void *)(&v->runstate.state_entry_time + 1) - 1,
+                                1);
+            smp_wmb();
+        }
 
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return;
+        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
 
-    if ( VM_ASSIST(v->domain, runstate_update_flag) )
-    {
-        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
-        guest_handle--;
-        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
-        smp_wmb();
+        if ( guest_handle )
+        {
+            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+            smp_wmb();
+            __raw_copy_to_guest(guest_handle,
+                                (void *)(&v->runstate.state_entry_time + 1) - 1,
+                                1);
+        }
     }
 
-    __copy_to_guest(runstate_guest(v), &v->runstate, 1);
-
-    if ( guest_handle )
+    spin_lock(&v->mapped_runstate_lock);
+    if ( v->mapped_runstate )
     {
-        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
-        smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            v->mapped_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
+            smp_wmb();
+            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+        }
+
+        memcpy(v->mapped_runstate, &v->runstate, sizeof(v->runstate));
+
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            v->mapped_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+            smp_wmb();
+            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        }
     }
+    spin_unlock(&v->mapped_runstate_lock);
+
 }
 
 static void schedule_tail(struct vcpu *prev)
@@ -998,6 +1021,7 @@  long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) a
     {
         case VCPUOP_register_vcpu_info:
         case VCPUOP_register_runstate_memory_area:
+        case VCPUOP_register_runstate_phys_memory_area:
             return do_vcpu_op(cmd, vcpuid, arg);
         default:
             return -EINVAL;
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 9eaa978..46c2219 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1558,51 +1558,98 @@  void paravirt_ctxt_switch_to(struct vcpu *v)
         wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
 }
 
-/* Update per-VCPU guest runstate shared memory area (if registered). */
-bool update_runstate_area(struct vcpu *v)
+static void update_mapped_runstate_area_native(struct vcpu *v)
 {
-    bool rc;
-    struct guest_memory_policy policy = { .nested_guest_mode = false };
-    void __user *guest_handle = NULL;
-
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return true;
-
-    update_guest_memory_policy(v, &policy);
-
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
-        guest_handle = has_32bit_shinfo(v->domain)
-            ? &v->runstate_guest.compat.p->state_entry_time + 1
-            : &v->runstate_guest.native.p->state_entry_time + 1;
-        guest_handle--;
         v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
+        v->mapped_runstate.native->state_entry_time |= XEN_RUNSTATE_UPDATE;
         smp_wmb();
     }
 
-    if ( has_32bit_shinfo(v->domain) )
+    memcpy(v->mapped_runstate.native, &v->runstate, sizeof(v->runstate));
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
-        struct compat_vcpu_runstate_info info;
+        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        v->mapped_runstate.native->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+    }
+}
 
-        XLAT_vcpu_runstate_info(&info, &v->runstate);
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
-        rc = true;
+static void update_mapped_runstate_area_compat(struct vcpu *v)
+{
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+        v->mapped_runstate.compat->state_entry_time |= XEN_RUNSTATE_UPDATE;
+        smp_wmb();
     }
-    else
-        rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) !=
-             sizeof(v->runstate);
 
-    if ( guest_handle )
+    memcpy(v->mapped_runstate.compat, &v->runstate, sizeof(v->runstate));
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
         v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        v->mapped_runstate.compat->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
         smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
     }
+}
 
-    update_guest_memory_policy(v, &policy);
+/* Update per-VCPU guest runstate shared memory area (if registered). */
+bool update_runstate_area(struct vcpu *v)
+{
+    bool rc = true;
+
+    if ( !guest_handle_is_null(runstate_guest(v)) )
+    {
+        struct guest_memory_policy policy = { .nested_guest_mode = false };
+        void __user *guest_handle = NULL;
+
+        update_guest_memory_policy(v, &policy);
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            guest_handle = has_32bit_shinfo(v->domain)
+                ? &v->runstate_guest.compat.p->state_entry_time + 1
+                : &v->runstate_guest.native.p->state_entry_time + 1;
+            guest_handle--;
+            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+            __raw_copy_to_guest(guest_handle,
+                                (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
+            smp_wmb();
+        }
+
+        if ( has_32bit_shinfo(v->domain) )
+        {
+            struct compat_vcpu_runstate_info info;
+
+            XLAT_vcpu_runstate_info(&info, &v->runstate);
+            __copy_to_guest(v->runstate_guest.compat, &info, 1);
+            rc = true;
+        }
+        else
+            rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) !=
+                 sizeof(v->runstate);
+
+        if ( guest_handle )
+        {
+            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+            smp_wmb();
+            __raw_copy_to_guest(guest_handle,
+                                (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
+        }
+        update_guest_memory_policy(v, &policy);
+    }
+
+    spin_lock(v->mapped_runstate_lock);
+    if ( v->mapped_runstate )
+    {
+        if ( has_32bit_shinfo((v)->domain) )
+            update_mapped_runstate_area_compat(v);
+        else
+            update_mapped_runstate_area_native(v);
+    }
+    spin_unlock(v->mapped_runstate_lock);
 
     return rc;
 }
diff --git a/xen/common/domain.c b/xen/common/domain.c
index ae22049..6df76c6 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -149,6 +149,7 @@  struct vcpu *vcpu_create(
     v->dirty_cpu = VCPU_CPU_CLEAN;
 
     spin_lock_init(&v->virq_lock);
+    spin_lock_init(&v->mapped_runstate_lock);
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
 
@@ -699,6 +700,69 @@  int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
     return 0;
 }
 
+static void _unmap_runstate_area(struct vcpu *v)
+{
+    mfn_t mfn;
+
+    if ( !v->mapped_runstate )
+        return;
+
+    mfn = _mfn(virt_to_mfn(runstate_guest(v).p));
+
+    unmap_domain_page_global((void *)
+                             ((unsigned long)v->mapped_runstate &
+                              PAGE_MASK));
+
+    v->mapped_runstate = NULL;
+    put_page_and_type(mfn_to_page(mfn));
+}
+
+static int map_runstate_area(struct vcpu *v,
+                      struct vcpu_register_runstate_memory_area *area)
+{
+    unsigned long offset = area->addr.p & ~PAGE_MASK;
+    gfn_t gfn = gaddr_to_gfn(area->addr.p);
+    struct domain *d = v->domain;
+    void *mapping;
+    struct page_info *page;
+    size_t size = sizeof (struct vcpu_runstate_info );
+
+    if ( offset > (PAGE_SIZE - size) )
+        return -EINVAL;
+
+    page = get_page_from_gfn(d, gfn_x(gfn), NULL, P2M_ALLOC);
+    if ( !page )
+        return -EINVAL;
+
+    if ( !get_page_type(page, PGT_writable_page) )
+    {
+        put_page(page);
+        return -EINVAL;
+    }
+
+    mapping = __map_domain_page_global(page);
+
+    if ( mapping == NULL )
+    {
+        put_page_and_type(page);
+        return -ENOMEM;
+    }
+
+    spin_lock(&v->mapped_runstate_lock);
+    _unmap_runstate_area(v);
+    v->mapped_runstate = mapping + offset;
+    spin_unlock(&v->mapped_runstate_lock);
+
+    return 0;
+}
+
+static void unmap_runstate_area(struct vcpu *v)
+{
+    spin_lock(&v->mapped_runstate_lock);
+    _unmap_runstate_area(v);
+    spin_unlock(&v->mapped_runstate_lock);
+}
+
 int domain_kill(struct domain *d)
 {
     int rc = 0;
@@ -737,7 +801,11 @@  int domain_kill(struct domain *d)
         if ( cpupool_move_domain(d, cpupool0) )
             return -ERESTART;
         for_each_vcpu ( d, v )
+        {
+            set_xen_guest_handle(runstate_guest(v), NULL);
+            unmap_runstate_area(v);
             unmap_vcpu_info(v);
+        }
         d->is_dying = DOMDYING_dead;
         /* Mem event cleanup has to go here because the rings 
          * have to be put before we call put_domain. */
@@ -1192,6 +1260,7 @@  int domain_soft_reset(struct domain *d)
     for_each_vcpu ( d, v )
     {
         set_xen_guest_handle(runstate_guest(v), NULL);
+        unmap_runstate_area(v);
         unmap_vcpu_info(v);
     }
 
@@ -1536,8 +1605,17 @@  long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
     }
 
     case VCPUOP_register_runstate_phys_memory_area:
-        rc = -EOPNOTSUPP;
+    {
+        struct vcpu_register_runstate_memory_area area;
+
+        rc = -EFAULT;
+        if ( copy_from_guest(&area, arg, 1) )
+            break;
+
+        rc = map_runstate_area(v, &area);
+
         break;
+    }
 
 #ifdef VCPU_TRAP_NMI
     case VCPUOP_send_nmi:
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 312fec8..3fb6ea2 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -217,6 +217,8 @@  void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
 void vcpu_switch_to_aarch64_mode(struct vcpu *);
 
+void update_runstate_area(struct vcpu *);
+
 /*
  * Due to the restriction of GICv3, the number of vCPUs in AFF0 is
  * limited to 16, thus only the first 4 bits of AFF0 are legal. We will
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index d1bfc82..ecddcfe 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -118,4 +118,6 @@  struct vnuma_info {
 
 void vnuma_destroy(struct vnuma_info *vnuma);
 
+struct vcpu_register_runstate_memory_area;
+
 #endif /* __XEN_DOMAIN_H__ */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 748bb0f..2afe31c 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -163,15 +163,23 @@  struct vcpu
     void            *sched_priv;    /* scheduler-specific data */
 
     struct vcpu_runstate_info runstate;
+
+    spinlock_t      mapped_runstate_lock;
+
 #ifndef CONFIG_COMPAT
 # define runstate_guest(v) ((v)->runstate_guest)
     XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
+    vcpu_runstate_info_t *mapped_runstate;
 #else
 # define runstate_guest(v) ((v)->runstate_guest.native)
     union {
         XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
         XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
     } runstate_guest; /* guest address */
+    union {
+        vcpu_runstate_info_t* native;
+        vcpu_runstate_info_compat_t* compat;
+    } mapped_runstate; /* guest address */
 #endif
 
     /* last time when vCPU is scheduled out */