Message ID | 1556007026-31057-3-git-send-email-andrii.anisov@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce runstate area registration with phys address | expand |
Hi, On 23/04/2019 09:10, Andrii Anisov wrote: > From: Andrii Anisov <andrii_anisov@epam.com> > > VCPUOP_register_runstate_phys_memory_area is implemented via runstate > area mapping. > > Signed-off-by: Andrii Anisov <andrii_anisov@epam.com> > --- > xen/arch/arm/domain.c | 62 +++++++++++++++++-------- > xen/arch/x86/domain.c | 105 +++++++++++++++++++++++++++++++------------ > xen/common/domain.c | 80 ++++++++++++++++++++++++++++++++- > xen/include/asm-arm/domain.h | 2 + > xen/include/xen/domain.h | 2 + > xen/include/xen/sched.h | 8 ++++ > 6 files changed, 210 insertions(+), 49 deletions(-) This patch is quite hard to read because you are reworking the code and at the same time implementing the new VCPUOP. How about moving the rework in a separate patch? The implementation can then be fold in the previous patch as suggested by George. > > diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c > index 6dc633e..8e24e63 100644 > --- a/xen/arch/arm/domain.c > +++ b/xen/arch/arm/domain.c > @@ -275,32 +275,55 @@ static void ctxt_switch_to(struct vcpu *n) > } > > /* Update per-VCPU guest runstate shared memory area (if registered). */ > -static void update_runstate_area(struct vcpu *v) > +void update_runstate_area(struct vcpu *v) Why do you export update_runstate_area? The function does not seem to be called outside. > { > - void __user *guest_handle = NULL; > + if ( !guest_handle_is_null(runstate_guest(v)) ) > + { > + void __user *guest_handle = NULL; > + if ( VM_ASSIST(v->domain, runstate_update_flag) ) > + { > + guest_handle = &v->runstate_guest.p->state_entry_time + 1; > + guest_handle--; > + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; > + __raw_copy_to_guest(guest_handle, > + (void *)(&v->runstate.state_entry_time + 1) - 1, > + 1); > + smp_wmb(); > + } > > - if ( guest_handle_is_null(runstate_guest(v)) ) > - return; > + __copy_to_guest(runstate_guest(v), &v->runstate, 1); > > - if ( VM_ASSIST(v->domain, runstate_update_flag) ) > - { > - guest_handle = &v->runstate_guest.p->state_entry_time + 1; > - guest_handle--; > - v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; > - __raw_copy_to_guest(guest_handle, > - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); > - smp_wmb(); > + if ( guest_handle ) > + { > + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; > + smp_wmb(); > + __raw_copy_to_guest(guest_handle, > + (void *)(&v->runstate.state_entry_time + 1) - 1, > + 1); > + } > } > > - __copy_to_guest(runstate_guest(v), &v->runstate, 1); > - > - if ( guest_handle ) > + spin_lock(&v->mapped_runstate_lock); > + if ( v->mapped_runstate ) The code looks a bit odd to me, you seem to allow a guest to provide 2 runstate areas: one using guest virtual address the other using guest physical address. It would be best if we prevent a guest to mix match them. IOW, if the guest provide a physical address first, then *all* the call should be physical address. Alternatively this could be a per vCPU decision. > { > - v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; > - smp_wmb(); > - __raw_copy_to_guest(guest_handle, > - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); > + if ( VM_ASSIST(v->domain, runstate_update_flag) ) > + { > + v->mapped_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE; > + smp_wmb(); > + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; > + } > + > + memcpy(v->mapped_runstate, &v->runstate, sizeof(v->runstate)); > + > + if ( VM_ASSIST(v->domain, runstate_update_flag) ) > + { > + v->mapped_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE; > + smp_wmb(); > + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; > + } > } > + spin_unlock(&v->mapped_runstate_lock); > + NIT: The newline is not necessary here. > } > > static void schedule_tail(struct vcpu *prev) > @@ -998,6 +1021,7 @@ long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) a > { > case VCPUOP_register_vcpu_info: > case VCPUOP_register_runstate_memory_area: > + case VCPUOP_register_runstate_phys_memory_area: > return do_vcpu_op(cmd, vcpuid, arg); > default: > return -EINVAL; [...] > diff --git a/xen/common/domain.c b/xen/common/domain.c > index ae22049..6df76c6 100644 > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -149,6 +149,7 @@ struct vcpu *vcpu_create( > v->dirty_cpu = VCPU_CPU_CLEAN; > > spin_lock_init(&v->virq_lock); > + spin_lock_init(&v->mapped_runstate_lock); > > tasklet_init(&v->continue_hypercall_tasklet, NULL, 0); > > @@ -699,6 +700,69 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d) > return 0; > } > > +static void _unmap_runstate_area(struct vcpu *v) A better name would be unamep_runstate_area_locked() so you avoid the reserved name and make clear of the use. > +{ > + mfn_t mfn; > + > + if ( !v->mapped_runstate ) > + return; > + > + mfn = _mfn(virt_to_mfn(runstate_guest(v).p)); As pointed out by Jan in the previous version: The pointer is the result of __map_domain_page_global(). So I don't think you don't think you can legitimately use virt_to_mfn() on it, at least not on x86; domain_page_map_to_mfn() is what you want to use here. > + > + unmap_domain_page_global((void *) > + ((unsigned long)v->mapped_runstate & > + PAGE_MASK)); > + > + v->mapped_runstate = NULL; > + put_page_and_type(mfn_to_page(mfn)); > +} We seem to have this pattern in a few places now (see unmap_guest_page). It would be good to introduce helpers that can be used everywhere (probably lifted from common/event_fifo.c. > + > +static int map_runstate_area(struct vcpu *v, > + struct vcpu_register_runstate_memory_area *area) > +{ > + unsigned long offset = area->addr.p & ~PAGE_MASK; > + gfn_t gfn = gaddr_to_gfn(area->addr.p); > + struct domain *d = v->domain; > + void *mapping; > + struct page_info *page; > + size_t size = sizeof (struct vcpu_runstate_info ); space is not necessary before ). But is the variable really necessary? > + > + if ( offset > (PAGE_SIZE - size) ) > + return -EINVAL; > + > + page = get_page_from_gfn(d, gfn_x(gfn), NULL, P2M_ALLOC); > + if ( !page ) > + return -EINVAL; > + > + if ( !get_page_type(page, PGT_writable_page) ) > + { > + put_page(page); > + return -EINVAL; > + } > + > + mapping = __map_domain_page_global(page); > + > + if ( mapping == NULL ) > + { > + put_page_and_type(page); > + return -ENOMEM; > + } > + > + spin_lock(&v->mapped_runstate_lock); > + _unmap_runstate_area(v); > + v->mapped_runstate = mapping + offset; > + spin_unlock(&v->mapped_runstate_lock); > + > + return 0; > +} > + > +static void unmap_runstate_area(struct vcpu *v) > +{ > + spin_lock(&v->mapped_runstate_lock); > + _unmap_runstate_area(v); > + spin_unlock(&v->mapped_runstate_lock); > +} > + > int domain_kill(struct domain *d) > { > int rc = 0; > @@ -737,7 +801,11 @@ int domain_kill(struct domain *d) > if ( cpupool_move_domain(d, cpupool0) ) > return -ERESTART; > for_each_vcpu ( d, v ) > + { > + set_xen_guest_handle(runstate_guest(v), NULL); > + unmap_runstate_area(v); > unmap_vcpu_info(v); > + } > d->is_dying = DOMDYING_dead; > /* Mem event cleanup has to go here because the rings > * have to be put before we call put_domain. */ > @@ -1192,6 +1260,7 @@ int domain_soft_reset(struct domain *d) > for_each_vcpu ( d, v ) > { > set_xen_guest_handle(runstate_guest(v), NULL); > + unmap_runstate_area(v); > unmap_vcpu_info(v); > } > > @@ -1536,8 +1605,17 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) > } > > case VCPUOP_register_runstate_phys_memory_area: > - rc = -EOPNOTSUPP; > + { > + struct vcpu_register_runstate_memory_area area; > + > + rc = -EFAULT; > + if ( copy_from_guest(&area, arg, 1) ) > + break; > + > + rc = map_runstate_area(v, &area); > + > break; > + } > > #ifdef VCPU_TRAP_NMI > case VCPUOP_send_nmi: > diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h > index 312fec8..3fb6ea2 100644 > --- a/xen/include/asm-arm/domain.h > +++ b/xen/include/asm-arm/domain.h > @@ -217,6 +217,8 @@ void vcpu_show_execution_state(struct vcpu *); > void vcpu_show_registers(const struct vcpu *); > void vcpu_switch_to_aarch64_mode(struct vcpu *); > > +void update_runstate_area(struct vcpu *); > + > /* > * Due to the restriction of GICv3, the number of vCPUs in AFF0 is > * limited to 16, thus only the first 4 bits of AFF0 are legal. We will > diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h > index d1bfc82..ecddcfe 100644 > --- a/xen/include/xen/domain.h > +++ b/xen/include/xen/domain.h > @@ -118,4 +118,6 @@ struct vnuma_info { > > void vnuma_destroy(struct vnuma_info *vnuma); > > +struct vcpu_register_runstate_memory_area; > + > #endif /* __XEN_DOMAIN_H__ */ > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > index 748bb0f..2afe31c 100644 > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -163,15 +163,23 @@ struct vcpu > void *sched_priv; /* scheduler-specific data */ > > struct vcpu_runstate_info runstate; > + > + spinlock_t mapped_runstate_lock; > + > #ifndef CONFIG_COMPAT > # define runstate_guest(v) ((v)->runstate_guest) > XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */ > + vcpu_runstate_info_t *mapped_runstate; > #else > # define runstate_guest(v) ((v)->runstate_guest.native) > union { > XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; > XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat; > } runstate_guest; /* guest address */ > + union { > + vcpu_runstate_info_t* native; > + vcpu_runstate_info_compat_t* compat; > + } mapped_runstate; /* guest address */ The combination of mapped_runstate and runstate_guest is a bit confusing. I think you want to rework the interface to show that only one is possible at the time and make clear which one is used by who. Maybe: union { /* Legacy interface to be used when the guest provides a virtual address */ union { XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; ... } virt; /* Interface used when the guest provides a physical address */ union { } phys; } runstate_guest. runstate_guest_type /* could be a bool or enum */ Jan what do you think? Cheers,
>>> On 08.05.19 at 17:40, <julien.grall@arm.com> wrote: > On 23/04/2019 09:10, Andrii Anisov wrote: >> --- a/xen/include/xen/sched.h >> +++ b/xen/include/xen/sched.h >> @@ -163,15 +163,23 @@ struct vcpu >> void *sched_priv; /* scheduler-specific data */ >> >> struct vcpu_runstate_info runstate; >> + >> + spinlock_t mapped_runstate_lock; >> + >> #ifndef CONFIG_COMPAT >> # define runstate_guest(v) ((v)->runstate_guest) >> XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */ >> + vcpu_runstate_info_t *mapped_runstate; >> #else >> # define runstate_guest(v) ((v)->runstate_guest.native) >> union { >> XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; >> XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat; >> } runstate_guest; /* guest address */ >> + union { >> + vcpu_runstate_info_t* native; >> + vcpu_runstate_info_compat_t* compat; >> + } mapped_runstate; /* guest address */ > > The combination of mapped_runstate and runstate_guest is a bit confusing. I > think you want to rework the interface to show that only one is possible at the > time and make clear which one is used by who. Maybe: > > union > { > /* Legacy interface to be used when the guest provides a virtual address */ > union { > XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; > ... > } virt; > > /* Interface used when the guest provides a physical address */ > union { > } phys; > } runstate_guest. > > runstate_guest_type /* could be a bool or enum */ > > Jan what do you think? I fully agree - no mixing of approaches here, if possible. However, care needs to be taken that after a domain reset the new kernel can chose the opposite model. Question is whether there are even other cases where independent components (say boot loader and OS) may need to be permitted to chose models independently of one another. As a side note, Andrii - would you please avoid sending double mail to xen-devel (addresses differing just by domain used)? Jan
On 08.05.19 18:40, Julien Grall wrote: > This patch is quite hard to read because you are reworking the code and at the same time implementing the new VCPUOP. How about moving the rework in a separate patch? The implementation can then be fold in the previous patch as suggested by George. OK. > >> >> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c >> index 6dc633e..8e24e63 100644 >> --- a/xen/arch/arm/domain.c >> +++ b/xen/arch/arm/domain.c >> @@ -275,32 +275,55 @@ static void ctxt_switch_to(struct vcpu *n) >> } >> /* Update per-VCPU guest runstate shared memory area (if registered). */ >> -static void update_runstate_area(struct vcpu *v) >> +void update_runstate_area(struct vcpu *v) > > Why do you export update_runstate_area? The function does not seem to be called outside. Ouch, this left from one of the previous versions. > >> { >> - void __user *guest_handle = NULL; >> + if ( !guest_handle_is_null(runstate_guest(v)) ) >> + { >> + void __user *guest_handle = NULL; >> + if ( VM_ASSIST(v->domain, runstate_update_flag) ) >> + { >> + guest_handle = &v->runstate_guest.p->state_entry_time + 1; >> + guest_handle--; >> + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; >> + __raw_copy_to_guest(guest_handle, >> + (void *)(&v->runstate.state_entry_time + 1) - 1, >> + 1); >> + smp_wmb(); >> + } >> - if ( guest_handle_is_null(runstate_guest(v)) ) >> - return; >> + __copy_to_guest(runstate_guest(v), &v->runstate, 1); >> - if ( VM_ASSIST(v->domain, runstate_update_flag) ) >> - { >> - guest_handle = &v->runstate_guest.p->state_entry_time + 1; >> - guest_handle--; >> - v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; >> - __raw_copy_to_guest(guest_handle, >> - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); >> - smp_wmb(); >> + if ( guest_handle ) >> + { >> + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; >> + smp_wmb(); >> + __raw_copy_to_guest(guest_handle, >> + (void *)(&v->runstate.state_entry_time + 1) - 1, >> + 1); >> + } >> } >> - __copy_to_guest(runstate_guest(v), &v->runstate, 1); >> - >> - if ( guest_handle ) >> + spin_lock(&v->mapped_runstate_lock); >> + if ( v->mapped_runstate ) > > The code looks a bit odd to me, you seem to allow a guest to provide 2 runstate areas: one using guest virtual address the other using guest physical address. > > It would be best if we prevent a guest to mix match them. Firstly I turned to implementing in that way, but the locking and decissions code become really ugly and complex while trying to cover 'guest's misbehavior' scenarios. > IOW, if the guest provide a physical address first, then *all* the call should be physical address. Alternatively this could be a per vCPU decision. I guess we should agree what to implement first. > >> { >> - v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; >> - smp_wmb(); >> - __raw_copy_to_guest(guest_handle, >> - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); >> + if ( VM_ASSIST(v->domain, runstate_update_flag) ) >> + { >> + v->mapped_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE; >> + smp_wmb(); >> + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; >> + } >> + >> + memcpy(v->mapped_runstate, &v->runstate, sizeof(v->runstate)); >> + >> + if ( VM_ASSIST(v->domain, runstate_update_flag) ) >> + { >> + v->mapped_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE; >> + smp_wmb(); >> + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; >> + } >> } >> + spin_unlock(&v->mapped_runstate_lock); >> + > > NIT: The newline is not necessary here. OK. > >> } >> static void schedule_tail(struct vcpu *prev) >> @@ -998,6 +1021,7 @@ long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) a >> { >> case VCPUOP_register_vcpu_info: >> case VCPUOP_register_runstate_memory_area: >> + case VCPUOP_register_runstate_phys_memory_area: >> return do_vcpu_op(cmd, vcpuid, arg); >> default: >> return -EINVAL; > > > [...] > >> diff --git a/xen/common/domain.c b/xen/common/domain.c >> index ae22049..6df76c6 100644 >> --- a/xen/common/domain.c >> +++ b/xen/common/domain.c >> @@ -149,6 +149,7 @@ struct vcpu *vcpu_create( >> v->dirty_cpu = VCPU_CPU_CLEAN; >> spin_lock_init(&v->virq_lock); >> + spin_lock_init(&v->mapped_runstate_lock); >> tasklet_init(&v->continue_hypercall_tasklet, NULL, 0); >> @@ -699,6 +700,69 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d) >> return 0; >> } >> +static void _unmap_runstate_area(struct vcpu *v) > A better name would be unamep_runstate_area_locked() so you avoid the reserved name and make clear of the use. OK. > >> +{ >> + mfn_t mfn; >> + >> + if ( !v->mapped_runstate ) >> + return; >> + >> + mfn = _mfn(virt_to_mfn(runstate_guest(v).p)); > > As pointed out by Jan in the previous version: > > The pointer is the result of __map_domain_page_global(). So I don't think you > don't think you can legitimately use virt_to_mfn() on it, at > least not on x86; domain_page_map_to_mfn() is what you > want to use here. Yep. > >> + >> + unmap_domain_page_global((void *) >> + ((unsigned long)v->mapped_runstate & >> + PAGE_MASK)); >> + >> + v->mapped_runstate = NULL; >> + put_page_and_type(mfn_to_page(mfn)); >> +} > > We seem to have this pattern in a few places now (see unmap_guest_page). It would be good to introduce helpers that can be used everywhere (probably lifted from common/event_fifo.c. I'll check. > >> + >> +static int map_runstate_area(struct vcpu *v, >> + struct vcpu_register_runstate_memory_area *area) >> +{ >> + unsigned long offset = area->addr.p & ~PAGE_MASK; >> + gfn_t gfn = gaddr_to_gfn(area->addr.p); >> + struct domain *d = v->domain; >> + void *mapping; >> + struct page_info *page; >> + size_t size = sizeof (struct vcpu_runstate_info ); > > space is not necessary before ). > > But is the variable really necessary? Well, I think it could be dropped. > >> + >> + if ( offset > (PAGE_SIZE - size) ) >> + return -EINVAL; >> + >> + page = get_page_from_gfn(d, gfn_x(gfn), NULL, P2M_ALLOC); >> + if ( !page ) >> + return -EINVAL; >> + >> + if ( !get_page_type(page, PGT_writable_page) ) >> + { >> + put_page(page); >> + return -EINVAL; >> + } >> + >> + mapping = __map_domain_page_global(page); >> + >> + if ( mapping == NULL ) >> + { >> + put_page_and_type(page); >> + return -ENOMEM; >> + } >> + >> + spin_lock(&v->mapped_runstate_lock); >> + _unmap_runstate_area(v); >> + v->mapped_runstate = mapping + offset; >> + spin_unlock(&v->mapped_runstate_lock); >> + >> + return 0; >> +} >> + >> +static void unmap_runstate_area(struct vcpu *v) >> +{ >> + spin_lock(&v->mapped_runstate_lock); >> + _unmap_runstate_area(v); >> + spin_unlock(&v->mapped_runstate_lock); >> +} >> + >> int domain_kill(struct domain *d) >> { >> int rc = 0; >> @@ -737,7 +801,11 @@ int domain_kill(struct domain *d) >> if ( cpupool_move_domain(d, cpupool0) ) >> return -ERESTART; >> for_each_vcpu ( d, v ) >> + { >> + set_xen_guest_handle(runstate_guest(v), NULL); >> + unmap_runstate_area(v); >> unmap_vcpu_info(v); >> + } >> d->is_dying = DOMDYING_dead; >> /* Mem event cleanup has to go here because the rings >> * have to be put before we call put_domain. */ >> @@ -1192,6 +1260,7 @@ int domain_soft_reset(struct domain *d) >> for_each_vcpu ( d, v ) >> { >> set_xen_guest_handle(runstate_guest(v), NULL); >> + unmap_runstate_area(v); >> unmap_vcpu_info(v); >> } >> @@ -1536,8 +1605,17 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) >> } >> case VCPUOP_register_runstate_phys_memory_area: >> - rc = -EOPNOTSUPP; >> + { >> + struct vcpu_register_runstate_memory_area area; >> + >> + rc = -EFAULT; >> + if ( copy_from_guest(&area, arg, 1) ) >> + break; >> + >> + rc = map_runstate_area(v, &area); >> + >> break; >> + } >> #ifdef VCPU_TRAP_NMI >> case VCPUOP_send_nmi: >> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h >> index 312fec8..3fb6ea2 100644 >> --- a/xen/include/asm-arm/domain.h >> +++ b/xen/include/asm-arm/domain.h >> @@ -217,6 +217,8 @@ void vcpu_show_execution_state(struct vcpu *); >> void vcpu_show_registers(const struct vcpu *); >> void vcpu_switch_to_aarch64_mode(struct vcpu *); >> +void update_runstate_area(struct vcpu *); >> + >> /* >> * Due to the restriction of GICv3, the number of vCPUs in AFF0 is >> * limited to 16, thus only the first 4 bits of AFF0 are legal. We will >> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h >> index d1bfc82..ecddcfe 100644 >> --- a/xen/include/xen/domain.h >> +++ b/xen/include/xen/domain.h >> @@ -118,4 +118,6 @@ struct vnuma_info { >> void vnuma_destroy(struct vnuma_info *vnuma); >> +struct vcpu_register_runstate_memory_area; >> + >> #endif /* __XEN_DOMAIN_H__ */ >> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h >> index 748bb0f..2afe31c 100644 >> --- a/xen/include/xen/sched.h >> +++ b/xen/include/xen/sched.h >> @@ -163,15 +163,23 @@ struct vcpu >> void *sched_priv; /* scheduler-specific data */ >> struct vcpu_runstate_info runstate; >> + >> + spinlock_t mapped_runstate_lock; >> + >> #ifndef CONFIG_COMPAT >> # define runstate_guest(v) ((v)->runstate_guest) >> XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */ >> + vcpu_runstate_info_t *mapped_runstate; >> #else >> # define runstate_guest(v) ((v)->runstate_guest.native) >> union { >> XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; >> XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat; >> } runstate_guest; /* guest address */ >> + union { >> + vcpu_runstate_info_t* native; >> + vcpu_runstate_info_compat_t* compat; >> + } mapped_runstate; /* guest address */ > > The combination of mapped_runstate and runstate_guest is a bit confusing. I think you want to rework the interface to show that only one is possible at the time and make clear which one is used by who. Maybe: As I said before, IMO coupling those interfaces makes the code complicated and ugly. > > union > { > /* Legacy interface to be used when the guest provides a virtual address */ > union { > XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; > ... > } virt; > > /* Interface used when the guest provides a physical address */ > union { > } phys; > } runstate_guest.> > runstate_guest_type /* could be a bool or enum */ > > Jan what do you think? > > Cheers, >
Hi Jan, On 09/05/2019 10:27, Jan Beulich wrote: >>>> On 08.05.19 at 17:40, <julien.grall@arm.com> wrote: >> On 23/04/2019 09:10, Andrii Anisov wrote: >>> --- a/xen/include/xen/sched.h >>> +++ b/xen/include/xen/sched.h >>> @@ -163,15 +163,23 @@ struct vcpu >>> void *sched_priv; /* scheduler-specific data */ >>> >>> struct vcpu_runstate_info runstate; >>> + >>> + spinlock_t mapped_runstate_lock; >>> + >>> #ifndef CONFIG_COMPAT >>> # define runstate_guest(v) ((v)->runstate_guest) >>> XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */ >>> + vcpu_runstate_info_t *mapped_runstate; >>> #else >>> # define runstate_guest(v) ((v)->runstate_guest.native) >>> union { >>> XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; >>> XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat; >>> } runstate_guest; /* guest address */ >>> + union { >>> + vcpu_runstate_info_t* native; >>> + vcpu_runstate_info_compat_t* compat; >>> + } mapped_runstate; /* guest address */ >> >> The combination of mapped_runstate and runstate_guest is a bit confusing. I >> think you want to rework the interface to show that only one is possible at the >> time and make clear which one is used by who. Maybe: >> >> union >> { >> /* Legacy interface to be used when the guest provides a virtual address */ >> union { >> XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; >> ... >> } virt; >> >> /* Interface used when the guest provides a physical address */ >> union { >> } phys; >> } runstate_guest. >> >> runstate_guest_type /* could be a bool or enum */ >> >> Jan what do you think? > > I fully agree - no mixing of approaches here, if possible. However, > care needs to be taken that after a domain reset the new kernel > can chose the opposite model. Question is whether there are even > other cases where independent components (say boot loader and > OS) may need to be permitted to chose models independently of > one another. Good point. On a similar topic, how does Kexec works on Xen? Do we reset the domain as well? Cheers,
>>> On 14.05.19 at 11:35, <julien.grall@arm.com> wrote: > On a similar topic, how does Kexec works on Xen? Do we reset the > domain as well? No, how could we? What gets invoked isn't Xen in the common case, but some other (typically bare metal) OS like Linux. Jan
On 13/05/2019 13:30, Andrii Anisov wrote: > > > On 08.05.19 18:40, Julien Grall wrote: >>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c >>> index 6dc633e..8e24e63 100644 >> >>> { >>> - void __user *guest_handle = NULL; >>> + if ( !guest_handle_is_null(runstate_guest(v)) ) >>> + { >>> + void __user *guest_handle = NULL; >>> + if ( VM_ASSIST(v->domain, runstate_update_flag) ) >>> + { >>> + guest_handle = &v->runstate_guest.p->state_entry_time + 1; >>> + guest_handle--; >>> + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; >>> + __raw_copy_to_guest(guest_handle, >>> + (void *)(&v->runstate.state_entry_time + 1) >>> - 1, >>> + 1); >>> + smp_wmb(); >>> + } >>> - if ( guest_handle_is_null(runstate_guest(v)) ) >>> - return; >>> + __copy_to_guest(runstate_guest(v), &v->runstate, 1); >>> - if ( VM_ASSIST(v->domain, runstate_update_flag) ) >>> - { >>> - guest_handle = &v->runstate_guest.p->state_entry_time + 1; >>> - guest_handle--; >>> - v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; >>> - __raw_copy_to_guest(guest_handle, >>> - (void *)(&v->runstate.state_entry_time + 1) - 1, >>> 1); >>> - smp_wmb(); >>> + if ( guest_handle ) >>> + { >>> + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; >>> + smp_wmb(); >>> + __raw_copy_to_guest(guest_handle, >>> + (void *)(&v->runstate.state_entry_time + 1) >>> - 1, >>> + 1); >>> + } >>> } >>> - __copy_to_guest(runstate_guest(v), &v->runstate, 1); >>> - >>> - if ( guest_handle ) >>> + spin_lock(&v->mapped_runstate_lock); >>> + if ( v->mapped_runstate ) >> >> The code looks a bit odd to me, you seem to allow a guest to provide 2 >> runstate areas: one using guest virtual address the other using guest physical >> address. >> >> It would be best if we prevent a guest to mix match them. > > Firstly I turned to implementing in that way, but the locking and decissions > code become really ugly and complex while trying to cover 'guest's misbehavior' > scenarios. I think it is possible to have a simple version taking the decision on which method to use. You can either use the spin_lock to protect everything or use something like: update_runstate_area(): if ( xchg(&v->runstate_in_use, 1) ) return; switch ( v->runstate_type ) { GVADDR: update_runstate_by_gvaddr(); GPADDR: update_runstate_by_gpaddr(); } xchg(&v->runstate_in_use, 0); registering an area while ( xchg(&v->runstate_in_use, 1) ); /* Clean-up and registering the area */ > >> IOW, if the guest provide a physical address first, then *all* the call should >> be physical address. Alternatively this could be a per vCPU decision. > > I guess we should agree what to implement first. I think there are an agreement that the two methods should not be used together. [..] >>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h >>> index 312fec8..3fb6ea2 100644 >>> --- a/xen/include/asm-arm/domain.h >>> +++ b/xen/include/asm-arm/domain.h >>> @@ -217,6 +217,8 @@ void vcpu_show_execution_state(struct vcpu *); >>> void vcpu_show_registers(const struct vcpu *); >>> void vcpu_switch_to_aarch64_mode(struct vcpu *); >>> +void update_runstate_area(struct vcpu *); >>> + >>> /* >>> * Due to the restriction of GICv3, the number of vCPUs in AFF0 is >>> * limited to 16, thus only the first 4 bits of AFF0 are legal. We will >>> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h >>> index d1bfc82..ecddcfe 100644 >>> --- a/xen/include/xen/domain.h >>> +++ b/xen/include/xen/domain.h >>> @@ -118,4 +118,6 @@ struct vnuma_info { >>> void vnuma_destroy(struct vnuma_info *vnuma); >>> +struct vcpu_register_runstate_memory_area; >>> + >>> #endif /* __XEN_DOMAIN_H__ */ >>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h >>> index 748bb0f..2afe31c 100644 >>> --- a/xen/include/xen/sched.h >>> +++ b/xen/include/xen/sched.h >>> @@ -163,15 +163,23 @@ struct vcpu >>> void *sched_priv; /* scheduler-specific data */ >>> struct vcpu_runstate_info runstate; >>> + >>> + spinlock_t mapped_runstate_lock; >>> + >>> #ifndef CONFIG_COMPAT >>> # define runstate_guest(v) ((v)->runstate_guest) >>> XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */ >>> + vcpu_runstate_info_t *mapped_runstate; >>> #else >>> # define runstate_guest(v) ((v)->runstate_guest.native) >>> union { >>> XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; >>> XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat; >>> } runstate_guest; /* guest address */ >>> + union { >>> + vcpu_runstate_info_t* native; >>> + vcpu_runstate_info_compat_t* compat; >>> + } mapped_runstate; /* guest address */ >> > The combination of mapped_runstate and runstate_guest is a bit confusing. I >> think you want to rework the interface to show that only one is possible at >> the time and make clear which one is used by who. Maybe: > > As I said before, IMO coupling those interfaces makes the code complicated and > ugly. Well, I can't see how it can be ugly (see my example above). Cheers,
Hello Julien, On 14.05.19 12:58, Julien Grall wrote: >> I guess we should agree what to implement first. > > I think there are an agreement that the two methods should not be used together. For a domain or for a particular VCPU? How should we response on the request to register paddr when we already have registered vaddr and vice versa?
On 14/05/2019 10:48, Jan Beulich wrote: >>>> On 14.05.19 at 11:35, <julien.grall@arm.com> wrote: >> On a similar topic, how does Kexec works on Xen? Do we reset the >> domain as well? > > No, how could we? What gets invoked isn't Xen in the common case, > but some other (typically bare metal) OS like Linux. It is not what I asked. What I asked is whether Xen is involved when a guest kernel is kexecing to another kernel. I don't know enough Kexec to be able to answer that question myself. Cheers,
On 14/05/2019 11:08, Andrii Anisov wrote: > Hello Julien, > > On 14.05.19 12:58, Julien Grall wrote: >>> I guess we should agree what to implement first. >> >> I think there are an agreement that the two methods should not be used together. > > For a domain or for a particular VCPU? > How should we response on the request to register paddr when we already have > registered vaddr and vice versa? From the discussion with Jan, you would tear down the previous solution and then register the new version. So this allows cases like a bootloader and a kernel using different version of the interface. Cheers,
>>> On 14.05.19 at 13:23, <julien.grall@arm.com> wrote: > On 14/05/2019 10:48, Jan Beulich wrote: >>>>> On 14.05.19 at 11:35, <julien.grall@arm.com> wrote: >>> On a similar topic, how does Kexec works on Xen? Do we reset the >>> domain as well? >> >> No, how could we? What gets invoked isn't Xen in the common case, >> but some other (typically bare metal) OS like Linux. > > It is not what I asked. What I asked is whether Xen is involved when a guest > kernel is kexecing to another kernel. I don't think I've ever heard of such a thing (outside of perhaps using domain reset), so I don't think I can answer the (originally ambiguous) question then. Jan
On 14.05.19 14:24, Julien Grall wrote: >>> I think there are an agreement that the two methods should not be used together. >> >> For a domain or for a particular VCPU? >> How should we response on the request to register paddr when we already have registered vaddr and vice versa? > > From the discussion with Jan, you would tear down the previous solution and then > register the new version. So this allows cases like a bootloader and a kernel using different version of the interface. I'm not sure Jan stated that, he rather questioned that. Jan, could you please confirm you agree that it should be dropped already registered runstate and (potentially) changed version in case of the new register request?
>>> On 14.05.19 at 13:45, <andrii.anisov@gmail.com> wrote: > On 14.05.19 14:24, Julien Grall wrote: >>>> I think there are an agreement that the two methods should not be used together. >>> >>> For a domain or for a particular VCPU? >>> How should we response on the request to register paddr when we already have registered vaddr and vice versa? >> >> From the discussion with Jan, you would tear down the previous solution and then >> register the new version. So this allows cases like a bootloader and a > kernel using different version of the interface. > > I'm not sure Jan stated that, he rather questioned that. > > Jan, could you please confirm you agree that it should be dropped already > registered runstate and (potentially) changed version in case of the new > register request? Well, I think Julian's implication was that we can't support in particular the boot loader -> kernel handover case without extra measures (if at all), and hence he added things together and said what results from this. Of course ideally we'd reject mixed requests, but unless someone can come up with a clever means of how to determine entity boundaries I'm afraid this is not going to be possible without breaking certain setups. Jan
On 14.05.19 15:02, Jan Beulich wrote: > Well, I think Julian's implication was that we can't support in particular > the boot loader -> kernel handover case without extra measures (if > at all), and hence he added things together and said what results > from this. Of course ideally we'd reject mixed requests, but unless > someone can come up with a clever means of how to determine entity > boundaries I'm afraid this is not going to be possible without breaking > certain setups. From my understanding, if we are speaking of different entities in a domain and their boundaries, we have to define unregister interface as well. So that those entities would be able to take care of themselves explicitly.
On 14/05/2019 14:05, Andrii Anisov wrote: > > > On 14.05.19 15:02, Jan Beulich wrote: >> Well, I think Julian's implication was that we can't support in particular >> the boot loader -> kernel handover case without extra measures (if >> at all), and hence he added things together and said what results >> from this. Of course ideally we'd reject mixed requests, but unless >> someone can come up with a clever means of how to determine entity >> boundaries I'm afraid this is not going to be possible without breaking >> certain setups. > > From my understanding, if we are speaking of different entities in a domain and > their boundaries, we have to define unregister interface as well. So that those > entities would be able to take care of themselves explicitly. You have to keep in mind that existing OS have to run on newer Xen without any modification. The existing hypercall allows you to: 1) De-register an interface using the value 0. 2) Replacing a current existing interface You probably can't use 2) for a bootloader -> kernel handover because we are dealing with guest virtual address. There is an high chance the virtual address space layout is going to be different or even turning off MMU for a bit (done on Arm). So you have to use 1) otherwise you might write in a random place in memory. I am not entirely sure whether there are actual value for 2). The only reason I can think of is if you want to move around the runstate in your virtual address space. But that's sounds a bit weird at least on Arm. For the new hypercall, I think we at least want 1) (with a magic value TBD). 2) might be helpful in the case the bootloader didn't do the right thing or we are using Kexec to boot a new kernel. This would also be safer as physical address could be excluded more easily. 2) should not be too difficult to implement. It is just a matter of clean-up whatever was used previous before registering the new interface. Cheers,
>>> On 14.05.19 at 15:05, <andrii.anisov@gmail.com> wrote: > On 14.05.19 15:02, Jan Beulich wrote: >> Well, I think Julian's implication was that we can't support in particular >> the boot loader -> kernel handover case without extra measures (if >> at all), and hence he added things together and said what results >> from this. Of course ideally we'd reject mixed requests, but unless >> someone can come up with a clever means of how to determine entity >> boundaries I'm afraid this is not going to be possible without breaking >> certain setups. > > From my understanding, if we are speaking of different entities in a domain > and their boundaries, we have to define unregister interface as well. So that > those entities would be able to take care of themselves explicitly. If this was a concern only for newly written code, this would be fine. But you need to make sure all existing code also continues to work with whatever new interface you implement. Just because a kernel uses your new physical address based mechanism doesn't mean the boot loader knows to unregister what it has registered. Jan
Hello Jan, On 14.05.19 16:49, Jan Beulich wrote: > If this was a concern only for newly written code, this would be fine. > But you need to make sure all existing code also continues to work > with whatever new interface you implement. And that is one more reason why I tend to introduce the new interface in parallel to be fully independent from the old one. But not do a mixed implementation as you and Julien suggest. > Just because a kernel > uses your new physical address based mechanism doesn't mean the > boot loader knows to unregister what it has registered. As Julien already said, the current interface has an implicit mechanism to unregister runstate area. Also if the bootloader misses unregistering runstate area with the current interface, we already have the broken system. So it is really up to the guest system to take care about its possible transitions.
On 14.05.19 16:49, Julien Grall wrote: > You have to keep in mind that existing OS have to run on newer Xen without any modification. As I just written to Jan, it is one more reason to keep those interfaces living in parallel and do not mix their implementation. > The existing hypercall allows you to: > 1) De-register an interface using the value 0. My current implementation can easily be updated with the same behavior. > 2) Replacing a current existing interface > You probably can't use 2) for a bootloader -> kernel handover because we are dealing with guest virtual address. There is an high chance the virtual address space layout is going to be different or even turning off MMU for a bit (done on Arm). So you have to use 1) otherwise you might write in a random place in memory. This definitely not the way to handle transitions between systems in a guest domain. > I am not entirely sure whether there are actual value for 2). The only reason I can think of is if you want to move around the runstate in your virtual address space. But that's sounds a bit weird at least on Arm. > For the new hypercall, I think we at least want 1) (with a magic value TBD). The magic value 0x0 can easily be introduced. > 2) might be helpful in the case the bootloader didn't do the right thing or we are using Kexec to boot a new kernel. This would also be safer as physical address could be excluded more easily. But the new system have to get some knowledge about the previous phys addr is reserved (used by hypervisor), and do not use it prior to registering new runstate area. Providing such a knowledge is something (e.g.) the bootloader should take care of. But, IMO, it is better to require from (e.g.) the bootloader to unregister its runstate area prior to switching to the new system.
Hi Andrii, On 15/05/2019 10:04, Andrii Anisov wrote: > > > On 14.05.19 16:49, Julien Grall wrote: >> You have to keep in mind that existing OS have to run on newer Xen without any >> modification. > > As I just written to Jan, it is one more reason to keep those interfaces living > in parallel and do not mix their implementation. There are actually no good reason for a guest to register via the two interfaces at the same time. The more we want to encourage the OS developer to switch to the new interface. I also provided in my previous e-mails way to make the two working together without much trouble. > >> The existing hypercall allows you to: >> 1) De-register an interface using the value 0. > > My current implementation can easily be updated with the same behavior. > >> 2) Replacing a current existing interface > >> You probably can't use 2) for a bootloader -> kernel handover because we are >> dealing with guest virtual address. There is an high chance the virtual >> address space layout is going to be different or even turning off MMU for a >> bit (done on Arm). So you have to use 1) otherwise you might write in a random >> place in memory. > > This definitely not the way to handle transitions between systems in a guest > domain. > >> I am not entirely sure whether there are actual value for 2). The only reason >> I can think of is if you want to move around the runstate in your virtual >> address space. But that's sounds a bit weird at least on Arm. >> For the new hypercall, I think we at least want 1) (with a magic value TBD). > > The magic value 0x0 can easily be introduced. 0x0 is not an option. It could be a valid physical address. We need a value that cannot be used by anyone. > >> 2) might be helpful in the case the bootloader didn't do the right thing or >> we are using Kexec to boot a new kernel. This would also be safer as physical >> address could be excluded more easily. > > But the new system have to get some knowledge about the previous phys addr is > reserved (used by hypervisor), and do not use it prior to registering new > runstate area. > Providing such a knowledge is something (e.g.) the bootloader should take care > of. But, IMO, it is better to require from (e.g.) the bootloader to unregister > its runstate area prior to switching to the new system. Well, if a bootloader keep some part in memory (such as for handling runtime services), it will usually mark those pages are reserved. So it can't be used by the kernel. But here, the point is it would not be difficult to handle 2). So why would you try to forbid it? Cheers,
>>> On 15.05.19 at 10:44, <andrii.anisov@gmail.com> wrote: > On 14.05.19 16:49, Jan Beulich wrote: >> If this was a concern only for newly written code, this would be fine. >> But you need to make sure all existing code also continues to work >> with whatever new interface you implement. > > And that is one more reason why I tend to introduce the new interface in > parallel to be fully independent from the old one. > But not do a mixed implementation as you and Julien suggest. What behavior guests see and how it is implemented in the hypervisor are two largely independent things. That is, we could choose to forbid mixing of registration methods while still having some level of code sharing between how both hypercall variants get actually processed. Jan
>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote: > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -163,15 +163,23 @@ struct vcpu > void *sched_priv; /* scheduler-specific data */ > > struct vcpu_runstate_info runstate; > + > + spinlock_t mapped_runstate_lock; Besides other comments given elsewhere already - does this really need to be a per-vCPU lock? Guests aren't expected to invoke the API frequently, so quite likely a per-domain lock would suffice. Quite possibly domain_{,un}lock() could actually be (re-)used. Jan
Hello Jan, On 16.05.19 15:09, Jan Beulich wrote: >>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote: >> --- a/xen/include/xen/sched.h >> +++ b/xen/include/xen/sched.h >> @@ -163,15 +163,23 @@ struct vcpu >> void *sched_priv; /* scheduler-specific data */ >> >> struct vcpu_runstate_info runstate; >> + >> + spinlock_t mapped_runstate_lock; > > Besides other comments given elsewhere already - does this > really need to be a per-vCPU lock? Guests aren't expected to > invoke the API frequently, so quite likely a per-domain lock > would suffice. Quite possibly domain_{,un}lock() could > actually be (re-)used. I'd not use a per-domain lock here. This lock will be locked on every runstate area update, what's happening on every context switch. And the event of simultaneous switching of vcpus from the same domain has quite high probability.
Hello Jan, On 16.05.19 15:09, Jan Beulich wrote: >>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote: >> --- a/xen/include/xen/sched.h >> +++ b/xen/include/xen/sched.h >> @@ -163,15 +163,23 @@ struct vcpu >> void *sched_priv; /* scheduler-specific data */ >> >> struct vcpu_runstate_info runstate; >> + >> + spinlock_t mapped_runstate_lock; > > Besides other comments given elsewhere already - does this > really need to be a per-vCPU lock? Guests aren't expected to > invoke the API frequently, so quite likely a per-domain lock > would suffice. Quite possibly domain_{,un}lock() could > actually be (re-)used. I'd not use a per-domain lock here. This lock will be locked on every runstate area update, what's happening on every context switch. And the event of simultaneous switching of several vcpus from the same domain has quite high probability.
On 16/05/2019 14:30, Andrii Anisov wrote: > Hello Jan, > > On 16.05.19 15:09, Jan Beulich wrote: >>>>> On 23.04.19 at 10:10, <andrii.anisov@gmail.com> wrote: >>> --- a/xen/include/xen/sched.h >>> +++ b/xen/include/xen/sched.h >>> @@ -163,15 +163,23 @@ struct vcpu >>> void *sched_priv; /* scheduler-specific data */ >>> struct vcpu_runstate_info runstate; >>> + >>> + spinlock_t mapped_runstate_lock; >> >> Besides other comments given elsewhere already - does this >> really need to be a per-vCPU lock? Guests aren't expected to >> invoke the API frequently, so quite likely a per-domain lock >> would suffice. Quite possibly domain_{,un}lock() could >> actually be (re-)used. > > I'd not use a per-domain lock here. This lock will be locked on every runstate > area update, what's happening on every context switch. And the event of > simultaneous switching of several vcpus from the same domain has quite high > probability. The lock can be completely removed anyway. See my previous comments. Cheers,
Hello Julien,
On 16.05.19 16:48, Julien Grall wrote:
> The lock can be completely removed anyway. See my previous comments.
You suggested kinda simplified try_lock with runstate update skipping in case of fail.
The question here is if we are OK with not updating runstate under circumstances?
Though even in this case the question here might be if we need `runstate_in_use` per VCPU or per Domain? My answer is per VCPU.
Hi Andrii, On 16/05/2019 15:25, Andrii Anisov wrote: > Hello Julien, > > On 16.05.19 16:48, Julien Grall wrote: >> The lock can be completely removed anyway. See my previous comments. > > You suggested kinda simplified try_lock with runstate update skipping in case of > fail. > The question here is if we are OK with not updating runstate under circumstances? Well, if you fail the check then it means someone was modifying it behind your back. That someone could update the runstate with the new information once it is modified. So I can't see issue with not updating the runstate in the context switch. Cheers,
On 16.05.19 17:28, Julien Grall wrote:
> Well, if you fail the check then it means someone was modifying it behind your back. That someone could update the runstate with the new information once it is modified. So I can't see issue with not updating the runstate in the context switch.
That's fair enough.
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c index 6dc633e..8e24e63 100644 --- a/xen/arch/arm/domain.c +++ b/xen/arch/arm/domain.c @@ -275,32 +275,55 @@ static void ctxt_switch_to(struct vcpu *n) } /* Update per-VCPU guest runstate shared memory area (if registered). */ -static void update_runstate_area(struct vcpu *v) +void update_runstate_area(struct vcpu *v) { - void __user *guest_handle = NULL; + if ( !guest_handle_is_null(runstate_guest(v)) ) + { + void __user *guest_handle = NULL; + if ( VM_ASSIST(v->domain, runstate_update_flag) ) + { + guest_handle = &v->runstate_guest.p->state_entry_time + 1; + guest_handle--; + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; + __raw_copy_to_guest(guest_handle, + (void *)(&v->runstate.state_entry_time + 1) - 1, + 1); + smp_wmb(); + } - if ( guest_handle_is_null(runstate_guest(v)) ) - return; + __copy_to_guest(runstate_guest(v), &v->runstate, 1); - if ( VM_ASSIST(v->domain, runstate_update_flag) ) - { - guest_handle = &v->runstate_guest.p->state_entry_time + 1; - guest_handle--; - v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; - __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); - smp_wmb(); + if ( guest_handle ) + { + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; + smp_wmb(); + __raw_copy_to_guest(guest_handle, + (void *)(&v->runstate.state_entry_time + 1) - 1, + 1); + } } - __copy_to_guest(runstate_guest(v), &v->runstate, 1); - - if ( guest_handle ) + spin_lock(&v->mapped_runstate_lock); + if ( v->mapped_runstate ) { - v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; - smp_wmb(); - __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + if ( VM_ASSIST(v->domain, runstate_update_flag) ) + { + v->mapped_runstate->state_entry_time |= XEN_RUNSTATE_UPDATE; + smp_wmb(); + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; + } + + memcpy(v->mapped_runstate, &v->runstate, sizeof(v->runstate)); + + if ( VM_ASSIST(v->domain, runstate_update_flag) ) + { + v->mapped_runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE; + smp_wmb(); + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; + } } + spin_unlock(&v->mapped_runstate_lock); + } static void schedule_tail(struct vcpu *prev) @@ -998,6 +1021,7 @@ long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) a { case VCPUOP_register_vcpu_info: case VCPUOP_register_runstate_memory_area: + case VCPUOP_register_runstate_phys_memory_area: return do_vcpu_op(cmd, vcpuid, arg); default: return -EINVAL; diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 9eaa978..46c2219 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1558,51 +1558,98 @@ void paravirt_ctxt_switch_to(struct vcpu *v) wrmsr_tsc_aux(v->arch.msrs->tsc_aux); } -/* Update per-VCPU guest runstate shared memory area (if registered). */ -bool update_runstate_area(struct vcpu *v) +static void update_mapped_runstate_area_native(struct vcpu *v) { - bool rc; - struct guest_memory_policy policy = { .nested_guest_mode = false }; - void __user *guest_handle = NULL; - - if ( guest_handle_is_null(runstate_guest(v)) ) - return true; - - update_guest_memory_policy(v, &policy); - if ( VM_ASSIST(v->domain, runstate_update_flag) ) { - guest_handle = has_32bit_shinfo(v->domain) - ? &v->runstate_guest.compat.p->state_entry_time + 1 - : &v->runstate_guest.native.p->state_entry_time + 1; - guest_handle--; v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; - __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + v->mapped_runstate.native->state_entry_time |= XEN_RUNSTATE_UPDATE; smp_wmb(); } - if ( has_32bit_shinfo(v->domain) ) + memcpy(v->mapped_runstate.native, &v->runstate, sizeof(v->runstate)); + + if ( VM_ASSIST(v->domain, runstate_update_flag) ) { - struct compat_vcpu_runstate_info info; + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; + v->mapped_runstate.native->state_entry_time &= ~XEN_RUNSTATE_UPDATE; + smp_wmb(); + } +} - XLAT_vcpu_runstate_info(&info, &v->runstate); - __copy_to_guest(v->runstate_guest.compat, &info, 1); - rc = true; +static void update_mapped_runstate_area_compat(struct vcpu *v) +{ + if ( VM_ASSIST(v->domain, runstate_update_flag) ) + { + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; + v->mapped_runstate.compat->state_entry_time |= XEN_RUNSTATE_UPDATE; + smp_wmb(); } - else - rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) != - sizeof(v->runstate); - if ( guest_handle ) + memcpy(v->mapped_runstate.compat, &v->runstate, sizeof(v->runstate)); + + if ( VM_ASSIST(v->domain, runstate_update_flag) ) { v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; + v->mapped_runstate.compat->state_entry_time &= ~XEN_RUNSTATE_UPDATE; smp_wmb(); - __raw_copy_to_guest(guest_handle, - (void *)(&v->runstate.state_entry_time + 1) - 1, 1); } +} - update_guest_memory_policy(v, &policy); +/* Update per-VCPU guest runstate shared memory area (if registered). */ +bool update_runstate_area(struct vcpu *v) +{ + bool rc = true; + + if ( !guest_handle_is_null(runstate_guest(v)) ) + { + struct guest_memory_policy policy = { .nested_guest_mode = false }; + void __user *guest_handle = NULL; + + update_guest_memory_policy(v, &policy); + if ( VM_ASSIST(v->domain, runstate_update_flag) ) + { + guest_handle = has_32bit_shinfo(v->domain) + ? &v->runstate_guest.compat.p->state_entry_time + 1 + : &v->runstate_guest.native.p->state_entry_time + 1; + guest_handle--; + v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; + __raw_copy_to_guest(guest_handle, + (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + smp_wmb(); + } + + if ( has_32bit_shinfo(v->domain) ) + { + struct compat_vcpu_runstate_info info; + + XLAT_vcpu_runstate_info(&info, &v->runstate); + __copy_to_guest(v->runstate_guest.compat, &info, 1); + rc = true; + } + else + rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) != + sizeof(v->runstate); + + if ( guest_handle ) + { + v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; + smp_wmb(); + __raw_copy_to_guest(guest_handle, + (void *)(&v->runstate.state_entry_time + 1) - 1, 1); + } + update_guest_memory_policy(v, &policy); + } + + spin_lock(v->mapped_runstate_lock); + if ( v->mapped_runstate ) + { + if ( has_32bit_shinfo((v)->domain) ) + update_mapped_runstate_area_compat(v); + else + update_mapped_runstate_area_native(v); + } + spin_unlock(v->mapped_runstate_lock); return rc; } diff --git a/xen/common/domain.c b/xen/common/domain.c index ae22049..6df76c6 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -149,6 +149,7 @@ struct vcpu *vcpu_create( v->dirty_cpu = VCPU_CPU_CLEAN; spin_lock_init(&v->virq_lock); + spin_lock_init(&v->mapped_runstate_lock); tasklet_init(&v->continue_hypercall_tasklet, NULL, 0); @@ -699,6 +700,69 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d) return 0; } +static void _unmap_runstate_area(struct vcpu *v) +{ + mfn_t mfn; + + if ( !v->mapped_runstate ) + return; + + mfn = _mfn(virt_to_mfn(runstate_guest(v).p)); + + unmap_domain_page_global((void *) + ((unsigned long)v->mapped_runstate & + PAGE_MASK)); + + v->mapped_runstate = NULL; + put_page_and_type(mfn_to_page(mfn)); +} + +static int map_runstate_area(struct vcpu *v, + struct vcpu_register_runstate_memory_area *area) +{ + unsigned long offset = area->addr.p & ~PAGE_MASK; + gfn_t gfn = gaddr_to_gfn(area->addr.p); + struct domain *d = v->domain; + void *mapping; + struct page_info *page; + size_t size = sizeof (struct vcpu_runstate_info ); + + if ( offset > (PAGE_SIZE - size) ) + return -EINVAL; + + page = get_page_from_gfn(d, gfn_x(gfn), NULL, P2M_ALLOC); + if ( !page ) + return -EINVAL; + + if ( !get_page_type(page, PGT_writable_page) ) + { + put_page(page); + return -EINVAL; + } + + mapping = __map_domain_page_global(page); + + if ( mapping == NULL ) + { + put_page_and_type(page); + return -ENOMEM; + } + + spin_lock(&v->mapped_runstate_lock); + _unmap_runstate_area(v); + v->mapped_runstate = mapping + offset; + spin_unlock(&v->mapped_runstate_lock); + + return 0; +} + +static void unmap_runstate_area(struct vcpu *v) +{ + spin_lock(&v->mapped_runstate_lock); + _unmap_runstate_area(v); + spin_unlock(&v->mapped_runstate_lock); +} + int domain_kill(struct domain *d) { int rc = 0; @@ -737,7 +801,11 @@ int domain_kill(struct domain *d) if ( cpupool_move_domain(d, cpupool0) ) return -ERESTART; for_each_vcpu ( d, v ) + { + set_xen_guest_handle(runstate_guest(v), NULL); + unmap_runstate_area(v); unmap_vcpu_info(v); + } d->is_dying = DOMDYING_dead; /* Mem event cleanup has to go here because the rings * have to be put before we call put_domain. */ @@ -1192,6 +1260,7 @@ int domain_soft_reset(struct domain *d) for_each_vcpu ( d, v ) { set_xen_guest_handle(runstate_guest(v), NULL); + unmap_runstate_area(v); unmap_vcpu_info(v); } @@ -1536,8 +1605,17 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) } case VCPUOP_register_runstate_phys_memory_area: - rc = -EOPNOTSUPP; + { + struct vcpu_register_runstate_memory_area area; + + rc = -EFAULT; + if ( copy_from_guest(&area, arg, 1) ) + break; + + rc = map_runstate_area(v, &area); + break; + } #ifdef VCPU_TRAP_NMI case VCPUOP_send_nmi: diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h index 312fec8..3fb6ea2 100644 --- a/xen/include/asm-arm/domain.h +++ b/xen/include/asm-arm/domain.h @@ -217,6 +217,8 @@ void vcpu_show_execution_state(struct vcpu *); void vcpu_show_registers(const struct vcpu *); void vcpu_switch_to_aarch64_mode(struct vcpu *); +void update_runstate_area(struct vcpu *); + /* * Due to the restriction of GICv3, the number of vCPUs in AFF0 is * limited to 16, thus only the first 4 bits of AFF0 are legal. We will diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h index d1bfc82..ecddcfe 100644 --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -118,4 +118,6 @@ struct vnuma_info { void vnuma_destroy(struct vnuma_info *vnuma); +struct vcpu_register_runstate_memory_area; + #endif /* __XEN_DOMAIN_H__ */ diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 748bb0f..2afe31c 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -163,15 +163,23 @@ struct vcpu void *sched_priv; /* scheduler-specific data */ struct vcpu_runstate_info runstate; + + spinlock_t mapped_runstate_lock; + #ifndef CONFIG_COMPAT # define runstate_guest(v) ((v)->runstate_guest) XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */ + vcpu_runstate_info_t *mapped_runstate; #else # define runstate_guest(v) ((v)->runstate_guest.native) union { XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat; } runstate_guest; /* guest address */ + union { + vcpu_runstate_info_t* native; + vcpu_runstate_info_compat_t* compat; + } mapped_runstate; /* guest address */ #endif /* last time when vCPU is scheduled out */