Message ID | 1467312332-3098-1-git-send-email-czuzu@bitdefender.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 06/30/16 21:45, Corneliu ZUZU wrote: > The arch_vm_event structure is dynamically allocated and freed @ > vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user > disables domain monitoring (xc_monitor_disable), which in turn effectively > discards any information that was in arch_vm_event.write_data. > > But this can yield unexpected behavior since if a CR-write was awaiting to be > committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data) > before xc_monitor_disable is called, then the domain CR write is wrongfully > ignored, which of course, in these cases, can easily render a domain crash. > > To fix the issue, this patch makes only arch_vm_event.emul_read_data dynamically > allocated instead of the whole arch_vm_event structure. With this we can avoid > invalidation of an awaiting arch_vm_event.write_data by selectively cleaning up > only emul_read_data and emulate_flags @ vm_event_cleanup_domain. > > Small note: arch_vm_event structure definition needed to be moved from > asm-x86/vm_event.h to asm-x86/domain.h in the process. > > Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com> > --- > xen/arch/x86/domain.c | 5 ++-- > xen/arch/x86/hvm/emulate.c | 8 +++--- > xen/arch/x86/hvm/hvm.c | 62 ++++++++++++++++++------------------------ > xen/arch/x86/mm/p2m.c | 4 +-- > xen/arch/x86/monitor.c | 7 +---- > xen/arch/x86/vm_event.c | 16 +++++------ > xen/include/asm-x86/domain.h | 42 +++++++++++++++++----------- > xen/include/asm-x86/monitor.h | 3 +- > xen/include/asm-x86/vm_event.h | 10 ------- > 9 files changed, 73 insertions(+), 84 deletions(-) > > diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c > index bb59247..06e68ae 100644 > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v) > > void vcpu_destroy(struct vcpu *v) > { > - xfree(v->arch.vm_event); > - v->arch.vm_event = NULL; > + v->arch.vm_event.emulate_flags = 0; > + xfree(v->arch.vm_event.emul_read_data); > + v->arch.vm_event.emul_read_data = NULL; > > if ( is_pv_32bit_vcpu(v) ) > { > diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c > index 855af4d..68f5515 100644 > --- a/xen/arch/x86/hvm/emulate.c > +++ b/xen/arch/x86/hvm/emulate.c > @@ -73,12 +73,12 @@ static int set_context_data(void *buffer, unsigned int size) > { > struct vcpu *curr = current; > > - if ( curr->arch.vm_event ) > + if ( curr->arch.vm_event.emul_read_data ) > { > unsigned int safe_size = > - min(size, curr->arch.vm_event->emul_read_data.size); > + min(size, curr->arch.vm_event.emul_read_data->size); > > - memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size); > + memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size); > memset(buffer + safe_size, 0, size - safe_size); > return X86EMUL_OKAY; > } > @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear( > * vm_event being triggered for repeated writes to a whole page. > */ > if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) && > - current->arch.vm_event->emulate_flags != 0 ) > + current->arch.vm_event.emulate_flags != 0 ) > max_reps = 1; > > /* > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > index 884ae40..03dffb8 100644 > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) > if ( !handle_hvm_io_completion(v) ) > return; > > - if ( unlikely(v->arch.vm_event) ) > + if ( unlikely(v->arch.vm_event.emulate_flags) ) > { > - if ( v->arch.vm_event->emulate_flags ) > - { > - enum emul_kind kind = EMUL_KIND_NORMAL; > + enum emul_kind kind; > > - if ( v->arch.vm_event->emulate_flags & > - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) > - kind = EMUL_KIND_SET_CONTEXT; > - else if ( v->arch.vm_event->emulate_flags & > - VM_EVENT_FLAG_EMULATE_NOWRITE ) > - kind = EMUL_KIND_NOWRITE; > + ASSERT(v->arch.vm_event.emul_read_data); > > - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, > - HVM_DELIVER_NO_ERROR_CODE); > + kind = EMUL_KIND_NORMAL; Why do the "kind = EMUL_KIND_NORMAL;" assignment separately after the ASSERT()? Could it not be left the same way as before ("enum emul_kind kind = EMUL_KIND_NORMAL;") above the ASSERT()? It's not a big change and I won't hold the patch over it, but small changes add up in the review process so unnecessary changes are best either avoided, or done in a standalone cleanup patch. > - v->arch.vm_event->emulate_flags = 0; > - } > + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA ) > + kind = EMUL_KIND_SET_CONTEXT; > + else if ( v->arch.vm_event.emulate_flags & > + VM_EVENT_FLAG_EMULATE_NOWRITE ) > + kind = EMUL_KIND_NOWRITE; > + > + hvm_mem_access_emulate_one(kind, TRAP_invalid_op, > + HVM_DELIVER_NO_ERROR_CODE); > + > + v->arch.vm_event.emulate_flags = 0; > } > > arch_monitor_write_data(v); > @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t may_defer) > if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & > monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) ) > { > - ASSERT(v->arch.vm_event); > - > if ( hvm_monitor_crX(CR0, value, old_value) ) > { > /* > * The actual write will occur in arch_monitor_write_data(), if > * permitted. > */ > - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); > - v->arch.vm_event->write_data.status = MWS_CR0; > - v->arch.vm_event->write_data.value = value; > + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); > + v->arch.vm_event.write_data.status = MWS_CR0; > + v->arch.vm_event.write_data.value = value; > > return X86EMUL_OKAY; > } > @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer) > if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & > monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) ) > { > - ASSERT(v->arch.vm_event); > - > if ( hvm_monitor_crX(CR3, value, old) ) > { > /* > * The actual write will occur in arch_monitor_write_data(), if > * permitted. > */ > - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); > - v->arch.vm_event->write_data.status = MWS_CR3; > - v->arch.vm_event->write_data.value = value; > + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); > + v->arch.vm_event.write_data.status = MWS_CR3; > + v->arch.vm_event.write_data.value = value; > > return X86EMUL_OKAY; > } > @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t may_defer) > if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & > monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ) > { > - ASSERT(v->arch.vm_event); > - > if ( hvm_monitor_crX(CR4, value, old_cr) ) > { > /* > * The actual write will occur in arch_monitor_write_data(), if > * permitted. > */ > - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); > - v->arch.vm_event->write_data.status = MWS_CR4; > - v->arch.vm_event->write_data.value = value; > + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); > + v->arch.vm_event.write_data.status = MWS_CR4; > + v->arch.vm_event.write_data.value = value; > > return X86EMUL_OKAY; > } > @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content, > > if ( may_defer && unlikely(monitored_msr(v->domain, msr)) ) > { > - ASSERT(v->arch.vm_event); > - > /* > * The actual write will occur in arch_monitor_write_data(), if > * permitted. > */ > - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); > - v->arch.vm_event->write_data.status = MWS_MSR; > - v->arch.vm_event->write_data.msr = msr; > - v->arch.vm_event->write_data.value = msr_content; > + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); > + v->arch.vm_event.write_data.status = MWS_MSR; > + v->arch.vm_event.write_data.msr = msr; > + v->arch.vm_event.write_data.value = msr_content; > > hvm_monitor_msr(msr, msr_content); > return X86EMUL_OKAY; > diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c > index 16733a4..9bcaa8a 100644 > --- a/xen/arch/x86/mm/p2m.c > +++ b/xen/arch/x86/mm/p2m.c > @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu *v, > } > } > > - v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0; > + v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0; > > if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) ) > - v->arch.vm_event->emul_read_data = rsp->data.emul_read_data; > + *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data; > } > } > > diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c > index 5c8d4da..88d14ae 100644 > --- a/xen/arch/x86/monitor.c > +++ b/xen/arch/x86/monitor.c > @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d) > > void arch_monitor_write_data(struct vcpu *v) > { > - struct monitor_write_data *w; > - > - if ( likely(!v->arch.vm_event) ) > - return; > - > - w = &v->arch.vm_event->write_data; > + struct monitor_write_data *w = &v->arch.vm_event.write_data; > > if ( likely(MWS_NOWRITE == w->status) ) > return; > diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c > index 825da48..f21ff10 100644 > --- a/xen/arch/x86/vm_event.c > +++ b/xen/arch/x86/vm_event.c > @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d) > > for_each_vcpu ( d, v ) > { > - if ( v->arch.vm_event ) > + if ( v->arch.vm_event.emul_read_data ) > continue; > > - v->arch.vm_event = xzalloc(struct arch_vm_event); > + v->arch.vm_event.emul_read_data = > + xzalloc(struct vm_event_emul_read_data); > > - if ( !v->arch.vm_event ) > + if ( !v->arch.vm_event.emul_read_data ) > return -ENOMEM; > } > > @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d) > > for_each_vcpu ( d, v ) > { > - xfree(v->arch.vm_event); > - v->arch.vm_event = NULL; > + v->arch.vm_event.emulate_flags = 0; > + xfree(v->arch.vm_event.emul_read_data); > + v->arch.vm_event.emul_read_data = NULL; > } > > d->arch.mem_access_emulate_each_rep = 0; > @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp) > { > if ( rsp->flags & VM_EVENT_FLAG_DENY ) > { > - ASSERT(v->arch.vm_event); > - > /* deny flag requires the vCPU to be paused */ > if ( !atomic_read(&v->vm_event_pause_count) ) > return; > > - v->arch.vm_event->write_data.status = MWS_NOWRITE; > + v->arch.vm_event.write_data.status = MWS_NOWRITE; > } > } > > diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h > index a22ee6b..7ea5c8f 100644 > --- a/xen/include/asm-x86/domain.h > +++ b/xen/include/asm-x86/domain.h > @@ -259,21 +259,6 @@ struct pv_domain > struct cpuidmasks *cpuidmasks; > }; > > -enum monitor_write_status > -{ > - MWS_NOWRITE = 0, > - MWS_MSR, > - MWS_CR0, > - MWS_CR3, > - MWS_CR4, > -}; > - > -struct monitor_write_data { > - enum monitor_write_status status; > - uint32_t msr; > - uint64_t value; > -}; > - > struct arch_domain > { > struct page_info *perdomain_l3_pg; > @@ -496,6 +481,31 @@ typedef enum __packed { > SMAP_CHECK_DISABLED, /* disable the check */ > } smap_check_policy_t; > > +enum monitor_write_status > +{ > + MWS_NOWRITE = 0, > + MWS_MSR, > + MWS_CR0, > + MWS_CR3, > + MWS_CR4, > +}; > + > +struct monitor_write_data { > + enum monitor_write_status status; > + uint32_t msr; > + uint64_t value; > +}; > + > +/* > + * Should we emulate the next matching instruction on VCPU resume > + * after a vm_event? > + */ > +struct arch_vm_event { > + uint32_t emulate_flags; > + struct vm_event_emul_read_data *emul_read_data; > + struct monitor_write_data write_data; > +}; > + > struct arch_vcpu > { > /* > @@ -569,7 +579,7 @@ struct arch_vcpu > /* A secondary copy of the vcpu time info. */ > XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest; > > - struct arch_vm_event *vm_event; > + struct arch_vm_event vm_event; > }; > > smap_check_policy_t smap_policy_change(struct vcpu *v, > diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h > index 1068376..984ac4c 100644 > --- a/xen/include/asm-x86/monitor.h > +++ b/xen/include/asm-x86/monitor.h > @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop) > * Enabling mem_access_emulate_each_rep without a vm_event subscriber > * is meaningless. > */ > - if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event ) > + if ( d->max_vcpus && d->vcpu[0] && > + d->vcpu[0]->arch.vm_event.emul_read_data ) Again, I won't hold the patch over this, but if there are additional reviews that require changes and cause another version of it, please add a small line to the comment above the if, stating that emul_read_data only gets allocated when vm_event gets enabled, otherwise (especially for newcomers) that check might look confusing. Otherwise: Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com> Thanks, Razvan
On 7/1/2016 9:47 AM, Razvan Cojocaru wrote: > On 06/30/16 21:45, Corneliu ZUZU wrote: >> The arch_vm_event structure is dynamically allocated and freed @ >> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user >> disables domain monitoring (xc_monitor_disable), which in turn effectively >> discards any information that was in arch_vm_event.write_data. >> >> But this can yield unexpected behavior since if a CR-write was awaiting to be >> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data) >> before xc_monitor_disable is called, then the domain CR write is wrongfully >> ignored, which of course, in these cases, can easily render a domain crash. >> >> To fix the issue, this patch makes only arch_vm_event.emul_read_data dynamically >> allocated instead of the whole arch_vm_event structure. With this we can avoid >> invalidation of an awaiting arch_vm_event.write_data by selectively cleaning up >> only emul_read_data and emulate_flags @ vm_event_cleanup_domain. >> >> Small note: arch_vm_event structure definition needed to be moved from >> asm-x86/vm_event.h to asm-x86/domain.h in the process. >> >> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com> >> --- >> xen/arch/x86/domain.c | 5 ++-- >> xen/arch/x86/hvm/emulate.c | 8 +++--- >> xen/arch/x86/hvm/hvm.c | 62 ++++++++++++++++++------------------------ >> xen/arch/x86/mm/p2m.c | 4 +-- >> xen/arch/x86/monitor.c | 7 +---- >> xen/arch/x86/vm_event.c | 16 +++++------ >> xen/include/asm-x86/domain.h | 42 +++++++++++++++++----------- >> xen/include/asm-x86/monitor.h | 3 +- >> xen/include/asm-x86/vm_event.h | 10 ------- >> 9 files changed, 73 insertions(+), 84 deletions(-) >> >> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c >> index bb59247..06e68ae 100644 >> --- a/xen/arch/x86/domain.c >> +++ b/xen/arch/x86/domain.c >> @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v) >> >> void vcpu_destroy(struct vcpu *v) >> { >> - xfree(v->arch.vm_event); >> - v->arch.vm_event = NULL; >> + v->arch.vm_event.emulate_flags = 0; >> + xfree(v->arch.vm_event.emul_read_data); >> + v->arch.vm_event.emul_read_data = NULL; >> >> if ( is_pv_32bit_vcpu(v) ) >> { >> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c >> index 855af4d..68f5515 100644 >> --- a/xen/arch/x86/hvm/emulate.c >> +++ b/xen/arch/x86/hvm/emulate.c >> @@ -73,12 +73,12 @@ static int set_context_data(void *buffer, unsigned int size) >> { >> struct vcpu *curr = current; >> >> - if ( curr->arch.vm_event ) >> + if ( curr->arch.vm_event.emul_read_data ) >> { >> unsigned int safe_size = >> - min(size, curr->arch.vm_event->emul_read_data.size); >> + min(size, curr->arch.vm_event.emul_read_data->size); >> >> - memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size); >> + memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size); >> memset(buffer + safe_size, 0, size - safe_size); >> return X86EMUL_OKAY; >> } >> @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear( >> * vm_event being triggered for repeated writes to a whole page. >> */ >> if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) && >> - current->arch.vm_event->emulate_flags != 0 ) >> + current->arch.vm_event.emulate_flags != 0 ) >> max_reps = 1; >> >> /* >> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c >> index 884ae40..03dffb8 100644 >> --- a/xen/arch/x86/hvm/hvm.c >> +++ b/xen/arch/x86/hvm/hvm.c >> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) >> if ( !handle_hvm_io_completion(v) ) >> return; >> >> - if ( unlikely(v->arch.vm_event) ) >> + if ( unlikely(v->arch.vm_event.emulate_flags) ) >> { >> - if ( v->arch.vm_event->emulate_flags ) >> - { >> - enum emul_kind kind = EMUL_KIND_NORMAL; >> + enum emul_kind kind; >> >> - if ( v->arch.vm_event->emulate_flags & >> - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >> - kind = EMUL_KIND_SET_CONTEXT; >> - else if ( v->arch.vm_event->emulate_flags & >> - VM_EVENT_FLAG_EMULATE_NOWRITE ) >> - kind = EMUL_KIND_NOWRITE; >> + ASSERT(v->arch.vm_event.emul_read_data); >> >> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >> - HVM_DELIVER_NO_ERROR_CODE); >> + kind = EMUL_KIND_NORMAL; > Why do the "kind = EMUL_KIND_NORMAL;" assignment separately after the > ASSERT()? Could it not be left the same way as before ("enum emul_kind > kind = EMUL_KIND_NORMAL;") above the ASSERT()? > > It's not a big change and I won't hold the patch over it, but small > changes add up in the review process so unnecessary changes are best > either avoided, or done in a standalone cleanup patch. > >> - v->arch.vm_event->emulate_flags = 0; >> - } >> + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >> + kind = EMUL_KIND_SET_CONTEXT; >> + else if ( v->arch.vm_event.emulate_flags & >> + VM_EVENT_FLAG_EMULATE_NOWRITE ) >> + kind = EMUL_KIND_NOWRITE; >> + >> + hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >> + HVM_DELIVER_NO_ERROR_CODE); >> + >> + v->arch.vm_event.emulate_flags = 0; >> } >> >> arch_monitor_write_data(v); >> @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t may_defer) >> if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & >> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) ) >> { >> - ASSERT(v->arch.vm_event); >> - >> if ( hvm_monitor_crX(CR0, value, old_value) ) >> { >> /* >> * The actual write will occur in arch_monitor_write_data(), if >> * permitted. >> */ >> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >> - v->arch.vm_event->write_data.status = MWS_CR0; >> - v->arch.vm_event->write_data.value = value; >> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >> + v->arch.vm_event.write_data.status = MWS_CR0; >> + v->arch.vm_event.write_data.value = value; >> >> return X86EMUL_OKAY; >> } >> @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer) >> if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & >> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) ) >> { >> - ASSERT(v->arch.vm_event); >> - >> if ( hvm_monitor_crX(CR3, value, old) ) >> { >> /* >> * The actual write will occur in arch_monitor_write_data(), if >> * permitted. >> */ >> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >> - v->arch.vm_event->write_data.status = MWS_CR3; >> - v->arch.vm_event->write_data.value = value; >> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >> + v->arch.vm_event.write_data.status = MWS_CR3; >> + v->arch.vm_event.write_data.value = value; >> >> return X86EMUL_OKAY; >> } >> @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t may_defer) >> if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & >> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ) >> { >> - ASSERT(v->arch.vm_event); >> - >> if ( hvm_monitor_crX(CR4, value, old_cr) ) >> { >> /* >> * The actual write will occur in arch_monitor_write_data(), if >> * permitted. >> */ >> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >> - v->arch.vm_event->write_data.status = MWS_CR4; >> - v->arch.vm_event->write_data.value = value; >> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >> + v->arch.vm_event.write_data.status = MWS_CR4; >> + v->arch.vm_event.write_data.value = value; >> >> return X86EMUL_OKAY; >> } >> @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content, >> >> if ( may_defer && unlikely(monitored_msr(v->domain, msr)) ) >> { >> - ASSERT(v->arch.vm_event); >> - >> /* >> * The actual write will occur in arch_monitor_write_data(), if >> * permitted. >> */ >> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >> - v->arch.vm_event->write_data.status = MWS_MSR; >> - v->arch.vm_event->write_data.msr = msr; >> - v->arch.vm_event->write_data.value = msr_content; >> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >> + v->arch.vm_event.write_data.status = MWS_MSR; >> + v->arch.vm_event.write_data.msr = msr; >> + v->arch.vm_event.write_data.value = msr_content; >> >> hvm_monitor_msr(msr, msr_content); >> return X86EMUL_OKAY; >> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c >> index 16733a4..9bcaa8a 100644 >> --- a/xen/arch/x86/mm/p2m.c >> +++ b/xen/arch/x86/mm/p2m.c >> @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu *v, >> } >> } >> >> - v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0; >> + v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0; >> >> if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) ) >> - v->arch.vm_event->emul_read_data = rsp->data.emul_read_data; >> + *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data; >> } >> } >> >> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c >> index 5c8d4da..88d14ae 100644 >> --- a/xen/arch/x86/monitor.c >> +++ b/xen/arch/x86/monitor.c >> @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d) >> >> void arch_monitor_write_data(struct vcpu *v) >> { >> - struct monitor_write_data *w; >> - >> - if ( likely(!v->arch.vm_event) ) >> - return; >> - >> - w = &v->arch.vm_event->write_data; >> + struct monitor_write_data *w = &v->arch.vm_event.write_data; >> >> if ( likely(MWS_NOWRITE == w->status) ) >> return; >> diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c >> index 825da48..f21ff10 100644 >> --- a/xen/arch/x86/vm_event.c >> +++ b/xen/arch/x86/vm_event.c >> @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d) >> >> for_each_vcpu ( d, v ) >> { >> - if ( v->arch.vm_event ) >> + if ( v->arch.vm_event.emul_read_data ) >> continue; >> >> - v->arch.vm_event = xzalloc(struct arch_vm_event); >> + v->arch.vm_event.emul_read_data = >> + xzalloc(struct vm_event_emul_read_data); >> >> - if ( !v->arch.vm_event ) >> + if ( !v->arch.vm_event.emul_read_data ) >> return -ENOMEM; >> } >> >> @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d) >> >> for_each_vcpu ( d, v ) >> { >> - xfree(v->arch.vm_event); >> - v->arch.vm_event = NULL; >> + v->arch.vm_event.emulate_flags = 0; >> + xfree(v->arch.vm_event.emul_read_data); >> + v->arch.vm_event.emul_read_data = NULL; >> } >> >> d->arch.mem_access_emulate_each_rep = 0; >> @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp) >> { >> if ( rsp->flags & VM_EVENT_FLAG_DENY ) >> { >> - ASSERT(v->arch.vm_event); >> - >> /* deny flag requires the vCPU to be paused */ >> if ( !atomic_read(&v->vm_event_pause_count) ) >> return; >> >> - v->arch.vm_event->write_data.status = MWS_NOWRITE; >> + v->arch.vm_event.write_data.status = MWS_NOWRITE; >> } >> } >> >> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h >> index a22ee6b..7ea5c8f 100644 >> --- a/xen/include/asm-x86/domain.h >> +++ b/xen/include/asm-x86/domain.h >> @@ -259,21 +259,6 @@ struct pv_domain >> struct cpuidmasks *cpuidmasks; >> }; >> >> -enum monitor_write_status >> -{ >> - MWS_NOWRITE = 0, >> - MWS_MSR, >> - MWS_CR0, >> - MWS_CR3, >> - MWS_CR4, >> -}; >> - >> -struct monitor_write_data { >> - enum monitor_write_status status; >> - uint32_t msr; >> - uint64_t value; >> -}; >> - >> struct arch_domain >> { >> struct page_info *perdomain_l3_pg; >> @@ -496,6 +481,31 @@ typedef enum __packed { >> SMAP_CHECK_DISABLED, /* disable the check */ >> } smap_check_policy_t; >> >> +enum monitor_write_status >> +{ >> + MWS_NOWRITE = 0, >> + MWS_MSR, >> + MWS_CR0, >> + MWS_CR3, >> + MWS_CR4, >> +}; >> + >> +struct monitor_write_data { >> + enum monitor_write_status status; >> + uint32_t msr; >> + uint64_t value; >> +}; >> + >> +/* >> + * Should we emulate the next matching instruction on VCPU resume >> + * after a vm_event? >> + */ >> +struct arch_vm_event { >> + uint32_t emulate_flags; >> + struct vm_event_emul_read_data *emul_read_data; >> + struct monitor_write_data write_data; >> +}; >> + >> struct arch_vcpu >> { >> /* >> @@ -569,7 +579,7 @@ struct arch_vcpu >> /* A secondary copy of the vcpu time info. */ >> XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest; >> >> - struct arch_vm_event *vm_event; >> + struct arch_vm_event vm_event; >> }; >> >> smap_check_policy_t smap_policy_change(struct vcpu *v, >> diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h >> index 1068376..984ac4c 100644 >> --- a/xen/include/asm-x86/monitor.h >> +++ b/xen/include/asm-x86/monitor.h >> @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop) >> * Enabling mem_access_emulate_each_rep without a vm_event subscriber >> * is meaningless. >> */ >> - if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event ) >> + if ( d->max_vcpus && d->vcpu[0] && >> + d->vcpu[0]->arch.vm_event.emul_read_data ) > Again, I won't hold the patch over this, but if there are additional > reviews that require changes and cause another version of it, please add > a small line to the comment above the if, stating that emul_read_data > only gets allocated when vm_event gets enabled, otherwise (especially > for newcomers) that check might look confusing. > > Otherwise: > > Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com> > > > Thanks, > Razvan > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel Since that came up wouldn't it be even nicer if we add a: #define vm_event_initialized_on_vcpu(v) (NULL != (v)->arch.vm_event.emul_read_data) in asm-x86/vm_event.h above vm_event_init_domain and use that everywhere instead? Corneliu.
On 07/01/16 09:56, Corneliu ZUZU wrote: > On 7/1/2016 9:47 AM, Razvan Cojocaru wrote: >> On 06/30/16 21:45, Corneliu ZUZU wrote: >>> The arch_vm_event structure is dynamically allocated and freed @ >>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the >>> toolstack user >>> disables domain monitoring (xc_monitor_disable), which in turn >>> effectively >>> discards any information that was in arch_vm_event.write_data. >>> >>> But this can yield unexpected behavior since if a CR-write was >>> awaiting to be >>> committed on the scheduling tail >>> (hvm_do_resume->arch_monitor_write_data) >>> before xc_monitor_disable is called, then the domain CR write is >>> wrongfully >>> ignored, which of course, in these cases, can easily render a domain >>> crash. >>> >>> To fix the issue, this patch makes only arch_vm_event.emul_read_data >>> dynamically >>> allocated instead of the whole arch_vm_event structure. With this we >>> can avoid >>> invalidation of an awaiting arch_vm_event.write_data by selectively >>> cleaning up >>> only emul_read_data and emulate_flags @ vm_event_cleanup_domain. >>> >>> Small note: arch_vm_event structure definition needed to be moved from >>> asm-x86/vm_event.h to asm-x86/domain.h in the process. >>> >>> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com> >>> --- >>> xen/arch/x86/domain.c | 5 ++-- >>> xen/arch/x86/hvm/emulate.c | 8 +++--- >>> xen/arch/x86/hvm/hvm.c | 62 >>> ++++++++++++++++++------------------------ >>> xen/arch/x86/mm/p2m.c | 4 +-- >>> xen/arch/x86/monitor.c | 7 +---- >>> xen/arch/x86/vm_event.c | 16 +++++------ >>> xen/include/asm-x86/domain.h | 42 +++++++++++++++++----------- >>> xen/include/asm-x86/monitor.h | 3 +- >>> xen/include/asm-x86/vm_event.h | 10 ------- >>> 9 files changed, 73 insertions(+), 84 deletions(-) >>> >>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c >>> index bb59247..06e68ae 100644 >>> --- a/xen/arch/x86/domain.c >>> +++ b/xen/arch/x86/domain.c >>> @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v) >>> void vcpu_destroy(struct vcpu *v) >>> { >>> - xfree(v->arch.vm_event); >>> - v->arch.vm_event = NULL; >>> + v->arch.vm_event.emulate_flags = 0; >>> + xfree(v->arch.vm_event.emul_read_data); >>> + v->arch.vm_event.emul_read_data = NULL; >>> if ( is_pv_32bit_vcpu(v) ) >>> { >>> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c >>> index 855af4d..68f5515 100644 >>> --- a/xen/arch/x86/hvm/emulate.c >>> +++ b/xen/arch/x86/hvm/emulate.c >>> @@ -73,12 +73,12 @@ static int set_context_data(void *buffer, >>> unsigned int size) >>> { >>> struct vcpu *curr = current; >>> - if ( curr->arch.vm_event ) >>> + if ( curr->arch.vm_event.emul_read_data ) >>> { >>> unsigned int safe_size = >>> - min(size, curr->arch.vm_event->emul_read_data.size); >>> + min(size, curr->arch.vm_event.emul_read_data->size); >>> - memcpy(buffer, curr->arch.vm_event->emul_read_data.data, >>> safe_size); >>> + memcpy(buffer, curr->arch.vm_event.emul_read_data->data, >>> safe_size); >>> memset(buffer + safe_size, 0, size - safe_size); >>> return X86EMUL_OKAY; >>> } >>> @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear( >>> * vm_event being triggered for repeated writes to a whole page. >>> */ >>> if ( >>> unlikely(current->domain->arch.mem_access_emulate_each_rep) && >>> - current->arch.vm_event->emulate_flags != 0 ) >>> + current->arch.vm_event.emulate_flags != 0 ) >>> max_reps = 1; >>> /* >>> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c >>> index 884ae40..03dffb8 100644 >>> --- a/xen/arch/x86/hvm/hvm.c >>> +++ b/xen/arch/x86/hvm/hvm.c >>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) >>> if ( !handle_hvm_io_completion(v) ) >>> return; >>> - if ( unlikely(v->arch.vm_event) ) >>> + if ( unlikely(v->arch.vm_event.emulate_flags) ) >>> { >>> - if ( v->arch.vm_event->emulate_flags ) >>> - { >>> - enum emul_kind kind = EMUL_KIND_NORMAL; >>> + enum emul_kind kind; >>> - if ( v->arch.vm_event->emulate_flags & >>> - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >>> - kind = EMUL_KIND_SET_CONTEXT; >>> - else if ( v->arch.vm_event->emulate_flags & >>> - VM_EVENT_FLAG_EMULATE_NOWRITE ) >>> - kind = EMUL_KIND_NOWRITE; >>> + ASSERT(v->arch.vm_event.emul_read_data); >>> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >>> - HVM_DELIVER_NO_ERROR_CODE); >>> + kind = EMUL_KIND_NORMAL; >> Why do the "kind = EMUL_KIND_NORMAL;" assignment separately after the >> ASSERT()? Could it not be left the same way as before ("enum emul_kind >> kind = EMUL_KIND_NORMAL;") above the ASSERT()? >> >> It's not a big change and I won't hold the patch over it, but small >> changes add up in the review process so unnecessary changes are best >> either avoided, or done in a standalone cleanup patch. >> >>> - v->arch.vm_event->emulate_flags = 0; >>> - } >>> + if ( v->arch.vm_event.emulate_flags & >>> VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >>> + kind = EMUL_KIND_SET_CONTEXT; >>> + else if ( v->arch.vm_event.emulate_flags & >>> + VM_EVENT_FLAG_EMULATE_NOWRITE ) >>> + kind = EMUL_KIND_NOWRITE; >>> + >>> + hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >>> + HVM_DELIVER_NO_ERROR_CODE); >>> + >>> + v->arch.vm_event.emulate_flags = 0; >>> } >>> arch_monitor_write_data(v); >>> @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t >>> may_defer) >>> if ( may_defer && >>> unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & >>> >>> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) ) >>> { >>> - ASSERT(v->arch.vm_event); >>> - >>> if ( hvm_monitor_crX(CR0, value, old_value) ) >>> { >>> /* >>> * The actual write will occur in >>> arch_monitor_write_data(), if >>> * permitted. >>> */ >>> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >>> - v->arch.vm_event->write_data.status = MWS_CR0; >>> - v->arch.vm_event->write_data.value = value; >>> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >>> + v->arch.vm_event.write_data.status = MWS_CR0; >>> + v->arch.vm_event.write_data.value = value; >>> return X86EMUL_OKAY; >>> } >>> @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t >>> may_defer) >>> if ( may_defer && >>> unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & >>> >>> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) ) >>> { >>> - ASSERT(v->arch.vm_event); >>> - >>> if ( hvm_monitor_crX(CR3, value, old) ) >>> { >>> /* >>> * The actual write will occur in >>> arch_monitor_write_data(), if >>> * permitted. >>> */ >>> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >>> - v->arch.vm_event->write_data.status = MWS_CR3; >>> - v->arch.vm_event->write_data.value = value; >>> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >>> + v->arch.vm_event.write_data.status = MWS_CR3; >>> + v->arch.vm_event.write_data.value = value; >>> return X86EMUL_OKAY; >>> } >>> @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t >>> may_defer) >>> if ( may_defer && >>> unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & >>> >>> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ) >>> { >>> - ASSERT(v->arch.vm_event); >>> - >>> if ( hvm_monitor_crX(CR4, value, old_cr) ) >>> { >>> /* >>> * The actual write will occur in >>> arch_monitor_write_data(), if >>> * permitted. >>> */ >>> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >>> - v->arch.vm_event->write_data.status = MWS_CR4; >>> - v->arch.vm_event->write_data.value = value; >>> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >>> + v->arch.vm_event.write_data.status = MWS_CR4; >>> + v->arch.vm_event.write_data.value = value; >>> return X86EMUL_OKAY; >>> } >>> @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr, >>> uint64_t msr_content, >>> if ( may_defer && unlikely(monitored_msr(v->domain, msr)) ) >>> { >>> - ASSERT(v->arch.vm_event); >>> - >>> /* >>> * The actual write will occur in >>> arch_monitor_write_data(), if >>> * permitted. >>> */ >>> - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); >>> - v->arch.vm_event->write_data.status = MWS_MSR; >>> - v->arch.vm_event->write_data.msr = msr; >>> - v->arch.vm_event->write_data.value = msr_content; >>> + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); >>> + v->arch.vm_event.write_data.status = MWS_MSR; >>> + v->arch.vm_event.write_data.msr = msr; >>> + v->arch.vm_event.write_data.value = msr_content; >>> hvm_monitor_msr(msr, msr_content); >>> return X86EMUL_OKAY; >>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c >>> index 16733a4..9bcaa8a 100644 >>> --- a/xen/arch/x86/mm/p2m.c >>> +++ b/xen/arch/x86/mm/p2m.c >>> @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu >>> *v, >>> } >>> } >>> - v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0; >>> + v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0; >>> if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) ) >>> - v->arch.vm_event->emul_read_data = >>> rsp->data.emul_read_data; >>> + *v->arch.vm_event.emul_read_data = >>> rsp->data.emul_read_data; >>> } >>> } >>> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c >>> index 5c8d4da..88d14ae 100644 >>> --- a/xen/arch/x86/monitor.c >>> +++ b/xen/arch/x86/monitor.c >>> @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d) >>> void arch_monitor_write_data(struct vcpu *v) >>> { >>> - struct monitor_write_data *w; >>> - >>> - if ( likely(!v->arch.vm_event) ) >>> - return; >>> - >>> - w = &v->arch.vm_event->write_data; >>> + struct monitor_write_data *w = &v->arch.vm_event.write_data; >>> if ( likely(MWS_NOWRITE == w->status) ) >>> return; >>> diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c >>> index 825da48..f21ff10 100644 >>> --- a/xen/arch/x86/vm_event.c >>> +++ b/xen/arch/x86/vm_event.c >>> @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d) >>> for_each_vcpu ( d, v ) >>> { >>> - if ( v->arch.vm_event ) >>> + if ( v->arch.vm_event.emul_read_data ) >>> continue; >>> - v->arch.vm_event = xzalloc(struct arch_vm_event); >>> + v->arch.vm_event.emul_read_data = >>> + xzalloc(struct vm_event_emul_read_data); >>> - if ( !v->arch.vm_event ) >>> + if ( !v->arch.vm_event.emul_read_data ) >>> return -ENOMEM; >>> } >>> @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d) >>> for_each_vcpu ( d, v ) >>> { >>> - xfree(v->arch.vm_event); >>> - v->arch.vm_event = NULL; >>> + v->arch.vm_event.emulate_flags = 0; >>> + xfree(v->arch.vm_event.emul_read_data); >>> + v->arch.vm_event.emul_read_data = NULL; >>> } >>> d->arch.mem_access_emulate_each_rep = 0; >>> @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu >>> *v, vm_event_response_t *rsp) >>> { >>> if ( rsp->flags & VM_EVENT_FLAG_DENY ) >>> { >>> - ASSERT(v->arch.vm_event); >>> - >>> /* deny flag requires the vCPU to be paused */ >>> if ( !atomic_read(&v->vm_event_pause_count) ) >>> return; >>> - v->arch.vm_event->write_data.status = MWS_NOWRITE; >>> + v->arch.vm_event.write_data.status = MWS_NOWRITE; >>> } >>> } >>> diff --git a/xen/include/asm-x86/domain.h >>> b/xen/include/asm-x86/domain.h >>> index a22ee6b..7ea5c8f 100644 >>> --- a/xen/include/asm-x86/domain.h >>> +++ b/xen/include/asm-x86/domain.h >>> @@ -259,21 +259,6 @@ struct pv_domain >>> struct cpuidmasks *cpuidmasks; >>> }; >>> -enum monitor_write_status >>> -{ >>> - MWS_NOWRITE = 0, >>> - MWS_MSR, >>> - MWS_CR0, >>> - MWS_CR3, >>> - MWS_CR4, >>> -}; >>> - >>> -struct monitor_write_data { >>> - enum monitor_write_status status; >>> - uint32_t msr; >>> - uint64_t value; >>> -}; >>> - >>> struct arch_domain >>> { >>> struct page_info *perdomain_l3_pg; >>> @@ -496,6 +481,31 @@ typedef enum __packed { >>> SMAP_CHECK_DISABLED, /* disable the check */ >>> } smap_check_policy_t; >>> +enum monitor_write_status >>> +{ >>> + MWS_NOWRITE = 0, >>> + MWS_MSR, >>> + MWS_CR0, >>> + MWS_CR3, >>> + MWS_CR4, >>> +}; >>> + >>> +struct monitor_write_data { >>> + enum monitor_write_status status; >>> + uint32_t msr; >>> + uint64_t value; >>> +}; >>> + >>> +/* >>> + * Should we emulate the next matching instruction on VCPU resume >>> + * after a vm_event? >>> + */ >>> +struct arch_vm_event { >>> + uint32_t emulate_flags; >>> + struct vm_event_emul_read_data *emul_read_data; >>> + struct monitor_write_data write_data; >>> +}; >>> + >>> struct arch_vcpu >>> { >>> /* >>> @@ -569,7 +579,7 @@ struct arch_vcpu >>> /* A secondary copy of the vcpu time info. */ >>> XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest; >>> - struct arch_vm_event *vm_event; >>> + struct arch_vm_event vm_event; >>> }; >>> smap_check_policy_t smap_policy_change(struct vcpu *v, >>> diff --git a/xen/include/asm-x86/monitor.h >>> b/xen/include/asm-x86/monitor.h >>> index 1068376..984ac4c 100644 >>> --- a/xen/include/asm-x86/monitor.h >>> +++ b/xen/include/asm-x86/monitor.h >>> @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct >>> xen_domctl_monitor_op *mop) >>> * Enabling mem_access_emulate_each_rep without a vm_event >>> subscriber >>> * is meaningless. >>> */ >>> - if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event ) >>> + if ( d->max_vcpus && d->vcpu[0] && >>> + d->vcpu[0]->arch.vm_event.emul_read_data ) >> Again, I won't hold the patch over this, but if there are additional >> reviews that require changes and cause another version of it, please add >> a small line to the comment above the if, stating that emul_read_data >> only gets allocated when vm_event gets enabled, otherwise (especially >> for newcomers) that check might look confusing. >> >> Otherwise: >> >> Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com> >> >> >> Thanks, >> Razvan >> >> _______________________________________________ >> Xen-devel mailing list >> Xen-devel@lists.xen.org >> http://lists.xen.org/xen-devel > > Since that came up wouldn't it be even nicer if we add a: > > #define vm_event_initialized_on_vcpu(v) (NULL != > (v)->arch.vm_event.emul_read_data) > > in asm-x86/vm_event.h above vm_event_init_domain and use that everywhere > instead? Yes, I think that's the best way to go about that. Thanks, Razvan
>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: > The arch_vm_event structure is dynamically allocated and freed @ > vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user > disables domain monitoring (xc_monitor_disable), which in turn effectively > discards any information that was in arch_vm_event.write_data. Isn't that rather a toolstack user bug, not warranting a relatively extensive (even if mostly mechanical) hypervisor change like this one? Sane monitor behavior, after all, is required anyway for the monitored guest to survive. > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) > if ( !handle_hvm_io_completion(v) ) > return; > > - if ( unlikely(v->arch.vm_event) ) > + if ( unlikely(v->arch.vm_event.emulate_flags) ) > { > - if ( v->arch.vm_event->emulate_flags ) > - { > - enum emul_kind kind = EMUL_KIND_NORMAL; > + enum emul_kind kind; > > - if ( v->arch.vm_event->emulate_flags & > - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) > - kind = EMUL_KIND_SET_CONTEXT; > - else if ( v->arch.vm_event->emulate_flags & > - VM_EVENT_FLAG_EMULATE_NOWRITE ) > - kind = EMUL_KIND_NOWRITE; > + ASSERT(v->arch.vm_event.emul_read_data); > > - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, > - HVM_DELIVER_NO_ERROR_CODE); > + kind = EMUL_KIND_NORMAL; Please keep this being the initializer of the variable. > > - v->arch.vm_event->emulate_flags = 0; > - } > + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA ) Long line. > --- a/xen/include/asm-x86/domain.h > +++ b/xen/include/asm-x86/domain.h > @@ -259,21 +259,6 @@ struct pv_domain > struct cpuidmasks *cpuidmasks; > }; > > -enum monitor_write_status > -{ > - MWS_NOWRITE = 0, > - MWS_MSR, > - MWS_CR0, > - MWS_CR3, > - MWS_CR4, > -}; > - > -struct monitor_write_data { > - enum monitor_write_status status; > - uint32_t msr; > - uint64_t value; > -}; > - > struct arch_domain > { > struct page_info *perdomain_l3_pg; > @@ -496,6 +481,31 @@ typedef enum __packed { > SMAP_CHECK_DISABLED, /* disable the check */ > } smap_check_policy_t; > > +enum monitor_write_status > +{ > + MWS_NOWRITE = 0, > + MWS_MSR, > + MWS_CR0, > + MWS_CR3, > + MWS_CR4, > +}; > + > +struct monitor_write_data { > + enum monitor_write_status status; > + uint32_t msr; > + uint64_t value; > +}; Instead of moving these around now, may I suggest you put them into their final place right away in the previous patch? Jan
On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >> The arch_vm_event structure is dynamically allocated and freed @ >> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user >> disables domain monitoring (xc_monitor_disable), which in turn effectively >> discards any information that was in arch_vm_event.write_data. > Isn't that rather a toolstack user bug, not warranting a relatively > extensive (even if mostly mechanical) hypervisor change like this > one? Sane monitor behavior, after all, is required anyway for the > monitored guest to survive. Sorry but could you please rephrase this, I don't quite understand what you're saying. The write_data field in arch_vm_event should _not ever_ be invalidated as a direct result of a toolstack user's action. >> --- a/xen/arch/x86/hvm/hvm.c >> +++ b/xen/arch/x86/hvm/hvm.c >> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) >> if ( !handle_hvm_io_completion(v) ) >> return; >> >> - if ( unlikely(v->arch.vm_event) ) >> + if ( unlikely(v->arch.vm_event.emulate_flags) ) >> { >> - if ( v->arch.vm_event->emulate_flags ) >> - { >> - enum emul_kind kind = EMUL_KIND_NORMAL; >> + enum emul_kind kind; >> >> - if ( v->arch.vm_event->emulate_flags & >> - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >> - kind = EMUL_KIND_SET_CONTEXT; >> - else if ( v->arch.vm_event->emulate_flags & >> - VM_EVENT_FLAG_EMULATE_NOWRITE ) >> - kind = EMUL_KIND_NOWRITE; >> + ASSERT(v->arch.vm_event.emul_read_data); >> >> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >> - HVM_DELIVER_NO_ERROR_CODE); >> + kind = EMUL_KIND_NORMAL; > Please keep this being the initializer of the variable. I put it there because of the ASSERT (to do that before anything else), but I will undo if you prefer. > >> >> - v->arch.vm_event->emulate_flags = 0; >> - } >> + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA ) > Long line. Long but under 80 columns, isn't that the rule? :-) > >> --- a/xen/include/asm-x86/domain.h >> +++ b/xen/include/asm-x86/domain.h >> @@ -259,21 +259,6 @@ struct pv_domain >> struct cpuidmasks *cpuidmasks; >> }; >> >> -enum monitor_write_status >> -{ >> - MWS_NOWRITE = 0, >> - MWS_MSR, >> - MWS_CR0, >> - MWS_CR3, >> - MWS_CR4, >> -}; >> - >> -struct monitor_write_data { >> - enum monitor_write_status status; >> - uint32_t msr; >> - uint64_t value; >> -}; >> - >> struct arch_domain >> { >> struct page_info *perdomain_l3_pg; >> @@ -496,6 +481,31 @@ typedef enum __packed { >> SMAP_CHECK_DISABLED, /* disable the check */ >> } smap_check_policy_t; >> >> +enum monitor_write_status >> +{ >> + MWS_NOWRITE = 0, >> + MWS_MSR, >> + MWS_CR0, >> + MWS_CR3, >> + MWS_CR4, >> +}; >> + >> +struct monitor_write_data { >> + enum monitor_write_status status; >> + uint32_t msr; >> + uint64_t value; >> +}; > Instead of moving these around now, may I suggest you put them > into their final place right away in the previous patch? > > Jan > > Sounds good, will do. Corneliu.
>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: > On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>> The arch_vm_event structure is dynamically allocated and freed @ >>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user >>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>> discards any information that was in arch_vm_event.write_data. >> Isn't that rather a toolstack user bug, not warranting a relatively >> extensive (even if mostly mechanical) hypervisor change like this >> one? Sane monitor behavior, after all, is required anyway for the >> monitored guest to survive. > > Sorry but could you please rephrase this, I don't quite understand what > you're saying. > The write_data field in arch_vm_event should _not ever_ be invalidated > as a direct result of a toolstack user's action. The monitoring app can cause all kinds of problems to the guest it monitors. Why would this specific one need taking care of in the hypervisor, instead of demanding that the app not disable monitoring at the wrong time? >>> --- a/xen/arch/x86/hvm/hvm.c >>> +++ b/xen/arch/x86/hvm/hvm.c >>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) >>> if ( !handle_hvm_io_completion(v) ) >>> return; >>> >>> - if ( unlikely(v->arch.vm_event) ) >>> + if ( unlikely(v->arch.vm_event.emulate_flags) ) >>> { >>> - if ( v->arch.vm_event->emulate_flags ) >>> - { >>> - enum emul_kind kind = EMUL_KIND_NORMAL; >>> + enum emul_kind kind; >>> >>> - if ( v->arch.vm_event->emulate_flags & >>> - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >>> - kind = EMUL_KIND_SET_CONTEXT; >>> - else if ( v->arch.vm_event->emulate_flags & >>> - VM_EVENT_FLAG_EMULATE_NOWRITE ) >>> - kind = EMUL_KIND_NOWRITE; >>> + ASSERT(v->arch.vm_event.emul_read_data); >>> >>> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >>> - HVM_DELIVER_NO_ERROR_CODE); >>> + kind = EMUL_KIND_NORMAL; >> Please keep this being the initializer of the variable. > > I put it there because of the ASSERT (to do that before anything else), > but I will undo if you prefer. Since the initializer is (very obviously) independent of the condition the ASSERT() checks, I indeed would prefer it to remain the way it is before this change. >>> - v->arch.vm_event->emulate_flags = 0; >>> - } >>> + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >> Long line. > > Long but under 80 columns, isn't that the rule? :-) I've counted 81 here. Jan
On 7/4/2016 4:11 PM, Jan Beulich wrote: >>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>> The arch_vm_event structure is dynamically allocated and freed @ >>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user >>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>> discards any information that was in arch_vm_event.write_data. >>> Isn't that rather a toolstack user bug, not warranting a relatively >>> extensive (even if mostly mechanical) hypervisor change like this >>> one? Sane monitor behavior, after all, is required anyway for the >>> monitored guest to survive. >> Sorry but could you please rephrase this, I don't quite understand what >> you're saying. >> The write_data field in arch_vm_event should _not ever_ be invalidated >> as a direct result of a toolstack user's action. > The monitoring app can cause all kinds of problems to the guest it > monitors. Why would this specific one need taking care of in the > hypervisor, instead of demanding that the app not disable monitoring > at the wrong time? Because it wouldn't be the wrong time to disable monitoring. This is not a case of wrong toolstack usage, but rather a case of xc_monitor_disable doing the wrong thing along the way. > >>>> --- a/xen/arch/x86/hvm/hvm.c >>>> +++ b/xen/arch/x86/hvm/hvm.c >>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) >>>> if ( !handle_hvm_io_completion(v) ) >>>> return; >>>> >>>> - if ( unlikely(v->arch.vm_event) ) >>>> + if ( unlikely(v->arch.vm_event.emulate_flags) ) >>>> { >>>> - if ( v->arch.vm_event->emulate_flags ) >>>> - { >>>> - enum emul_kind kind = EMUL_KIND_NORMAL; >>>> + enum emul_kind kind; >>>> >>>> - if ( v->arch.vm_event->emulate_flags & >>>> - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >>>> - kind = EMUL_KIND_SET_CONTEXT; >>>> - else if ( v->arch.vm_event->emulate_flags & >>>> - VM_EVENT_FLAG_EMULATE_NOWRITE ) >>>> - kind = EMUL_KIND_NOWRITE; >>>> + ASSERT(v->arch.vm_event.emul_read_data); >>>> >>>> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >>>> - HVM_DELIVER_NO_ERROR_CODE); >>>> + kind = EMUL_KIND_NORMAL; >>> Please keep this being the initializer of the variable. >> I put it there because of the ASSERT (to do that before anything else), >> but I will undo if you prefer. > Since the initializer is (very obviously) independent of the > condition the ASSERT() checks, I indeed would prefer it to remain > the way it is before this change. > >>>> - v->arch.vm_event->emulate_flags = 0; >>>> - } >>>> + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >>> Long line. >> Long but under 80 columns, isn't that the rule? :-) > I've counted 81 here. > > Jan You may have counted the beginning '+' as well. Is the rule "<= 80 columns in the source file" (in which case you're wrong) or is it "<= 80 columns in the resulting diff" (in which case I'm wrong)? Corneliu.
On 07/04/16 16:11, Jan Beulich wrote: >>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>> The arch_vm_event structure is dynamically allocated and freed @ >>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user >>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>> discards any information that was in arch_vm_event.write_data. >>> Isn't that rather a toolstack user bug, not warranting a relatively >>> extensive (even if mostly mechanical) hypervisor change like this >>> one? Sane monitor behavior, after all, is required anyway for the >>> monitored guest to survive. >> >> Sorry but could you please rephrase this, I don't quite understand what >> you're saying. >> The write_data field in arch_vm_event should _not ever_ be invalidated >> as a direct result of a toolstack user's action. > > The monitoring app can cause all kinds of problems to the guest it > monitors. Why would this specific one need taking care of in the > hypervisor, instead of demanding that the app not disable monitoring > at the wrong time? I'm not sure there's a right time here. The problem is that, if I understand this correctly, a race is possible between the moment the userspace application responds to the vm_event _and_ call xc_monitor_disable() and the time hvm_do_resume() gets called. If xc_monitor_disable() happened before hvm_do_resume() springs into action, we lose a register write. There's no guaranteed way for this not to happen as far as I can see, although it's true that the race should pretty much never happen in practice - at least we've never come across such a case so far. Thanks, Razvan
On 7/4/2016 4:50 PM, Razvan Cojocaru wrote: > On 07/04/16 16:11, Jan Beulich wrote: >>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user >>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>> discards any information that was in arch_vm_event.write_data. >>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>> extensive (even if mostly mechanical) hypervisor change like this >>>> one? Sane monitor behavior, after all, is required anyway for the >>>> monitored guest to survive. >>> Sorry but could you please rephrase this, I don't quite understand what >>> you're saying. >>> The write_data field in arch_vm_event should _not ever_ be invalidated >>> as a direct result of a toolstack user's action. >> The monitoring app can cause all kinds of problems to the guest it >> monitors. Why would this specific one need taking care of in the >> hypervisor, instead of demanding that the app not disable monitoring >> at the wrong time? > I'm not sure there's a right time here. The problem is that, if I > understand this correctly, a race is possible between the moment the > userspace application responds to the vm_event _and_ call > xc_monitor_disable() and the time hvm_do_resume() gets called. > > If xc_monitor_disable() happened before hvm_do_resume() springs into > action, we lose a register write. There's no guaranteed way for this not > to happen as far as I can see, although it's true that the race should > pretty much never happen in practice - at least we've never come across > such a case so far. > > > Thanks, > Razvan > Perfectly pointed, thanks. Note that xc_monitor_disable() may happen before hvm_do_resume() because the latter only happens _when the scheduler reschedules the target vCPU_, which _may not happen between_ the moment the toolstack user _handles the vm-event_ and the moment when he _calls xc_monitor_disable()_, but rather after xc_monitor_disable() is called. Corneliu.
>>> On 04.07.16 at 15:28, <czuzu@bitdefender.com> wrote: > On 7/4/2016 4:11 PM, Jan Beulich wrote: >>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack > user >>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>> discards any information that was in arch_vm_event.write_data. >>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>> extensive (even if mostly mechanical) hypervisor change like this >>>> one? Sane monitor behavior, after all, is required anyway for the >>>> monitored guest to survive. >>> Sorry but could you please rephrase this, I don't quite understand what >>> you're saying. >>> The write_data field in arch_vm_event should _not ever_ be invalidated >>> as a direct result of a toolstack user's action. >> The monitoring app can cause all kinds of problems to the guest it >> monitors. Why would this specific one need taking care of in the >> hypervisor, instead of demanding that the app not disable monitoring >> at the wrong time? > > Because it wouldn't be the wrong time to disable monitoring. > This is not a case of wrong toolstack usage, but rather a case of > xc_monitor_disable doing the wrong thing along the way. > >> >>>>> --- a/xen/arch/x86/hvm/hvm.c >>>>> +++ b/xen/arch/x86/hvm/hvm.c >>>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) >>>>> if ( !handle_hvm_io_completion(v) ) >>>>> return; >>>>> >>>>> - if ( unlikely(v->arch.vm_event) ) >>>>> + if ( unlikely(v->arch.vm_event.emulate_flags) ) >>>>> { >>>>> - if ( v->arch.vm_event->emulate_flags ) >>>>> - { >>>>> - enum emul_kind kind = EMUL_KIND_NORMAL; >>>>> + enum emul_kind kind; >>>>> >>>>> - if ( v->arch.vm_event->emulate_flags & >>>>> - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >>>>> - kind = EMUL_KIND_SET_CONTEXT; >>>>> - else if ( v->arch.vm_event->emulate_flags & >>>>> - VM_EVENT_FLAG_EMULATE_NOWRITE ) >>>>> - kind = EMUL_KIND_NOWRITE; >>>>> + ASSERT(v->arch.vm_event.emul_read_data); >>>>> >>>>> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >>>>> - HVM_DELIVER_NO_ERROR_CODE); >>>>> + kind = EMUL_KIND_NORMAL; >>>> Please keep this being the initializer of the variable. >>> I put it there because of the ASSERT (to do that before anything else), >>> but I will undo if you prefer. >> Since the initializer is (very obviously) independent of the >> condition the ASSERT() checks, I indeed would prefer it to remain >> the way it is before this change. >> >>>>> - v->arch.vm_event->emulate_flags = 0; >>>>> - } >>>>> + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA > ) >>>> Long line. >>> Long but under 80 columns, isn't that the rule? :-) >> I've counted 81 here. > > You may have counted the beginning '+' as well. Is the rule "<= 80 > columns in the source file" (in which case you're wrong) or is it "<= 80 > columns in the resulting diff" (in which case I'm wrong)? Ah - you first said "under 80" but now say "<= 80". The former is what ./CODING_STYLE asks for. (And ftr the 81 included the newline, so indeed I was off by one too.) Jan
>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote: > On 07/04/16 16:11, Jan Beulich wrote: >>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack > user >>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>> discards any information that was in arch_vm_event.write_data. >>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>> extensive (even if mostly mechanical) hypervisor change like this >>>> one? Sane monitor behavior, after all, is required anyway for the >>>> monitored guest to survive. >>> >>> Sorry but could you please rephrase this, I don't quite understand what >>> you're saying. >>> The write_data field in arch_vm_event should _not ever_ be invalidated >>> as a direct result of a toolstack user's action. >> >> The monitoring app can cause all kinds of problems to the guest it >> monitors. Why would this specific one need taking care of in the >> hypervisor, instead of demanding that the app not disable monitoring >> at the wrong time? > > I'm not sure there's a right time here. The problem is that, if I > understand this correctly, a race is possible between the moment the > userspace application responds to the vm_event _and_ call > xc_monitor_disable() and the time hvm_do_resume() gets called. It's that _and_ in your reply that I put under question, but I admit that questioning may be due to my limited (or should I say non- existent) knowledge on the user space parts here: Why would the app be _required_ to "responds to the vm_event _and_ call xc_monitor_disable()", rather than delaying the latter for long enough? Jan
On 07/04/16 17:17, Jan Beulich wrote: >>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote: >> On 07/04/16 16:11, Jan Beulich wrote: >>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack >> user >>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>>> discards any information that was in arch_vm_event.write_data. >>>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>>> extensive (even if mostly mechanical) hypervisor change like this >>>>> one? Sane monitor behavior, after all, is required anyway for the >>>>> monitored guest to survive. >>>> >>>> Sorry but could you please rephrase this, I don't quite understand what >>>> you're saying. >>>> The write_data field in arch_vm_event should _not ever_ be invalidated >>>> as a direct result of a toolstack user's action. >>> >>> The monitoring app can cause all kinds of problems to the guest it >>> monitors. Why would this specific one need taking care of in the >>> hypervisor, instead of demanding that the app not disable monitoring >>> at the wrong time? >> >> I'm not sure there's a right time here. The problem is that, if I >> understand this correctly, a race is possible between the moment the >> userspace application responds to the vm_event _and_ call >> xc_monitor_disable() and the time hvm_do_resume() gets called. > > It's that _and_ in your reply that I put under question, but I admit > that questioning may be due to my limited (or should I say non- > existent) knowledge on the user space parts here: Why would the > app be _required_ to "responds to the vm_event _and_ call > xc_monitor_disable()", rather than delaying the latter for long > enough? It's not required to do that, it's just that we don't really know what "long enough means". I suppose a second should do be more than enough, but obviously we'd prefer a fool-proof solution with better guarantees and no lag - I thought that the hypervisor change is trivial enough to not make the tradeoff into much of an issue. Thanks, Razvan
On 7/4/2016 5:13 PM, Jan Beulich wrote: >>>> On 04.07.16 at 15:28, <czuzu@bitdefender.com> wrote: >> On 7/4/2016 4:11 PM, Jan Beulich wrote: >>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack >> user >>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>>> discards any information that was in arch_vm_event.write_data. >>>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>>> extensive (even if mostly mechanical) hypervisor change like this >>>>> one? Sane monitor behavior, after all, is required anyway for the >>>>> monitored guest to survive. >>>> Sorry but could you please rephrase this, I don't quite understand what >>>> you're saying. >>>> The write_data field in arch_vm_event should _not ever_ be invalidated >>>> as a direct result of a toolstack user's action. >>> The monitoring app can cause all kinds of problems to the guest it >>> monitors. Why would this specific one need taking care of in the >>> hypervisor, instead of demanding that the app not disable monitoring >>> at the wrong time? >> Because it wouldn't be the wrong time to disable monitoring. >> This is not a case of wrong toolstack usage, but rather a case of >> xc_monitor_disable doing the wrong thing along the way. >> >>>>>> --- a/xen/arch/x86/hvm/hvm.c >>>>>> +++ b/xen/arch/x86/hvm/hvm.c >>>>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) >>>>>> if ( !handle_hvm_io_completion(v) ) >>>>>> return; >>>>>> >>>>>> - if ( unlikely(v->arch.vm_event) ) >>>>>> + if ( unlikely(v->arch.vm_event.emulate_flags) ) >>>>>> { >>>>>> - if ( v->arch.vm_event->emulate_flags ) >>>>>> - { >>>>>> - enum emul_kind kind = EMUL_KIND_NORMAL; >>>>>> + enum emul_kind kind; >>>>>> >>>>>> - if ( v->arch.vm_event->emulate_flags & >>>>>> - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) >>>>>> - kind = EMUL_KIND_SET_CONTEXT; >>>>>> - else if ( v->arch.vm_event->emulate_flags & >>>>>> - VM_EVENT_FLAG_EMULATE_NOWRITE ) >>>>>> - kind = EMUL_KIND_NOWRITE; >>>>>> + ASSERT(v->arch.vm_event.emul_read_data); >>>>>> >>>>>> - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, >>>>>> - HVM_DELIVER_NO_ERROR_CODE); >>>>>> + kind = EMUL_KIND_NORMAL; >>>>> Please keep this being the initializer of the variable. >>>> I put it there because of the ASSERT (to do that before anything else), >>>> but I will undo if you prefer. >>> Since the initializer is (very obviously) independent of the >>> condition the ASSERT() checks, I indeed would prefer it to remain >>> the way it is before this change. >>> >>>>>> - v->arch.vm_event->emulate_flags = 0; >>>>>> - } >>>>>> + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA >> ) >>>>> Long line. >>>> Long but under 80 columns, isn't that the rule? :-) >>> I've counted 81 here. >> You may have counted the beginning '+' as well. Is the rule "<= 80 >> columns in the source file" (in which case you're wrong) or is it "<= 80 >> columns in the resulting diff" (in which case I'm wrong)? > Ah - you first said "under 80" but now say "<= 80". The former > is what ./CODING_STYLE asks for. (And ftr the 81 included the > newline, so indeed I was off by one too.) > > Jan Hmm, I meant "<= 80" the first time too, I was under the impression that that's what CODING_STYLE asks for too, but, quoting: 'Lines should be less than 80 characters in length.' Why not <= 80? Shouldn't the limit (i.e. "<=") be 80 columns, including the 80-th? (this points me otherwise http://programmers.stackexchange.com/questions/148677/why-is-80-characters-the-standard-limit-for-code-width ) Corneliu.
On 7/4/2016 5:17 PM, Jan Beulich wrote: >>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote: >> On 07/04/16 16:11, Jan Beulich wrote: >>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack >> user >>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>>> discards any information that was in arch_vm_event.write_data. >>>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>>> extensive (even if mostly mechanical) hypervisor change like this >>>>> one? Sane monitor behavior, after all, is required anyway for the >>>>> monitored guest to survive. >>>> Sorry but could you please rephrase this, I don't quite understand what >>>> you're saying. >>>> The write_data field in arch_vm_event should _not ever_ be invalidated >>>> as a direct result of a toolstack user's action. >>> The monitoring app can cause all kinds of problems to the guest it >>> monitors. Why would this specific one need taking care of in the >>> hypervisor, instead of demanding that the app not disable monitoring >>> at the wrong time? >> I'm not sure there's a right time here. The problem is that, if I >> understand this correctly, a race is possible between the moment the >> userspace application responds to the vm_event _and_ call >> xc_monitor_disable() and the time hvm_do_resume() gets called. > It's that _and_ in your reply that I put under question, but I admit > that questioning may be due to my limited (or should I say non- > existent) knowledge on the user space parts here: Why would the > app be _required_ to "responds to the vm_event _and_ call > xc_monitor_disable()", rather than delaying the latter for long > enough? > > Jan > > Because it's a limitation that can easily be avoided and also a rule which the user has to keep in mind (why should we impose this effort on the toolstack user if not necessary?). Corneliu.
>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote: > On 07/04/16 17:17, Jan Beulich wrote: >>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote: >>> On 07/04/16 16:11, Jan Beulich wrote: >>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack >>> user >>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>>>> discards any information that was in arch_vm_event.write_data. >>>>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>>>> extensive (even if mostly mechanical) hypervisor change like this >>>>>> one? Sane monitor behavior, after all, is required anyway for the >>>>>> monitored guest to survive. >>>>> >>>>> Sorry but could you please rephrase this, I don't quite understand what >>>>> you're saying. >>>>> The write_data field in arch_vm_event should _not ever_ be invalidated >>>>> as a direct result of a toolstack user's action. >>>> >>>> The monitoring app can cause all kinds of problems to the guest it >>>> monitors. Why would this specific one need taking care of in the >>>> hypervisor, instead of demanding that the app not disable monitoring >>>> at the wrong time? >>> >>> I'm not sure there's a right time here. The problem is that, if I >>> understand this correctly, a race is possible between the moment the >>> userspace application responds to the vm_event _and_ call >>> xc_monitor_disable() and the time hvm_do_resume() gets called. >> >> It's that _and_ in your reply that I put under question, but I admit >> that questioning may be due to my limited (or should I say non- >> existent) knowledge on the user space parts here: Why would the >> app be _required_ to "responds to the vm_event _and_ call >> xc_monitor_disable()", rather than delaying the latter for long >> enough? > > It's not required to do that, it's just that we don't really know what > "long enough means". I suppose a second should do be more than enough, > but obviously we'd prefer a fool-proof solution with better guarantees > and no lag - I thought that the hypervisor change is trivial enough to > not make the tradeoff into much of an issue. It's mostly mechanical indeed, but iirc it grows struct vcpu (or was it struct domain?), which is never really desirable (but admittedly often unavoidable). I'm therefore simply trying to understand what alternatives there are. Jan
>>> On 04.07.16 at 16:31, <czuzu@bitdefender.com> wrote: > Hmm, I meant "<= 80" the first time too, I was under the impression that > that's what CODING_STYLE asks for too, but, quoting: > 'Lines should be less than 80 characters in length.' > Why not <= 80? Shouldn't the limit (i.e. "<=") be 80 columns, including > the 80-th? (this points me otherwise > http://programmers.stackexchange.com/questions/148677/why-is-80-characters-t > he-standard-limit-for-code-width > ) I have no idea what (editing) tools have limitation at precisely which line length. Hence I can't answer the "why". I can only refer to the documented requirement, which aiui is pretty clear. Jan
On 7/4/2016 6:07 PM, Jan Beulich wrote: >>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote: >> On 07/04/16 17:17, Jan Beulich wrote: >>>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote: >>>> On 07/04/16 16:11, Jan Beulich wrote: >>>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack >>>> user >>>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>>>>> discards any information that was in arch_vm_event.write_data. >>>>>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>>>>> extensive (even if mostly mechanical) hypervisor change like this >>>>>>> one? Sane monitor behavior, after all, is required anyway for the >>>>>>> monitored guest to survive. >>>>>> Sorry but could you please rephrase this, I don't quite understand what >>>>>> you're saying. >>>>>> The write_data field in arch_vm_event should _not ever_ be invalidated >>>>>> as a direct result of a toolstack user's action. >>>>> The monitoring app can cause all kinds of problems to the guest it >>>>> monitors. Why would this specific one need taking care of in the >>>>> hypervisor, instead of demanding that the app not disable monitoring >>>>> at the wrong time? >>>> I'm not sure there's a right time here. The problem is that, if I >>>> understand this correctly, a race is possible between the moment the >>>> userspace application responds to the vm_event _and_ call >>>> xc_monitor_disable() and the time hvm_do_resume() gets called. >>> It's that _and_ in your reply that I put under question, but I admit >>> that questioning may be due to my limited (or should I say non- >>> existent) knowledge on the user space parts here: Why would the >>> app be _required_ to "responds to the vm_event _and_ call >>> xc_monitor_disable()", rather than delaying the latter for long >>> enough? >> It's not required to do that, it's just that we don't really know what >> "long enough means". I suppose a second should do be more than enough, >> but obviously we'd prefer a fool-proof solution with better guarantees >> and no lag - I thought that the hypervisor change is trivial enough to >> not make the tradeoff into much of an issue. > It's mostly mechanical indeed, but iirc it grows struct vcpu (or was > it struct domain?), which is never really desirable (but admittedly > often unavoidable). I'm therefore simply trying to understand what > alternatives there are. > > Jan The change adds 20 bytes to the structure, that's less than 3 uint64_t's, not that much if you ask me. But given the discussion we're having over patch 4/8, monitor_write_data would indeed get larger if I revert that, so what I propose instead is going for something in-between what was before and what I did after, i.e. don't dynamically allocate arch_vm_event entirely, but don't statically allocate monitor_write_data either. I propose turning arch_vm_event into: struct arch_vm_event { uint32_t emulate_flags; struct vm_event_emul_read_data *emul_read_data; struct monitor_write_data *write_data; }; This way we can still selectively invalidate write_data from arch_vm_event. Sounds good? Corneliu.
On 7/4/2016 6:09 PM, Jan Beulich wrote: >>>> On 04.07.16 at 16:31, <czuzu@bitdefender.com> wrote: >> Hmm, I meant "<= 80" the first time too, I was under the impression that >> that's what CODING_STYLE asks for too, but, quoting: >> 'Lines should be less than 80 characters in length.' >> Why not <= 80? Shouldn't the limit (i.e. "<=") be 80 columns, including >> the 80-th? (this points me otherwise >> http://programmers.stackexchange.com/questions/148677/why-is-80-characters-t >> he-standard-limit-for-code-width >> ) > I have no idea what (editing) tools have limitation at precisely which > line length. Hence I can't answer the "why". I can only refer to the > documented requirement, which aiui is pretty clear. > > Jan Without understanding the "why" we can't be sure we're doing the right thing. I was hoping someone else would provide feedback on this, as I'm almost sure who wrote CODING_STYLE actually meant "less than or equal" instead of "less than". Corneliu.
>>> On 04.07.16 at 17:21, <czuzu@bitdefender.com> wrote: > On 7/4/2016 6:07 PM, Jan Beulich wrote: >>>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote: >>> On 07/04/16 17:17, Jan Beulich wrote: >>>>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote: >>>>> On 07/04/16 16:11, Jan Beulich wrote: >>>>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>>>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>>>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack >>>>> user >>>>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>>>>>> discards any information that was in arch_vm_event.write_data. >>>>>>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>>>>>> extensive (even if mostly mechanical) hypervisor change like this >>>>>>>> one? Sane monitor behavior, after all, is required anyway for the >>>>>>>> monitored guest to survive. >>>>>>> Sorry but could you please rephrase this, I don't quite understand what >>>>>>> you're saying. >>>>>>> The write_data field in arch_vm_event should _not ever_ be invalidated >>>>>>> as a direct result of a toolstack user's action. >>>>>> The monitoring app can cause all kinds of problems to the guest it >>>>>> monitors. Why would this specific one need taking care of in the >>>>>> hypervisor, instead of demanding that the app not disable monitoring >>>>>> at the wrong time? >>>>> I'm not sure there's a right time here. The problem is that, if I >>>>> understand this correctly, a race is possible between the moment the >>>>> userspace application responds to the vm_event _and_ call >>>>> xc_monitor_disable() and the time hvm_do_resume() gets called. >>>> It's that _and_ in your reply that I put under question, but I admit >>>> that questioning may be due to my limited (or should I say non- >>>> existent) knowledge on the user space parts here: Why would the >>>> app be _required_ to "responds to the vm_event _and_ call >>>> xc_monitor_disable()", rather than delaying the latter for long >>>> enough? >>> It's not required to do that, it's just that we don't really know what >>> "long enough means". I suppose a second should do be more than enough, >>> but obviously we'd prefer a fool-proof solution with better guarantees >>> and no lag - I thought that the hypervisor change is trivial enough to >>> not make the tradeoff into much of an issue. >> It's mostly mechanical indeed, but iirc it grows struct vcpu (or was >> it struct domain?), which is never really desirable (but admittedly >> often unavoidable). I'm therefore simply trying to understand what >> alternatives there are. > > The change adds 20 bytes to the structure, that's less than 3 > uint64_t's, not that much if you ask me. > But given the discussion we're having over patch 4/8, monitor_write_data > would indeed get larger if I revert that, so what I propose instead is > going for something in-between what was before and what I did after, > i.e. don't dynamically allocate arch_vm_event entirely, but don't > statically allocate monitor_write_data either. I propose turning > arch_vm_event into: > > struct arch_vm_event { > uint32_t emulate_flags; > struct vm_event_emul_read_data *emul_read_data; > struct monitor_write_data *write_data; > }; > > This way we can still selectively invalidate write_data from arch_vm_event. > Sounds good? Well, ideally I'd like there to remain a single pointer in struct vcpu. What this points to and when that backing memory gets allocated would then of less interest (i.e. perhaps you could allocate the part that needs to remain active after disabling monitoring independent from what can go away at that point). Systems without any monitoring in use would then still pay no larger penalty than they do today. Jan
On 7/4/2016 6:57 PM, Jan Beulich wrote: >>>> On 04.07.16 at 17:21, <czuzu@bitdefender.com> wrote: >> On 7/4/2016 6:07 PM, Jan Beulich wrote: >>>>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote: >>>> On 07/04/16 17:17, Jan Beulich wrote: >>>>>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote: >>>>>> On 07/04/16 16:11, Jan Beulich wrote: >>>>>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote: >>>>>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote: >>>>>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote: >>>>>>>>>> The arch_vm_event structure is dynamically allocated and freed @ >>>>>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack >>>>>> user >>>>>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively >>>>>>>>>> discards any information that was in arch_vm_event.write_data. >>>>>>>>> Isn't that rather a toolstack user bug, not warranting a relatively >>>>>>>>> extensive (even if mostly mechanical) hypervisor change like this >>>>>>>>> one? Sane monitor behavior, after all, is required anyway for the >>>>>>>>> monitored guest to survive. >>>>>>>> Sorry but could you please rephrase this, I don't quite understand what >>>>>>>> you're saying. >>>>>>>> The write_data field in arch_vm_event should _not ever_ be invalidated >>>>>>>> as a direct result of a toolstack user's action. >>>>>>> The monitoring app can cause all kinds of problems to the guest it >>>>>>> monitors. Why would this specific one need taking care of in the >>>>>>> hypervisor, instead of demanding that the app not disable monitoring >>>>>>> at the wrong time? >>>>>> I'm not sure there's a right time here. The problem is that, if I >>>>>> understand this correctly, a race is possible between the moment the >>>>>> userspace application responds to the vm_event _and_ call >>>>>> xc_monitor_disable() and the time hvm_do_resume() gets called. >>>>> It's that _and_ in your reply that I put under question, but I admit >>>>> that questioning may be due to my limited (or should I say non- >>>>> existent) knowledge on the user space parts here: Why would the >>>>> app be _required_ to "responds to the vm_event _and_ call >>>>> xc_monitor_disable()", rather than delaying the latter for long >>>>> enough? >>>> It's not required to do that, it's just that we don't really know what >>>> "long enough means". I suppose a second should do be more than enough, >>>> but obviously we'd prefer a fool-proof solution with better guarantees >>>> and no lag - I thought that the hypervisor change is trivial enough to >>>> not make the tradeoff into much of an issue. >>> It's mostly mechanical indeed, but iirc it grows struct vcpu (or was >>> it struct domain?), which is never really desirable (but admittedly >>> often unavoidable). I'm therefore simply trying to understand what >>> alternatives there are. >> The change adds 20 bytes to the structure, that's less than 3 >> uint64_t's, not that much if you ask me. >> But given the discussion we're having over patch 4/8, monitor_write_data >> would indeed get larger if I revert that, so what I propose instead is >> going for something in-between what was before and what I did after, >> i.e. don't dynamically allocate arch_vm_event entirely, but don't >> statically allocate monitor_write_data either. I propose turning >> arch_vm_event into: >> >> struct arch_vm_event { >> uint32_t emulate_flags; >> struct vm_event_emul_read_data *emul_read_data; >> struct monitor_write_data *write_data; >> }; >> >> This way we can still selectively invalidate write_data from arch_vm_event. >> Sounds good? > Well, ideally I'd like there to remain a single pointer in struct vcpu. > What this points to and when that backing memory gets allocated > would then of less interest (i.e. perhaps you could allocate the > part that needs to remain active after disabling monitoring > independent from what can go away at that point). Systems > without any monitoring in use would then still pay no larger > penalty than they do today. > > Jan Now that I think about it, that's feasible too. So then, make arch_vm_event be dynamically allocated as it was, but slightly change its definition to: struct arch_vm_event { uint32_t emulate_flags; struct vm_event_emul_read_data *emul_read_data; struct monitor_write_data *write_data; }; , allocate it when it was previously allocated along emul_read_data and write_data but don't ever deallocate it entirely, instead only deallocate and reallocate selectively (emul_read_data) as needed, correct? This way indeed there's no memory overhead in arch_vcpu compared to how it was before. Corneliu.
>>> On 04.07.16 at 18:09, <czuzu@bitdefender.com> wrote: > Now that I think about it, that's feasible too. So then, make > arch_vm_event be dynamically allocated as it was, but slightly change > its definition to: > > struct arch_vm_event { > uint32_t emulate_flags; > struct vm_event_emul_read_data *emul_read_data; > struct monitor_write_data *write_data; > }; > > , allocate it when it was previously allocated along emul_read_data and > write_data but don't ever deallocate it entirely, instead only > deallocate and reallocate selectively (emul_read_data) as needed, correct? Yes, if that also fits you. Jan
On 7/4/2016 7:16 PM, Jan Beulich wrote: >>>> On 04.07.16 at 18:09, <czuzu@bitdefender.com> wrote: >> Now that I think about it, that's feasible too. So then, make >> arch_vm_event be dynamically allocated as it was, but slightly change >> its definition to: >> >> struct arch_vm_event { >> uint32_t emulate_flags; >> struct vm_event_emul_read_data *emul_read_data; >> struct monitor_write_data *write_data; >> }; >> >> , allocate it when it was previously allocated along emul_read_data and >> write_data but don't ever deallocate it entirely, instead only >> deallocate and reallocate selectively (emul_read_data) as needed, correct? > Yes, if that also fits you. > > Jan Heh, within the above scheme I only now realized it would make no sense to have write_data dynamically allocated. With that settled, thanks for the great feedback. Corneliu.
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index bb59247..06e68ae 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v) void vcpu_destroy(struct vcpu *v) { - xfree(v->arch.vm_event); - v->arch.vm_event = NULL; + v->arch.vm_event.emulate_flags = 0; + xfree(v->arch.vm_event.emul_read_data); + v->arch.vm_event.emul_read_data = NULL; if ( is_pv_32bit_vcpu(v) ) { diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c index 855af4d..68f5515 100644 --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -73,12 +73,12 @@ static int set_context_data(void *buffer, unsigned int size) { struct vcpu *curr = current; - if ( curr->arch.vm_event ) + if ( curr->arch.vm_event.emul_read_data ) { unsigned int safe_size = - min(size, curr->arch.vm_event->emul_read_data.size); + min(size, curr->arch.vm_event.emul_read_data->size); - memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size); + memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size); memset(buffer + safe_size, 0, size - safe_size); return X86EMUL_OKAY; } @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear( * vm_event being triggered for repeated writes to a whole page. */ if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) && - current->arch.vm_event->emulate_flags != 0 ) + current->arch.vm_event.emulate_flags != 0 ) max_reps = 1; /* diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index 884ae40..03dffb8 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v) if ( !handle_hvm_io_completion(v) ) return; - if ( unlikely(v->arch.vm_event) ) + if ( unlikely(v->arch.vm_event.emulate_flags) ) { - if ( v->arch.vm_event->emulate_flags ) - { - enum emul_kind kind = EMUL_KIND_NORMAL; + enum emul_kind kind; - if ( v->arch.vm_event->emulate_flags & - VM_EVENT_FLAG_SET_EMUL_READ_DATA ) - kind = EMUL_KIND_SET_CONTEXT; - else if ( v->arch.vm_event->emulate_flags & - VM_EVENT_FLAG_EMULATE_NOWRITE ) - kind = EMUL_KIND_NOWRITE; + ASSERT(v->arch.vm_event.emul_read_data); - hvm_mem_access_emulate_one(kind, TRAP_invalid_op, - HVM_DELIVER_NO_ERROR_CODE); + kind = EMUL_KIND_NORMAL; - v->arch.vm_event->emulate_flags = 0; - } + if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA ) + kind = EMUL_KIND_SET_CONTEXT; + else if ( v->arch.vm_event.emulate_flags & + VM_EVENT_FLAG_EMULATE_NOWRITE ) + kind = EMUL_KIND_NOWRITE; + + hvm_mem_access_emulate_one(kind, TRAP_invalid_op, + HVM_DELIVER_NO_ERROR_CODE); + + v->arch.vm_event.emulate_flags = 0; } arch_monitor_write_data(v); @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t may_defer) if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) ) { - ASSERT(v->arch.vm_event); - if ( hvm_monitor_crX(CR0, value, old_value) ) { /* * The actual write will occur in arch_monitor_write_data(), if * permitted. */ - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); - v->arch.vm_event->write_data.status = MWS_CR0; - v->arch.vm_event->write_data.value = value; + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); + v->arch.vm_event.write_data.status = MWS_CR0; + v->arch.vm_event.write_data.value = value; return X86EMUL_OKAY; } @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer) if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) ) { - ASSERT(v->arch.vm_event); - if ( hvm_monitor_crX(CR3, value, old) ) { /* * The actual write will occur in arch_monitor_write_data(), if * permitted. */ - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); - v->arch.vm_event->write_data.status = MWS_CR3; - v->arch.vm_event->write_data.value = value; + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); + v->arch.vm_event.write_data.status = MWS_CR3; + v->arch.vm_event.write_data.value = value; return X86EMUL_OKAY; } @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t may_defer) if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ) { - ASSERT(v->arch.vm_event); - if ( hvm_monitor_crX(CR4, value, old_cr) ) { /* * The actual write will occur in arch_monitor_write_data(), if * permitted. */ - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); - v->arch.vm_event->write_data.status = MWS_CR4; - v->arch.vm_event->write_data.value = value; + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); + v->arch.vm_event.write_data.status = MWS_CR4; + v->arch.vm_event.write_data.value = value; return X86EMUL_OKAY; } @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content, if ( may_defer && unlikely(monitored_msr(v->domain, msr)) ) { - ASSERT(v->arch.vm_event); - /* * The actual write will occur in arch_monitor_write_data(), if * permitted. */ - ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status); - v->arch.vm_event->write_data.status = MWS_MSR; - v->arch.vm_event->write_data.msr = msr; - v->arch.vm_event->write_data.value = msr_content; + ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status); + v->arch.vm_event.write_data.status = MWS_MSR; + v->arch.vm_event.write_data.msr = msr; + v->arch.vm_event.write_data.value = msr_content; hvm_monitor_msr(msr, msr_content); return X86EMUL_OKAY; diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index 16733a4..9bcaa8a 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu *v, } } - v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0; + v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0; if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) ) - v->arch.vm_event->emul_read_data = rsp->data.emul_read_data; + *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data; } } diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c index 5c8d4da..88d14ae 100644 --- a/xen/arch/x86/monitor.c +++ b/xen/arch/x86/monitor.c @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d) void arch_monitor_write_data(struct vcpu *v) { - struct monitor_write_data *w; - - if ( likely(!v->arch.vm_event) ) - return; - - w = &v->arch.vm_event->write_data; + struct monitor_write_data *w = &v->arch.vm_event.write_data; if ( likely(MWS_NOWRITE == w->status) ) return; diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c index 825da48..f21ff10 100644 --- a/xen/arch/x86/vm_event.c +++ b/xen/arch/x86/vm_event.c @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d) for_each_vcpu ( d, v ) { - if ( v->arch.vm_event ) + if ( v->arch.vm_event.emul_read_data ) continue; - v->arch.vm_event = xzalloc(struct arch_vm_event); + v->arch.vm_event.emul_read_data = + xzalloc(struct vm_event_emul_read_data); - if ( !v->arch.vm_event ) + if ( !v->arch.vm_event.emul_read_data ) return -ENOMEM; } @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d) for_each_vcpu ( d, v ) { - xfree(v->arch.vm_event); - v->arch.vm_event = NULL; + v->arch.vm_event.emulate_flags = 0; + xfree(v->arch.vm_event.emul_read_data); + v->arch.vm_event.emul_read_data = NULL; } d->arch.mem_access_emulate_each_rep = 0; @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp) { if ( rsp->flags & VM_EVENT_FLAG_DENY ) { - ASSERT(v->arch.vm_event); - /* deny flag requires the vCPU to be paused */ if ( !atomic_read(&v->vm_event_pause_count) ) return; - v->arch.vm_event->write_data.status = MWS_NOWRITE; + v->arch.vm_event.write_data.status = MWS_NOWRITE; } } diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index a22ee6b..7ea5c8f 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -259,21 +259,6 @@ struct pv_domain struct cpuidmasks *cpuidmasks; }; -enum monitor_write_status -{ - MWS_NOWRITE = 0, - MWS_MSR, - MWS_CR0, - MWS_CR3, - MWS_CR4, -}; - -struct monitor_write_data { - enum monitor_write_status status; - uint32_t msr; - uint64_t value; -}; - struct arch_domain { struct page_info *perdomain_l3_pg; @@ -496,6 +481,31 @@ typedef enum __packed { SMAP_CHECK_DISABLED, /* disable the check */ } smap_check_policy_t; +enum monitor_write_status +{ + MWS_NOWRITE = 0, + MWS_MSR, + MWS_CR0, + MWS_CR3, + MWS_CR4, +}; + +struct monitor_write_data { + enum monitor_write_status status; + uint32_t msr; + uint64_t value; +}; + +/* + * Should we emulate the next matching instruction on VCPU resume + * after a vm_event? + */ +struct arch_vm_event { + uint32_t emulate_flags; + struct vm_event_emul_read_data *emul_read_data; + struct monitor_write_data write_data; +}; + struct arch_vcpu { /* @@ -569,7 +579,7 @@ struct arch_vcpu /* A secondary copy of the vcpu time info. */ XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest; - struct arch_vm_event *vm_event; + struct arch_vm_event vm_event; }; smap_check_policy_t smap_policy_change(struct vcpu *v, diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h index 1068376..984ac4c 100644 --- a/xen/include/asm-x86/monitor.h +++ b/xen/include/asm-x86/monitor.h @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop) * Enabling mem_access_emulate_each_rep without a vm_event subscriber * is meaningless. */ - if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event ) + if ( d->max_vcpus && d->vcpu[0] && + d->vcpu[0]->arch.vm_event.emul_read_data ) d->arch.mem_access_emulate_each_rep = !!mop->event; else rc = -EINVAL; diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h index 026f42e..c83583d 100644 --- a/xen/include/asm-x86/vm_event.h +++ b/xen/include/asm-x86/vm_event.h @@ -22,16 +22,6 @@ #include <xen/sched.h> #include <xen/vm_event.h> -/* - * Should we emulate the next matching instruction on VCPU resume - * after a vm_event? - */ -struct arch_vm_event { - uint32_t emulate_flags; - struct vm_event_emul_read_data emul_read_data; - struct monitor_write_data write_data; -}; - int vm_event_init_domain(struct domain *d); void vm_event_cleanup_domain(struct domain *d);
The arch_vm_event structure is dynamically allocated and freed @ vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user disables domain monitoring (xc_monitor_disable), which in turn effectively discards any information that was in arch_vm_event.write_data. But this can yield unexpected behavior since if a CR-write was awaiting to be committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data) before xc_monitor_disable is called, then the domain CR write is wrongfully ignored, which of course, in these cases, can easily render a domain crash. To fix the issue, this patch makes only arch_vm_event.emul_read_data dynamically allocated instead of the whole arch_vm_event structure. With this we can avoid invalidation of an awaiting arch_vm_event.write_data by selectively cleaning up only emul_read_data and emulate_flags @ vm_event_cleanup_domain. Small note: arch_vm_event structure definition needed to be moved from asm-x86/vm_event.h to asm-x86/domain.h in the process. Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com> --- xen/arch/x86/domain.c | 5 ++-- xen/arch/x86/hvm/emulate.c | 8 +++--- xen/arch/x86/hvm/hvm.c | 62 ++++++++++++++++++------------------------ xen/arch/x86/mm/p2m.c | 4 +-- xen/arch/x86/monitor.c | 7 +---- xen/arch/x86/vm_event.c | 16 +++++------ xen/include/asm-x86/domain.h | 42 +++++++++++++++++----------- xen/include/asm-x86/monitor.h | 3 +- xen/include/asm-x86/vm_event.h | 10 ------- 9 files changed, 73 insertions(+), 84 deletions(-)