diff mbox

[5/8] x86/vm-event/monitor: don't compromise monitor_write_data on domain cleanup

Message ID 1467312332-3098-1-git-send-email-czuzu@bitdefender.com (mailing list archive)
State New, archived
Headers show

Commit Message

Corneliu ZUZU June 30, 2016, 6:45 p.m. UTC
The arch_vm_event structure is dynamically allocated and freed @
vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
disables domain monitoring (xc_monitor_disable), which in turn effectively
discards any information that was in arch_vm_event.write_data.

But this can yield unexpected behavior since if a CR-write was awaiting to be
committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
before xc_monitor_disable is called, then the domain CR write is wrongfully
ignored, which of course, in these cases, can easily render a domain crash.

To fix the issue, this patch makes only arch_vm_event.emul_read_data dynamically
allocated instead of the whole arch_vm_event structure. With this we can avoid
invalidation of an awaiting arch_vm_event.write_data by selectively cleaning up
only emul_read_data and emulate_flags @ vm_event_cleanup_domain.

Small note: arch_vm_event structure definition needed to be moved from
asm-x86/vm_event.h to asm-x86/domain.h in the process.

Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
---
 xen/arch/x86/domain.c          |  5 ++--
 xen/arch/x86/hvm/emulate.c     |  8 +++---
 xen/arch/x86/hvm/hvm.c         | 62 ++++++++++++++++++------------------------
 xen/arch/x86/mm/p2m.c          |  4 +--
 xen/arch/x86/monitor.c         |  7 +----
 xen/arch/x86/vm_event.c        | 16 +++++------
 xen/include/asm-x86/domain.h   | 42 +++++++++++++++++-----------
 xen/include/asm-x86/monitor.h  |  3 +-
 xen/include/asm-x86/vm_event.h | 10 -------
 9 files changed, 73 insertions(+), 84 deletions(-)

Comments

Razvan Cojocaru July 1, 2016, 6:47 a.m. UTC | #1
On 06/30/16 21:45, Corneliu ZUZU wrote:
> The arch_vm_event structure is dynamically allocated and freed @
> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
> disables domain monitoring (xc_monitor_disable), which in turn effectively
> discards any information that was in arch_vm_event.write_data.
> 
> But this can yield unexpected behavior since if a CR-write was awaiting to be
> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
> before xc_monitor_disable is called, then the domain CR write is wrongfully
> ignored, which of course, in these cases, can easily render a domain crash.
> 
> To fix the issue, this patch makes only arch_vm_event.emul_read_data dynamically
> allocated instead of the whole arch_vm_event structure. With this we can avoid
> invalidation of an awaiting arch_vm_event.write_data by selectively cleaning up
> only emul_read_data and emulate_flags @ vm_event_cleanup_domain.
> 
> Small note: arch_vm_event structure definition needed to be moved from
> asm-x86/vm_event.h to asm-x86/domain.h in the process.
> 
> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
> ---
>  xen/arch/x86/domain.c          |  5 ++--
>  xen/arch/x86/hvm/emulate.c     |  8 +++---
>  xen/arch/x86/hvm/hvm.c         | 62 ++++++++++++++++++------------------------
>  xen/arch/x86/mm/p2m.c          |  4 +--
>  xen/arch/x86/monitor.c         |  7 +----
>  xen/arch/x86/vm_event.c        | 16 +++++------
>  xen/include/asm-x86/domain.h   | 42 +++++++++++++++++-----------
>  xen/include/asm-x86/monitor.h  |  3 +-
>  xen/include/asm-x86/vm_event.h | 10 -------
>  9 files changed, 73 insertions(+), 84 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index bb59247..06e68ae 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v)
>  
>  void vcpu_destroy(struct vcpu *v)
>  {
> -    xfree(v->arch.vm_event);
> -    v->arch.vm_event = NULL;
> +    v->arch.vm_event.emulate_flags = 0;
> +    xfree(v->arch.vm_event.emul_read_data);
> +    v->arch.vm_event.emul_read_data = NULL;
>  
>      if ( is_pv_32bit_vcpu(v) )
>      {
> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
> index 855af4d..68f5515 100644
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -73,12 +73,12 @@ static int set_context_data(void *buffer, unsigned int size)
>  {
>      struct vcpu *curr = current;
>  
> -    if ( curr->arch.vm_event )
> +    if ( curr->arch.vm_event.emul_read_data )
>      {
>          unsigned int safe_size =
> -            min(size, curr->arch.vm_event->emul_read_data.size);
> +            min(size, curr->arch.vm_event.emul_read_data->size);
>  
> -        memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size);
> +        memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size);
>          memset(buffer + safe_size, 0, size - safe_size);
>          return X86EMUL_OKAY;
>      }
> @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear(
>       * vm_event being triggered for repeated writes to a whole page.
>       */
>      if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) &&
> -         current->arch.vm_event->emulate_flags != 0 )
> +         current->arch.vm_event.emulate_flags != 0 )
>         max_reps = 1;
>  
>      /*
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 884ae40..03dffb8 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>      if ( !handle_hvm_io_completion(v) )
>          return;
>  
> -    if ( unlikely(v->arch.vm_event) )
> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>      {
> -        if ( v->arch.vm_event->emulate_flags )
> -        {
> -            enum emul_kind kind = EMUL_KIND_NORMAL;
> +        enum emul_kind kind;
>  
> -            if ( v->arch.vm_event->emulate_flags &
> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
> -                kind = EMUL_KIND_SET_CONTEXT;
> -            else if ( v->arch.vm_event->emulate_flags &
> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
> -                kind = EMUL_KIND_NOWRITE;
> +        ASSERT(v->arch.vm_event.emul_read_data);
>  
> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
> -                                       HVM_DELIVER_NO_ERROR_CODE);
> +        kind = EMUL_KIND_NORMAL;

Why do the "kind = EMUL_KIND_NORMAL;" assignment separately after the
ASSERT()? Could it not be left the same way as before ("enum emul_kind
kind = EMUL_KIND_NORMAL;") above the ASSERT()?

It's not a big change and I won't hold the patch over it, but small
changes add up in the review process so unnecessary changes are best
either avoided, or done in a standalone cleanup patch.

> -            v->arch.vm_event->emulate_flags = 0;
> -        }
> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA )
> +            kind = EMUL_KIND_SET_CONTEXT;
> +        else if ( v->arch.vm_event.emulate_flags &
> +                  VM_EVENT_FLAG_EMULATE_NOWRITE )
> +            kind = EMUL_KIND_NOWRITE;
> +
> +        hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
> +                                   HVM_DELIVER_NO_ERROR_CODE);
> +
> +        v->arch.vm_event.emulate_flags = 0;
>      }
>  
>      arch_monitor_write_data(v);
> @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t may_defer)
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
>      {
> -        ASSERT(v->arch.vm_event);
> -
>          if ( hvm_monitor_crX(CR0, value, old_value) )
>          {
>              /*
>               * The actual write will occur in arch_monitor_write_data(), if
>               * permitted.
>               */
> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> -            v->arch.vm_event->write_data.status = MWS_CR0;
> -            v->arch.vm_event->write_data.value = value;
> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> +            v->arch.vm_event.write_data.status = MWS_CR0;
> +            v->arch.vm_event.write_data.value = value;
>  
>              return X86EMUL_OKAY;
>          }
> @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
>      {
> -        ASSERT(v->arch.vm_event);
> -
>          if ( hvm_monitor_crX(CR3, value, old) )
>          {
>              /*
>               * The actual write will occur in arch_monitor_write_data(), if
>               * permitted.
>               */
> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> -            v->arch.vm_event->write_data.status = MWS_CR3;
> -            v->arch.vm_event->write_data.value = value;
> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> +            v->arch.vm_event.write_data.status = MWS_CR3;
> +            v->arch.vm_event.write_data.value = value;
>  
>              return X86EMUL_OKAY;
>          }
> @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t may_defer)
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
>      {
> -        ASSERT(v->arch.vm_event);
> -
>          if ( hvm_monitor_crX(CR4, value, old_cr) )
>          {
>              /*
>               * The actual write will occur in arch_monitor_write_data(), if
>               * permitted.
>               */
> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> -            v->arch.vm_event->write_data.status = MWS_CR4;
> -            v->arch.vm_event->write_data.value = value;
> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> +            v->arch.vm_event.write_data.status = MWS_CR4;
> +            v->arch.vm_event.write_data.value = value;
>  
>              return X86EMUL_OKAY;
>          }
> @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
>  
>      if ( may_defer && unlikely(monitored_msr(v->domain, msr)) )
>      {
> -        ASSERT(v->arch.vm_event);
> -
>          /*
>           * The actual write will occur in arch_monitor_write_data(), if
>           * permitted.
>           */
> -        ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
> -        v->arch.vm_event->write_data.status = MWS_MSR;
> -        v->arch.vm_event->write_data.msr = msr;
> -        v->arch.vm_event->write_data.value = msr_content;
> +        ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
> +        v->arch.vm_event.write_data.status = MWS_MSR;
> +        v->arch.vm_event.write_data.msr = msr;
> +        v->arch.vm_event.write_data.value = msr_content;
>  
>          hvm_monitor_msr(msr, msr_content);
>          return X86EMUL_OKAY;
> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 16733a4..9bcaa8a 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu *v,
>              }
>          }
>  
> -        v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
> +        v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0;
>  
>          if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
> -            v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
> +            *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data;
>      }
>  }
>  
> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
> index 5c8d4da..88d14ae 100644
> --- a/xen/arch/x86/monitor.c
> +++ b/xen/arch/x86/monitor.c
> @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d)
>  
>  void arch_monitor_write_data(struct vcpu *v)
>  {
> -    struct monitor_write_data *w;
> -
> -    if ( likely(!v->arch.vm_event) )
> -        return;
> -
> -    w = &v->arch.vm_event->write_data;
> +    struct monitor_write_data *w = &v->arch.vm_event.write_data;
>  
>      if ( likely(MWS_NOWRITE == w->status) )
>          return;
> diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
> index 825da48..f21ff10 100644
> --- a/xen/arch/x86/vm_event.c
> +++ b/xen/arch/x86/vm_event.c
> @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d)
>  
>      for_each_vcpu ( d, v )
>      {
> -        if ( v->arch.vm_event )
> +        if ( v->arch.vm_event.emul_read_data )
>              continue;
>  
> -        v->arch.vm_event = xzalloc(struct arch_vm_event);
> +        v->arch.vm_event.emul_read_data =
> +                xzalloc(struct vm_event_emul_read_data);
>  
> -        if ( !v->arch.vm_event )
> +        if ( !v->arch.vm_event.emul_read_data )
>              return -ENOMEM;
>      }
>  
> @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d)
>  
>      for_each_vcpu ( d, v )
>      {
> -        xfree(v->arch.vm_event);
> -        v->arch.vm_event = NULL;
> +        v->arch.vm_event.emulate_flags = 0;
> +        xfree(v->arch.vm_event.emul_read_data);
> +        v->arch.vm_event.emul_read_data = NULL;
>      }
>  
>      d->arch.mem_access_emulate_each_rep = 0;
> @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp)
>  {
>      if ( rsp->flags & VM_EVENT_FLAG_DENY )
>      {
> -        ASSERT(v->arch.vm_event);
> -
>          /* deny flag requires the vCPU to be paused */
>          if ( !atomic_read(&v->vm_event_pause_count) )
>              return;
>  
> -        v->arch.vm_event->write_data.status = MWS_NOWRITE;
> +        v->arch.vm_event.write_data.status = MWS_NOWRITE;
>      }
>  }
>  
> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
> index a22ee6b..7ea5c8f 100644
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -259,21 +259,6 @@ struct pv_domain
>      struct cpuidmasks *cpuidmasks;
>  };
>  
> -enum monitor_write_status
> -{
> -    MWS_NOWRITE = 0,
> -    MWS_MSR,
> -    MWS_CR0,
> -    MWS_CR3,
> -    MWS_CR4,
> -};
> -
> -struct monitor_write_data {
> -    enum monitor_write_status status;
> -    uint32_t msr;
> -    uint64_t value;
> -};
> -
>  struct arch_domain
>  {
>      struct page_info *perdomain_l3_pg;
> @@ -496,6 +481,31 @@ typedef enum __packed {
>      SMAP_CHECK_DISABLED,        /* disable the check */
>  } smap_check_policy_t;
>  
> +enum monitor_write_status
> +{
> +    MWS_NOWRITE = 0,
> +    MWS_MSR,
> +    MWS_CR0,
> +    MWS_CR3,
> +    MWS_CR4,
> +};
> +
> +struct monitor_write_data {
> +    enum monitor_write_status status;
> +    uint32_t msr;
> +    uint64_t value;
> +};
> +
> +/*
> + * Should we emulate the next matching instruction on VCPU resume
> + * after a vm_event?
> + */
> +struct arch_vm_event {
> +    uint32_t emulate_flags;
> +    struct vm_event_emul_read_data *emul_read_data;
> +    struct monitor_write_data write_data;
> +};
> +
>  struct arch_vcpu
>  {
>      /*
> @@ -569,7 +579,7 @@ struct arch_vcpu
>      /* A secondary copy of the vcpu time info. */
>      XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
>  
> -    struct arch_vm_event *vm_event;
> +    struct arch_vm_event vm_event;
>  };
>  
>  smap_check_policy_t smap_policy_change(struct vcpu *v,
> diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
> index 1068376..984ac4c 100644
> --- a/xen/include/asm-x86/monitor.h
> +++ b/xen/include/asm-x86/monitor.h
> @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop)
>           * Enabling mem_access_emulate_each_rep without a vm_event subscriber
>           * is meaningless.
>           */
> -        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
> +        if ( d->max_vcpus && d->vcpu[0] &&
> +             d->vcpu[0]->arch.vm_event.emul_read_data )

Again, I won't hold the patch over this, but if there are additional
reviews that require changes and cause another version of it, please add
a small line to the comment above the if, stating that emul_read_data
only gets allocated when vm_event gets enabled, otherwise (especially
for newcomers) that check might look confusing.

Otherwise:

Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>


Thanks,
Razvan
Corneliu ZUZU July 1, 2016, 6:56 a.m. UTC | #2
On 7/1/2016 9:47 AM, Razvan Cojocaru wrote:
> On 06/30/16 21:45, Corneliu ZUZU wrote:
>> The arch_vm_event structure is dynamically allocated and freed @
>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>> discards any information that was in arch_vm_event.write_data.
>>
>> But this can yield unexpected behavior since if a CR-write was awaiting to be
>> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
>> before xc_monitor_disable is called, then the domain CR write is wrongfully
>> ignored, which of course, in these cases, can easily render a domain crash.
>>
>> To fix the issue, this patch makes only arch_vm_event.emul_read_data dynamically
>> allocated instead of the whole arch_vm_event structure. With this we can avoid
>> invalidation of an awaiting arch_vm_event.write_data by selectively cleaning up
>> only emul_read_data and emulate_flags @ vm_event_cleanup_domain.
>>
>> Small note: arch_vm_event structure definition needed to be moved from
>> asm-x86/vm_event.h to asm-x86/domain.h in the process.
>>
>> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
>> ---
>>   xen/arch/x86/domain.c          |  5 ++--
>>   xen/arch/x86/hvm/emulate.c     |  8 +++---
>>   xen/arch/x86/hvm/hvm.c         | 62 ++++++++++++++++++------------------------
>>   xen/arch/x86/mm/p2m.c          |  4 +--
>>   xen/arch/x86/monitor.c         |  7 +----
>>   xen/arch/x86/vm_event.c        | 16 +++++------
>>   xen/include/asm-x86/domain.h   | 42 +++++++++++++++++-----------
>>   xen/include/asm-x86/monitor.h  |  3 +-
>>   xen/include/asm-x86/vm_event.h | 10 -------
>>   9 files changed, 73 insertions(+), 84 deletions(-)
>>
>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
>> index bb59247..06e68ae 100644
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v)
>>   
>>   void vcpu_destroy(struct vcpu *v)
>>   {
>> -    xfree(v->arch.vm_event);
>> -    v->arch.vm_event = NULL;
>> +    v->arch.vm_event.emulate_flags = 0;
>> +    xfree(v->arch.vm_event.emul_read_data);
>> +    v->arch.vm_event.emul_read_data = NULL;
>>   
>>       if ( is_pv_32bit_vcpu(v) )
>>       {
>> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
>> index 855af4d..68f5515 100644
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -73,12 +73,12 @@ static int set_context_data(void *buffer, unsigned int size)
>>   {
>>       struct vcpu *curr = current;
>>   
>> -    if ( curr->arch.vm_event )
>> +    if ( curr->arch.vm_event.emul_read_data )
>>       {
>>           unsigned int safe_size =
>> -            min(size, curr->arch.vm_event->emul_read_data.size);
>> +            min(size, curr->arch.vm_event.emul_read_data->size);
>>   
>> -        memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size);
>> +        memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size);
>>           memset(buffer + safe_size, 0, size - safe_size);
>>           return X86EMUL_OKAY;
>>       }
>> @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear(
>>        * vm_event being triggered for repeated writes to a whole page.
>>        */
>>       if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) &&
>> -         current->arch.vm_event->emulate_flags != 0 )
>> +         current->arch.vm_event.emulate_flags != 0 )
>>          max_reps = 1;
>>   
>>       /*
>> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
>> index 884ae40..03dffb8 100644
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>>       if ( !handle_hvm_io_completion(v) )
>>           return;
>>   
>> -    if ( unlikely(v->arch.vm_event) )
>> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>>       {
>> -        if ( v->arch.vm_event->emulate_flags )
>> -        {
>> -            enum emul_kind kind = EMUL_KIND_NORMAL;
>> +        enum emul_kind kind;
>>   
>> -            if ( v->arch.vm_event->emulate_flags &
>> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>> -                kind = EMUL_KIND_SET_CONTEXT;
>> -            else if ( v->arch.vm_event->emulate_flags &
>> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
>> -                kind = EMUL_KIND_NOWRITE;
>> +        ASSERT(v->arch.vm_event.emul_read_data);
>>   
>> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>> -                                       HVM_DELIVER_NO_ERROR_CODE);
>> +        kind = EMUL_KIND_NORMAL;
> Why do the "kind = EMUL_KIND_NORMAL;" assignment separately after the
> ASSERT()? Could it not be left the same way as before ("enum emul_kind
> kind = EMUL_KIND_NORMAL;") above the ASSERT()?
>
> It's not a big change and I won't hold the patch over it, but small
> changes add up in the review process so unnecessary changes are best
> either avoided, or done in a standalone cleanup patch.
>
>> -            v->arch.vm_event->emulate_flags = 0;
>> -        }
>> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>> +            kind = EMUL_KIND_SET_CONTEXT;
>> +        else if ( v->arch.vm_event.emulate_flags &
>> +                  VM_EVENT_FLAG_EMULATE_NOWRITE )
>> +            kind = EMUL_KIND_NOWRITE;
>> +
>> +        hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>> +                                   HVM_DELIVER_NO_ERROR_CODE);
>> +
>> +        v->arch.vm_event.emulate_flags = 0;
>>       }
>>   
>>       arch_monitor_write_data(v);
>> @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t may_defer)
>>       if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>                                  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
>>       {
>> -        ASSERT(v->arch.vm_event);
>> -
>>           if ( hvm_monitor_crX(CR0, value, old_value) )
>>           {
>>               /*
>>                * The actual write will occur in arch_monitor_write_data(), if
>>                * permitted.
>>                */
>> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>> -            v->arch.vm_event->write_data.status = MWS_CR0;
>> -            v->arch.vm_event->write_data.value = value;
>> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>> +            v->arch.vm_event.write_data.status = MWS_CR0;
>> +            v->arch.vm_event.write_data.value = value;
>>   
>>               return X86EMUL_OKAY;
>>           }
>> @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
>>       if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>                                  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
>>       {
>> -        ASSERT(v->arch.vm_event);
>> -
>>           if ( hvm_monitor_crX(CR3, value, old) )
>>           {
>>               /*
>>                * The actual write will occur in arch_monitor_write_data(), if
>>                * permitted.
>>                */
>> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>> -            v->arch.vm_event->write_data.status = MWS_CR3;
>> -            v->arch.vm_event->write_data.value = value;
>> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>> +            v->arch.vm_event.write_data.status = MWS_CR3;
>> +            v->arch.vm_event.write_data.value = value;
>>   
>>               return X86EMUL_OKAY;
>>           }
>> @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t may_defer)
>>       if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>                                  monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
>>       {
>> -        ASSERT(v->arch.vm_event);
>> -
>>           if ( hvm_monitor_crX(CR4, value, old_cr) )
>>           {
>>               /*
>>                * The actual write will occur in arch_monitor_write_data(), if
>>                * permitted.
>>                */
>> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>> -            v->arch.vm_event->write_data.status = MWS_CR4;
>> -            v->arch.vm_event->write_data.value = value;
>> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>> +            v->arch.vm_event.write_data.status = MWS_CR4;
>> +            v->arch.vm_event.write_data.value = value;
>>   
>>               return X86EMUL_OKAY;
>>           }
>> @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
>>   
>>       if ( may_defer && unlikely(monitored_msr(v->domain, msr)) )
>>       {
>> -        ASSERT(v->arch.vm_event);
>> -
>>           /*
>>            * The actual write will occur in arch_monitor_write_data(), if
>>            * permitted.
>>            */
>> -        ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>> -        v->arch.vm_event->write_data.status = MWS_MSR;
>> -        v->arch.vm_event->write_data.msr = msr;
>> -        v->arch.vm_event->write_data.value = msr_content;
>> +        ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>> +        v->arch.vm_event.write_data.status = MWS_MSR;
>> +        v->arch.vm_event.write_data.msr = msr;
>> +        v->arch.vm_event.write_data.value = msr_content;
>>   
>>           hvm_monitor_msr(msr, msr_content);
>>           return X86EMUL_OKAY;
>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
>> index 16733a4..9bcaa8a 100644
>> --- a/xen/arch/x86/mm/p2m.c
>> +++ b/xen/arch/x86/mm/p2m.c
>> @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu *v,
>>               }
>>           }
>>   
>> -        v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
>> +        v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0;
>>   
>>           if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
>> -            v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
>> +            *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data;
>>       }
>>   }
>>   
>> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
>> index 5c8d4da..88d14ae 100644
>> --- a/xen/arch/x86/monitor.c
>> +++ b/xen/arch/x86/monitor.c
>> @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d)
>>   
>>   void arch_monitor_write_data(struct vcpu *v)
>>   {
>> -    struct monitor_write_data *w;
>> -
>> -    if ( likely(!v->arch.vm_event) )
>> -        return;
>> -
>> -    w = &v->arch.vm_event->write_data;
>> +    struct monitor_write_data *w = &v->arch.vm_event.write_data;
>>   
>>       if ( likely(MWS_NOWRITE == w->status) )
>>           return;
>> diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
>> index 825da48..f21ff10 100644
>> --- a/xen/arch/x86/vm_event.c
>> +++ b/xen/arch/x86/vm_event.c
>> @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d)
>>   
>>       for_each_vcpu ( d, v )
>>       {
>> -        if ( v->arch.vm_event )
>> +        if ( v->arch.vm_event.emul_read_data )
>>               continue;
>>   
>> -        v->arch.vm_event = xzalloc(struct arch_vm_event);
>> +        v->arch.vm_event.emul_read_data =
>> +                xzalloc(struct vm_event_emul_read_data);
>>   
>> -        if ( !v->arch.vm_event )
>> +        if ( !v->arch.vm_event.emul_read_data )
>>               return -ENOMEM;
>>       }
>>   
>> @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d)
>>   
>>       for_each_vcpu ( d, v )
>>       {
>> -        xfree(v->arch.vm_event);
>> -        v->arch.vm_event = NULL;
>> +        v->arch.vm_event.emulate_flags = 0;
>> +        xfree(v->arch.vm_event.emul_read_data);
>> +        v->arch.vm_event.emul_read_data = NULL;
>>       }
>>   
>>       d->arch.mem_access_emulate_each_rep = 0;
>> @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp)
>>   {
>>       if ( rsp->flags & VM_EVENT_FLAG_DENY )
>>       {
>> -        ASSERT(v->arch.vm_event);
>> -
>>           /* deny flag requires the vCPU to be paused */
>>           if ( !atomic_read(&v->vm_event_pause_count) )
>>               return;
>>   
>> -        v->arch.vm_event->write_data.status = MWS_NOWRITE;
>> +        v->arch.vm_event.write_data.status = MWS_NOWRITE;
>>       }
>>   }
>>   
>> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
>> index a22ee6b..7ea5c8f 100644
>> --- a/xen/include/asm-x86/domain.h
>> +++ b/xen/include/asm-x86/domain.h
>> @@ -259,21 +259,6 @@ struct pv_domain
>>       struct cpuidmasks *cpuidmasks;
>>   };
>>   
>> -enum monitor_write_status
>> -{
>> -    MWS_NOWRITE = 0,
>> -    MWS_MSR,
>> -    MWS_CR0,
>> -    MWS_CR3,
>> -    MWS_CR4,
>> -};
>> -
>> -struct monitor_write_data {
>> -    enum monitor_write_status status;
>> -    uint32_t msr;
>> -    uint64_t value;
>> -};
>> -
>>   struct arch_domain
>>   {
>>       struct page_info *perdomain_l3_pg;
>> @@ -496,6 +481,31 @@ typedef enum __packed {
>>       SMAP_CHECK_DISABLED,        /* disable the check */
>>   } smap_check_policy_t;
>>   
>> +enum monitor_write_status
>> +{
>> +    MWS_NOWRITE = 0,
>> +    MWS_MSR,
>> +    MWS_CR0,
>> +    MWS_CR3,
>> +    MWS_CR4,
>> +};
>> +
>> +struct monitor_write_data {
>> +    enum monitor_write_status status;
>> +    uint32_t msr;
>> +    uint64_t value;
>> +};
>> +
>> +/*
>> + * Should we emulate the next matching instruction on VCPU resume
>> + * after a vm_event?
>> + */
>> +struct arch_vm_event {
>> +    uint32_t emulate_flags;
>> +    struct vm_event_emul_read_data *emul_read_data;
>> +    struct monitor_write_data write_data;
>> +};
>> +
>>   struct arch_vcpu
>>   {
>>       /*
>> @@ -569,7 +579,7 @@ struct arch_vcpu
>>       /* A secondary copy of the vcpu time info. */
>>       XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
>>   
>> -    struct arch_vm_event *vm_event;
>> +    struct arch_vm_event vm_event;
>>   };
>>   
>>   smap_check_policy_t smap_policy_change(struct vcpu *v,
>> diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
>> index 1068376..984ac4c 100644
>> --- a/xen/include/asm-x86/monitor.h
>> +++ b/xen/include/asm-x86/monitor.h
>> @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop)
>>            * Enabling mem_access_emulate_each_rep without a vm_event subscriber
>>            * is meaningless.
>>            */
>> -        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
>> +        if ( d->max_vcpus && d->vcpu[0] &&
>> +             d->vcpu[0]->arch.vm_event.emul_read_data )
> Again, I won't hold the patch over this, but if there are additional
> reviews that require changes and cause another version of it, please add
> a small line to the comment above the if, stating that emul_read_data
> only gets allocated when vm_event gets enabled, otherwise (especially
> for newcomers) that check might look confusing.
>
> Otherwise:
>
> Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
>
>
> Thanks,
> Razvan
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

Since that came up wouldn't it be even nicer if we add a:

#define vm_event_initialized_on_vcpu(v)     (NULL != 
(v)->arch.vm_event.emul_read_data)

in asm-x86/vm_event.h above vm_event_init_domain and use that everywhere 
instead?

Corneliu.
Razvan Cojocaru July 1, 2016, 6:59 a.m. UTC | #3
On 07/01/16 09:56, Corneliu ZUZU wrote:
> On 7/1/2016 9:47 AM, Razvan Cojocaru wrote:
>> On 06/30/16 21:45, Corneliu ZUZU wrote:
>>> The arch_vm_event structure is dynamically allocated and freed @
>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the
>>> toolstack user
>>> disables domain monitoring (xc_monitor_disable), which in turn
>>> effectively
>>> discards any information that was in arch_vm_event.write_data.
>>>
>>> But this can yield unexpected behavior since if a CR-write was
>>> awaiting to be
>>> committed on the scheduling tail
>>> (hvm_do_resume->arch_monitor_write_data)
>>> before xc_monitor_disable is called, then the domain CR write is
>>> wrongfully
>>> ignored, which of course, in these cases, can easily render a domain
>>> crash.
>>>
>>> To fix the issue, this patch makes only arch_vm_event.emul_read_data
>>> dynamically
>>> allocated instead of the whole arch_vm_event structure. With this we
>>> can avoid
>>> invalidation of an awaiting arch_vm_event.write_data by selectively
>>> cleaning up
>>> only emul_read_data and emulate_flags @ vm_event_cleanup_domain.
>>>
>>> Small note: arch_vm_event structure definition needed to be moved from
>>> asm-x86/vm_event.h to asm-x86/domain.h in the process.
>>>
>>> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
>>> ---
>>>   xen/arch/x86/domain.c          |  5 ++--
>>>   xen/arch/x86/hvm/emulate.c     |  8 +++---
>>>   xen/arch/x86/hvm/hvm.c         | 62
>>> ++++++++++++++++++------------------------
>>>   xen/arch/x86/mm/p2m.c          |  4 +--
>>>   xen/arch/x86/monitor.c         |  7 +----
>>>   xen/arch/x86/vm_event.c        | 16 +++++------
>>>   xen/include/asm-x86/domain.h   | 42 +++++++++++++++++-----------
>>>   xen/include/asm-x86/monitor.h  |  3 +-
>>>   xen/include/asm-x86/vm_event.h | 10 -------
>>>   9 files changed, 73 insertions(+), 84 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
>>> index bb59247..06e68ae 100644
>>> --- a/xen/arch/x86/domain.c
>>> +++ b/xen/arch/x86/domain.c
>>> @@ -492,8 +492,9 @@ int vcpu_initialise(struct vcpu *v)
>>>     void vcpu_destroy(struct vcpu *v)
>>>   {
>>> -    xfree(v->arch.vm_event);
>>> -    v->arch.vm_event = NULL;
>>> +    v->arch.vm_event.emulate_flags = 0;
>>> +    xfree(v->arch.vm_event.emul_read_data);
>>> +    v->arch.vm_event.emul_read_data = NULL;
>>>         if ( is_pv_32bit_vcpu(v) )
>>>       {
>>> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
>>> index 855af4d..68f5515 100644
>>> --- a/xen/arch/x86/hvm/emulate.c
>>> +++ b/xen/arch/x86/hvm/emulate.c
>>> @@ -73,12 +73,12 @@ static int set_context_data(void *buffer,
>>> unsigned int size)
>>>   {
>>>       struct vcpu *curr = current;
>>>   -    if ( curr->arch.vm_event )
>>> +    if ( curr->arch.vm_event.emul_read_data )
>>>       {
>>>           unsigned int safe_size =
>>> -            min(size, curr->arch.vm_event->emul_read_data.size);
>>> +            min(size, curr->arch.vm_event.emul_read_data->size);
>>>   -        memcpy(buffer, curr->arch.vm_event->emul_read_data.data,
>>> safe_size);
>>> +        memcpy(buffer, curr->arch.vm_event.emul_read_data->data,
>>> safe_size);
>>>           memset(buffer + safe_size, 0, size - safe_size);
>>>           return X86EMUL_OKAY;
>>>       }
>>> @@ -523,7 +523,7 @@ static int hvmemul_virtual_to_linear(
>>>        * vm_event being triggered for repeated writes to a whole page.
>>>        */
>>>       if (
>>> unlikely(current->domain->arch.mem_access_emulate_each_rep) &&
>>> -         current->arch.vm_event->emulate_flags != 0 )
>>> +         current->arch.vm_event.emulate_flags != 0 )
>>>          max_reps = 1;
>>>         /*
>>> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
>>> index 884ae40..03dffb8 100644
>>> --- a/xen/arch/x86/hvm/hvm.c
>>> +++ b/xen/arch/x86/hvm/hvm.c
>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>>>       if ( !handle_hvm_io_completion(v) )
>>>           return;
>>>   -    if ( unlikely(v->arch.vm_event) )
>>> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>>>       {
>>> -        if ( v->arch.vm_event->emulate_flags )
>>> -        {
>>> -            enum emul_kind kind = EMUL_KIND_NORMAL;
>>> +        enum emul_kind kind;
>>>   -            if ( v->arch.vm_event->emulate_flags &
>>> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>>> -                kind = EMUL_KIND_SET_CONTEXT;
>>> -            else if ( v->arch.vm_event->emulate_flags &
>>> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
>>> -                kind = EMUL_KIND_NOWRITE;
>>> +        ASSERT(v->arch.vm_event.emul_read_data);
>>>   -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>>> -                                       HVM_DELIVER_NO_ERROR_CODE);
>>> +        kind = EMUL_KIND_NORMAL;
>> Why do the "kind = EMUL_KIND_NORMAL;" assignment separately after the
>> ASSERT()? Could it not be left the same way as before ("enum emul_kind
>> kind = EMUL_KIND_NORMAL;") above the ASSERT()?
>>
>> It's not a big change and I won't hold the patch over it, but small
>> changes add up in the review process so unnecessary changes are best
>> either avoided, or done in a standalone cleanup patch.
>>
>>> -            v->arch.vm_event->emulate_flags = 0;
>>> -        }
>>> +        if ( v->arch.vm_event.emulate_flags &
>>> VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>>> +            kind = EMUL_KIND_SET_CONTEXT;
>>> +        else if ( v->arch.vm_event.emulate_flags &
>>> +                  VM_EVENT_FLAG_EMULATE_NOWRITE )
>>> +            kind = EMUL_KIND_NOWRITE;
>>> +
>>> +        hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>>> +                                   HVM_DELIVER_NO_ERROR_CODE);
>>> +
>>> +        v->arch.vm_event.emulate_flags = 0;
>>>       }
>>>         arch_monitor_write_data(v);
>>> @@ -2178,17 +2178,15 @@ int hvm_set_cr0(unsigned long value, bool_t
>>> may_defer)
>>>       if ( may_defer &&
>>> unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>>                                 
>>> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
>>>       {
>>> -        ASSERT(v->arch.vm_event);
>>> -
>>>           if ( hvm_monitor_crX(CR0, value, old_value) )
>>>           {
>>>               /*
>>>                * The actual write will occur in
>>> arch_monitor_write_data(), if
>>>                * permitted.
>>>                */
>>> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>>> -            v->arch.vm_event->write_data.status = MWS_CR0;
>>> -            v->arch.vm_event->write_data.value = value;
>>> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>>> +            v->arch.vm_event.write_data.status = MWS_CR0;
>>> +            v->arch.vm_event.write_data.value = value;
>>>                 return X86EMUL_OKAY;
>>>           }
>>> @@ -2284,17 +2282,15 @@ int hvm_set_cr3(unsigned long value, bool_t
>>> may_defer)
>>>       if ( may_defer &&
>>> unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>>                                 
>>> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
>>>       {
>>> -        ASSERT(v->arch.vm_event);
>>> -
>>>           if ( hvm_monitor_crX(CR3, value, old) )
>>>           {
>>>               /*
>>>                * The actual write will occur in
>>> arch_monitor_write_data(), if
>>>                * permitted.
>>>                */
>>> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>>> -            v->arch.vm_event->write_data.status = MWS_CR3;
>>> -            v->arch.vm_event->write_data.value = value;
>>> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>>> +            v->arch.vm_event.write_data.status = MWS_CR3;
>>> +            v->arch.vm_event.write_data.value = value;
>>>                 return X86EMUL_OKAY;
>>>           }
>>> @@ -2368,17 +2364,15 @@ int hvm_set_cr4(unsigned long value, bool_t
>>> may_defer)
>>>       if ( may_defer &&
>>> unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>>                                 
>>> monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
>>>       {
>>> -        ASSERT(v->arch.vm_event);
>>> -
>>>           if ( hvm_monitor_crX(CR4, value, old_cr) )
>>>           {
>>>               /*
>>>                * The actual write will occur in
>>> arch_monitor_write_data(), if
>>>                * permitted.
>>>                */
>>> -            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>>> -            v->arch.vm_event->write_data.status = MWS_CR4;
>>> -            v->arch.vm_event->write_data.value = value;
>>> +            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>>> +            v->arch.vm_event.write_data.status = MWS_CR4;
>>> +            v->arch.vm_event.write_data.value = value;
>>>                 return X86EMUL_OKAY;
>>>           }
>>> @@ -3753,16 +3747,14 @@ int hvm_msr_write_intercept(unsigned int msr,
>>> uint64_t msr_content,
>>>         if ( may_defer && unlikely(monitored_msr(v->domain, msr)) )
>>>       {
>>> -        ASSERT(v->arch.vm_event);
>>> -
>>>           /*
>>>            * The actual write will occur in
>>> arch_monitor_write_data(), if
>>>            * permitted.
>>>            */
>>> -        ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
>>> -        v->arch.vm_event->write_data.status = MWS_MSR;
>>> -        v->arch.vm_event->write_data.msr = msr;
>>> -        v->arch.vm_event->write_data.value = msr_content;
>>> +        ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
>>> +        v->arch.vm_event.write_data.status = MWS_MSR;
>>> +        v->arch.vm_event.write_data.msr = msr;
>>> +        v->arch.vm_event.write_data.value = msr_content;
>>>             hvm_monitor_msr(msr, msr_content);
>>>           return X86EMUL_OKAY;
>>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
>>> index 16733a4..9bcaa8a 100644
>>> --- a/xen/arch/x86/mm/p2m.c
>>> +++ b/xen/arch/x86/mm/p2m.c
>>> @@ -1639,10 +1639,10 @@ void p2m_mem_access_emulate_check(struct vcpu
>>> *v,
>>>               }
>>>           }
>>>   -        v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
>>> +        v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0;
>>>             if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
>>> -            v->arch.vm_event->emul_read_data =
>>> rsp->data.emul_read_data;
>>> +            *v->arch.vm_event.emul_read_data =
>>> rsp->data.emul_read_data;
>>>       }
>>>   }
>>>   diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
>>> index 5c8d4da..88d14ae 100644
>>> --- a/xen/arch/x86/monitor.c
>>> +++ b/xen/arch/x86/monitor.c
>>> @@ -46,12 +46,7 @@ void arch_monitor_cleanup_domain(struct domain *d)
>>>     void arch_monitor_write_data(struct vcpu *v)
>>>   {
>>> -    struct monitor_write_data *w;
>>> -
>>> -    if ( likely(!v->arch.vm_event) )
>>> -        return;
>>> -
>>> -    w = &v->arch.vm_event->write_data;
>>> +    struct monitor_write_data *w = &v->arch.vm_event.write_data;
>>>         if ( likely(MWS_NOWRITE == w->status) )
>>>           return;
>>> diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
>>> index 825da48..f21ff10 100644
>>> --- a/xen/arch/x86/vm_event.c
>>> +++ b/xen/arch/x86/vm_event.c
>>> @@ -30,12 +30,13 @@ int vm_event_init_domain(struct domain *d)
>>>         for_each_vcpu ( d, v )
>>>       {
>>> -        if ( v->arch.vm_event )
>>> +        if ( v->arch.vm_event.emul_read_data )
>>>               continue;
>>>   -        v->arch.vm_event = xzalloc(struct arch_vm_event);
>>> +        v->arch.vm_event.emul_read_data =
>>> +                xzalloc(struct vm_event_emul_read_data);
>>>   -        if ( !v->arch.vm_event )
>>> +        if ( !v->arch.vm_event.emul_read_data )
>>>               return -ENOMEM;
>>>       }
>>>   @@ -52,8 +53,9 @@ void vm_event_cleanup_domain(struct domain *d)
>>>         for_each_vcpu ( d, v )
>>>       {
>>> -        xfree(v->arch.vm_event);
>>> -        v->arch.vm_event = NULL;
>>> +        v->arch.vm_event.emulate_flags = 0;
>>> +        xfree(v->arch.vm_event.emul_read_data);
>>> +        v->arch.vm_event.emul_read_data = NULL;
>>>       }
>>>         d->arch.mem_access_emulate_each_rep = 0;
>>> @@ -73,13 +75,11 @@ void vm_event_register_write_resume(struct vcpu
>>> *v, vm_event_response_t *rsp)
>>>   {
>>>       if ( rsp->flags & VM_EVENT_FLAG_DENY )
>>>       {
>>> -        ASSERT(v->arch.vm_event);
>>> -
>>>           /* deny flag requires the vCPU to be paused */
>>>           if ( !atomic_read(&v->vm_event_pause_count) )
>>>               return;
>>>   -        v->arch.vm_event->write_data.status = MWS_NOWRITE;
>>> +        v->arch.vm_event.write_data.status = MWS_NOWRITE;
>>>       }
>>>   }
>>>   diff --git a/xen/include/asm-x86/domain.h
>>> b/xen/include/asm-x86/domain.h
>>> index a22ee6b..7ea5c8f 100644
>>> --- a/xen/include/asm-x86/domain.h
>>> +++ b/xen/include/asm-x86/domain.h
>>> @@ -259,21 +259,6 @@ struct pv_domain
>>>       struct cpuidmasks *cpuidmasks;
>>>   };
>>>   -enum monitor_write_status
>>> -{
>>> -    MWS_NOWRITE = 0,
>>> -    MWS_MSR,
>>> -    MWS_CR0,
>>> -    MWS_CR3,
>>> -    MWS_CR4,
>>> -};
>>> -
>>> -struct monitor_write_data {
>>> -    enum monitor_write_status status;
>>> -    uint32_t msr;
>>> -    uint64_t value;
>>> -};
>>> -
>>>   struct arch_domain
>>>   {
>>>       struct page_info *perdomain_l3_pg;
>>> @@ -496,6 +481,31 @@ typedef enum __packed {
>>>       SMAP_CHECK_DISABLED,        /* disable the check */
>>>   } smap_check_policy_t;
>>>   +enum monitor_write_status
>>> +{
>>> +    MWS_NOWRITE = 0,
>>> +    MWS_MSR,
>>> +    MWS_CR0,
>>> +    MWS_CR3,
>>> +    MWS_CR4,
>>> +};
>>> +
>>> +struct monitor_write_data {
>>> +    enum monitor_write_status status;
>>> +    uint32_t msr;
>>> +    uint64_t value;
>>> +};
>>> +
>>> +/*
>>> + * Should we emulate the next matching instruction on VCPU resume
>>> + * after a vm_event?
>>> + */
>>> +struct arch_vm_event {
>>> +    uint32_t emulate_flags;
>>> +    struct vm_event_emul_read_data *emul_read_data;
>>> +    struct monitor_write_data write_data;
>>> +};
>>> +
>>>   struct arch_vcpu
>>>   {
>>>       /*
>>> @@ -569,7 +579,7 @@ struct arch_vcpu
>>>       /* A secondary copy of the vcpu time info. */
>>>       XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
>>>   -    struct arch_vm_event *vm_event;
>>> +    struct arch_vm_event vm_event;
>>>   };
>>>     smap_check_policy_t smap_policy_change(struct vcpu *v,
>>> diff --git a/xen/include/asm-x86/monitor.h
>>> b/xen/include/asm-x86/monitor.h
>>> index 1068376..984ac4c 100644
>>> --- a/xen/include/asm-x86/monitor.h
>>> +++ b/xen/include/asm-x86/monitor.h
>>> @@ -48,7 +48,8 @@ int arch_monitor_domctl_op(struct domain *d, struct
>>> xen_domctl_monitor_op *mop)
>>>            * Enabling mem_access_emulate_each_rep without a vm_event
>>> subscriber
>>>            * is meaningless.
>>>            */
>>> -        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
>>> +        if ( d->max_vcpus && d->vcpu[0] &&
>>> +             d->vcpu[0]->arch.vm_event.emul_read_data )
>> Again, I won't hold the patch over this, but if there are additional
>> reviews that require changes and cause another version of it, please add
>> a small line to the comment above the if, stating that emul_read_data
>> only gets allocated when vm_event gets enabled, otherwise (especially
>> for newcomers) that check might look confusing.
>>
>> Otherwise:
>>
>> Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
>>
>>
>> Thanks,
>> Razvan
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
> 
> Since that came up wouldn't it be even nicer if we add a:
> 
> #define vm_event_initialized_on_vcpu(v)     (NULL !=
> (v)->arch.vm_event.emul_read_data)
> 
> in asm-x86/vm_event.h above vm_event_init_domain and use that everywhere
> instead?

Yes, I think that's the best way to go about that.


Thanks,
Razvan
Jan Beulich July 4, 2016, 12:47 p.m. UTC | #4
>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
> The arch_vm_event structure is dynamically allocated and freed @
> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
> disables domain monitoring (xc_monitor_disable), which in turn effectively
> discards any information that was in arch_vm_event.write_data.

Isn't that rather a toolstack user bug, not warranting a relatively
extensive (even if mostly mechanical) hypervisor change like this
one? Sane monitor behavior, after all, is required anyway for the
monitored guest to survive.

> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>      if ( !handle_hvm_io_completion(v) )
>          return;
>  
> -    if ( unlikely(v->arch.vm_event) )
> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>      {
> -        if ( v->arch.vm_event->emulate_flags )
> -        {
> -            enum emul_kind kind = EMUL_KIND_NORMAL;
> +        enum emul_kind kind;
>  
> -            if ( v->arch.vm_event->emulate_flags &
> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
> -                kind = EMUL_KIND_SET_CONTEXT;
> -            else if ( v->arch.vm_event->emulate_flags &
> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
> -                kind = EMUL_KIND_NOWRITE;
> +        ASSERT(v->arch.vm_event.emul_read_data);
>  
> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
> -                                       HVM_DELIVER_NO_ERROR_CODE);
> +        kind = EMUL_KIND_NORMAL;

Please keep this being the initializer of the variable.

>  
> -            v->arch.vm_event->emulate_flags = 0;
> -        }
> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA )

Long line.

> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -259,21 +259,6 @@ struct pv_domain
>      struct cpuidmasks *cpuidmasks;
>  };
>  
> -enum monitor_write_status
> -{
> -    MWS_NOWRITE = 0,
> -    MWS_MSR,
> -    MWS_CR0,
> -    MWS_CR3,
> -    MWS_CR4,
> -};
> -
> -struct monitor_write_data {
> -    enum monitor_write_status status;
> -    uint32_t msr;
> -    uint64_t value;
> -};
> -
>  struct arch_domain
>  {
>      struct page_info *perdomain_l3_pg;
> @@ -496,6 +481,31 @@ typedef enum __packed {
>      SMAP_CHECK_DISABLED,        /* disable the check */
>  } smap_check_policy_t;
>  
> +enum monitor_write_status
> +{
> +    MWS_NOWRITE = 0,
> +    MWS_MSR,
> +    MWS_CR0,
> +    MWS_CR3,
> +    MWS_CR4,
> +};
> +
> +struct monitor_write_data {
> +    enum monitor_write_status status;
> +    uint32_t msr;
> +    uint64_t value;
> +};

Instead of moving these around now, may I suggest you put them
into their final place right away in the previous patch?

Jan
Corneliu ZUZU July 4, 2016, 1:03 p.m. UTC | #5
On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>> The arch_vm_event structure is dynamically allocated and freed @
>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>> discards any information that was in arch_vm_event.write_data.
> Isn't that rather a toolstack user bug, not warranting a relatively
> extensive (even if mostly mechanical) hypervisor change like this
> one? Sane monitor behavior, after all, is required anyway for the
> monitored guest to survive.

Sorry but could you please rephrase this, I don't quite understand what 
you're saying.
The write_data field in arch_vm_event should _not ever_ be invalidated 
as a direct result of a toolstack user's action.

>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>>       if ( !handle_hvm_io_completion(v) )
>>           return;
>>   
>> -    if ( unlikely(v->arch.vm_event) )
>> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>>       {
>> -        if ( v->arch.vm_event->emulate_flags )
>> -        {
>> -            enum emul_kind kind = EMUL_KIND_NORMAL;
>> +        enum emul_kind kind;
>>   
>> -            if ( v->arch.vm_event->emulate_flags &
>> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>> -                kind = EMUL_KIND_SET_CONTEXT;
>> -            else if ( v->arch.vm_event->emulate_flags &
>> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
>> -                kind = EMUL_KIND_NOWRITE;
>> +        ASSERT(v->arch.vm_event.emul_read_data);
>>   
>> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>> -                                       HVM_DELIVER_NO_ERROR_CODE);
>> +        kind = EMUL_KIND_NORMAL;
> Please keep this being the initializer of the variable.

I put it there because of the ASSERT (to do that before anything else), 
but I will undo if you prefer.

>
>>   
>> -            v->arch.vm_event->emulate_flags = 0;
>> -        }
>> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA )
> Long line.

Long but under 80 columns, isn't that the rule? :-)

>
>> --- a/xen/include/asm-x86/domain.h
>> +++ b/xen/include/asm-x86/domain.h
>> @@ -259,21 +259,6 @@ struct pv_domain
>>       struct cpuidmasks *cpuidmasks;
>>   };
>>   
>> -enum monitor_write_status
>> -{
>> -    MWS_NOWRITE = 0,
>> -    MWS_MSR,
>> -    MWS_CR0,
>> -    MWS_CR3,
>> -    MWS_CR4,
>> -};
>> -
>> -struct monitor_write_data {
>> -    enum monitor_write_status status;
>> -    uint32_t msr;
>> -    uint64_t value;
>> -};
>> -
>>   struct arch_domain
>>   {
>>       struct page_info *perdomain_l3_pg;
>> @@ -496,6 +481,31 @@ typedef enum __packed {
>>       SMAP_CHECK_DISABLED,        /* disable the check */
>>   } smap_check_policy_t;
>>   
>> +enum monitor_write_status
>> +{
>> +    MWS_NOWRITE = 0,
>> +    MWS_MSR,
>> +    MWS_CR0,
>> +    MWS_CR3,
>> +    MWS_CR4,
>> +};
>> +
>> +struct monitor_write_data {
>> +    enum monitor_write_status status;
>> +    uint32_t msr;
>> +    uint64_t value;
>> +};
> Instead of moving these around now, may I suggest you put them
> into their final place right away in the previous patch?
>
> Jan
>
>

Sounds good, will do.

Corneliu.
Jan Beulich July 4, 2016, 1:11 p.m. UTC | #6
>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>> The arch_vm_event structure is dynamically allocated and freed @
>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>> discards any information that was in arch_vm_event.write_data.
>> Isn't that rather a toolstack user bug, not warranting a relatively
>> extensive (even if mostly mechanical) hypervisor change like this
>> one? Sane monitor behavior, after all, is required anyway for the
>> monitored guest to survive.
> 
> Sorry but could you please rephrase this, I don't quite understand what 
> you're saying.
> The write_data field in arch_vm_event should _not ever_ be invalidated 
> as a direct result of a toolstack user's action.

The monitoring app can cause all kinds of problems to the guest it
monitors. Why would this specific one need taking care of in the
hypervisor, instead of demanding that the app not disable monitoring
at the wrong time?

>>> --- a/xen/arch/x86/hvm/hvm.c
>>> +++ b/xen/arch/x86/hvm/hvm.c
>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>>>       if ( !handle_hvm_io_completion(v) )
>>>           return;
>>>   
>>> -    if ( unlikely(v->arch.vm_event) )
>>> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>>>       {
>>> -        if ( v->arch.vm_event->emulate_flags )
>>> -        {
>>> -            enum emul_kind kind = EMUL_KIND_NORMAL;
>>> +        enum emul_kind kind;
>>>   
>>> -            if ( v->arch.vm_event->emulate_flags &
>>> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>>> -                kind = EMUL_KIND_SET_CONTEXT;
>>> -            else if ( v->arch.vm_event->emulate_flags &
>>> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
>>> -                kind = EMUL_KIND_NOWRITE;
>>> +        ASSERT(v->arch.vm_event.emul_read_data);
>>>   
>>> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>>> -                                       HVM_DELIVER_NO_ERROR_CODE);
>>> +        kind = EMUL_KIND_NORMAL;
>> Please keep this being the initializer of the variable.
> 
> I put it there because of the ASSERT (to do that before anything else), 
> but I will undo if you prefer.

Since the initializer is (very obviously) independent of the
condition the ASSERT() checks, I indeed would prefer it to remain
the way it is before this change.

>>> -            v->arch.vm_event->emulate_flags = 0;
>>> -        }
>>> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>> Long line.
> 
> Long but under 80 columns, isn't that the rule? :-)

I've counted 81 here.

Jan
Corneliu ZUZU July 4, 2016, 1:28 p.m. UTC | #7
On 7/4/2016 4:11 PM, Jan Beulich wrote:
>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>> discards any information that was in arch_vm_event.write_data.
>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>> extensive (even if mostly mechanical) hypervisor change like this
>>> one? Sane monitor behavior, after all, is required anyway for the
>>> monitored guest to survive.
>> Sorry but could you please rephrase this, I don't quite understand what
>> you're saying.
>> The write_data field in arch_vm_event should _not ever_ be invalidated
>> as a direct result of a toolstack user's action.
> The monitoring app can cause all kinds of problems to the guest it
> monitors. Why would this specific one need taking care of in the
> hypervisor, instead of demanding that the app not disable monitoring
> at the wrong time?

Because it wouldn't be the wrong time to disable monitoring.
This is not a case of wrong toolstack usage, but rather a case of 
xc_monitor_disable doing the wrong thing along the way.

>
>>>> --- a/xen/arch/x86/hvm/hvm.c
>>>> +++ b/xen/arch/x86/hvm/hvm.c
>>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>>>>        if ( !handle_hvm_io_completion(v) )
>>>>            return;
>>>>    
>>>> -    if ( unlikely(v->arch.vm_event) )
>>>> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>>>>        {
>>>> -        if ( v->arch.vm_event->emulate_flags )
>>>> -        {
>>>> -            enum emul_kind kind = EMUL_KIND_NORMAL;
>>>> +        enum emul_kind kind;
>>>>    
>>>> -            if ( v->arch.vm_event->emulate_flags &
>>>> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>>>> -                kind = EMUL_KIND_SET_CONTEXT;
>>>> -            else if ( v->arch.vm_event->emulate_flags &
>>>> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
>>>> -                kind = EMUL_KIND_NOWRITE;
>>>> +        ASSERT(v->arch.vm_event.emul_read_data);
>>>>    
>>>> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>>>> -                                       HVM_DELIVER_NO_ERROR_CODE);
>>>> +        kind = EMUL_KIND_NORMAL;
>>> Please keep this being the initializer of the variable.
>> I put it there because of the ASSERT (to do that before anything else),
>> but I will undo if you prefer.
> Since the initializer is (very obviously) independent of the
> condition the ASSERT() checks, I indeed would prefer it to remain
> the way it is before this change.
>
>>>> -            v->arch.vm_event->emulate_flags = 0;
>>>> -        }
>>>> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>>> Long line.
>> Long but under 80 columns, isn't that the rule? :-)
> I've counted 81 here.
>
> Jan

You may have counted the beginning '+' as well. Is the rule "<= 80 
columns in the source file" (in which case you're wrong) or is it "<= 80 
columns in the resulting diff" (in which case I'm wrong)?

Corneliu.
Razvan Cojocaru July 4, 2016, 1:50 p.m. UTC | #8
On 07/04/16 16:11, Jan Beulich wrote:
>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>> discards any information that was in arch_vm_event.write_data.
>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>> extensive (even if mostly mechanical) hypervisor change like this
>>> one? Sane monitor behavior, after all, is required anyway for the
>>> monitored guest to survive.
>>
>> Sorry but could you please rephrase this, I don't quite understand what 
>> you're saying.
>> The write_data field in arch_vm_event should _not ever_ be invalidated 
>> as a direct result of a toolstack user's action.
> 
> The monitoring app can cause all kinds of problems to the guest it
> monitors. Why would this specific one need taking care of in the
> hypervisor, instead of demanding that the app not disable monitoring
> at the wrong time?

I'm not sure there's a right time here. The problem is that, if I
understand this correctly, a race is possible between the moment the
userspace application responds to the vm_event _and_ call
xc_monitor_disable() and the time hvm_do_resume() gets called.

If xc_monitor_disable() happened before hvm_do_resume() springs into
action, we lose a register write. There's no guaranteed way for this not
to happen as far as I can see, although it's true that the race should
pretty much never happen in practice - at least we've never come across
such a case so far.


Thanks,
Razvan
Corneliu ZUZU July 4, 2016, 2:05 p.m. UTC | #9
On 7/4/2016 4:50 PM, Razvan Cojocaru wrote:
> On 07/04/16 16:11, Jan Beulich wrote:
>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>> discards any information that was in arch_vm_event.write_data.
>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>> monitored guest to survive.
>>> Sorry but could you please rephrase this, I don't quite understand what
>>> you're saying.
>>> The write_data field in arch_vm_event should _not ever_ be invalidated
>>> as a direct result of a toolstack user's action.
>> The monitoring app can cause all kinds of problems to the guest it
>> monitors. Why would this specific one need taking care of in the
>> hypervisor, instead of demanding that the app not disable monitoring
>> at the wrong time?
> I'm not sure there's a right time here. The problem is that, if I
> understand this correctly, a race is possible between the moment the
> userspace application responds to the vm_event _and_ call
> xc_monitor_disable() and the time hvm_do_resume() gets called.
>
> If xc_monitor_disable() happened before hvm_do_resume() springs into
> action, we lose a register write. There's no guaranteed way for this not
> to happen as far as I can see, although it's true that the race should
> pretty much never happen in practice - at least we've never come across
> such a case so far.
>
>
> Thanks,
> Razvan
>

Perfectly pointed, thanks. Note that xc_monitor_disable() may happen 
before hvm_do_resume() because the latter only happens _when the 
scheduler reschedules the target vCPU_, which _may not happen between_ 
the moment the toolstack user _handles the vm-event_ and the moment when 
he _calls xc_monitor_disable()_, but rather after xc_monitor_disable() 
is called.

Corneliu.
Jan Beulich July 4, 2016, 2:13 p.m. UTC | #10
>>> On 04.07.16 at 15:28, <czuzu@bitdefender.com> wrote:
> On 7/4/2016 4:11 PM, Jan Beulich wrote:
>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack 
> user
>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>> discards any information that was in arch_vm_event.write_data.
>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>> monitored guest to survive.
>>> Sorry but could you please rephrase this, I don't quite understand what
>>> you're saying.
>>> The write_data field in arch_vm_event should _not ever_ be invalidated
>>> as a direct result of a toolstack user's action.
>> The monitoring app can cause all kinds of problems to the guest it
>> monitors. Why would this specific one need taking care of in the
>> hypervisor, instead of demanding that the app not disable monitoring
>> at the wrong time?
> 
> Because it wouldn't be the wrong time to disable monitoring.
> This is not a case of wrong toolstack usage, but rather a case of 
> xc_monitor_disable doing the wrong thing along the way.
> 
>>
>>>>> --- a/xen/arch/x86/hvm/hvm.c
>>>>> +++ b/xen/arch/x86/hvm/hvm.c
>>>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>>>>>        if ( !handle_hvm_io_completion(v) )
>>>>>            return;
>>>>>    
>>>>> -    if ( unlikely(v->arch.vm_event) )
>>>>> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>>>>>        {
>>>>> -        if ( v->arch.vm_event->emulate_flags )
>>>>> -        {
>>>>> -            enum emul_kind kind = EMUL_KIND_NORMAL;
>>>>> +        enum emul_kind kind;
>>>>>    
>>>>> -            if ( v->arch.vm_event->emulate_flags &
>>>>> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>>>>> -                kind = EMUL_KIND_SET_CONTEXT;
>>>>> -            else if ( v->arch.vm_event->emulate_flags &
>>>>> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
>>>>> -                kind = EMUL_KIND_NOWRITE;
>>>>> +        ASSERT(v->arch.vm_event.emul_read_data);
>>>>>    
>>>>> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>>>>> -                                       HVM_DELIVER_NO_ERROR_CODE);
>>>>> +        kind = EMUL_KIND_NORMAL;
>>>> Please keep this being the initializer of the variable.
>>> I put it there because of the ASSERT (to do that before anything else),
>>> but I will undo if you prefer.
>> Since the initializer is (very obviously) independent of the
>> condition the ASSERT() checks, I indeed would prefer it to remain
>> the way it is before this change.
>>
>>>>> -            v->arch.vm_event->emulate_flags = 0;
>>>>> -        }
>>>>> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA 
> )
>>>> Long line.
>>> Long but under 80 columns, isn't that the rule? :-)
>> I've counted 81 here.
> 
> You may have counted the beginning '+' as well. Is the rule "<= 80 
> columns in the source file" (in which case you're wrong) or is it "<= 80 
> columns in the resulting diff" (in which case I'm wrong)?

Ah - you first said "under 80" but now say "<= 80". The former
is what ./CODING_STYLE asks for. (And ftr the 81 included the
newline, so indeed I was off by one too.)

Jan
Jan Beulich July 4, 2016, 2:17 p.m. UTC | #11
>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote:
> On 07/04/16 16:11, Jan Beulich wrote:
>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack 
> user
>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>> discards any information that was in arch_vm_event.write_data.
>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>> monitored guest to survive.
>>>
>>> Sorry but could you please rephrase this, I don't quite understand what 
>>> you're saying.
>>> The write_data field in arch_vm_event should _not ever_ be invalidated 
>>> as a direct result of a toolstack user's action.
>> 
>> The monitoring app can cause all kinds of problems to the guest it
>> monitors. Why would this specific one need taking care of in the
>> hypervisor, instead of demanding that the app not disable monitoring
>> at the wrong time?
> 
> I'm not sure there's a right time here. The problem is that, if I
> understand this correctly, a race is possible between the moment the
> userspace application responds to the vm_event _and_ call
> xc_monitor_disable() and the time hvm_do_resume() gets called.

It's that _and_ in your reply that I put under question, but I admit
that questioning may be due to my limited (or should I say non-
existent) knowledge on the user space parts here: Why would the
app be _required_ to "responds to the vm_event _and_ call
xc_monitor_disable()", rather than delaying the latter for long
enough?

Jan
Razvan Cojocaru July 4, 2016, 2:21 p.m. UTC | #12
On 07/04/16 17:17, Jan Beulich wrote:
>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote:
>> On 07/04/16 16:11, Jan Beulich wrote:
>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack 
>> user
>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>>> discards any information that was in arch_vm_event.write_data.
>>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>>> monitored guest to survive.
>>>>
>>>> Sorry but could you please rephrase this, I don't quite understand what 
>>>> you're saying.
>>>> The write_data field in arch_vm_event should _not ever_ be invalidated 
>>>> as a direct result of a toolstack user's action.
>>>
>>> The monitoring app can cause all kinds of problems to the guest it
>>> monitors. Why would this specific one need taking care of in the
>>> hypervisor, instead of demanding that the app not disable monitoring
>>> at the wrong time?
>>
>> I'm not sure there's a right time here. The problem is that, if I
>> understand this correctly, a race is possible between the moment the
>> userspace application responds to the vm_event _and_ call
>> xc_monitor_disable() and the time hvm_do_resume() gets called.
> 
> It's that _and_ in your reply that I put under question, but I admit
> that questioning may be due to my limited (or should I say non-
> existent) knowledge on the user space parts here: Why would the
> app be _required_ to "responds to the vm_event _and_ call
> xc_monitor_disable()", rather than delaying the latter for long
> enough?

It's not required to do that, it's just that we don't really know what
"long enough means". I suppose a second should do be more than enough,
but obviously we'd prefer a fool-proof solution with better guarantees
and no lag - I thought that the hypervisor change is trivial enough to
not make the tradeoff into much of an issue.


Thanks,
Razvan
Corneliu ZUZU July 4, 2016, 2:31 p.m. UTC | #13
On 7/4/2016 5:13 PM, Jan Beulich wrote:
>>>> On 04.07.16 at 15:28, <czuzu@bitdefender.com> wrote:
>> On 7/4/2016 4:11 PM, Jan Beulich wrote:
>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack
>> user
>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>>> discards any information that was in arch_vm_event.write_data.
>>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>>> monitored guest to survive.
>>>> Sorry but could you please rephrase this, I don't quite understand what
>>>> you're saying.
>>>> The write_data field in arch_vm_event should _not ever_ be invalidated
>>>> as a direct result of a toolstack user's action.
>>> The monitoring app can cause all kinds of problems to the guest it
>>> monitors. Why would this specific one need taking care of in the
>>> hypervisor, instead of demanding that the app not disable monitoring
>>> at the wrong time?
>> Because it wouldn't be the wrong time to disable monitoring.
>> This is not a case of wrong toolstack usage, but rather a case of
>> xc_monitor_disable doing the wrong thing along the way.
>>
>>>>>> --- a/xen/arch/x86/hvm/hvm.c
>>>>>> +++ b/xen/arch/x86/hvm/hvm.c
>>>>>> @@ -473,24 +473,24 @@ void hvm_do_resume(struct vcpu *v)
>>>>>>         if ( !handle_hvm_io_completion(v) )
>>>>>>             return;
>>>>>>     
>>>>>> -    if ( unlikely(v->arch.vm_event) )
>>>>>> +    if ( unlikely(v->arch.vm_event.emulate_flags) )
>>>>>>         {
>>>>>> -        if ( v->arch.vm_event->emulate_flags )
>>>>>> -        {
>>>>>> -            enum emul_kind kind = EMUL_KIND_NORMAL;
>>>>>> +        enum emul_kind kind;
>>>>>>     
>>>>>> -            if ( v->arch.vm_event->emulate_flags &
>>>>>> -                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
>>>>>> -                kind = EMUL_KIND_SET_CONTEXT;
>>>>>> -            else if ( v->arch.vm_event->emulate_flags &
>>>>>> -                      VM_EVENT_FLAG_EMULATE_NOWRITE )
>>>>>> -                kind = EMUL_KIND_NOWRITE;
>>>>>> +        ASSERT(v->arch.vm_event.emul_read_data);
>>>>>>     
>>>>>> -            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
>>>>>> -                                       HVM_DELIVER_NO_ERROR_CODE);
>>>>>> +        kind = EMUL_KIND_NORMAL;
>>>>> Please keep this being the initializer of the variable.
>>>> I put it there because of the ASSERT (to do that before anything else),
>>>> but I will undo if you prefer.
>>> Since the initializer is (very obviously) independent of the
>>> condition the ASSERT() checks, I indeed would prefer it to remain
>>> the way it is before this change.
>>>
>>>>>> -            v->arch.vm_event->emulate_flags = 0;
>>>>>> -        }
>>>>>> +        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA
>> )
>>>>> Long line.
>>>> Long but under 80 columns, isn't that the rule? :-)
>>> I've counted 81 here.
>> You may have counted the beginning '+' as well. Is the rule "<= 80
>> columns in the source file" (in which case you're wrong) or is it "<= 80
>> columns in the resulting diff" (in which case I'm wrong)?
> Ah - you first said "under 80" but now say "<= 80". The former
> is what ./CODING_STYLE asks for. (And ftr the 81 included the
> newline, so indeed I was off by one too.)
>
> Jan

Hmm, I meant "<= 80" the first time too, I was under the impression that 
that's what CODING_STYLE asks for too, but, quoting:
     'Lines should be less than 80 characters in length.'
Why not <= 80? Shouldn't the limit (i.e. "<=") be 80 columns, including 
the 80-th? (this points me otherwise 
http://programmers.stackexchange.com/questions/148677/why-is-80-characters-the-standard-limit-for-code-width 
)

Corneliu.
Corneliu ZUZU July 4, 2016, 2:45 p.m. UTC | #14
On 7/4/2016 5:17 PM, Jan Beulich wrote:
>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote:
>> On 07/04/16 16:11, Jan Beulich wrote:
>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack
>> user
>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>>> discards any information that was in arch_vm_event.write_data.
>>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>>> monitored guest to survive.
>>>> Sorry but could you please rephrase this, I don't quite understand what
>>>> you're saying.
>>>> The write_data field in arch_vm_event should _not ever_ be invalidated
>>>> as a direct result of a toolstack user's action.
>>> The monitoring app can cause all kinds of problems to the guest it
>>> monitors. Why would this specific one need taking care of in the
>>> hypervisor, instead of demanding that the app not disable monitoring
>>> at the wrong time?
>> I'm not sure there's a right time here. The problem is that, if I
>> understand this correctly, a race is possible between the moment the
>> userspace application responds to the vm_event _and_ call
>> xc_monitor_disable() and the time hvm_do_resume() gets called.
> It's that _and_ in your reply that I put under question, but I admit
> that questioning may be due to my limited (or should I say non-
> existent) knowledge on the user space parts here: Why would the
> app be _required_ to "responds to the vm_event _and_ call
> xc_monitor_disable()", rather than delaying the latter for long
> enough?
>
> Jan
>
>

Because it's a limitation that can easily be avoided and also a rule 
which the user has to keep in mind (why should we impose this effort on 
the toolstack user if not necessary?).

Corneliu.
Jan Beulich July 4, 2016, 3:07 p.m. UTC | #15
>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote:
> On 07/04/16 17:17, Jan Beulich wrote:
>>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote:
>>> On 07/04/16 16:11, Jan Beulich wrote:
>>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack 
>>> user
>>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>>>> discards any information that was in arch_vm_event.write_data.
>>>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>>>> monitored guest to survive.
>>>>>
>>>>> Sorry but could you please rephrase this, I don't quite understand what 
>>>>> you're saying.
>>>>> The write_data field in arch_vm_event should _not ever_ be invalidated 
>>>>> as a direct result of a toolstack user's action.
>>>>
>>>> The monitoring app can cause all kinds of problems to the guest it
>>>> monitors. Why would this specific one need taking care of in the
>>>> hypervisor, instead of demanding that the app not disable monitoring
>>>> at the wrong time?
>>>
>>> I'm not sure there's a right time here. The problem is that, if I
>>> understand this correctly, a race is possible between the moment the
>>> userspace application responds to the vm_event _and_ call
>>> xc_monitor_disable() and the time hvm_do_resume() gets called.
>> 
>> It's that _and_ in your reply that I put under question, but I admit
>> that questioning may be due to my limited (or should I say non-
>> existent) knowledge on the user space parts here: Why would the
>> app be _required_ to "responds to the vm_event _and_ call
>> xc_monitor_disable()", rather than delaying the latter for long
>> enough?
> 
> It's not required to do that, it's just that we don't really know what
> "long enough means". I suppose a second should do be more than enough,
> but obviously we'd prefer a fool-proof solution with better guarantees
> and no lag - I thought that the hypervisor change is trivial enough to
> not make the tradeoff into much of an issue.

It's mostly mechanical indeed, but iirc it grows struct vcpu (or was
it struct domain?), which is never really desirable (but admittedly
often unavoidable). I'm therefore simply trying to understand what
alternatives there are.

Jan
Jan Beulich July 4, 2016, 3:09 p.m. UTC | #16
>>> On 04.07.16 at 16:31, <czuzu@bitdefender.com> wrote:
> Hmm, I meant "<= 80" the first time too, I was under the impression that 
> that's what CODING_STYLE asks for too, but, quoting:
>      'Lines should be less than 80 characters in length.'
> Why not <= 80? Shouldn't the limit (i.e. "<=") be 80 columns, including 
> the 80-th? (this points me otherwise 
> http://programmers.stackexchange.com/questions/148677/why-is-80-characters-t 
> he-standard-limit-for-code-width 
> )

I have no idea what (editing) tools have limitation at precisely which
line length. Hence I can't answer the "why". I can only refer to the
documented requirement, which aiui is pretty clear.

Jan
Corneliu ZUZU July 4, 2016, 3:21 p.m. UTC | #17
On 7/4/2016 6:07 PM, Jan Beulich wrote:
>>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote:
>> On 07/04/16 17:17, Jan Beulich wrote:
>>>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote:
>>>> On 07/04/16 16:11, Jan Beulich wrote:
>>>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack
>>>> user
>>>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>>>>> discards any information that was in arch_vm_event.write_data.
>>>>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>>>>> monitored guest to survive.
>>>>>> Sorry but could you please rephrase this, I don't quite understand what
>>>>>> you're saying.
>>>>>> The write_data field in arch_vm_event should _not ever_ be invalidated
>>>>>> as a direct result of a toolstack user's action.
>>>>> The monitoring app can cause all kinds of problems to the guest it
>>>>> monitors. Why would this specific one need taking care of in the
>>>>> hypervisor, instead of demanding that the app not disable monitoring
>>>>> at the wrong time?
>>>> I'm not sure there's a right time here. The problem is that, if I
>>>> understand this correctly, a race is possible between the moment the
>>>> userspace application responds to the vm_event _and_ call
>>>> xc_monitor_disable() and the time hvm_do_resume() gets called.
>>> It's that _and_ in your reply that I put under question, but I admit
>>> that questioning may be due to my limited (or should I say non-
>>> existent) knowledge on the user space parts here: Why would the
>>> app be _required_ to "responds to the vm_event _and_ call
>>> xc_monitor_disable()", rather than delaying the latter for long
>>> enough?
>> It's not required to do that, it's just that we don't really know what
>> "long enough means". I suppose a second should do be more than enough,
>> but obviously we'd prefer a fool-proof solution with better guarantees
>> and no lag - I thought that the hypervisor change is trivial enough to
>> not make the tradeoff into much of an issue.
> It's mostly mechanical indeed, but iirc it grows struct vcpu (or was
> it struct domain?), which is never really desirable (but admittedly
> often unavoidable). I'm therefore simply trying to understand what
> alternatives there are.
>
> Jan

The change adds 20 bytes to the structure, that's less than 3 
uint64_t's, not that much if you ask me.
But given the discussion we're having over patch 4/8, monitor_write_data 
would indeed get larger if I revert that, so what I propose instead is 
going for something in-between what was before and what I did after, 
i.e. don't dynamically allocate arch_vm_event entirely, but don't 
statically allocate monitor_write_data either. I propose turning 
arch_vm_event into:

struct arch_vm_event {
     uint32_t emulate_flags;
     struct vm_event_emul_read_data *emul_read_data;
     struct monitor_write_data *write_data;
};

This way we can still selectively invalidate write_data from arch_vm_event.
Sounds good?

Corneliu.
Corneliu ZUZU July 4, 2016, 3:24 p.m. UTC | #18
On 7/4/2016 6:09 PM, Jan Beulich wrote:
>>>> On 04.07.16 at 16:31, <czuzu@bitdefender.com> wrote:
>> Hmm, I meant "<= 80" the first time too, I was under the impression that
>> that's what CODING_STYLE asks for too, but, quoting:
>>       'Lines should be less than 80 characters in length.'
>> Why not <= 80? Shouldn't the limit (i.e. "<=") be 80 columns, including
>> the 80-th? (this points me otherwise
>> http://programmers.stackexchange.com/questions/148677/why-is-80-characters-t
>> he-standard-limit-for-code-width
>> )
> I have no idea what (editing) tools have limitation at precisely which
> line length. Hence I can't answer the "why". I can only refer to the
> documented requirement, which aiui is pretty clear.
>
> Jan

Without understanding the "why" we can't be sure we're doing the right 
thing.
I was hoping someone else would provide feedback on this, as I'm almost 
sure who wrote CODING_STYLE actually meant "less than or equal" instead 
of "less than".

Corneliu.
Jan Beulich July 4, 2016, 3:57 p.m. UTC | #19
>>> On 04.07.16 at 17:21, <czuzu@bitdefender.com> wrote:
> On 7/4/2016 6:07 PM, Jan Beulich wrote:
>>>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote:
>>> On 07/04/16 17:17, Jan Beulich wrote:
>>>>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote:
>>>>> On 07/04/16 16:11, Jan Beulich wrote:
>>>>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>>>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack
>>>>> user
>>>>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>>>>>> discards any information that was in arch_vm_event.write_data.
>>>>>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>>>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>>>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>>>>>> monitored guest to survive.
>>>>>>> Sorry but could you please rephrase this, I don't quite understand what
>>>>>>> you're saying.
>>>>>>> The write_data field in arch_vm_event should _not ever_ be invalidated
>>>>>>> as a direct result of a toolstack user's action.
>>>>>> The monitoring app can cause all kinds of problems to the guest it
>>>>>> monitors. Why would this specific one need taking care of in the
>>>>>> hypervisor, instead of demanding that the app not disable monitoring
>>>>>> at the wrong time?
>>>>> I'm not sure there's a right time here. The problem is that, if I
>>>>> understand this correctly, a race is possible between the moment the
>>>>> userspace application responds to the vm_event _and_ call
>>>>> xc_monitor_disable() and the time hvm_do_resume() gets called.
>>>> It's that _and_ in your reply that I put under question, but I admit
>>>> that questioning may be due to my limited (or should I say non-
>>>> existent) knowledge on the user space parts here: Why would the
>>>> app be _required_ to "responds to the vm_event _and_ call
>>>> xc_monitor_disable()", rather than delaying the latter for long
>>>> enough?
>>> It's not required to do that, it's just that we don't really know what
>>> "long enough means". I suppose a second should do be more than enough,
>>> but obviously we'd prefer a fool-proof solution with better guarantees
>>> and no lag - I thought that the hypervisor change is trivial enough to
>>> not make the tradeoff into much of an issue.
>> It's mostly mechanical indeed, but iirc it grows struct vcpu (or was
>> it struct domain?), which is never really desirable (but admittedly
>> often unavoidable). I'm therefore simply trying to understand what
>> alternatives there are.
> 
> The change adds 20 bytes to the structure, that's less than 3 
> uint64_t's, not that much if you ask me.
> But given the discussion we're having over patch 4/8, monitor_write_data 
> would indeed get larger if I revert that, so what I propose instead is 
> going for something in-between what was before and what I did after, 
> i.e. don't dynamically allocate arch_vm_event entirely, but don't 
> statically allocate monitor_write_data either. I propose turning 
> arch_vm_event into:
> 
> struct arch_vm_event {
>      uint32_t emulate_flags;
>      struct vm_event_emul_read_data *emul_read_data;
>      struct monitor_write_data *write_data;
> };
> 
> This way we can still selectively invalidate write_data from arch_vm_event.
> Sounds good?

Well, ideally I'd like there to remain a single pointer in struct vcpu.
What this points to and when that backing memory gets allocated
would then of less interest (i.e. perhaps you could allocate the
part that needs to remain active after disabling monitoring
independent from what can go away at that point). Systems
without any monitoring in use would then still pay no larger
penalty than they do today.

Jan
Corneliu ZUZU July 4, 2016, 4:09 p.m. UTC | #20
On 7/4/2016 6:57 PM, Jan Beulich wrote:
>>>> On 04.07.16 at 17:21, <czuzu@bitdefender.com> wrote:
>> On 7/4/2016 6:07 PM, Jan Beulich wrote:
>>>>>> On 04.07.16 at 16:21, <rcojocaru@bitdefender.com> wrote:
>>>> On 07/04/16 17:17, Jan Beulich wrote:
>>>>>>>> On 04.07.16 at 15:50, <rcojocaru@bitdefender.com> wrote:
>>>>>> On 07/04/16 16:11, Jan Beulich wrote:
>>>>>>>>>> On 04.07.16 at 15:03, <czuzu@bitdefender.com> wrote:
>>>>>>>> On 7/4/2016 3:47 PM, Jan Beulich wrote:
>>>>>>>>>>>> On 30.06.16 at 20:45, <czuzu@bitdefender.com> wrote:
>>>>>>>>>> The arch_vm_event structure is dynamically allocated and freed @
>>>>>>>>>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack
>>>>>> user
>>>>>>>>>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>>>>>>>>>> discards any information that was in arch_vm_event.write_data.
>>>>>>>>> Isn't that rather a toolstack user bug, not warranting a relatively
>>>>>>>>> extensive (even if mostly mechanical) hypervisor change like this
>>>>>>>>> one? Sane monitor behavior, after all, is required anyway for the
>>>>>>>>> monitored guest to survive.
>>>>>>>> Sorry but could you please rephrase this, I don't quite understand what
>>>>>>>> you're saying.
>>>>>>>> The write_data field in arch_vm_event should _not ever_ be invalidated
>>>>>>>> as a direct result of a toolstack user's action.
>>>>>>> The monitoring app can cause all kinds of problems to the guest it
>>>>>>> monitors. Why would this specific one need taking care of in the
>>>>>>> hypervisor, instead of demanding that the app not disable monitoring
>>>>>>> at the wrong time?
>>>>>> I'm not sure there's a right time here. The problem is that, if I
>>>>>> understand this correctly, a race is possible between the moment the
>>>>>> userspace application responds to the vm_event _and_ call
>>>>>> xc_monitor_disable() and the time hvm_do_resume() gets called.
>>>>> It's that _and_ in your reply that I put under question, but I admit
>>>>> that questioning may be due to my limited (or should I say non-
>>>>> existent) knowledge on the user space parts here: Why would the
>>>>> app be _required_ to "responds to the vm_event _and_ call
>>>>> xc_monitor_disable()", rather than delaying the latter for long
>>>>> enough?
>>>> It's not required to do that, it's just that we don't really know what
>>>> "long enough means". I suppose a second should do be more than enough,
>>>> but obviously we'd prefer a fool-proof solution with better guarantees
>>>> and no lag - I thought that the hypervisor change is trivial enough to
>>>> not make the tradeoff into much of an issue.
>>> It's mostly mechanical indeed, but iirc it grows struct vcpu (or was
>>> it struct domain?), which is never really desirable (but admittedly
>>> often unavoidable). I'm therefore simply trying to understand what
>>> alternatives there are.
>> The change adds 20 bytes to the structure, that's less than 3
>> uint64_t's, not that much if you ask me.
>> But given the discussion we're having over patch 4/8, monitor_write_data
>> would indeed get larger if I revert that, so what I propose instead is
>> going for something in-between what was before and what I did after,
>> i.e. don't dynamically allocate arch_vm_event entirely, but don't
>> statically allocate monitor_write_data either. I propose turning
>> arch_vm_event into:
>>
>> struct arch_vm_event {
>>       uint32_t emulate_flags;
>>       struct vm_event_emul_read_data *emul_read_data;
>>       struct monitor_write_data *write_data;
>> };
>>
>> This way we can still selectively invalidate write_data from arch_vm_event.
>> Sounds good?
> Well, ideally I'd like there to remain a single pointer in struct vcpu.
> What this points to and when that backing memory gets allocated
> would then of less interest (i.e. perhaps you could allocate the
> part that needs to remain active after disabling monitoring
> independent from what can go away at that point). Systems
> without any monitoring in use would then still pay no larger
> penalty than they do today.
>
> Jan

Now that I think about it, that's feasible too. So then, make 
arch_vm_event be dynamically allocated as it was, but slightly change 
its definition to:

struct arch_vm_event {
      uint32_t emulate_flags;
      struct vm_event_emul_read_data *emul_read_data;
      struct monitor_write_data *write_data;
};

, allocate it when it was previously allocated along emul_read_data and 
write_data but don't ever deallocate it entirely, instead only 
deallocate  and reallocate selectively (emul_read_data) as needed, correct?
This way indeed there's no memory overhead in arch_vcpu compared to how 
it was before.

Corneliu.
Jan Beulich July 4, 2016, 4:16 p.m. UTC | #21
>>> On 04.07.16 at 18:09, <czuzu@bitdefender.com> wrote:
> Now that I think about it, that's feasible too. So then, make 
> arch_vm_event be dynamically allocated as it was, but slightly change 
> its definition to:
> 
> struct arch_vm_event {
>       uint32_t emulate_flags;
>       struct vm_event_emul_read_data *emul_read_data;
>       struct monitor_write_data *write_data;
> };
> 
> , allocate it when it was previously allocated along emul_read_data and 
> write_data but don't ever deallocate it entirely, instead only 
> deallocate  and reallocate selectively (emul_read_data) as needed, correct?

Yes, if that also fits you.

Jan
Corneliu ZUZU July 4, 2016, 4:26 p.m. UTC | #22
On 7/4/2016 7:16 PM, Jan Beulich wrote:
>>>> On 04.07.16 at 18:09, <czuzu@bitdefender.com> wrote:
>> Now that I think about it, that's feasible too. So then, make
>> arch_vm_event be dynamically allocated as it was, but slightly change
>> its definition to:
>>
>> struct arch_vm_event {
>>        uint32_t emulate_flags;
>>        struct vm_event_emul_read_data *emul_read_data;
>>        struct monitor_write_data *write_data;
>> };
>>
>> , allocate it when it was previously allocated along emul_read_data and
>> write_data but don't ever deallocate it entirely, instead only
>> deallocate  and reallocate selectively (emul_read_data) as needed, correct?
> Yes, if that also fits you.
>
> Jan

Heh, within the above scheme I only now realized it would make no sense 
to have write_data dynamically allocated.
With that settled, thanks for the great feedback.

Corneliu.
diff mbox

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index bb59247..06e68ae 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -492,8 +492,9 @@  int vcpu_initialise(struct vcpu *v)
 
 void vcpu_destroy(struct vcpu *v)
 {
-    xfree(v->arch.vm_event);
-    v->arch.vm_event = NULL;
+    v->arch.vm_event.emulate_flags = 0;
+    xfree(v->arch.vm_event.emul_read_data);
+    v->arch.vm_event.emul_read_data = NULL;
 
     if ( is_pv_32bit_vcpu(v) )
     {
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 855af4d..68f5515 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -73,12 +73,12 @@  static int set_context_data(void *buffer, unsigned int size)
 {
     struct vcpu *curr = current;
 
-    if ( curr->arch.vm_event )
+    if ( curr->arch.vm_event.emul_read_data )
     {
         unsigned int safe_size =
-            min(size, curr->arch.vm_event->emul_read_data.size);
+            min(size, curr->arch.vm_event.emul_read_data->size);
 
-        memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size);
+        memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size);
         memset(buffer + safe_size, 0, size - safe_size);
         return X86EMUL_OKAY;
     }
@@ -523,7 +523,7 @@  static int hvmemul_virtual_to_linear(
      * vm_event being triggered for repeated writes to a whole page.
      */
     if ( unlikely(current->domain->arch.mem_access_emulate_each_rep) &&
-         current->arch.vm_event->emulate_flags != 0 )
+         current->arch.vm_event.emulate_flags != 0 )
        max_reps = 1;
 
     /*
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 884ae40..03dffb8 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -473,24 +473,24 @@  void hvm_do_resume(struct vcpu *v)
     if ( !handle_hvm_io_completion(v) )
         return;
 
-    if ( unlikely(v->arch.vm_event) )
+    if ( unlikely(v->arch.vm_event.emulate_flags) )
     {
-        if ( v->arch.vm_event->emulate_flags )
-        {
-            enum emul_kind kind = EMUL_KIND_NORMAL;
+        enum emul_kind kind;
 
-            if ( v->arch.vm_event->emulate_flags &
-                 VM_EVENT_FLAG_SET_EMUL_READ_DATA )
-                kind = EMUL_KIND_SET_CONTEXT;
-            else if ( v->arch.vm_event->emulate_flags &
-                      VM_EVENT_FLAG_EMULATE_NOWRITE )
-                kind = EMUL_KIND_NOWRITE;
+        ASSERT(v->arch.vm_event.emul_read_data);
 
-            hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
-                                       HVM_DELIVER_NO_ERROR_CODE);
+        kind = EMUL_KIND_NORMAL;
 
-            v->arch.vm_event->emulate_flags = 0;
-        }
+        if ( v->arch.vm_event.emulate_flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA )
+            kind = EMUL_KIND_SET_CONTEXT;
+        else if ( v->arch.vm_event.emulate_flags &
+                  VM_EVENT_FLAG_EMULATE_NOWRITE )
+            kind = EMUL_KIND_NOWRITE;
+
+        hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
+                                   HVM_DELIVER_NO_ERROR_CODE);
+
+        v->arch.vm_event.emulate_flags = 0;
     }
 
     arch_monitor_write_data(v);
@@ -2178,17 +2178,15 @@  int hvm_set_cr0(unsigned long value, bool_t may_defer)
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
     {
-        ASSERT(v->arch.vm_event);
-
         if ( hvm_monitor_crX(CR0, value, old_value) )
         {
             /*
              * The actual write will occur in arch_monitor_write_data(), if
              * permitted.
              */
-            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-            v->arch.vm_event->write_data.status = MWS_CR0;
-            v->arch.vm_event->write_data.value = value;
+            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+            v->arch.vm_event.write_data.status = MWS_CR0;
+            v->arch.vm_event.write_data.value = value;
 
             return X86EMUL_OKAY;
         }
@@ -2284,17 +2282,15 @@  int hvm_set_cr3(unsigned long value, bool_t may_defer)
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
     {
-        ASSERT(v->arch.vm_event);
-
         if ( hvm_monitor_crX(CR3, value, old) )
         {
             /*
              * The actual write will occur in arch_monitor_write_data(), if
              * permitted.
              */
-            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-            v->arch.vm_event->write_data.status = MWS_CR3;
-            v->arch.vm_event->write_data.value = value;
+            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+            v->arch.vm_event.write_data.status = MWS_CR3;
+            v->arch.vm_event.write_data.value = value;
 
             return X86EMUL_OKAY;
         }
@@ -2368,17 +2364,15 @@  int hvm_set_cr4(unsigned long value, bool_t may_defer)
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
     {
-        ASSERT(v->arch.vm_event);
-
         if ( hvm_monitor_crX(CR4, value, old_cr) )
         {
             /*
              * The actual write will occur in arch_monitor_write_data(), if
              * permitted.
              */
-            ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-            v->arch.vm_event->write_data.status = MWS_CR4;
-            v->arch.vm_event->write_data.value = value;
+            ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+            v->arch.vm_event.write_data.status = MWS_CR4;
+            v->arch.vm_event.write_data.value = value;
 
             return X86EMUL_OKAY;
         }
@@ -3753,16 +3747,14 @@  int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
 
     if ( may_defer && unlikely(monitored_msr(v->domain, msr)) )
     {
-        ASSERT(v->arch.vm_event);
-
         /*
          * The actual write will occur in arch_monitor_write_data(), if
          * permitted.
          */
-        ASSERT(MWS_NOWRITE == v->arch.vm_event->write_data.status);
-        v->arch.vm_event->write_data.status = MWS_MSR;
-        v->arch.vm_event->write_data.msr = msr;
-        v->arch.vm_event->write_data.value = msr_content;
+        ASSERT(MWS_NOWRITE == v->arch.vm_event.write_data.status);
+        v->arch.vm_event.write_data.status = MWS_MSR;
+        v->arch.vm_event.write_data.msr = msr;
+        v->arch.vm_event.write_data.value = msr_content;
 
         hvm_monitor_msr(msr, msr_content);
         return X86EMUL_OKAY;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 16733a4..9bcaa8a 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1639,10 +1639,10 @@  void p2m_mem_access_emulate_check(struct vcpu *v,
             }
         }
 
-        v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
+        v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0;
 
         if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
-            v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
+            *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data;
     }
 }
 
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index 5c8d4da..88d14ae 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -46,12 +46,7 @@  void arch_monitor_cleanup_domain(struct domain *d)
 
 void arch_monitor_write_data(struct vcpu *v)
 {
-    struct monitor_write_data *w;
-
-    if ( likely(!v->arch.vm_event) )
-        return;
-
-    w = &v->arch.vm_event->write_data;
+    struct monitor_write_data *w = &v->arch.vm_event.write_data;
 
     if ( likely(MWS_NOWRITE == w->status) )
         return;
diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
index 825da48..f21ff10 100644
--- a/xen/arch/x86/vm_event.c
+++ b/xen/arch/x86/vm_event.c
@@ -30,12 +30,13 @@  int vm_event_init_domain(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        if ( v->arch.vm_event )
+        if ( v->arch.vm_event.emul_read_data )
             continue;
 
-        v->arch.vm_event = xzalloc(struct arch_vm_event);
+        v->arch.vm_event.emul_read_data =
+                xzalloc(struct vm_event_emul_read_data);
 
-        if ( !v->arch.vm_event )
+        if ( !v->arch.vm_event.emul_read_data )
             return -ENOMEM;
     }
 
@@ -52,8 +53,9 @@  void vm_event_cleanup_domain(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        xfree(v->arch.vm_event);
-        v->arch.vm_event = NULL;
+        v->arch.vm_event.emulate_flags = 0;
+        xfree(v->arch.vm_event.emul_read_data);
+        v->arch.vm_event.emul_read_data = NULL;
     }
 
     d->arch.mem_access_emulate_each_rep = 0;
@@ -73,13 +75,11 @@  void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp)
 {
     if ( rsp->flags & VM_EVENT_FLAG_DENY )
     {
-        ASSERT(v->arch.vm_event);
-
         /* deny flag requires the vCPU to be paused */
         if ( !atomic_read(&v->vm_event_pause_count) )
             return;
 
-        v->arch.vm_event->write_data.status = MWS_NOWRITE;
+        v->arch.vm_event.write_data.status = MWS_NOWRITE;
     }
 }
 
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index a22ee6b..7ea5c8f 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -259,21 +259,6 @@  struct pv_domain
     struct cpuidmasks *cpuidmasks;
 };
 
-enum monitor_write_status
-{
-    MWS_NOWRITE = 0,
-    MWS_MSR,
-    MWS_CR0,
-    MWS_CR3,
-    MWS_CR4,
-};
-
-struct monitor_write_data {
-    enum monitor_write_status status;
-    uint32_t msr;
-    uint64_t value;
-};
-
 struct arch_domain
 {
     struct page_info *perdomain_l3_pg;
@@ -496,6 +481,31 @@  typedef enum __packed {
     SMAP_CHECK_DISABLED,        /* disable the check */
 } smap_check_policy_t;
 
+enum monitor_write_status
+{
+    MWS_NOWRITE = 0,
+    MWS_MSR,
+    MWS_CR0,
+    MWS_CR3,
+    MWS_CR4,
+};
+
+struct monitor_write_data {
+    enum monitor_write_status status;
+    uint32_t msr;
+    uint64_t value;
+};
+
+/*
+ * Should we emulate the next matching instruction on VCPU resume
+ * after a vm_event?
+ */
+struct arch_vm_event {
+    uint32_t emulate_flags;
+    struct vm_event_emul_read_data *emul_read_data;
+    struct monitor_write_data write_data;
+};
+
 struct arch_vcpu
 {
     /*
@@ -569,7 +579,7 @@  struct arch_vcpu
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
 
-    struct arch_vm_event *vm_event;
+    struct arch_vm_event vm_event;
 };
 
 smap_check_policy_t smap_policy_change(struct vcpu *v,
diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
index 1068376..984ac4c 100644
--- a/xen/include/asm-x86/monitor.h
+++ b/xen/include/asm-x86/monitor.h
@@ -48,7 +48,8 @@  int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop)
          * Enabling mem_access_emulate_each_rep without a vm_event subscriber
          * is meaningless.
          */
-        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
+        if ( d->max_vcpus && d->vcpu[0] &&
+             d->vcpu[0]->arch.vm_event.emul_read_data )
             d->arch.mem_access_emulate_each_rep = !!mop->event;
         else
             rc = -EINVAL;
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
index 026f42e..c83583d 100644
--- a/xen/include/asm-x86/vm_event.h
+++ b/xen/include/asm-x86/vm_event.h
@@ -22,16 +22,6 @@ 
 #include <xen/sched.h>
 #include <xen/vm_event.h>
 
-/*
- * Should we emulate the next matching instruction on VCPU resume
- * after a vm_event?
- */
-struct arch_vm_event {
-    uint32_t emulate_flags;
-    struct vm_event_emul_read_data emul_read_data;
-    struct monitor_write_data write_data;
-};
-
 int vm_event_init_domain(struct domain *d);
 
 void vm_event_cleanup_domain(struct domain *d);