diff mbox

[v2,3/7] x86/vm-event/monitor: don't compromise monitor_write_data on domain cleanup

Message ID 1467728902-13725-1-git-send-email-czuzu@bitdefender.com (mailing list archive)
State New, archived
Headers show

Commit Message

Corneliu ZUZU July 5, 2016, 2:28 p.m. UTC
The arch_vm_event structure is dynamically allocated and freed @
vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
disables domain monitoring (xc_monitor_disable), which in turn effectively
discards any information that was in arch_vm_event.write_data.

But this can yield unexpected behavior since if a CR-write was awaiting to be
committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
before xc_monitor_disable is called, then the domain CR write is wrongfully
ignored, which of course, in these cases, can easily render a domain crash.

To fix the issue, this patch makes arch_vm_event.emul_read_data dynamically
allocated and only frees that in vm_event_cleanup_domain, instead of the whole
arch_vcpu.vm_event structure, which with this patch will only be freed on
vcpu/domain destroyal.

Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
Changed since v1:
  * arch_vcpu.vm_event made pointer again to avoid eating memory from arch_vcpu
    structure
---
 xen/arch/x86/domain.c          |  9 +++++++--
 xen/arch/x86/hvm/emulate.c     |  6 +++---
 xen/arch/x86/hvm/hvm.c         |  2 ++
 xen/arch/x86/mm/p2m.c          |  2 +-
 xen/arch/x86/vm_event.c        | 22 ++++++++++++++++------
 xen/common/vm_event.c          | 11 +++++++++++
 xen/include/asm-x86/monitor.h  |  4 +++-
 xen/include/asm-x86/vm_event.h |  2 +-
 8 files changed, 44 insertions(+), 14 deletions(-)

Comments

George Dunlap July 5, 2016, 2:45 p.m. UTC | #1
On 05/07/16 15:28, Corneliu ZUZU wrote:
> The arch_vm_event structure is dynamically allocated and freed @
> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
> disables domain monitoring (xc_monitor_disable), which in turn effectively
> discards any information that was in arch_vm_event.write_data.
> 
> But this can yield unexpected behavior since if a CR-write was awaiting to be
> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
> before xc_monitor_disable is called, then the domain CR write is wrongfully
> ignored, which of course, in these cases, can easily render a domain crash.
> 
> To fix the issue, this patch makes arch_vm_event.emul_read_data dynamically
> allocated and only frees that in vm_event_cleanup_domain, instead of the whole
> arch_vcpu.vm_event structure, which with this patch will only be freed on
> vcpu/domain destroyal.
> 
> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
> Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>

[snip]

> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 16733a4..6616626 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -1642,7 +1642,7 @@ void p2m_mem_access_emulate_check(struct vcpu *v,
>          v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
>  
>          if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
> -            v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
> +            *v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
>      }
>  }
>  

On the basis of Razvan's ack:

Acked-by: George Dunlap <george.dunlap@citrix.com>

> diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
> index 80f84d6..0e25e4d 100644
> --- a/xen/arch/x86/vm_event.c
> +++ b/xen/arch/x86/vm_event.c
> @@ -30,12 +30,18 @@ int vm_event_init_domain(struct domain *d)
>  
>      for_each_vcpu ( d, v )
>      {
> -        if ( v->arch.vm_event )
> +        if ( likely(!v->arch.vm_event) )
> +        {
> +            v->arch.vm_event = xzalloc(struct arch_vm_event);
> +            if ( !v->arch.vm_event )
> +                return -ENOMEM;
> +        }
> +        else if ( unlikely(v->arch.vm_event->emul_read_data) )
>              continue;
>  
> -        v->arch.vm_event = xzalloc(struct arch_vm_event);
> -
> -        if ( !v->arch.vm_event )
> +        v->arch.vm_event->emul_read_data =
> +                xzalloc(struct vm_event_emul_read_data);
> +        if ( !v->arch.vm_event->emul_read_data )
>              return -ENOMEM;
>      }
>  
> @@ -52,8 +58,12 @@ void vm_event_cleanup_domain(struct domain *d)
>  
>      for_each_vcpu ( d, v )
>      {
> -        xfree(v->arch.vm_event);
> -        v->arch.vm_event = NULL;
> +        if ( likely(!v->arch.vm_event) )
> +            continue;
> +
> +        v->arch.vm_event->emulate_flags = 0;
> +        xfree(v->arch.vm_event->emul_read_data);
> +        v->arch.vm_event->emul_read_data = NULL;
>      }
>  
>      d->arch.mem_access_emulate_each_rep = 0;
> diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
> index 17d2716..75bbbab 100644
> --- a/xen/common/vm_event.c
> +++ b/xen/common/vm_event.c
> @@ -534,6 +534,8 @@ static void mem_sharing_notification(struct vcpu *v, unsigned int port)
>  /* Clean up on domain destruction */
>  void vm_event_cleanup(struct domain *d)
>  {
> +    struct vcpu *v;
> +
>  #ifdef CONFIG_HAS_MEM_PAGING
>      if ( d->vm_event->paging.ring_page )
>      {
> @@ -560,6 +562,15 @@ void vm_event_cleanup(struct domain *d)
>          (void)vm_event_disable(d, &d->vm_event->share);
>      }
>  #endif
> +
> +    for_each_vcpu ( d, v )
> +    {
> +        if ( unlikely(v->arch.vm_event) )
> +        {
> +            xfree(v->arch.vm_event);
> +            v->arch.vm_event = NULL;
> +        }
> +    }
>  }
>  
>  int vm_event_domctl(struct domain *d, xen_domctl_vm_event_op_t *vec,
> diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
> index 0611681..a094c0d 100644
> --- a/xen/include/asm-x86/monitor.h
> +++ b/xen/include/asm-x86/monitor.h
> @@ -26,6 +26,7 @@
>  #include <public/domctl.h>
>  #include <asm/cpufeature.h>
>  #include <asm/hvm/hvm.h>
> +#include <asm/vm_event.h>
>  
>  #define monitor_ctrlreg_bitmask(ctrlreg_index) (1U << (ctrlreg_index))
>  
> @@ -48,7 +49,8 @@ int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop)
>           * Enabling mem_access_emulate_each_rep without a vm_event subscriber
>           * is meaningless.
>           */
> -        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
> +        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event &&
> +             d->vcpu[0]->arch.vm_event->emul_read_data )
>              d->arch.mem_access_emulate_each_rep = !!mop->event;
>          else
>              rc = -EINVAL;
> diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
> index 026f42e..557f353 100644
> --- a/xen/include/asm-x86/vm_event.h
> +++ b/xen/include/asm-x86/vm_event.h
> @@ -28,7 +28,7 @@
>   */
>  struct arch_vm_event {
>      uint32_t emulate_flags;
> -    struct vm_event_emul_read_data emul_read_data;
> +    struct vm_event_emul_read_data *emul_read_data;
>      struct monitor_write_data write_data;
>  };
>  
>
George Dunlap July 5, 2016, 2:46 p.m. UTC | #2
On 05/07/16 15:28, Corneliu ZUZU wrote:
> The arch_vm_event structure is dynamically allocated and freed @
> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
> disables domain monitoring (xc_monitor_disable), which in turn effectively
> discards any information that was in arch_vm_event.write_data.
> 
> But this can yield unexpected behavior since if a CR-write was awaiting to be
> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
> before xc_monitor_disable is called, then the domain CR write is wrongfully
> ignored, which of course, in these cases, can easily render a domain crash.
> 
> To fix the issue, this patch makes arch_vm_event.emul_read_data dynamically
> allocated and only frees that in vm_event_cleanup_domain, instead of the whole
> arch_vcpu.vm_event structure, which with this patch will only be freed on
> vcpu/domain destroyal.
> 
> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
> Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>

On the basis of one of the vm_event maintainer's acks...

> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 16733a4..6616626 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -1642,7 +1642,7 @@ void p2m_mem_access_emulate_check(struct vcpu *v,
>          v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
>  
>          if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
> -            v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
> +            *v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
>      }
>  }
>  

p2m bits:

Acked-by: George Dunlap <george.dunlap@citrix.com>
Razvan Cojocaru July 5, 2016, 5:16 p.m. UTC | #3
On 07/05/16 17:28, Corneliu ZUZU wrote:
> The arch_vm_event structure is dynamically allocated and freed @
> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
> disables domain monitoring (xc_monitor_disable), which in turn effectively
> discards any information that was in arch_vm_event.write_data.
> 
> But this can yield unexpected behavior since if a CR-write was awaiting to be
> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
> before xc_monitor_disable is called, then the domain CR write is wrongfully
> ignored, which of course, in these cases, can easily render a domain crash.
> 
> To fix the issue, this patch makes arch_vm_event.emul_read_data dynamically
> allocated and only frees that in vm_event_cleanup_domain, instead of the whole
> arch_vcpu.vm_event structure, which with this patch will only be freed on
> vcpu/domain destroyal.
> 
> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
> Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
> ---
> Changed since v1:
>   * arch_vcpu.vm_event made pointer again to avoid eating memory from arch_vcpu
>     structure

I believe that all acks should be presumed lost on non-trivial changes
in a new version (which I believe this qualifies as being, with all the
new logic of allocating / deallocating only part of vm_event).

Unfortunately I'm out of office until early next week so I can't
properly test / thoroughly parse this until then, but we should be extra
careful that there are several places in the code where it is assumed
that v->arch.vm_event != NULL is the same thing as monitoring being
enabled. I'm not saying that they're not treated in this patch (the
proper change has certainly been made in emulate.c), I'm just saying
that we should be careful that they are.

Having said that, I propose a special macro to make this all clearer,
something like:

#define is_monitor_enabled_for_vcpu(v) \
    ( v->arch.vm_event && v->arch.vm_event->emul_read_data )

or equivalent inline functions returning a bool_t. Just a thought.


Thanks,
Razvan
Corneliu ZUZU July 6, 2016, 7:01 a.m. UTC | #4
On 7/5/2016 8:16 PM, Razvan Cojocaru wrote:
> On 07/05/16 17:28, Corneliu ZUZU wrote:
>> The arch_vm_event structure is dynamically allocated and freed @
>> vm_event_cleanup_domain. This cleanup is triggered e.g. when the toolstack user
>> disables domain monitoring (xc_monitor_disable), which in turn effectively
>> discards any information that was in arch_vm_event.write_data.
>>
>> But this can yield unexpected behavior since if a CR-write was awaiting to be
>> committed on the scheduling tail (hvm_do_resume->arch_monitor_write_data)
>> before xc_monitor_disable is called, then the domain CR write is wrongfully
>> ignored, which of course, in these cases, can easily render a domain crash.
>>
>> To fix the issue, this patch makes arch_vm_event.emul_read_data dynamically
>> allocated and only frees that in vm_event_cleanup_domain, instead of the whole
>> arch_vcpu.vm_event structure, which with this patch will only be freed on
>> vcpu/domain destroyal.
>>
>> Signed-off-by: Corneliu ZUZU <czuzu@bitdefender.com>
>> Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
>> ---
>> Changed since v1:
>>    * arch_vcpu.vm_event made pointer again to avoid eating memory from arch_vcpu
>>      structure
> I believe that all acks should be presumed lost on non-trivial changes
> in a new version (which I believe this qualifies as being, with all the
> new logic of allocating / deallocating only part of vm_event).
>
> Unfortunately I'm out of office until early next week so I can't
> properly test / thoroughly parse this until then, but we should be extra
> careful that there are several places in the code where it is assumed
> that v->arch.vm_event != NULL is the same thing as monitoring being
> enabled. I'm not saying that they're not treated in this patch (the
> proper change has certainly been made in emulate.c), I'm just saying
> that we should be careful that they are.
>
> Having said that, I propose a special macro to make this all clearer,
> something like:
>
> #define is_monitor_enabled_for_vcpu(v) \
>      ( v->arch.vm_event && v->arch.vm_event->emul_read_data )
>
> or equivalent inline functions returning a bool_t. Just a thought.
>
>
> Thanks,
> Razvan

At some point I actually defined that exact macro while constructing 
this patch, but I decided to delete it afterwards because I thought the 
code would still be clear without it (i.e. only check emul_read_data 
when we actually need _that_ to be non-NULL) and because it seemed a bit 
strange, being possible to have _arch_vm_event allocated_ but having the 
monitor vm-events subsystem _uninitialized_ (because of .write_data 
being treated specially). Since the write_data field is also part of the 
monitor subsystem, conceptually one could say the monitor subsystem is 
at least _partially_ initialized when that's non-NULL, not uninitialized 
at all. I'll think of a resolution around this, but in the meantime I 
there's something more important to settle:

I only now notice, it seems to me that the ASSERT(v->arch.vm_event) @ 
hvm_set_cr0 & such (the one just before calling hvm_monitor_crX) might 
fail. That's because from what I see in the code the following can happen:
- v.arch.vm_event = NULL (vm-event _not_ initialized)
- toolstack user calls e.g. xc_moLRnitor_write_ctrlreg(xch, domid, 
VM_EVENT_X86_CR0, true, true, false) -> do_domctl(XEN_DOMCTL_monitor_op) 
-> monitor_domctl(XEN_DOMCTL_MONITOR_OP_ENABLE) -> 
arch_monitor_domctl_event(XEN_DOMCTL_MONITOR_EVENT_WRITE_CTREG) which in 
turn sets the CR0 bit of d.arch.monitor.write_ctrlreg_enabled _without 
ever checking if v.arch.vm_event is non-NULL_
- afterwards a CR0 write happens on one of the domain vCPUs and we 
arrive at that assert without having v.arch.vm_event allocated!

I can't test this @ the moment, am I missing something here? I remember 
taking a look on this code path at some point and idk how I didn't 
notice something like this, it seems obviously wrong..has something 
changed recently?
I think we need to take a second look over this code-path, I'm also 
seeing that the proper check _is done_ @ 
arch_monitor_domctl_op(XEN_DOMCTL_MONITOR_OP_EMULATE_EACH_REP) (though I 
think a more proper error return value there would be ENODEV instead of 
EINVAL).

Corneliu.
diff mbox

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index bb59247..0313208 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -56,6 +56,7 @@ 
 #include <asm/debugreg.h>
 #include <asm/msr.h>
 #include <asm/traps.h>
+#include <asm/vm_event.h>
 #include <asm/nmi.h>
 #include <asm/mce.h>
 #include <asm/amd.h>
@@ -492,8 +493,12 @@  int vcpu_initialise(struct vcpu *v)
 
 void vcpu_destroy(struct vcpu *v)
 {
-    xfree(v->arch.vm_event);
-    v->arch.vm_event = NULL;
+    if ( unlikely(v->arch.vm_event) )
+    {
+        xfree(v->arch.vm_event->emul_read_data);
+        xfree(v->arch.vm_event);
+        v->arch.vm_event = NULL;
+    }
 
     if ( is_pv_32bit_vcpu(v) )
     {
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 855af4d..3880df1 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -73,12 +73,12 @@  static int set_context_data(void *buffer, unsigned int size)
 {
     struct vcpu *curr = current;
 
-    if ( curr->arch.vm_event )
+    if ( curr->arch.vm_event && curr->arch.vm_event->emul_read_data )
     {
         unsigned int safe_size =
-            min(size, curr->arch.vm_event->emul_read_data.size);
+            min(size, curr->arch.vm_event->emul_read_data->size);
 
-        memcpy(buffer, curr->arch.vm_event->emul_read_data.data, safe_size);
+        memcpy(buffer, curr->arch.vm_event->emul_read_data->data, safe_size);
         memset(buffer + safe_size, 0, size - safe_size);
         return X86EMUL_OKAY;
     }
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index e3829d2..ac6d9eb 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -479,6 +479,8 @@  void hvm_do_resume(struct vcpu *v)
         {
             enum emul_kind kind = EMUL_KIND_NORMAL;
 
+            ASSERT(v->arch.vm_event->emul_read_data);
+
             if ( v->arch.vm_event->emulate_flags &
                  VM_EVENT_FLAG_SET_EMUL_READ_DATA )
                 kind = EMUL_KIND_SET_CONTEXT;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 16733a4..6616626 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1642,7 +1642,7 @@  void p2m_mem_access_emulate_check(struct vcpu *v,
         v->arch.vm_event->emulate_flags = violation ? rsp->flags : 0;
 
         if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) )
-            v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
+            *v->arch.vm_event->emul_read_data = rsp->data.emul_read_data;
     }
 }
 
diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
index 80f84d6..0e25e4d 100644
--- a/xen/arch/x86/vm_event.c
+++ b/xen/arch/x86/vm_event.c
@@ -30,12 +30,18 @@  int vm_event_init_domain(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        if ( v->arch.vm_event )
+        if ( likely(!v->arch.vm_event) )
+        {
+            v->arch.vm_event = xzalloc(struct arch_vm_event);
+            if ( !v->arch.vm_event )
+                return -ENOMEM;
+        }
+        else if ( unlikely(v->arch.vm_event->emul_read_data) )
             continue;
 
-        v->arch.vm_event = xzalloc(struct arch_vm_event);
-
-        if ( !v->arch.vm_event )
+        v->arch.vm_event->emul_read_data =
+                xzalloc(struct vm_event_emul_read_data);
+        if ( !v->arch.vm_event->emul_read_data )
             return -ENOMEM;
     }
 
@@ -52,8 +58,12 @@  void vm_event_cleanup_domain(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        xfree(v->arch.vm_event);
-        v->arch.vm_event = NULL;
+        if ( likely(!v->arch.vm_event) )
+            continue;
+
+        v->arch.vm_event->emulate_flags = 0;
+        xfree(v->arch.vm_event->emul_read_data);
+        v->arch.vm_event->emul_read_data = NULL;
     }
 
     d->arch.mem_access_emulate_each_rep = 0;
diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
index 17d2716..75bbbab 100644
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -534,6 +534,8 @@  static void mem_sharing_notification(struct vcpu *v, unsigned int port)
 /* Clean up on domain destruction */
 void vm_event_cleanup(struct domain *d)
 {
+    struct vcpu *v;
+
 #ifdef CONFIG_HAS_MEM_PAGING
     if ( d->vm_event->paging.ring_page )
     {
@@ -560,6 +562,15 @@  void vm_event_cleanup(struct domain *d)
         (void)vm_event_disable(d, &d->vm_event->share);
     }
 #endif
+
+    for_each_vcpu ( d, v )
+    {
+        if ( unlikely(v->arch.vm_event) )
+        {
+            xfree(v->arch.vm_event);
+            v->arch.vm_event = NULL;
+        }
+    }
 }
 
 int vm_event_domctl(struct domain *d, xen_domctl_vm_event_op_t *vec,
diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
index 0611681..a094c0d 100644
--- a/xen/include/asm-x86/monitor.h
+++ b/xen/include/asm-x86/monitor.h
@@ -26,6 +26,7 @@ 
 #include <public/domctl.h>
 #include <asm/cpufeature.h>
 #include <asm/hvm/hvm.h>
+#include <asm/vm_event.h>
 
 #define monitor_ctrlreg_bitmask(ctrlreg_index) (1U << (ctrlreg_index))
 
@@ -48,7 +49,8 @@  int arch_monitor_domctl_op(struct domain *d, struct xen_domctl_monitor_op *mop)
          * Enabling mem_access_emulate_each_rep without a vm_event subscriber
          * is meaningless.
          */
-        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event )
+        if ( d->max_vcpus && d->vcpu[0] && d->vcpu[0]->arch.vm_event &&
+             d->vcpu[0]->arch.vm_event->emul_read_data )
             d->arch.mem_access_emulate_each_rep = !!mop->event;
         else
             rc = -EINVAL;
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
index 026f42e..557f353 100644
--- a/xen/include/asm-x86/vm_event.h
+++ b/xen/include/asm-x86/vm_event.h
@@ -28,7 +28,7 @@ 
  */
 struct arch_vm_event {
     uint32_t emulate_flags;
-    struct vm_event_emul_read_data emul_read_data;
+    struct vm_event_emul_read_data *emul_read_data;
     struct monitor_write_data write_data;
 };