diff mbox

[1/2] VMX: fix VMCS race on context-switch paths

Message ID 1487154433.3588.1.camel@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sergey Dyasli Feb. 15, 2017, 10:27 a.m. UTC
On Tue, 2017-02-14 at 03:28 -0700, Jan Beulich wrote:
> When __context_switch() is being bypassed during original context
> switch handling, the vCPU "owning" the VMCS partially loses control of
> it: It will appear non-running to remote CPUs, and hence their attempt
> to pause the owning vCPU will have no effect on it (as it already
> looks to be paused). At the same time the "owning" CPU will re-enable
> interrupts eventually (the lastest when entering the idle loop) and
> hence becomes subject to IPIs from other CPUs requesting access to the
> VMCS. As a result, when __context_switch() finally gets run, the CPU
> may no longer have the VMCS loaded, and hence any accesses to it would
> fail. Hence we may need to re-load the VMCS in vmx_ctxt_switch_from().
> 
> Similarly, when __context_switch() is being bypassed also on the second
> (switch-in) path, VMCS ownership may have been lost and hence needs
> re-establishing. Since there's no existing hook to put this in, add a
> new one.
> 
> Reported-by: Kevin Mayer <Kevin.Mayer@gdata.de>
> Reported-by: Anshul Makkar <anshul.makkar@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2098,11 +2098,14 @@ void context_switch(struct vcpu *prev, s
>  
>      set_current(next);
>  
> -    if ( (per_cpu(curr_vcpu, cpu) == next) ||
> -         (is_idle_domain(nextd) && cpu_online(cpu)) )
> +    if ( (per_cpu(curr_vcpu, cpu) == next) )
>      {
> +        if ( next->arch.ctxt_switch_same )
> +            next->arch.ctxt_switch_same(next);
>          local_irq_enable();
>      }
> +    else if ( is_idle_domain(nextd) && cpu_online(cpu) )
> +        local_irq_enable();
>      else
>      {
>          __context_switch();
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -552,6 +552,27 @@ static void vmx_load_vmcs(struct vcpu *v
>      local_irq_restore(flags);
>  }
>  
> +void vmx_vmcs_reload(struct vcpu *v)
> +{
> +    /*
> +     * As we're running with interrupts disabled, we can't acquire
> +     * v->arch.hvm_vmx.vmcs_lock here. However, with interrupts disabled
> +     * the VMCS can't be taken away from us anymore if we still own it.
> +     */
> +    ASSERT(!local_irq_is_enabled());
> +    if ( v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs) )
> +        return;
> +    ASSERT(!this_cpu(current_vmcs));
> +
> +    /*
> +     * Wait for the remote side to be done with the VMCS before loading
> +     * it here.
> +     */
> +    while ( v->arch.hvm_vmx.active_cpu != -1 )
> +        cpu_relax();
> +    vmx_load_vmcs(v);
> +}
> +
>  int vmx_cpu_up_prepare(unsigned int cpu)
>  {
>      /*
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -298,6 +298,7 @@ static int vmx_vcpu_initialise(struct vc
>      v->arch.schedule_tail    = vmx_do_resume;
>      v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
>      v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
> +    v->arch.ctxt_switch_same = vmx_vmcs_reload;
>  
>      if ( (rc = vmx_create_vmcs(v)) != 0 )
>      {
> @@ -936,6 +937,18 @@ static void vmx_ctxt_switch_from(struct
>      if ( unlikely(!this_cpu(vmxon)) )
>          return;
>  
> +    if ( !v->is_running )
> +    {
> +        /*
> +         * When this vCPU isn't marked as running anymore, a remote pCPU's
> +         * attempt to pause us (from vmx_vmcs_enter()) won't have a reason
> +         * to spin in vcpu_sleep_sync(), and hence that pCPU might have taken
> +         * away the VMCS from us. As we're running with interrupts disabled,
> +         * we also can't call vmx_vmcs_enter().
> +         */
> +        vmx_vmcs_reload(v);
> +    }
> +
>      vmx_fpu_leave(v);
>      vmx_save_guest_msrs(v);
>      vmx_restore_host_msrs();
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -514,6 +514,7 @@ struct arch_vcpu
>  
>      void (*ctxt_switch_from) (struct vcpu *);
>      void (*ctxt_switch_to) (struct vcpu *);
> +    void (*ctxt_switch_same) (struct vcpu *);
>  
>      struct vpmu_struct vpmu;
>  
> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -174,6 +174,7 @@ void vmx_destroy_vmcs(struct vcpu *v);
>  void vmx_vmcs_enter(struct vcpu *v);
>  bool_t __must_check vmx_vmcs_try_enter(struct vcpu *v);
>  void vmx_vmcs_exit(struct vcpu *v);
> +void vmx_vmcs_reload(struct vcpu *v);
>  
>  #define CPU_BASED_VIRTUAL_INTR_PENDING        0x00000004
>  #define CPU_BASED_USE_TSC_OFFSETING           0x00000008
> 
> 

Using the above patch with the following change for Xen-4.6.1:

-    if ( v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs) )
+    if ( v->arch.hvm_vmx.vmcs == this_cpu(current_vmcs) )

This is what I'm getting during the original test case (32 VMs reboot):

(XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!
(XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----
(XEN) [ 1407.803774] CPU:    12
(XEN) [ 1407.806975] RIP:    e008:[<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
(XEN) [ 1407.814926] RFLAGS: 0000000000000013   CONTEXT: hypervisor (d230v0)
(XEN) [ 1407.822486] rax: 0000000000000000   rbx: ffff830079ee7000   rcx: 0000000000000000
(XEN) [ 1407.831407] rdx: 0000006f8f72ce00   rsi: ffff8329b3efbfe8   rdi: ffff830079ee7000
(XEN) [ 1407.840326] rbp: ffff83007bab7000   rsp: ffff83400fab7dc8   r8:  000001468e9e3ccc
(XEN) [ 1407.849246] r9:  ffff83403ffe7000   r10: 00000146c91c1737   r11: ffff833a9558c310
(XEN) [ 1407.858166] r12: ffff833a9558c000   r13: 000000000000000c   r14: ffff83403ffe7000
(XEN) [ 1407.867085] r15: ffff82d080364640   cr0: 0000000080050033   cr4: 00000000003526e0
(XEN) [ 1407.876005] cr3: 000000294b074000   cr2: 000007fefd7ce150
(XEN) [ 1407.882599] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
(XEN) [ 1407.890938] Xen code around <ffff82d0801ea2a2> (vmx_vmcs_reload+0x32/0x50):
(XEN) [ 1407.899277]  84 00 00 00 00 00 f3 90 <83> bf e8 05 00 00 ff 75 f5 e9 a0 fa ff ff f3 c3
(XEN) [ 1407.908679] Xen stack trace from rsp=ffff83400fab7dc8:
(XEN) [ 1407.914982]    ffff82d08016c58d 0000000000001000 0000000000000000 0000000000000000
(XEN) [ 1407.923998]    0000000000000206 0000000000000086 0000000000000286 000000000000000c
(XEN) [ 1407.933017]    ffff83007bab7058 ffff82d080364640 ffff83007bab7000 00000146a7f26495
(XEN) [ 1407.942032]    ffff830079ee7000 ffff833a9558cf84 ffff833a9558c000 ffff82d080364640
(XEN) [ 1407.951048]    ffff82d08012fb8e ffff83400fabda98 ffff83400faba148 ffff83403ffe7000
(XEN) [ 1407.960067]    ffff83400faba160 ffff83400fabda40 ffff82d080164305 000000000000000c
(XEN) [ 1407.969083]    ffff830079ee7000 0000000001c9c380 ffff82d080136400 000000440000011d
(XEN) [ 1407.978101]    00000000ffffffff ffffffffffffffff ffff83400fab0000 ffff82d080348d00
(XEN) [ 1407.987116]    ffff833a9558c000 ffff82d080364640 ffff82d08013311c ffff830079ee7000
(XEN) [ 1407.996134]    ffff83400fab0000 ffff830079ee7000 ffff83403ffe7000 00000000ffffffff
(XEN) [ 1408.005151]    ffff82d080167d35 ffff83007bab7000 0000000000000001 fffffa80077f9700
(XEN) [ 1408.014167]    fffffa80075bf900 fffffa80077f9820 0000000000000000 0000000000000000
(XEN) [ 1408.023184]    fffffa8008889c00 0000000002fa1e78 0000003b6ed18d78 0000000000000000
(XEN) [ 1408.032202]    00000000068e7780 fffffa80075ba790 fffffa80077f9848 fffff800027f9e80
(XEN) [ 1408.041220]    0000000000000001 000000fc00000000 fffff880042499c2 0000000000000000
(XEN) [ 1408.050235]    0000000000000246 fffff80000b9cb58 0000000000000000 80248e00e008e1f0
(XEN) [ 1408.059253]    00000000ffff82d0 80248e00e008e200 00000000ffff82d0 80248e000000000c
(XEN) [ 1408.068268]    ffff830079ee7000 0000006f8f72ce00 00000000ffff82d0
(XEN) [ 1408.075638] Xen call trace:
(XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
(XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0
(XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0
(XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0
(XEN) [ 1408.107925]    [<ffff82d080136400>] timer.c#timer_softirq_action+0x90/0x210
(XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90
(XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60

Currently I'm testing the following patch that fixes at least one possible scenario:

commit f76dc832c2de4950872fc27962c56c609cff1160
Author: Sergey Dyasli <sergey.dyasli@citrix.com>
Date:   Tue Feb 14 15:23:54 2017 +0000

    x86/vmx: use curr_vcpu in vmx_vmcs_exit()
    
    During context_switch() from a HVM vCPU to the idle vCPU, current is
    updated but curr_vcpu retains the HMV vCPU's value.  If after that,
    for some reason, vmx_vmcs_enter() + vmx_vmcs_exit() pair will be
    executed, the test for has_hvm_container_vcpu(current) will fail and
    vmcs for curr_vcpu will not be loaded.
    
    This will lead to BUG() during the next __context_switch() when
    vmx_ctxt_switch_from() will be executed and __vmwrite() will fail.
    
    Fix this by testing has_hvm_container_vcpu() against curr_vcpu.
    
    Signed-off-by: Sergey Dyasli <sergey.dyasli@citrix.com>

         
So far no crashes observed but testing continues.

-- 
Thanks,
Sergey

Comments

Jan Beulich Feb. 15, 2017, 11 a.m. UTC | #1
>>> On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
> This is what I'm getting during the original test case (32 VMs reboot):
> 
> (XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!
> (XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----
> (XEN) [ 1407.803774] CPU:    12
> (XEN) [ 1407.806975] RIP:    e008:[<ffff82d0801ea2a2>] 
> vmx_vmcs_reload+0x32/0x50
> (XEN) [ 1407.814926] RFLAGS: 0000000000000013   CONTEXT: hypervisor (d230v0)
> (XEN) [ 1407.822486] rax: 0000000000000000   rbx: ffff830079ee7000   rcx: 0000000000000000
> (XEN) [ 1407.831407] rdx: 0000006f8f72ce00   rsi: ffff8329b3efbfe8   rdi: ffff830079ee7000
> (XEN) [ 1407.840326] rbp: ffff83007bab7000   rsp: ffff83400fab7dc8   r8: 000001468e9e3ccc
> (XEN) [ 1407.849246] r9:  ffff83403ffe7000   r10: 00000146c91c1737   r11: ffff833a9558c310
> (XEN) [ 1407.858166] r12: ffff833a9558c000   r13: 000000000000000c   r14: ffff83403ffe7000
> (XEN) [ 1407.867085] r15: ffff82d080364640   cr0: 0000000080050033   cr4: 00000000003526e0
> (XEN) [ 1407.876005] cr3: 000000294b074000   cr2: 000007fefd7ce150
> (XEN) [ 1407.882599] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
> (XEN) [ 1407.890938] Xen code around <ffff82d0801ea2a2> (vmx_vmcs_reload+0x32/0x50):
> (XEN) [ 1407.899277]  84 00 00 00 00 00 f3 90 <83> bf e8 05 00 00 ff 75 f5 e9 a0 fa ff ff f3 c3
> (XEN) [ 1407.908679] Xen stack trace from rsp=ffff83400fab7dc8:
> (XEN) [ 1407.914982]    ffff82d08016c58d 0000000000001000 0000000000000000 0000000000000000
> (XEN) [ 1407.923998]    0000000000000206 0000000000000086 0000000000000286 000000000000000c
> (XEN) [ 1407.933017]    ffff83007bab7058 ffff82d080364640 ffff83007bab7000 00000146a7f26495
> (XEN) [ 1407.942032]    ffff830079ee7000 ffff833a9558cf84 ffff833a9558c000 ffff82d080364640
> (XEN) [ 1407.951048]    ffff82d08012fb8e ffff83400fabda98 ffff83400faba148 ffff83403ffe7000
> (XEN) [ 1407.960067]    ffff83400faba160 ffff83400fabda40 ffff82d080164305 000000000000000c
> (XEN) [ 1407.969083]    ffff830079ee7000 0000000001c9c380 ffff82d080136400 000000440000011d
> (XEN) [ 1407.978101]    00000000ffffffff ffffffffffffffff ffff83400fab0000 ffff82d080348d00
> (XEN) [ 1407.987116]    ffff833a9558c000 ffff82d080364640 ffff82d08013311c ffff830079ee7000
> (XEN) [ 1407.996134]    ffff83400fab0000 ffff830079ee7000 ffff83403ffe7000 00000000ffffffff
> (XEN) [ 1408.005151]    ffff82d080167d35 ffff83007bab7000 0000000000000001 fffffa80077f9700
> (XEN) [ 1408.014167]    fffffa80075bf900 fffffa80077f9820 0000000000000000 0000000000000000
> (XEN) [ 1408.023184]    fffffa8008889c00 0000000002fa1e78 0000003b6ed18d78 0000000000000000
> (XEN) [ 1408.032202]    00000000068e7780 fffffa80075ba790 fffffa80077f9848 fffff800027f9e80
> (XEN) [ 1408.041220]    0000000000000001 000000fc00000000 fffff880042499c2 0000000000000000
> (XEN) [ 1408.050235]    0000000000000246 fffff80000b9cb58 0000000000000000 80248e00e008e1f0
> (XEN) [ 1408.059253]    00000000ffff82d0 80248e00e008e200 00000000ffff82d0 80248e000000000c
> (XEN) [ 1408.068268]    ffff830079ee7000 0000006f8f72ce00 00000000ffff82d0
> (XEN) [ 1408.075638] Xen call trace:
> (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
> (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0
> (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0
> (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0
> (XEN) [ 1408.107925]    [<ffff82d080136400>] timer.c#timer_softirq_action+0x90/0x210
> (XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90
> (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60
> 
> Currently I'm testing the following patch that fixes at least one possible 
> scenario:
> 
> commit f76dc832c2de4950872fc27962c56c609cff1160
> Author: Sergey Dyasli <sergey.dyasli@citrix.com>
> Date:   Tue Feb 14 15:23:54 2017 +0000
> 
>     x86/vmx: use curr_vcpu in vmx_vmcs_exit()
>     
>     During context_switch() from a HVM vCPU to the idle vCPU, current is
>     updated but curr_vcpu retains the HMV vCPU's value.  If after that,
>     for some reason, vmx_vmcs_enter() + vmx_vmcs_exit() pair will be
>     executed, the test for has_hvm_container_vcpu(current) will fail and
>     vmcs for curr_vcpu will not be loaded.

I'm not seeing the connection to the watchdog invocation above:
I take it that vmx_vmcs_reload() was caught while waiting for
active_cpu to become -1. Yet that's what vmx_clear_vmcs()
achieves, not vmx_load_vmcs().

>     This will lead to BUG() during the next __context_switch() when
>     vmx_ctxt_switch_from() will be executed and __vmwrite() will fail.

But that case is specifically (supposed to be) taken care of by
calling vmx_vmcs_reload() from vmx_ctxt_switch_from().

>     Fix this by testing has_hvm_container_vcpu() against curr_vcpu.

If at all possible, I like to avoid having to use curr_vcpu in the fixing
of this (whole) issue.

> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -742,6 +742,8 @@ void vmx_vmcs_enter(struct vcpu *v)
>  void vmx_vmcs_exit(struct vcpu *v)
>  {
>      struct foreign_vmcs *fv;
> +    unsigned int cpu = smp_processor_id();
> +    struct vcpu *p = per_cpu(curr_vcpu, cpu);
>  
>      if ( likely(v == current) )
>          return;
> @@ -754,8 +756,8 @@ void vmx_vmcs_exit(struct vcpu *v)
>      {
>          /* Don't confuse vmx_do_resume (for @v or @current!) */
>          vmx_clear_vmcs(v);
> -        if ( has_hvm_container_vcpu(current) )
> -            vmx_load_vmcs(current);
> +        if ( has_hvm_container_vcpu(p) )
> +            vmx_load_vmcs(p);

If this change (or something similar) really turned out to be
necessary, the variable declarations should be moved into this
more narrow scope (and I'd prefer the "cpu" one to be dropped
altogether).

Jan
Sergey Dyasli Feb. 15, 2017, 11:13 a.m. UTC | #2
On Wed, 2017-02-15 at 04:00 -0700, Jan Beulich wrote:
> > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:

> > 

> > This is what I'm getting during the original test case (32 VMs reboot):

> > 

> > (XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!

> > (XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----

> > (XEN) [ 1407.803774] CPU:    12

> > (XEN) [ 1407.806975] RIP:    e008:[<ffff82d0801ea2a2>] 

> > vmx_vmcs_reload+0x32/0x50

> > (XEN) [ 1407.814926] RFLAGS: 0000000000000013   CONTEXT: hypervisor (d230v0)

> > (XEN) [ 1407.822486] rax: 0000000000000000   rbx: ffff830079ee7000   rcx: 0000000000000000

> > (XEN) [ 1407.831407] rdx: 0000006f8f72ce00   rsi: ffff8329b3efbfe8   rdi: ffff830079ee7000

> > (XEN) [ 1407.840326] rbp: ffff83007bab7000   rsp: ffff83400fab7dc8   r8: 000001468e9e3ccc

> > (XEN) [ 1407.849246] r9:  ffff83403ffe7000   r10: 00000146c91c1737   r11: ffff833a9558c310

> > (XEN) [ 1407.858166] r12: ffff833a9558c000   r13: 000000000000000c   r14: ffff83403ffe7000

> > (XEN) [ 1407.867085] r15: ffff82d080364640   cr0: 0000000080050033   cr4: 00000000003526e0

> > (XEN) [ 1407.876005] cr3: 000000294b074000   cr2: 000007fefd7ce150

> > (XEN) [ 1407.882599] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008

> > (XEN) [ 1407.890938] Xen code around <ffff82d0801ea2a2> (vmx_vmcs_reload+0x32/0x50):

> > (XEN) [ 1407.899277]  84 00 00 00 00 00 f3 90 <83> bf e8 05 00 00 ff 75 f5 e9 a0 fa ff ff f3 c3

> > (XEN) [ 1407.908679] Xen stack trace from rsp=ffff83400fab7dc8:

> > (XEN) [ 1407.914982]    ffff82d08016c58d 0000000000001000 0000000000000000 0000000000000000

> > (XEN) [ 1407.923998]    0000000000000206 0000000000000086 0000000000000286 000000000000000c

> > (XEN) [ 1407.933017]    ffff83007bab7058 ffff82d080364640 ffff83007bab7000 00000146a7f26495

> > (XEN) [ 1407.942032]    ffff830079ee7000 ffff833a9558cf84 ffff833a9558c000 ffff82d080364640

> > (XEN) [ 1407.951048]    ffff82d08012fb8e ffff83400fabda98 ffff83400faba148 ffff83403ffe7000

> > (XEN) [ 1407.960067]    ffff83400faba160 ffff83400fabda40 ffff82d080164305 000000000000000c

> > (XEN) [ 1407.969083]    ffff830079ee7000 0000000001c9c380 ffff82d080136400 000000440000011d

> > (XEN) [ 1407.978101]    00000000ffffffff ffffffffffffffff ffff83400fab0000 ffff82d080348d00

> > (XEN) [ 1407.987116]    ffff833a9558c000 ffff82d080364640 ffff82d08013311c ffff830079ee7000

> > (XEN) [ 1407.996134]    ffff83400fab0000 ffff830079ee7000 ffff83403ffe7000 00000000ffffffff

> > (XEN) [ 1408.005151]    ffff82d080167d35 ffff83007bab7000 0000000000000001 fffffa80077f9700

> > (XEN) [ 1408.014167]    fffffa80075bf900 fffffa80077f9820 0000000000000000 0000000000000000

> > (XEN) [ 1408.023184]    fffffa8008889c00 0000000002fa1e78 0000003b6ed18d78 0000000000000000

> > (XEN) [ 1408.032202]    00000000068e7780 fffffa80075ba790 fffffa80077f9848 fffff800027f9e80

> > (XEN) [ 1408.041220]    0000000000000001 000000fc00000000 fffff880042499c2 0000000000000000

> > (XEN) [ 1408.050235]    0000000000000246 fffff80000b9cb58 0000000000000000 80248e00e008e1f0

> > (XEN) [ 1408.059253]    00000000ffff82d0 80248e00e008e200 00000000ffff82d0 80248e000000000c

> > (XEN) [ 1408.068268]    ffff830079ee7000 0000006f8f72ce00 00000000ffff82d0

> > (XEN) [ 1408.075638] Xen call trace:

> > (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50

> > (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0

> > (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0

> > (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0

> > (XEN) [ 1408.107925]    [<ffff82d080136400>] timer.c#timer_softirq_action+0x90/0x210

> > (XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90

> > (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60

> > 

> > Currently I'm testing the following patch that fixes at least one possible 

> > scenario:

> > 

> > commit f76dc832c2de4950872fc27962c56c609cff1160

> > Author: Sergey Dyasli <sergey.dyasli@citrix.com>

> > Date:   Tue Feb 14 15:23:54 2017 +0000

> > 

> >     x86/vmx: use curr_vcpu in vmx_vmcs_exit()

> >     

> >     During context_switch() from a HVM vCPU to the idle vCPU, current is

> >     updated but curr_vcpu retains the HMV vCPU's value.  If after that,

> >     for some reason, vmx_vmcs_enter() + vmx_vmcs_exit() pair will be

> >     executed, the test for has_hvm_container_vcpu(current) will fail and

> >     vmcs for curr_vcpu will not be loaded.

> 

> I'm not seeing the connection to the watchdog invocation above:

> I take it that vmx_vmcs_reload() was caught while waiting for

> active_cpu to become -1. Yet that's what vmx_clear_vmcs()

> achieves, not vmx_load_vmcs().


Sorry for misunderstanding but these 2 patches are being tested
separately, not together. My patch is an alternative approach for
the issue. It fixes at least one observed BUG() scenario with PML and
I'm currently looking for others.

> Similarly, when __context_switch() is being bypassed also on the second

> (switch-in) path, VMCS ownership may have been lost and hence needs

> re-establishing. Since there's no existing hook to put this in, add a

> new one.


BTW, and what about vmx_do_resume()? It does vmx_load_vmcs() in cases
when VMCS was cleared by someone.

> 

> >     This will lead to BUG() during the next __context_switch() when

> >     vmx_ctxt_switch_from() will be executed and __vmwrite() will fail.

> 

> But that case is specifically (supposed to be) taken care of by

> calling vmx_vmcs_reload() from vmx_ctxt_switch_from().

> 

> >     Fix this by testing has_hvm_container_vcpu() against curr_vcpu.

> 

> If at all possible, I like to avoid having to use curr_vcpu in the fixing

> of this (whole) issue.

> 

> > --- a/xen/arch/x86/hvm/vmx/vmcs.c

> > +++ b/xen/arch/x86/hvm/vmx/vmcs.c

> > @@ -742,6 +742,8 @@ void vmx_vmcs_enter(struct vcpu *v)

> >  void vmx_vmcs_exit(struct vcpu *v)

> >  {

> >      struct foreign_vmcs *fv;

> > +    unsigned int cpu = smp_processor_id();

> > +    struct vcpu *p = per_cpu(curr_vcpu, cpu);

> >  

> >      if ( likely(v == current) )

> >          return;

> > @@ -754,8 +756,8 @@ void vmx_vmcs_exit(struct vcpu *v)

> >      {

> >          /* Don't confuse vmx_do_resume (for @v or @current!) */

> >          vmx_clear_vmcs(v);

> > -        if ( has_hvm_container_vcpu(current) )

> > -            vmx_load_vmcs(current);

> > +        if ( has_hvm_container_vcpu(p) )

> > +            vmx_load_vmcs(p);

> 

> If this change (or something similar) really turned out to be

> necessary, the variable declarations should be moved into this

> more narrow scope (and I'd prefer the "cpu" one to be dropped

> altogether).

> 

> Jan

-- 
Thanks,
Sergey
Jan Beulich Feb. 15, 2017, 11:24 a.m. UTC | #3
>>> On 15.02.17 at 12:13, <sergey.dyasli@citrix.com> wrote:
> On Wed, 2017-02-15 at 04:00 -0700, Jan Beulich wrote:
>> Similarly, when __context_switch() is being bypassed also on the second
>> (switch-in) path, VMCS ownership may have been lost and hence needs
>> re-establishing. Since there's no existing hook to put this in, add a
>> new one.
> 
> BTW, and what about vmx_do_resume()? It does vmx_load_vmcs() in cases
> when VMCS was cleared by someone.

Not sure I understand the question correctly - it would seem to me
that this was a half hearted attempt to deal with the situation we
now find needs more changes, i.e. I would assume that call could
now be dropped from the function. Or if it was to stay, it should
likely become a vmx_vmcs_reload() call (with the inner conditional
dropped).

Jan
Jan Beulich Feb. 15, 2017, 11:39 a.m. UTC | #4
>>> On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
> This is what I'm getting during the original test case (32 VMs reboot):
> 
> (XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!
> (XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----
> (XEN) [ 1407.803774] CPU:    12
> (XEN) [ 1407.806975] RIP:    e008:[<ffff82d0801ea2a2>] 
> vmx_vmcs_reload+0x32/0x50
> (XEN) [ 1407.814926] RFLAGS: 0000000000000013   CONTEXT: hypervisor (d230v0)
> (XEN) [ 1407.822486] rax: 0000000000000000   rbx: ffff830079ee7000   rcx: 0000000000000000
> (XEN) [ 1407.831407] rdx: 0000006f8f72ce00   rsi: ffff8329b3efbfe8   rdi: ffff830079ee7000
> (XEN) [ 1407.840326] rbp: ffff83007bab7000   rsp: ffff83400fab7dc8   r8: 000001468e9e3ccc
> (XEN) [ 1407.849246] r9:  ffff83403ffe7000   r10: 00000146c91c1737   r11: ffff833a9558c310
> (XEN) [ 1407.858166] r12: ffff833a9558c000   r13: 000000000000000c   r14: ffff83403ffe7000
> (XEN) [ 1407.867085] r15: ffff82d080364640   cr0: 0000000080050033   cr4: 00000000003526e0
> (XEN) [ 1407.876005] cr3: 000000294b074000   cr2: 000007fefd7ce150
> (XEN) [ 1407.882599] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
> (XEN) [ 1407.890938] Xen code around <ffff82d0801ea2a2> 
> (vmx_vmcs_reload+0x32/0x50):
> (XEN) [ 1407.899277]  84 00 00 00 00 00 f3 90 <83> bf e8 05 00 00 ff 75 f5 e9 a0 fa ff ff f3 c3
> (XEN) [ 1407.908679] Xen stack trace from rsp=ffff83400fab7dc8:
> (XEN) [ 1407.914982]    ffff82d08016c58d 0000000000001000 0000000000000000 0000000000000000
> (XEN) [ 1407.923998]    0000000000000206 0000000000000086 0000000000000286 000000000000000c
> (XEN) [ 1407.933017]    ffff83007bab7058 ffff82d080364640 ffff83007bab7000 00000146a7f26495
> (XEN) [ 1407.942032]    ffff830079ee7000 ffff833a9558cf84 ffff833a9558c000 ffff82d080364640
> (XEN) [ 1407.951048]    ffff82d08012fb8e ffff83400fabda98 ffff83400faba148 ffff83403ffe7000
> (XEN) [ 1407.960067]    ffff83400faba160 ffff83400fabda40 ffff82d080164305 000000000000000c
> (XEN) [ 1407.969083]    ffff830079ee7000 0000000001c9c380 ffff82d080136400 000000440000011d
> (XEN) [ 1407.978101]    00000000ffffffff ffffffffffffffff ffff83400fab0000 ffff82d080348d00
> (XEN) [ 1407.987116]    ffff833a9558c000 ffff82d080364640 ffff82d08013311c ffff830079ee7000
> (XEN) [ 1407.996134]    ffff83400fab0000 ffff830079ee7000 ffff83403ffe7000 00000000ffffffff
> (XEN) [ 1408.005151]    ffff82d080167d35 ffff83007bab7000 0000000000000001 fffffa80077f9700
> (XEN) [ 1408.014167]    fffffa80075bf900 fffffa80077f9820 0000000000000000 0000000000000000
> (XEN) [ 1408.023184]    fffffa8008889c00 0000000002fa1e78 0000003b6ed18d78 0000000000000000
> (XEN) [ 1408.032202]    00000000068e7780 fffffa80075ba790 fffffa80077f9848 fffff800027f9e80
> (XEN) [ 1408.041220]    0000000000000001 000000fc00000000 fffff880042499c2 0000000000000000
> (XEN) [ 1408.050235]    0000000000000246 fffff80000b9cb58 0000000000000000 80248e00e008e1f0
> (XEN) [ 1408.059253]    00000000ffff82d0 80248e00e008e200 00000000ffff82d0 80248e000000000c
> (XEN) [ 1408.068268]    ffff830079ee7000 0000006f8f72ce00 00000000ffff82d0
> (XEN) [ 1408.075638] Xen call trace:
> (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
> (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0
> (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0
> (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0
> (XEN) [ 1408.107925]    [<ffff82d080136400>] timer.c#timer_softirq_action+0x90/0x210
> (XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90
> (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60

Taking your later reply into account - were you able to figure out
what other party held onto the VMCS being waited for here?

Jan
Sergey Dyasli Feb. 15, 2017, 11:48 a.m. UTC | #5
On Wed, 2017-02-15 at 04:39 -0700, Jan Beulich wrote:
> > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:

> > 

> > This is what I'm getting during the original test case (32 VMs reboot):

> > 

> > (XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!

> > (XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----

> > (XEN) [ 1407.803774] CPU:    12

> > (XEN) [ 1407.806975] RIP:    e008:[<ffff82d0801ea2a2>] 

> > vmx_vmcs_reload+0x32/0x50

> > (XEN) [ 1407.814926] RFLAGS: 0000000000000013   CONTEXT: hypervisor (d230v0)

> > (XEN) [ 1407.822486] rax: 0000000000000000   rbx: ffff830079ee7000   rcx: 0000000000000000

> > (XEN) [ 1407.831407] rdx: 0000006f8f72ce00   rsi: ffff8329b3efbfe8   rdi: ffff830079ee7000

> > (XEN) [ 1407.840326] rbp: ffff83007bab7000   rsp: ffff83400fab7dc8   r8: 000001468e9e3ccc

> > (XEN) [ 1407.849246] r9:  ffff83403ffe7000   r10: 00000146c91c1737   r11: ffff833a9558c310

> > (XEN) [ 1407.858166] r12: ffff833a9558c000   r13: 000000000000000c   r14: ffff83403ffe7000

> > (XEN) [ 1407.867085] r15: ffff82d080364640   cr0: 0000000080050033   cr4: 00000000003526e0

> > (XEN) [ 1407.876005] cr3: 000000294b074000   cr2: 000007fefd7ce150

> > (XEN) [ 1407.882599] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008

> > (XEN) [ 1407.890938] Xen code around <ffff82d0801ea2a2> 

> > (vmx_vmcs_reload+0x32/0x50):

> > (XEN) [ 1407.899277]  84 00 00 00 00 00 f3 90 <83> bf e8 05 00 00 ff 75 f5 e9 a0 fa ff ff f3 c3

> > (XEN) [ 1407.908679] Xen stack trace from rsp=ffff83400fab7dc8:

> > (XEN) [ 1407.914982]    ffff82d08016c58d 0000000000001000 0000000000000000 0000000000000000

> > (XEN) [ 1407.923998]    0000000000000206 0000000000000086 0000000000000286 000000000000000c

> > (XEN) [ 1407.933017]    ffff83007bab7058 ffff82d080364640 ffff83007bab7000 00000146a7f26495

> > (XEN) [ 1407.942032]    ffff830079ee7000 ffff833a9558cf84 ffff833a9558c000 ffff82d080364640

> > (XEN) [ 1407.951048]    ffff82d08012fb8e ffff83400fabda98 ffff83400faba148 ffff83403ffe7000

> > (XEN) [ 1407.960067]    ffff83400faba160 ffff83400fabda40 ffff82d080164305 000000000000000c

> > (XEN) [ 1407.969083]    ffff830079ee7000 0000000001c9c380 ffff82d080136400 000000440000011d

> > (XEN) [ 1407.978101]    00000000ffffffff ffffffffffffffff ffff83400fab0000 ffff82d080348d00

> > (XEN) [ 1407.987116]    ffff833a9558c000 ffff82d080364640 ffff82d08013311c ffff830079ee7000

> > (XEN) [ 1407.996134]    ffff83400fab0000 ffff830079ee7000 ffff83403ffe7000 00000000ffffffff

> > (XEN) [ 1408.005151]    ffff82d080167d35 ffff83007bab7000 0000000000000001 fffffa80077f9700

> > (XEN) [ 1408.014167]    fffffa80075bf900 fffffa80077f9820 0000000000000000 0000000000000000

> > (XEN) [ 1408.023184]    fffffa8008889c00 0000000002fa1e78 0000003b6ed18d78 0000000000000000

> > (XEN) [ 1408.032202]    00000000068e7780 fffffa80075ba790 fffffa80077f9848 fffff800027f9e80

> > (XEN) [ 1408.041220]    0000000000000001 000000fc00000000 fffff880042499c2 0000000000000000

> > (XEN) [ 1408.050235]    0000000000000246 fffff80000b9cb58 0000000000000000 80248e00e008e1f0

> > (XEN) [ 1408.059253]    00000000ffff82d0 80248e00e008e200 00000000ffff82d0 80248e000000000c

> > (XEN) [ 1408.068268]    ffff830079ee7000 0000006f8f72ce00 00000000ffff82d0

> > (XEN) [ 1408.075638] Xen call trace:

> > (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50

> > (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0

> > (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0

> > (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0

> > (XEN) [ 1408.107925]    [<ffff82d080136400>] timer.c#timer_softirq_action+0x90/0x210

> > (XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90

> > (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60

> 

> Taking your later reply into account - were you able to figure out

> what other party held onto the VMCS being waited for here?


Unfortunately, no. It was unclear from debug logs. But judging from
the following vmx_do_resume() code:

    if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
    {
        if ( v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs) )
            vmx_load_vmcs(v);
    }

If both of the above conditions are true then vmx_vmcs_reload() will
probably hang.

-- 
Thanks,
Sergey
Jan Beulich Feb. 15, 2017, 11:55 a.m. UTC | #6
>>> On 15.02.17 at 12:48, <sergey.dyasli@citrix.com> wrote:
> On Wed, 2017-02-15 at 04:39 -0700, Jan Beulich wrote:
>> > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
>> > (XEN) [ 1408.075638] Xen call trace:
>> > (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
>> > (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0
>> > (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0
>> > (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0
>> > (XEN) [ 1408.107925]    [<ffff82d080136400>] timer.c#timer_softirq_action+0x90/0x210
>> > (XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90
>> > (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60
>> 
>> Taking your later reply into account - were you able to figure out
>> what other party held onto the VMCS being waited for here?
> 
> Unfortunately, no. It was unclear from debug logs. But judging from
> the following vmx_do_resume() code:
> 
>     if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
>     {
>         if ( v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs) )
>             vmx_load_vmcs(v);
>     }
> 
> If both of the above conditions are true then vmx_vmcs_reload() will
> probably hang.

I don't follow (reload should run before this, not after), but I must
be missing something more general anyway, as I'm seeing the code
above being needed despite the reload additions.

Jan
Jan Beulich Feb. 15, 2017, 1:03 p.m. UTC | #7
>>> On 15.02.17 at 12:55, <JBeulich@suse.com> wrote:
>>>> On 15.02.17 at 12:48, <sergey.dyasli@citrix.com> wrote:
>> On Wed, 2017-02-15 at 04:39 -0700, Jan Beulich wrote:
>>> > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
>>> > (XEN) [ 1408.075638] Xen call trace:
>>> > (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
>>> > (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0
>>> > (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0
>>> > (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0
>>> > (XEN) [ 1408.107925]    [<ffff82d080136400>] 
> timer.c#timer_softirq_action+0x90/0x210
>>> > (XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90
>>> > (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60
>>> 
>>> Taking your later reply into account - were you able to figure out
>>> what other party held onto the VMCS being waited for here?
>> 
>> Unfortunately, no. It was unclear from debug logs. But judging from
>> the following vmx_do_resume() code:
>> 
>>     if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
>>     {
>>         if ( v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs) )
>>             vmx_load_vmcs(v);
>>     }
>> 
>> If both of the above conditions are true then vmx_vmcs_reload() will
>> probably hang.
> 
> I don't follow (reload should run before this, not after), but I must
> be missing something more general anyway, as I'm seeing the code
> above being needed despite the reload additions.

I think I've understood part of it over lunch: Surprisingly enough
vmx_ctxt_switch_to() doesn't re-establish the VMCS, so it needs
to be done here. Which I think means we don't need the new
hook at all, as that way the state is no different between going
through ->to() or bypassing it.

What I continue to not understand is why vmcs_pa would ever
not match current_vmcs when active_cpu is smp_processor_id().
So far I thought both are always updated together. Looking
further ...

Jan
Jan Beulich Feb. 15, 2017, 1:20 p.m. UTC | #8
>>> On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
> This is what I'm getting during the original test case (32 VMs reboot):
> 
> (XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!
> (XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----

Hmm, this was with a non-debug build, so the ASSERT() in
vmx_vmcs_reload() was a no-op, yet it would have been useful
to know whether active_cpu was -1 when getting stuck here.
Btw - there was no nested virt in the picture in your try, was
there?

Jan
Sergey Dyasli Feb. 15, 2017, 1:40 p.m. UTC | #9
On Wed, 2017-02-15 at 06:03 -0700, Jan Beulich wrote:
> > > > On 15.02.17 at 12:55, <JBeulich@suse.com> wrote:

> > > > > On 15.02.17 at 12:48, <sergey.dyasli@citrix.com> wrote:

> > > 

> > > On Wed, 2017-02-15 at 04:39 -0700, Jan Beulich wrote:

> > > > > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:

> > > > > 

> > > > > (XEN) [ 1408.075638] Xen call trace:

> > > > > (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50

> > > > > (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0

> > > > > (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] schedule.c#schedule+0x46e/0x7d0

> > > > > (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0

> > > > > (XEN) [ 1408.107925]    [<ffff82d080136400>] 

> > 

> > timer.c#timer_softirq_action+0x90/0x210

> > > > > (XEN) [ 1408.116263]    [<ffff82d08013311c>] softirq.c#__do_softirq+0x5c/0x90

> > > > > (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60

> > > > 

> > > > Taking your later reply into account - were you able to figure out

> > > > what other party held onto the VMCS being waited for here?

> > > 

> > > Unfortunately, no. It was unclear from debug logs. But judging from

> > > the following vmx_do_resume() code:

> > > 

> > >     if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )

> > >     {

> > >         if ( v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs) )

> > >             vmx_load_vmcs(v);

> > >     }

> > > 

> > > If both of the above conditions are true then vmx_vmcs_reload() will

> > > probably hang.

> > 

> > I don't follow (reload should run before this, not after), but I must

> > be missing something more general anyway, as I'm seeing the code

> > above being needed despite the reload additions.

> 

> I think I've understood part of it over lunch: Surprisingly enough

> vmx_ctxt_switch_to() doesn't re-establish the VMCS, so it needs

> to be done here. Which I think means we don't need the new

> hook at all, as that way the state is no different between going

> through ->to() or bypassing it.

> 

> What I continue to not understand is why vmcs_pa would ever

> not match current_vmcs when active_cpu is smp_processor_id().

> So far I thought both are always updated together. Looking

> further ...


This is exactly what will happen should the 3.1 occur:

    1. HVM vCPU#1 --> idle vCPU context_switch

    2. softirq --> vmx_vmcs_enter() + vmx_vmcs_exit() for a remote vCPU
       [scenario with PML]
       This will switch current_vmcs to a remote one.
       has_hvm_container_vcpu(current) will be false and vmcs will not
       be reloaded.

    3.1. idle vCPU --> HVM vCPU#1 (same) context_switch
         vmx_do_resume
            v->arch.hvm_vmx.active_cpu == smp_processor_id()
            v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs)

    3.2 idle vCPU --> HVM vCPU#2 (different)
            __context_switch()
                vmwrite
                BUG()
       This is the original BUG() scenario which my patch
       addresses.

-- 
Thanks,
Sergey
Jan Beulich Feb. 15, 2017, 2:29 p.m. UTC | #10
>>> On 15.02.17 at 14:40, <sergey.dyasli@citrix.com> wrote:
> On Wed, 2017-02-15 at 06:03 -0700, Jan Beulich wrote:
>> > > > On 15.02.17 at 12:55, <JBeulich@suse.com> wrote:
>> > > > > On 15.02.17 at 12:48, <sergey.dyasli@citrix.com> wrote:
>> > > 
>> > > On Wed, 2017-02-15 at 04:39 -0700, Jan Beulich wrote:
>> > > > > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
>> > > > > 
>> > > > > (XEN) [ 1408.075638] Xen call trace:
>> > > > > (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
>> > > > > (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0
>> > > > > (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] 
> schedule.c#schedule+0x46e/0x7d0
>> > > > > (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0
>> > > > > (XEN) [ 1408.107925]    [<ffff82d080136400>] 
>> > 
>> > timer.c#timer_softirq_action+0x90/0x210
>> > > > > (XEN) [ 1408.116263]    [<ffff82d08013311c>] 
> softirq.c#__do_softirq+0x5c/0x90
>> > > > > (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60
>> > > > 
>> > > > Taking your later reply into account - were you able to figure out
>> > > > what other party held onto the VMCS being waited for here?
>> > > 
>> > > Unfortunately, no. It was unclear from debug logs. But judging from
>> > > the following vmx_do_resume() code:
>> > > 
>> > >     if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
>> > >     {
>> > >         if ( v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs) )
>> > >             vmx_load_vmcs(v);
>> > >     }
>> > > 
>> > > If both of the above conditions are true then vmx_vmcs_reload() will
>> > > probably hang.
>> > 
>> > I don't follow (reload should run before this, not after), but I must
>> > be missing something more general anyway, as I'm seeing the code
>> > above being needed despite the reload additions.
>> 
>> I think I've understood part of it over lunch: Surprisingly enough
>> vmx_ctxt_switch_to() doesn't re-establish the VMCS, so it needs
>> to be done here. Which I think means we don't need the new
>> hook at all, as that way the state is no different between going
>> through ->to() or bypassing it.
>> 
>> What I continue to not understand is why vmcs_pa would ever
>> not match current_vmcs when active_cpu is smp_processor_id().
>> So far I thought both are always updated together. Looking
>> further ...
> 
> This is exactly what will happen should the 3.1 occur:
> 
>     1. HVM vCPU#1 --> idle vCPU context_switch
> 
>     2. softirq --> vmx_vmcs_enter() + vmx_vmcs_exit() for a remote vCPU
>        [scenario with PML]
>        This will switch current_vmcs to a remote one.
>        has_hvm_container_vcpu(current) will be false and vmcs will not
>        be reloaded.

Oh, right - this updates v's active_cpu, but doesn't update the
field for the vCPU the VMCS is being taken away from.

>     3.1. idle vCPU --> HVM vCPU#1 (same) context_switch
>          vmx_do_resume
>             v->arch.hvm_vmx.active_cpu == smp_processor_id()
>             v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs)
> 
>     3.2 idle vCPU --> HVM vCPU#2 (different)
>             __context_switch()
>                 vmwrite
>                 BUG()
>        This is the original BUG() scenario which my patch
>        addresses.

Which I'm not convinced of, as indicated in an earlier reply (on the
thread Anshul did start) to my own consideration of using curr_vcpu.
The problem of vcpu_pause() not spinning when v->is_running is
clear does not get addressed by your patch afaict, so to me it looks
as if you only shrink the critical time window without fully eliminating
it.

Nor can I see how the above would have been an issue in the
first place - that's precisely what vmx_do_resume() is taking care
of (unless you've found a vmwrite that happens before this
function runs, which would then get us back to the question why
vmx_ctxt_switch_to() doesn't load the VMCS). Afaics the problem
is on the switch-out and switch-same paths, but not on the
switch-in one (or if there is one there, then it's yet another one,
with a yet to be explained way of the VMCS being taken away
underneath the vCPU's feet).

Jan
Jan Beulich Feb. 15, 2017, 2:44 p.m. UTC | #11
>>> On 15.02.17 at 15:29, <JBeulich@suse.com> wrote:
>>>> On 15.02.17 at 14:40, <sergey.dyasli@citrix.com> wrote:
>> On Wed, 2017-02-15 at 06:03 -0700, Jan Beulich wrote:
>>> > > > On 15.02.17 at 12:55, <JBeulich@suse.com> wrote:
>>> > > > > On 15.02.17 at 12:48, <sergey.dyasli@citrix.com> wrote:
>>> > > 
>>> > > On Wed, 2017-02-15 at 04:39 -0700, Jan Beulich wrote:
>>> > > > > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
>>> > > > > 
>>> > > > > (XEN) [ 1408.075638] Xen call trace:
>>> > > > > (XEN) [ 1408.079322]    [<ffff82d0801ea2a2>] vmx_vmcs_reload+0x32/0x50
>>> > > > > (XEN) [ 1408.086303]    [<ffff82d08016c58d>] context_switch+0x85d/0xeb0
>>> > > > > (XEN) [ 1408.093380]    [<ffff82d08012fb8e>] 
>> schedule.c#schedule+0x46e/0x7d0
>>> > > > > (XEN) [ 1408.100942]    [<ffff82d080164305>] reprogram_timer+0x75/0xe0
>>> > > > > (XEN) [ 1408.107925]    [<ffff82d080136400>] 
>>> > 
>>> > timer.c#timer_softirq_action+0x90/0x210
>>> > > > > (XEN) [ 1408.116263]    [<ffff82d08013311c>] 
>> softirq.c#__do_softirq+0x5c/0x90
>>> > > > > (XEN) [ 1408.123921]    [<ffff82d080167d35>] domain.c#idle_loop+0x25/0x60
>>> > > > 
>>> > > > Taking your later reply into account - were you able to figure out
>>> > > > what other party held onto the VMCS being waited for here?
>>> > > 
>>> > > Unfortunately, no. It was unclear from debug logs. But judging from
>>> > > the following vmx_do_resume() code:
>>> > > 
>>> > >     if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
>>> > >     {
>>> > >         if ( v->arch.hvm_vmx.vmcs_pa != this_cpu(current_vmcs) )
>>> > >             vmx_load_vmcs(v);
>>> > >     }
>>> > > 
>>> > > If both of the above conditions are true then vmx_vmcs_reload() will
>>> > > probably hang.
>>> > 
>>> > I don't follow (reload should run before this, not after), but I must
>>> > be missing something more general anyway, as I'm seeing the code
>>> > above being needed despite the reload additions.
>>> 
>>> I think I've understood part of it over lunch: Surprisingly enough
>>> vmx_ctxt_switch_to() doesn't re-establish the VMCS, so it needs
>>> to be done here. Which I think means we don't need the new
>>> hook at all, as that way the state is no different between going
>>> through ->to() or bypassing it.
>>> 
>>> What I continue to not understand is why vmcs_pa would ever
>>> not match current_vmcs when active_cpu is smp_processor_id().
>>> So far I thought both are always updated together. Looking
>>> further ...
>> 
>> This is exactly what will happen should the 3.1 occur:
>> 
>>     1. HVM vCPU#1 --> idle vCPU context_switch
>> 
>>     2. softirq --> vmx_vmcs_enter() + vmx_vmcs_exit() for a remote vCPU
>>        [scenario with PML]
>>        This will switch current_vmcs to a remote one.
>>        has_hvm_container_vcpu(current) will be false and vmcs will not
>>        be reloaded.
> 
> Oh, right - this updates v's active_cpu, but doesn't update the
> field for the vCPU the VMCS is being taken away from.

Which then also indicates a likely reason for the endless loop you
saw with my patch - the spin loop (which I'm no longer convinced is
needed at all) would need to be skipped if the current CPU is the
active one:

void vmx_vmcs_reload(struct vcpu *v)
{
    /*
     * As we're running with interrupts disabled, we can't acquire
     * v->arch.hvm_vmx.vmcs_lock here. However, with interrupts disabled
     * the VMCS can't be taken away from us anymore if we still own it.
     */
    ASSERT(!local_irq_is_enabled());
    if ( v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs) )
        return;
    ASSERT(!this_cpu(current_vmcs));

    if ( v->arch.hvm_vmx.active_cpu != smp_processor_id() )
    {
        /*
         * Wait for the remote side to be done with the VMCS before
         * loading it here.
         */
        while ( v->arch.hvm_vmx.active_cpu != -1 )
            cpu_relax();
    }
    vmx_load_vmcs(v);
}

As to the spinning not being needed in the first place - the remote
CPU using the VMCS won't unpause us before being done with the
VMCS anyway. Thoughts?

Jan
Sergey Dyasli Feb. 15, 2017, 2:55 p.m. UTC | #12
On Wed, 2017-02-15 at 06:20 -0700, Jan Beulich wrote:
> > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:

> > 

> > This is what I'm getting during the original test case (32 VMs reboot):

> > 

> > (XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!

> > (XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----

> 

> Hmm, this was with a non-debug build, so the ASSERT() in

> vmx_vmcs_reload() was a no-op, yet it would have been useful

> to know whether active_cpu was -1 when getting stuck here.

> Btw - there was no nested virt in the picture in your try, was

> there?


No nested virt is involved in the test case.

Is it worth giving your patch another try with removing ctxt_switch_same()
since we figured out that vmx_do_resume() will reload vmcs either way?
And I will also update vmx_vmcs_reload() from your last email.

-- 
Thanks,
Sergey
Jan Beulich Feb. 15, 2017, 3:15 p.m. UTC | #13
>>> On 15.02.17 at 15:55, <sergey.dyasli@citrix.com> wrote:
> On Wed, 2017-02-15 at 06:20 -0700, Jan Beulich wrote:
>> > > > On 15.02.17 at 11:27, <sergey.dyasli@citrix.com> wrote:
>> > 
>> > This is what I'm getting during the original test case (32 VMs reboot):
>> > 
>> > (XEN) [ 1407.789329] Watchdog timer detects that CPU12 is stuck!
>> > (XEN) [ 1407.795726] ----[ Xen-4.6.1-xs-local  x86_64  debug=n  Not tainted ]----
>> 
>> Hmm, this was with a non-debug build, so the ASSERT() in
>> vmx_vmcs_reload() was a no-op, yet it would have been useful
>> to know whether active_cpu was -1 when getting stuck here.
>> Btw - there was no nested virt in the picture in your try, was
>> there?
> 
> No nested virt is involved in the test case.
> 
> Is it worth giving your patch another try with removing ctxt_switch_same()
> since we figured out that vmx_do_resume() will reload vmcs either way?

Yes, but that's the cosmetic part, whereras ...

> And I will also update vmx_vmcs_reload() from your last email.

... this looks to be the actual bug fix. If you agree with my
reasoning of removing the loop altogether, you may want to go
with that version instead of adding the conditional.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 88db7ee..f0d71ae 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -742,6 +742,8 @@  void vmx_vmcs_enter(struct vcpu *v)
 void vmx_vmcs_exit(struct vcpu *v)
 {
     struct foreign_vmcs *fv;
+    unsigned int cpu = smp_processor_id();
+    struct vcpu *p = per_cpu(curr_vcpu, cpu);
 
     if ( likely(v == current) )
         return;
@@ -754,8 +756,8 @@  void vmx_vmcs_exit(struct vcpu *v)
     {
         /* Don't confuse vmx_do_resume (for @v or @current!) */
         vmx_clear_vmcs(v);
-        if ( has_hvm_container_vcpu(current) )
-            vmx_load_vmcs(current);
+        if ( has_hvm_container_vcpu(p) )
+            vmx_load_vmcs(p);
 
         spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
         vcpu_unpause(v);