diff mbox series

KVM: VMX: Fix crash due to uninitialized current_vmcs

Message ID 20230118141348.828-1-alexandru.matei@uipath.com (mailing list archive)
State New, archived
Headers show
Series KVM: VMX: Fix crash due to uninitialized current_vmcs | expand

Commit Message

Alexandru Matei Jan. 18, 2023, 2:13 p.m. UTC
KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
that the msr bitmap was changed.

vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
-> vmx_msr_bitmap_l01_changed which in the end calls this function. The
function checks for current_vmcs if it is null but the check is
insufficient because current_vmcs is not initialized. Because of this, the
code might incorrectly write to the structure pointed by current_vmcs value
left by another task. Preemption is not disabled so the current task can
also be preempted and moved to another CPU while current_vmcs is accessed
multiple times from evmcs_touch_msr_bitmap() which leads to crash.

To fix this problem, this patch moves vmx_disable_intercept_for_msr calls
before init_vmcs call in __vmx_vcpu_reset(), as ->vcpu_reset() is invoked
after the vCPU is properly loaded via ->vcpu_load() and current_vmcs is
initialized.

BUG: kernel NULL pointer dereference, address: 0000000000000338
PGD 4e1775067 P4D 0
Oops: 0002 [#1] PREEMPT SMP NOPTI
...
RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel]
...
Call Trace:
 vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel]
 vmx_vcpu_create+0xe6/0x540 [kvm_intel]
 ? __vmalloc_node+0x4a/0x70
 kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm]
 kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm]
 ? __handle_mm_fault+0x3cb/0x750
 kvm_vm_ioctl+0x53f/0x790 [kvm]
 ? syscall_exit_work+0x11a/0x150
 ? syscall_exit_to_user_mode+0x12/0x30
 ? do_syscall_64+0x69/0x90
 ? handle_mm_fault+0xc5/0x2a0
 __x64_sys_ioctl+0x8a/0xc0
 do_syscall_64+0x5c/0x90
 ? exc_page_fault+0x62/0x150
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Signed-off-by: Alexandru Matei <alexandru.matei@uipath.com>
---
 arch/x86/kvm/vmx/vmx.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

Comments

Sean Christopherson Jan. 18, 2023, 3:45 p.m. UTC | #1
On Wed, Jan 18, 2023, Alexandru Matei wrote:
> KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
> a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
> evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
> that the msr bitmap was changed.
> 
> vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
> -> vmx_msr_bitmap_l01_changed which in the end calls this function. The
> function checks for current_vmcs if it is null but the check is
> insufficient because current_vmcs is not initialized. Because of this, the
> code might incorrectly write to the structure pointed by current_vmcs value
> left by another task. Preemption is not disabled so the current task can
> also be preempted and moved to another CPU while current_vmcs is accessed
> multiple times from evmcs_touch_msr_bitmap() which leads to crash.
> 
> To fix this problem, this patch moves vmx_disable_intercept_for_msr calls
> before init_vmcs call in __vmx_vcpu_reset(), as ->vcpu_reset() is invoked
> after the vCPU is properly loaded via ->vcpu_load() and current_vmcs is
> initialized.

IMO, moving the calls is a band-aid and doesn't address the underlying bug.  I
don't see any reason why the Hyper-V code should use a per-cpu pointer in this
case.  It makes sense when replacing VMX sequences that operate on the VMCS, e.g.
VMREAD, VMWRITE, etc., but for operations that aren't direct replacements for VMX
instructions I think we should have a rule that Hyper-V isn't allowed to touch the
per-cpu pointer.

E.g. in this case it's trivial to pass down the target (completely untested).

Vitaly?


---
 arch/x86/kvm/vmx/hyperv.h | 12 +++++++-----
 arch/x86/kvm/vmx/vmx.c    |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index ab08a9b9ab7d..ad16b52766bb 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -250,13 +250,15 @@ static inline u16 evmcs_read16(unsigned long field)
 	return *(u16 *)((char *)current_evmcs + offset);
 }
 
-static inline void evmcs_touch_msr_bitmap(void)
+static inline void evmcs_touch_msr_bitmap(struct vcpu_vmx *vmx)
 {
-	if (unlikely(!current_evmcs))
+	struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+	if (WARN_ON_ONCE(!evmcs))
 		return;
 
-	if (current_evmcs->hv_enlightenments_control.msr_bitmap)
-		current_evmcs->hv_clean_fields &=
+	if (evmcs->hv_enlightenments_control.msr_bitmap)
+		evmcs->hv_clean_fields &=
 			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
 }
 
@@ -280,7 +282,7 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; }
 static inline u32 evmcs_read32(unsigned long field) { return 0; }
 static inline u16 evmcs_read16(unsigned long field) { return 0; }
 static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
+static inline void evmcs_touch_msr_bitmap(struct vcpu_vmx *vmx) {}
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
 #define EVMPTR_INVALID (-1ULL)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c788aa382611..6ed6f52aad0c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3937,7 +3937,7 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
 	 * bitmap has changed.
 	 */
 	if (static_branch_unlikely(&enable_evmcs))
-		evmcs_touch_msr_bitmap();
+		evmcs_touch_msr_bitmap(vmx);
 
 	vmx->nested.force_msr_bitmap_recalc = true;
 }

base-commit: 6e9a476ea49d43a27b42004cfd7283f128494d1d
--
Vitaly Kuznetsov Jan. 18, 2023, 4:12 p.m. UTC | #2
Alexandru Matei <alexandru.matei@uipath.com> writes:

> KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
> a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
> evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
> that the msr bitmap was changed.
>
> vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
> -> vmx_msr_bitmap_l01_changed which in the end calls this function. The
> function checks for current_vmcs if it is null but the check is
> insufficient because current_vmcs is not initialized. 

I'm failing to remember why evmcs_touch_msr_bitmap() needs
'current_vmcs' at all, can we just use 'vmx->loaded_vmcs->vmcs' (after
passing 'vmx' parameter to it) instead?

> Because of this, the
> code might incorrectly write to the structure pointed by current_vmcs value
> left by another task. Preemption is not disabled so the current task can
> also be preempted and moved to another CPU while current_vmcs is accessed
> multiple times from evmcs_touch_msr_bitmap() which leads to crash.
>
> To fix this problem, this patch moves vmx_disable_intercept_for_msr calls
> before init_vmcs call in __vmx_vcpu_reset(), as ->vcpu_reset() is invoked
> after the vCPU is properly loaded via ->vcpu_load() and current_vmcs is
> initialized.
>
> BUG: kernel NULL pointer dereference, address: 0000000000000338
> PGD 4e1775067 P4D 0
> Oops: 0002 [#1] PREEMPT SMP NOPTI
> ...
> RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel]
> ...
> Call Trace:
>  vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel]
>  vmx_vcpu_create+0xe6/0x540 [kvm_intel]
>  ? __vmalloc_node+0x4a/0x70
>  kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm]
>  kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm]
>  ? __handle_mm_fault+0x3cb/0x750
>  kvm_vm_ioctl+0x53f/0x790 [kvm]
>  ? syscall_exit_work+0x11a/0x150
>  ? syscall_exit_to_user_mode+0x12/0x30
>  ? do_syscall_64+0x69/0x90
>  ? handle_mm_fault+0xc5/0x2a0
>  __x64_sys_ioctl+0x8a/0xc0
>  do_syscall_64+0x5c/0x90
>  ? exc_page_fault+0x62/0x150
>  entry_SYSCALL_64_after_hwframe+0x63/0xcd
>
> Signed-off-by: Alexandru Matei <alexandru.matei@uipath.com>
> ---
>  arch/x86/kvm/vmx/vmx.c | 32 ++++++++++++++++----------------
>  1 file changed, 16 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index fe5615fd8295..168138dfb0b4 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4735,6 +4735,22 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
> +#ifdef CONFIG_X86_64
> +	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> +#endif
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
> +	if (kvm_cstate_in_guest(vcpu->kvm)) {
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
> +	}
> +
>  	init_vmcs(vmx);
>  
>  	if (nested)
> @@ -7363,22 +7379,6 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
>  	bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
>  	bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);

I think that vmx_disable_intercept_for_msr() you move uses
'vmx->shadow_msr_intercept.read'/'vmx->shadow_msr_intercept.write' from
above so there's a dependency here. Let's avoid the churn (if possible).

>  
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
> -#ifdef CONFIG_X86_64
> -	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> -#endif
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
> -	if (kvm_cstate_in_guest(vcpu->kvm)) {
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
> -	}
> -
>  	vmx->loaded_vmcs = &vmx->vmcs01;
>  
>  	if (cpu_need_virtualize_apic_accesses(vcpu)) {
Vitaly Kuznetsov Jan. 18, 2023, 4:16 p.m. UTC | #3
Sean Christopherson <seanjc@google.com> writes:

> On Wed, Jan 18, 2023, Alexandru Matei wrote:
>> KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
>> a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
>> evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
>> that the msr bitmap was changed.
>> 
>> vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
>> -> vmx_msr_bitmap_l01_changed which in the end calls this function. The
>> function checks for current_vmcs if it is null but the check is
>> insufficient because current_vmcs is not initialized. Because of this, the
>> code might incorrectly write to the structure pointed by current_vmcs value
>> left by another task. Preemption is not disabled so the current task can
>> also be preempted and moved to another CPU while current_vmcs is accessed
>> multiple times from evmcs_touch_msr_bitmap() which leads to crash.
>> 
>> To fix this problem, this patch moves vmx_disable_intercept_for_msr calls
>> before init_vmcs call in __vmx_vcpu_reset(), as ->vcpu_reset() is invoked
>> after the vCPU is properly loaded via ->vcpu_load() and current_vmcs is
>> initialized.
>
> IMO, moving the calls is a band-aid and doesn't address the underlying bug.  I
> don't see any reason why the Hyper-V code should use a per-cpu pointer in this
> case.  It makes sense when replacing VMX sequences that operate on the VMCS, e.g.
> VMREAD, VMWRITE, etc., but for operations that aren't direct replacements for VMX
> instructions I think we should have a rule that Hyper-V isn't allowed to touch the
> per-cpu pointer.
>
> E.g. in this case it's trivial to pass down the target (completely untested).
>
> Vitaly?

Mid-air collision detected) I've just suggested a very similar approach
but instead of 'vmx->vmcs01.vmcs' I've suggested using
'vmx->loaded_vmcs->vmcs': in case we're running L2 and loaded VMCS is
'vmcs02', I think we still need to touch the clean field indicating that
MSR-Bitmap has changed. Equally untested :-)

>
>
> ---
>  arch/x86/kvm/vmx/hyperv.h | 12 +++++++-----
>  arch/x86/kvm/vmx/vmx.c    |  2 +-
>  2 files changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
> index ab08a9b9ab7d..ad16b52766bb 100644
> --- a/arch/x86/kvm/vmx/hyperv.h
> +++ b/arch/x86/kvm/vmx/hyperv.h
> @@ -250,13 +250,15 @@ static inline u16 evmcs_read16(unsigned long field)
>  	return *(u16 *)((char *)current_evmcs + offset);
>  }
>  
> -static inline void evmcs_touch_msr_bitmap(void)
> +static inline void evmcs_touch_msr_bitmap(struct vcpu_vmx *vmx)
>  {
> -	if (unlikely(!current_evmcs))
> +	struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
> +
> +	if (WARN_ON_ONCE(!evmcs))
>  		return;
>  
> -	if (current_evmcs->hv_enlightenments_control.msr_bitmap)
> -		current_evmcs->hv_clean_fields &=
> +	if (evmcs->hv_enlightenments_control.msr_bitmap)
> +		evmcs->hv_clean_fields &=
>  			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
>  }
>  
> @@ -280,7 +282,7 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; }
>  static inline u32 evmcs_read32(unsigned long field) { return 0; }
>  static inline u16 evmcs_read16(unsigned long field) { return 0; }
>  static inline void evmcs_load(u64 phys_addr) {}
> -static inline void evmcs_touch_msr_bitmap(void) {}
> +static inline void evmcs_touch_msr_bitmap(struct vcpu_vmx *vmx) {}
>  #endif /* IS_ENABLED(CONFIG_HYPERV) */
>  
>  #define EVMPTR_INVALID (-1ULL)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index c788aa382611..6ed6f52aad0c 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3937,7 +3937,7 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
>  	 * bitmap has changed.
>  	 */
>  	if (static_branch_unlikely(&enable_evmcs))
> -		evmcs_touch_msr_bitmap();
> +		evmcs_touch_msr_bitmap(vmx);
>  
>  	vmx->nested.force_msr_bitmap_recalc = true;
>  }
>
> base-commit: 6e9a476ea49d43a27b42004cfd7283f128494d1d
Sean Christopherson Jan. 18, 2023, 4:21 p.m. UTC | #4
On Wed, Jan 18, 2023, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Wed, Jan 18, 2023, Alexandru Matei wrote:
> >> KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
> >> a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
> >> evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
> >> that the msr bitmap was changed.
> >> 
> >> vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
> >> -> vmx_msr_bitmap_l01_changed which in the end calls this function. The
> >> function checks for current_vmcs if it is null but the check is
> >> insufficient because current_vmcs is not initialized. Because of this, the
> >> code might incorrectly write to the structure pointed by current_vmcs value
> >> left by another task. Preemption is not disabled so the current task can
> >> also be preempted and moved to another CPU while current_vmcs is accessed
> >> multiple times from evmcs_touch_msr_bitmap() which leads to crash.
> >> 
> >> To fix this problem, this patch moves vmx_disable_intercept_for_msr calls
> >> before init_vmcs call in __vmx_vcpu_reset(), as ->vcpu_reset() is invoked
> >> after the vCPU is properly loaded via ->vcpu_load() and current_vmcs is
> >> initialized.
> >
> > IMO, moving the calls is a band-aid and doesn't address the underlying bug.  I
> > don't see any reason why the Hyper-V code should use a per-cpu pointer in this
> > case.  It makes sense when replacing VMX sequences that operate on the VMCS, e.g.
> > VMREAD, VMWRITE, etc., but for operations that aren't direct replacements for VMX
> > instructions I think we should have a rule that Hyper-V isn't allowed to touch the
> > per-cpu pointer.
> >
> > E.g. in this case it's trivial to pass down the target (completely untested).
> >
> > Vitaly?
> 
> Mid-air collision detected) I've just suggested a very similar approach
> but instead of 'vmx->vmcs01.vmcs' I've suggested using
> 'vmx->loaded_vmcs->vmcs': in case we're running L2 and loaded VMCS is
> 'vmcs02', I think we still need to touch the clean field indicating that
> MSR-Bitmap has changed. Equally untested :-)

Three reasons to use vmcs01 directly:

  1. I don't want to require loaded_vmcs to be set.  E.g. in the problematic
     flows, this 

	vmx->loaded_vmcs = &vmx->vmcs01;

     comes after the calls to vmx_disable_intercept_for_msr().

  2. KVM on Hyper-V doesn't use the bitmaps for L2 (evmcs02):

	/*
	 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
	 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
	 * feature only for vmcs01, KVM currently isn't equipped to realize any
	 * performance benefits from enabling it for vmcs02.
	 */
	if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
	    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
		struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;

		evmcs->hv_enlightenments_control.msr_bitmap = 1;
	}

  3. KVM's manipulation of MSR bitmaps typically happens _only_ for vmcs01,
     e.g. the caller is vmx_msr_bitmap_l01_changed().  The nested case is a 
     special snowflake.
Vitaly Kuznetsov Jan. 18, 2023, 4:25 p.m. UTC | #5
Sean Christopherson <seanjc@google.com> writes:

> On Wed, Jan 18, 2023, Vitaly Kuznetsov wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> 
>> > On Wed, Jan 18, 2023, Alexandru Matei wrote:
>> >> KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
>> >> a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
>> >> evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
>> >> that the msr bitmap was changed.
>> >> 
>> >> vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
>> >> -> vmx_msr_bitmap_l01_changed which in the end calls this function. The
>> >> function checks for current_vmcs if it is null but the check is
>> >> insufficient because current_vmcs is not initialized. Because of this, the
>> >> code might incorrectly write to the structure pointed by current_vmcs value
>> >> left by another task. Preemption is not disabled so the current task can
>> >> also be preempted and moved to another CPU while current_vmcs is accessed
>> >> multiple times from evmcs_touch_msr_bitmap() which leads to crash.
>> >> 
>> >> To fix this problem, this patch moves vmx_disable_intercept_for_msr calls
>> >> before init_vmcs call in __vmx_vcpu_reset(), as ->vcpu_reset() is invoked
>> >> after the vCPU is properly loaded via ->vcpu_load() and current_vmcs is
>> >> initialized.
>> >
>> > IMO, moving the calls is a band-aid and doesn't address the underlying bug.  I
>> > don't see any reason why the Hyper-V code should use a per-cpu pointer in this
>> > case.  It makes sense when replacing VMX sequences that operate on the VMCS, e.g.
>> > VMREAD, VMWRITE, etc., but for operations that aren't direct replacements for VMX
>> > instructions I think we should have a rule that Hyper-V isn't allowed to touch the
>> > per-cpu pointer.
>> >
>> > E.g. in this case it's trivial to pass down the target (completely untested).
>> >
>> > Vitaly?
>> 
>> Mid-air collision detected) I've just suggested a very similar approach
>> but instead of 'vmx->vmcs01.vmcs' I've suggested using
>> 'vmx->loaded_vmcs->vmcs': in case we're running L2 and loaded VMCS is
>> 'vmcs02', I think we still need to touch the clean field indicating that
>> MSR-Bitmap has changed. Equally untested :-)
>
> Three reasons to use vmcs01 directly:
>
>   1. I don't want to require loaded_vmcs to be set.  E.g. in the problematic
>      flows, this 
>
> 	vmx->loaded_vmcs = &vmx->vmcs01;
>
>      comes after the calls to vmx_disable_intercept_for_msr().
>
>   2. KVM on Hyper-V doesn't use the bitmaps for L2 (evmcs02):
>
> 	/*
> 	 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
> 	 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
> 	 * feature only for vmcs01, KVM currently isn't equipped to realize any
> 	 * performance benefits from enabling it for vmcs02.
> 	 */
> 	if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
> 	    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
> 		struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
>
> 		evmcs->hv_enlightenments_control.msr_bitmap = 1;
> 	}

Oh, indeed, I've forgotten this. I'm fine with 'vmx->vmcs01' then but
let's leave a comment (which I've going to also forget about, but still)
that eMSR bitmap is an L1-only feature.

>
>   3. KVM's manipulation of MSR bitmaps typically happens _only_ for vmcs01,
>      e.g. the caller is vmx_msr_bitmap_l01_changed().  The nested case is a 
>      special snowflake.
>
Alexandru Matei Jan. 18, 2023, 9:15 p.m. UTC | #6
On 1/18/2023 6:25 PM, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
>> On Wed, Jan 18, 2023, Vitaly Kuznetsov wrote:
>>> Sean Christopherson <seanjc@google.com> writes:
>>>
>>>> On Wed, Jan 18, 2023, Alexandru Matei wrote:
>>>>> KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
>>>>> a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
>>>>> evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
>>>>> that the msr bitmap was changed.
>>>>>
>>>>> vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
>>>>> -> vmx_msr_bitmap_l01_changed which in the end calls this function. The
>>>>> function checks for current_vmcs if it is null but the check is
>>>>> insufficient because current_vmcs is not initialized. Because of this, the
>>>>> code might incorrectly write to the structure pointed by current_vmcs value
>>>>> left by another task. Preemption is not disabled so the current task can
>>>>> also be preempted and moved to another CPU while current_vmcs is accessed
>>>>> multiple times from evmcs_touch_msr_bitmap() which leads to crash.
>>>>>
>>>>> To fix this problem, this patch moves vmx_disable_intercept_for_msr calls
>>>>> before init_vmcs call in __vmx_vcpu_reset(), as ->vcpu_reset() is invoked
>>>>> after the vCPU is properly loaded via ->vcpu_load() and current_vmcs is
>>>>> initialized.
>>>>
>>>> IMO, moving the calls is a band-aid and doesn't address the underlying bug.  I
>>>> don't see any reason why the Hyper-V code should use a per-cpu pointer in this
>>>> case.  It makes sense when replacing VMX sequences that operate on the VMCS, e.g.
>>>> VMREAD, VMWRITE, etc., but for operations that aren't direct replacements for VMX
>>>> instructions I think we should have a rule that Hyper-V isn't allowed to touch the
>>>> per-cpu pointer.
>>>>
>>>> E.g. in this case it's trivial to pass down the target (completely untested).
>>>>
>>>> Vitaly?
>>>
>>> Mid-air collision detected) I've just suggested a very similar approach
>>> but instead of 'vmx->vmcs01.vmcs' I've suggested using
>>> 'vmx->loaded_vmcs->vmcs': in case we're running L2 and loaded VMCS is
>>> 'vmcs02', I think we still need to touch the clean field indicating that
>>> MSR-Bitmap has changed. Equally untested :-)
>>
>> Three reasons to use vmcs01 directly:
>>
>>   1. I don't want to require loaded_vmcs to be set.  E.g. in the problematic
>>      flows, this 
>>
>> 	vmx->loaded_vmcs = &vmx->vmcs01;
>>
>>      comes after the calls to vmx_disable_intercept_for_msr().
>>
>>   2. KVM on Hyper-V doesn't use the bitmaps for L2 (evmcs02):
>>
>> 	/*
>> 	 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
>> 	 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
>> 	 * feature only for vmcs01, KVM currently isn't equipped to realize any
>> 	 * performance benefits from enabling it for vmcs02.
>> 	 */
>> 	if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
>> 	    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
>> 		struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
>>
>> 		evmcs->hv_enlightenments_control.msr_bitmap = 1;
>> 	}
> 
> Oh, indeed, I've forgotten this. I'm fine with 'vmx->vmcs01' then but
> let's leave a comment (which I've going to also forget about, but still)
> that eMSR bitmap is an L1-only feature.
> 
>>
>>   3. KVM's manipulation of MSR bitmaps typically happens _only_ for vmcs01,
>>      e.g. the caller is vmx_msr_bitmap_l01_changed().  The nested case is a 
>>      special snowflake.
>>
> 

Thanks Sean and Vitaly for your insights and suggestions. I'll redo the patch 
using your code Sean if it's ok with you and run the tests again.
Sean Christopherson Jan. 18, 2023, 9:35 p.m. UTC | #7
On Wed, Jan 18, 2023, Alexandru Matei wrote:
> On 1/18/2023 6:25 PM, Vitaly Kuznetsov wrote:
> > Oh, indeed, I've forgotten this. I'm fine with 'vmx->vmcs01' then but
> > let's leave a comment (which I've going to also forget about, but still)
> > that eMSR bitmap is an L1-only feature.
> > 
> >>
> >>   3. KVM's manipulation of MSR bitmaps typically happens _only_ for vmcs01,
> >>      e.g. the caller is vmx_msr_bitmap_l01_changed().  The nested case is a 
> >>      special snowflake.
> >>
> > 
> 
> Thanks Sean and Vitaly for your insights and suggestions. I'll redo the patch 
> using your code Sean if it's ok with you and run the tests again.

Yep, absolutely!  As requested by Vitlay, please also add a comment in
evmcs_touch_msr_bitmap() to call out that the eMSR bitmap is only enabled for L1,
i.e. always operates on (e)vmcs01.

Thanks!
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fe5615fd8295..168138dfb0b4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4735,6 +4735,22 @@  static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+	if (kvm_cstate_in_guest(vcpu->kvm)) {
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+	}
+
 	init_vmcs(vmx);
 
 	if (nested)
@@ -7363,22 +7379,6 @@  static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
 	bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 	bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-#ifdef CONFIG_X86_64
-	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-#endif
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
-	if (kvm_cstate_in_guest(vcpu->kvm)) {
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
-	}
-
 	vmx->loaded_vmcs = &vmx->vmcs01;
 
 	if (cpu_need_virtualize_apic_accesses(vcpu)) {