Message ID | 20160808181623.12132-2-rkrcmar@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 2016/8/9 2:16, Radim Krčmář wrote: > msr bitmap can be used to avoid a VM exit (interception) on guest MSR > accesses. In some configurations of VMX controls, the guest can even > directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED > APIC ACCESSES. > > L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. > To do so, L1 would first trick KVM to disable all possible interceptions > by enabling APICv features and then would turn those features off; > nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would > not intercept previously enabled MSRs even though they were not safe > with the new configuration. > > Correctly re-enabling interceptions is not enough as a second bug would > still allow L1+L2 to access host's MSRs: msr bitmap was shared for all > VMCSs, so L1 could trigger a race to get the desired combination of msr > bitmap and VMX controls. > > This fix allocates a msr bitmap for every L1 VCPU, allows only safe > x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would > have to intercept everything anyway. > > Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") > Reported-by: Jim Mattson <jmattson@google.com> > Suggested-by: Wincy Van <fanwenyi0529@gmail.com> > Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> > --- > arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++------------------------------- > 1 file changed, 44 insertions(+), 63 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index a45d8580f91e..c66ac2c70d22 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -435,6 +435,8 @@ struct nested_vmx { > bool pi_pending; > u16 posted_intr_nv; > > + unsigned long *msr_bitmap; > + > struct hrtimer preemption_timer; > bool preemption_timer_expired; > > @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; > static unsigned long *vmx_msr_bitmap_longmode; > static unsigned long *vmx_msr_bitmap_legacy_x2apic; > static unsigned long *vmx_msr_bitmap_longmode_x2apic; > -static unsigned long *vmx_msr_bitmap_nested; > static unsigned long *vmx_vmread_bitmap; > static unsigned long *vmx_vmwrite_bitmap; > > @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) > unsigned long *msr_bitmap; > > if (is_guest_mode(vcpu)) > - msr_bitmap = vmx_msr_bitmap_nested; > + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; > else if (cpu_has_secondary_exec_ctrls() && > (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & > SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { > @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) > if (!vmx_msr_bitmap_longmode_x2apic) > goto out4; > > - if (nested) { > - vmx_msr_bitmap_nested = > - (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_msr_bitmap_nested) > - goto out5; > - } > - > vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); > if (!vmx_vmread_bitmap) > goto out6; > @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) > > memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); > memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); > - if (nested) > - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); > > if (setup_vmcs_config(&vmcs_config) < 0) { > r = -EIO; > @@ -6529,9 +6521,6 @@ out8: > out7: > free_page((unsigned long)vmx_vmread_bitmap); > out6: > - if (nested) > - free_page((unsigned long)vmx_msr_bitmap_nested); > -out5: > free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); > out4: > free_page((unsigned long)vmx_msr_bitmap_longmode); > @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) > free_page((unsigned long)vmx_io_bitmap_a); > free_page((unsigned long)vmx_vmwrite_bitmap); > free_page((unsigned long)vmx_vmread_bitmap); > - if (nested) > - free_page((unsigned long)vmx_msr_bitmap_nested); > > free_kvm_area(); > } > @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) > return 1; > } > > + if (cpu_has_vmx_msr_bitmap()) { > + vmx->nested.msr_bitmap = > + (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx->nested.msr_bitmap) > + goto out_msr_bitmap; > + } > + We export msr_bitmap to guest even it is not supported by hardware. So we need to allocate msr_bitmap for L1 unconditionally. > vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); > if (!vmx->nested.cached_vmcs12) > - return -ENOMEM; > + goto out_cached_vmcs12; > > if (enable_shadow_vmcs) { > shadow_vmcs = alloc_vmcs(); > - if (!shadow_vmcs) { > - kfree(vmx->nested.cached_vmcs12); > - return -ENOMEM; > - } > + if (!shadow_vmcs) > + goto out_shadow_vmcs; > /* mark vmcs as shadow */ > shadow_vmcs->revision_id |= (1u << 31); > /* init shadow vmcs */ > @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) > skip_emulated_instruction(vcpu); > nested_vmx_succeed(vcpu); > return 1; > + > +out_shadow_vmcs: > + kfree(vmx->nested.cached_vmcs12); > + > +out_cached_vmcs12: > + free_page((unsigned long)vmx->nested.msr_bitmap); > + > +out_msr_bitmap: > + return -ENOMEM; > } > > /* > @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) > vmx->nested.vmxon = false; > free_vpid(vmx->nested.vpid02); > nested_release_vmcs12(vmx); > + if (vmx->nested.msr_bitmap) { > + free_page((unsigned long)vmx->nested.msr_bitmap); > + vmx->nested.msr_bitmap = NULL; > + } > if (enable_shadow_vmcs) > free_vmcs(vmx->nested.current_shadow_vmcs); > kfree(vmx->nested.cached_vmcs12); > @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, > { > int msr; > struct page *page; > - unsigned long *msr_bitmap; > + unsigned long *msr_bitmap_l1; > + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; > > + /* This shortcut is ok because we support only x2APIC MSRs so far. */ > if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) > return false; > > @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, > WARN_ON(1); > return false; > } > - msr_bitmap = (unsigned long *)kmap(page); > - if (!msr_bitmap) { > + msr_bitmap_l1 = (unsigned long *)kmap(page); > + if (!msr_bitmap_l1) { > nested_release_page_clean(page); > WARN_ON(1); > return false; > } > > + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); > + > if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { > if (nested_cpu_has_apic_reg_virt(vmcs12)) > for (msr = 0x800; msr <= 0x8ff; msr++) > nested_vmx_disable_intercept_for_msr( > - msr_bitmap, > - vmx_msr_bitmap_nested, > + msr_bitmap_l1, msr_bitmap_l0, > msr, MSR_TYPE_R); > - /* TPR is allowed */ > - nested_vmx_disable_intercept_for_msr(msr_bitmap, > - vmx_msr_bitmap_nested, > + > + nested_vmx_disable_intercept_for_msr( > + msr_bitmap_l1, msr_bitmap_l0, > APIC_BASE_MSR + (APIC_TASKPRI >> 4), > MSR_TYPE_R | MSR_TYPE_W); > + > if (nested_cpu_has_vid(vmcs12)) { > - /* EOI and self-IPI are allowed */ > nested_vmx_disable_intercept_for_msr( > - msr_bitmap, > - vmx_msr_bitmap_nested, > + msr_bitmap_l1, msr_bitmap_l0, > APIC_BASE_MSR + (APIC_EOI >> 4), > MSR_TYPE_W); > nested_vmx_disable_intercept_for_msr( > - msr_bitmap, > - vmx_msr_bitmap_nested, > + msr_bitmap_l1, msr_bitmap_l0, > APIC_BASE_MSR + (APIC_SELF_IPI >> 4), > MSR_TYPE_W); > } > - } else { > - /* > - * Enable reading intercept of all the x2apic > - * MSRs. We should not rely on vmcs12 to do any > - * optimizations here, it may have been modified > - * by L1. > - */ > - for (msr = 0x800; msr <= 0x8ff; msr++) > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - msr, > - MSR_TYPE_R); > - > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - APIC_BASE_MSR + (APIC_TASKPRI >> 4), > - MSR_TYPE_W); > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - APIC_BASE_MSR + (APIC_EOI >> 4), > - MSR_TYPE_W); > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), > - MSR_TYPE_W); > } > kunmap(page); > nested_release_page_clean(page); > @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > } > > if (cpu_has_vmx_msr_bitmap() && > - exec_control & CPU_BASED_USE_MSR_BITMAPS) { > - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); > - /* MSR_BITMAP will be set by following vmx_set_efer. */ > - } else > + exec_control & CPU_BASED_USE_MSR_BITMAPS && > + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) > + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ > + else > exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; > > /* >
On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@gmail.com> wrote: > On 2016/8/9 2:16, Radim Krčmář wrote: >> >> msr bitmap can be used to avoid a VM exit (interception) on guest MSR >> accesses. In some configurations of VMX controls, the guest can even >> directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED >> APIC ACCESSES. >> >> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. >> To do so, L1 would first trick KVM to disable all possible interceptions >> by enabling APICv features and then would turn those features off; >> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would >> not intercept previously enabled MSRs even though they were not safe >> with the new configuration. >> >> Correctly re-enabling interceptions is not enough as a second bug would >> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all >> VMCSs, so L1 could trigger a race to get the desired combination of msr >> bitmap and VMX controls. >> >> This fix allocates a msr bitmap for every L1 VCPU, allows only safe >> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would >> have to intercept everything anyway. >> >> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") >> Reported-by: Jim Mattson <jmattson@google.com> >> Suggested-by: Wincy Van <fanwenyi0529@gmail.com> >> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> >> --- >> arch/x86/kvm/vmx.c | 107 >> ++++++++++++++++++++++------------------------------- >> 1 file changed, 44 insertions(+), 63 deletions(-) >> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index a45d8580f91e..c66ac2c70d22 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -435,6 +435,8 @@ struct nested_vmx { >> bool pi_pending; >> u16 posted_intr_nv; >> >> + unsigned long *msr_bitmap; >> + >> struct hrtimer preemption_timer; >> bool preemption_timer_expired; >> >> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; >> static unsigned long *vmx_msr_bitmap_longmode; >> static unsigned long *vmx_msr_bitmap_legacy_x2apic; >> static unsigned long *vmx_msr_bitmap_longmode_x2apic; >> -static unsigned long *vmx_msr_bitmap_nested; >> static unsigned long *vmx_vmread_bitmap; >> static unsigned long *vmx_vmwrite_bitmap; >> >> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu >> *vcpu) >> unsigned long *msr_bitmap; >> >> if (is_guest_mode(vcpu)) >> - msr_bitmap = vmx_msr_bitmap_nested; >> + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; >> else if (cpu_has_secondary_exec_ctrls() && >> (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & >> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { >> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) >> if (!vmx_msr_bitmap_longmode_x2apic) >> goto out4; >> >> - if (nested) { >> - vmx_msr_bitmap_nested = >> - (unsigned long *)__get_free_page(GFP_KERNEL); >> - if (!vmx_msr_bitmap_nested) >> - goto out5; >> - } >> - >> vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); >> if (!vmx_vmread_bitmap) >> goto out6; >> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) >> >> memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); >> memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); >> - if (nested) >> - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); >> >> if (setup_vmcs_config(&vmcs_config) < 0) { >> r = -EIO; >> @@ -6529,9 +6521,6 @@ out8: >> out7: >> free_page((unsigned long)vmx_vmread_bitmap); >> out6: >> - if (nested) >> - free_page((unsigned long)vmx_msr_bitmap_nested); >> -out5: >> free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); >> out4: >> free_page((unsigned long)vmx_msr_bitmap_longmode); >> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) >> free_page((unsigned long)vmx_io_bitmap_a); >> free_page((unsigned long)vmx_vmwrite_bitmap); >> free_page((unsigned long)vmx_vmread_bitmap); >> - if (nested) >> - free_page((unsigned long)vmx_msr_bitmap_nested); >> >> free_kvm_area(); >> } >> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >> return 1; >> } >> >> + if (cpu_has_vmx_msr_bitmap()) { >> + vmx->nested.msr_bitmap = >> + (unsigned long >> *)__get_free_page(GFP_KERNEL); >> + if (!vmx->nested.msr_bitmap) >> + goto out_msr_bitmap; >> + } >> + > > > We export msr_bitmap to guest even it is not supported by hardware. So we > need to allocate msr_bitmap for L1 unconditionally. > > Nice to see you, Yang :-) I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and used by hardware. L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP. Am I missed something? Thanks, Wincy >> vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); >> if (!vmx->nested.cached_vmcs12) >> - return -ENOMEM; >> + goto out_cached_vmcs12; >> >> if (enable_shadow_vmcs) { >> shadow_vmcs = alloc_vmcs(); >> - if (!shadow_vmcs) { >> - kfree(vmx->nested.cached_vmcs12); >> - return -ENOMEM; >> - } >> + if (!shadow_vmcs) >> + goto out_shadow_vmcs; >> /* mark vmcs as shadow */ >> shadow_vmcs->revision_id |= (1u << 31); >> /* init shadow vmcs */ >> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >> skip_emulated_instruction(vcpu); >> nested_vmx_succeed(vcpu); >> return 1; >> + >> +out_shadow_vmcs: >> + kfree(vmx->nested.cached_vmcs12); >> + >> +out_cached_vmcs12: >> + free_page((unsigned long)vmx->nested.msr_bitmap); >> + >> +out_msr_bitmap: >> + return -ENOMEM; >> } >> >> /* >> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) >> vmx->nested.vmxon = false; >> free_vpid(vmx->nested.vpid02); >> nested_release_vmcs12(vmx); >> + if (vmx->nested.msr_bitmap) { >> + free_page((unsigned long)vmx->nested.msr_bitmap); >> + vmx->nested.msr_bitmap = NULL; >> + } >> if (enable_shadow_vmcs) >> free_vmcs(vmx->nested.current_shadow_vmcs); >> kfree(vmx->nested.cached_vmcs12); >> @@ -9472,8 +9477,10 @@ static inline bool >> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, >> { >> int msr; >> struct page *page; >> - unsigned long *msr_bitmap; >> + unsigned long *msr_bitmap_l1; >> + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; >> >> + /* This shortcut is ok because we support only x2APIC MSRs so far. >> */ >> if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) >> return false; >> >> @@ -9482,63 +9489,37 @@ static inline bool >> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, >> WARN_ON(1); >> return false; >> } >> - msr_bitmap = (unsigned long *)kmap(page); >> - if (!msr_bitmap) { >> + msr_bitmap_l1 = (unsigned long *)kmap(page); >> + if (!msr_bitmap_l1) { >> nested_release_page_clean(page); >> WARN_ON(1); >> return false; >> } >> >> + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); >> + >> if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { >> if (nested_cpu_has_apic_reg_virt(vmcs12)) >> for (msr = 0x800; msr <= 0x8ff; msr++) >> nested_vmx_disable_intercept_for_msr( >> - msr_bitmap, >> - vmx_msr_bitmap_nested, >> + msr_bitmap_l1, msr_bitmap_l0, >> msr, MSR_TYPE_R); >> - /* TPR is allowed */ >> - nested_vmx_disable_intercept_for_msr(msr_bitmap, >> - vmx_msr_bitmap_nested, >> + >> + nested_vmx_disable_intercept_for_msr( >> + msr_bitmap_l1, msr_bitmap_l0, >> APIC_BASE_MSR + (APIC_TASKPRI >> 4), >> MSR_TYPE_R | MSR_TYPE_W); >> + >> if (nested_cpu_has_vid(vmcs12)) { >> - /* EOI and self-IPI are allowed */ >> nested_vmx_disable_intercept_for_msr( >> - msr_bitmap, >> - vmx_msr_bitmap_nested, >> + msr_bitmap_l1, msr_bitmap_l0, >> APIC_BASE_MSR + (APIC_EOI >> 4), >> MSR_TYPE_W); >> nested_vmx_disable_intercept_for_msr( >> - msr_bitmap, >> - vmx_msr_bitmap_nested, >> + msr_bitmap_l1, msr_bitmap_l0, >> APIC_BASE_MSR + (APIC_SELF_IPI >> 4), >> MSR_TYPE_W); >> } >> - } else { >> - /* >> - * Enable reading intercept of all the x2apic >> - * MSRs. We should not rely on vmcs12 to do any >> - * optimizations here, it may have been modified >> - * by L1. >> - */ >> - for (msr = 0x800; msr <= 0x8ff; msr++) >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - msr, >> - MSR_TYPE_R); >> - >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - APIC_BASE_MSR + (APIC_TASKPRI >> 4), >> - MSR_TYPE_W); >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - APIC_BASE_MSR + (APIC_EOI >> 4), >> - MSR_TYPE_W); >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), >> - MSR_TYPE_W); >> } >> kunmap(page); >> nested_release_page_clean(page); >> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, >> struct vmcs12 *vmcs12) >> } >> >> if (cpu_has_vmx_msr_bitmap() && >> - exec_control & CPU_BASED_USE_MSR_BITMAPS) { >> - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); >> - /* MSR_BITMAP will be set by following vmx_set_efer. */ >> - } else >> + exec_control & CPU_BASED_USE_MSR_BITMAPS && >> + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) >> + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ >> + else >> exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; >> >> /* >> > > > -- > Yang > Alibaba Cloud Computing -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2016-08-09 17:32+0800, Yang Zhang: > On 2016/8/9 2:16, Radim Krčmář wrote: >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >> return 1; >> } >> >> + if (cpu_has_vmx_msr_bitmap()) { >> + vmx->nested.msr_bitmap = >> + (unsigned long *)__get_free_page(GFP_KERNEL); >> + if (!vmx->nested.msr_bitmap) >> + goto out_msr_bitmap; >> + } >> + > > We export msr_bitmap to guest even it is not supported by hardware. So we > need to allocate msr_bitmap for L1 unconditionally. We do emulate the feature, but the vmx->nested.msr_bitmap is used only if VMX supports it to avoid some VM exits: >> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) >> } >> >> if (cpu_has_vmx_msr_bitmap() && >> - exec_control & CPU_BASED_USE_MSR_BITMAPS) { >> - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); >> - /* MSR_BITMAP will be set by following vmx_set_efer. */ >> - } else >> + exec_control & CPU_BASED_USE_MSR_BITMAPS && >> + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) >> + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ >> + else >> exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; The else branch is taken if !cpu_has_vmx_msr_bitmap() and disables msr bitmaps. Similar check for vmx_set_msr_bitmap(), so the NULL doesn't even get written to VMCS. KVM always uses L1's msr bitmaps when emulating the feature. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2016/8/9 18:19, Wincy Van wrote: > On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@gmail.com> wrote: >> On 2016/8/9 2:16, Radim Krčmář wrote: >>> >>> msr bitmap can be used to avoid a VM exit (interception) on guest MSR >>> accesses. In some configurations of VMX controls, the guest can even >>> directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED >>> APIC ACCESSES. >>> >>> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. >>> To do so, L1 would first trick KVM to disable all possible interceptions >>> by enabling APICv features and then would turn those features off; >>> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would >>> not intercept previously enabled MSRs even though they were not safe >>> with the new configuration. >>> >>> Correctly re-enabling interceptions is not enough as a second bug would >>> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all >>> VMCSs, so L1 could trigger a race to get the desired combination of msr >>> bitmap and VMX controls. >>> >>> This fix allocates a msr bitmap for every L1 VCPU, allows only safe >>> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would >>> have to intercept everything anyway. >>> >>> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") >>> Reported-by: Jim Mattson <jmattson@google.com> >>> Suggested-by: Wincy Van <fanwenyi0529@gmail.com> >>> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> >>> --- >>> arch/x86/kvm/vmx.c | 107 >>> ++++++++++++++++++++++------------------------------- >>> 1 file changed, 44 insertions(+), 63 deletions(-) >>> >>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>> index a45d8580f91e..c66ac2c70d22 100644 >>> --- a/arch/x86/kvm/vmx.c >>> +++ b/arch/x86/kvm/vmx.c >>> @@ -435,6 +435,8 @@ struct nested_vmx { >>> bool pi_pending; >>> u16 posted_intr_nv; >>> >>> + unsigned long *msr_bitmap; >>> + >>> struct hrtimer preemption_timer; >>> bool preemption_timer_expired; >>> >>> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; >>> static unsigned long *vmx_msr_bitmap_longmode; >>> static unsigned long *vmx_msr_bitmap_legacy_x2apic; >>> static unsigned long *vmx_msr_bitmap_longmode_x2apic; >>> -static unsigned long *vmx_msr_bitmap_nested; >>> static unsigned long *vmx_vmread_bitmap; >>> static unsigned long *vmx_vmwrite_bitmap; >>> >>> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu >>> *vcpu) >>> unsigned long *msr_bitmap; >>> >>> if (is_guest_mode(vcpu)) >>> - msr_bitmap = vmx_msr_bitmap_nested; >>> + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; >>> else if (cpu_has_secondary_exec_ctrls() && >>> (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & >>> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { >>> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) >>> if (!vmx_msr_bitmap_longmode_x2apic) >>> goto out4; >>> >>> - if (nested) { >>> - vmx_msr_bitmap_nested = >>> - (unsigned long *)__get_free_page(GFP_KERNEL); >>> - if (!vmx_msr_bitmap_nested) >>> - goto out5; >>> - } >>> - >>> vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); >>> if (!vmx_vmread_bitmap) >>> goto out6; >>> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) >>> >>> memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); >>> memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); >>> - if (nested) >>> - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); >>> >>> if (setup_vmcs_config(&vmcs_config) < 0) { >>> r = -EIO; >>> @@ -6529,9 +6521,6 @@ out8: >>> out7: >>> free_page((unsigned long)vmx_vmread_bitmap); >>> out6: >>> - if (nested) >>> - free_page((unsigned long)vmx_msr_bitmap_nested); >>> -out5: >>> free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); >>> out4: >>> free_page((unsigned long)vmx_msr_bitmap_longmode); >>> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) >>> free_page((unsigned long)vmx_io_bitmap_a); >>> free_page((unsigned long)vmx_vmwrite_bitmap); >>> free_page((unsigned long)vmx_vmread_bitmap); >>> - if (nested) >>> - free_page((unsigned long)vmx_msr_bitmap_nested); >>> >>> free_kvm_area(); >>> } >>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >>> return 1; >>> } >>> >>> + if (cpu_has_vmx_msr_bitmap()) { >>> + vmx->nested.msr_bitmap = >>> + (unsigned long >>> *)__get_free_page(GFP_KERNEL); >>> + if (!vmx->nested.msr_bitmap) >>> + goto out_msr_bitmap; >>> + } >>> + >> >> >> We export msr_bitmap to guest even it is not supported by hardware. So we >> need to allocate msr_bitmap for L1 unconditionally. >> >> > > Nice to see you, Yang :-) Niec to see you too. > > I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and > used by hardware. > L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to > vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP. > > Am I missed something? You are correct! > > Thanks, > Wincy > >>> vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); >>> if (!vmx->nested.cached_vmcs12) >>> - return -ENOMEM; >>> + goto out_cached_vmcs12; >>> >>> if (enable_shadow_vmcs) { >>> shadow_vmcs = alloc_vmcs(); >>> - if (!shadow_vmcs) { >>> - kfree(vmx->nested.cached_vmcs12); >>> - return -ENOMEM; >>> - } >>> + if (!shadow_vmcs) >>> + goto out_shadow_vmcs; >>> /* mark vmcs as shadow */ >>> shadow_vmcs->revision_id |= (1u << 31); >>> /* init shadow vmcs */ >>> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >>> skip_emulated_instruction(vcpu); >>> nested_vmx_succeed(vcpu); >>> return 1; >>> + >>> +out_shadow_vmcs: >>> + kfree(vmx->nested.cached_vmcs12); >>> + >>> +out_cached_vmcs12: >>> + free_page((unsigned long)vmx->nested.msr_bitmap); >>> + >>> +out_msr_bitmap: >>> + return -ENOMEM; >>> } >>> >>> /* >>> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) >>> vmx->nested.vmxon = false; >>> free_vpid(vmx->nested.vpid02); >>> nested_release_vmcs12(vmx); >>> + if (vmx->nested.msr_bitmap) { >>> + free_page((unsigned long)vmx->nested.msr_bitmap); >>> + vmx->nested.msr_bitmap = NULL; >>> + } >>> if (enable_shadow_vmcs) >>> free_vmcs(vmx->nested.current_shadow_vmcs); >>> kfree(vmx->nested.cached_vmcs12); >>> @@ -9472,8 +9477,10 @@ static inline bool >>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, >>> { >>> int msr; >>> struct page *page; >>> - unsigned long *msr_bitmap; >>> + unsigned long *msr_bitmap_l1; >>> + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; >>> >>> + /* This shortcut is ok because we support only x2APIC MSRs so far. >>> */ >>> if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) >>> return false; >>> >>> @@ -9482,63 +9489,37 @@ static inline bool >>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, >>> WARN_ON(1); >>> return false; >>> } >>> - msr_bitmap = (unsigned long *)kmap(page); >>> - if (!msr_bitmap) { >>> + msr_bitmap_l1 = (unsigned long *)kmap(page); >>> + if (!msr_bitmap_l1) { >>> nested_release_page_clean(page); >>> WARN_ON(1); >>> return false; >>> } >>> >>> + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); >>> + >>> if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { >>> if (nested_cpu_has_apic_reg_virt(vmcs12)) >>> for (msr = 0x800; msr <= 0x8ff; msr++) >>> nested_vmx_disable_intercept_for_msr( >>> - msr_bitmap, >>> - vmx_msr_bitmap_nested, >>> + msr_bitmap_l1, msr_bitmap_l0, >>> msr, MSR_TYPE_R); >>> - /* TPR is allowed */ >>> - nested_vmx_disable_intercept_for_msr(msr_bitmap, >>> - vmx_msr_bitmap_nested, >>> + >>> + nested_vmx_disable_intercept_for_msr( >>> + msr_bitmap_l1, msr_bitmap_l0, >>> APIC_BASE_MSR + (APIC_TASKPRI >> 4), >>> MSR_TYPE_R | MSR_TYPE_W); >>> + >>> if (nested_cpu_has_vid(vmcs12)) { >>> - /* EOI and self-IPI are allowed */ >>> nested_vmx_disable_intercept_for_msr( >>> - msr_bitmap, >>> - vmx_msr_bitmap_nested, >>> + msr_bitmap_l1, msr_bitmap_l0, >>> APIC_BASE_MSR + (APIC_EOI >> 4), >>> MSR_TYPE_W); >>> nested_vmx_disable_intercept_for_msr( >>> - msr_bitmap, >>> - vmx_msr_bitmap_nested, >>> + msr_bitmap_l1, msr_bitmap_l0, >>> APIC_BASE_MSR + (APIC_SELF_IPI >> 4), >>> MSR_TYPE_W); >>> } >>> - } else { >>> - /* >>> - * Enable reading intercept of all the x2apic >>> - * MSRs. We should not rely on vmcs12 to do any >>> - * optimizations here, it may have been modified >>> - * by L1. >>> - */ >>> - for (msr = 0x800; msr <= 0x8ff; msr++) >>> - __vmx_enable_intercept_for_msr( >>> - vmx_msr_bitmap_nested, >>> - msr, >>> - MSR_TYPE_R); >>> - >>> - __vmx_enable_intercept_for_msr( >>> - vmx_msr_bitmap_nested, >>> - APIC_BASE_MSR + (APIC_TASKPRI >> 4), >>> - MSR_TYPE_W); >>> - __vmx_enable_intercept_for_msr( >>> - vmx_msr_bitmap_nested, >>> - APIC_BASE_MSR + (APIC_EOI >> 4), >>> - MSR_TYPE_W); >>> - __vmx_enable_intercept_for_msr( >>> - vmx_msr_bitmap_nested, >>> - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), >>> - MSR_TYPE_W); >>> } >>> kunmap(page); >>> nested_release_page_clean(page); >>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, >>> struct vmcs12 *vmcs12) >>> } >>> >>> if (cpu_has_vmx_msr_bitmap() && >>> - exec_control & CPU_BASED_USE_MSR_BITMAPS) { >>> - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); >>> - /* MSR_BITMAP will be set by following vmx_set_efer. */ >>> - } else >>> + exec_control & CPU_BASED_USE_MSR_BITMAPS && >>> + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) >>> + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ >>> + else >>> exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; >>> >>> /* >>> >> >> >> -- >> Yang >> Alibaba Cloud Computing
On 2016/8/9 20:23, Radim Krčmář wrote: > 2016-08-09 17:32+0800, Yang Zhang: >> On 2016/8/9 2:16, Radim Krčmář wrote: >>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >>> return 1; >>> } >>> >>> + if (cpu_has_vmx_msr_bitmap()) { >>> + vmx->nested.msr_bitmap = >>> + (unsigned long *)__get_free_page(GFP_KERNEL); >>> + if (!vmx->nested.msr_bitmap) >>> + goto out_msr_bitmap; >>> + } >>> + >> >> We export msr_bitmap to guest even it is not supported by hardware. So we >> need to allocate msr_bitmap for L1 unconditionally. > > We do emulate the feature, but the vmx->nested.msr_bitmap is used only > if VMX supports it to avoid some VM exits: > >>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) >>> } >>> >>> if (cpu_has_vmx_msr_bitmap() && >>> - exec_control & CPU_BASED_USE_MSR_BITMAPS) { >>> - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); >>> - /* MSR_BITMAP will be set by following vmx_set_efer. */ >>> - } else >>> + exec_control & CPU_BASED_USE_MSR_BITMAPS && >>> + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) >>> + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ >>> + else >>> exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; > > The else branch is taken if !cpu_has_vmx_msr_bitmap() and disables msr > bitmaps. Similar check for vmx_set_msr_bitmap(), so the NULL doesn't > even get written to VMCS. > > KVM always uses L1's msr bitmaps when emulating the feature. > Yes, you are right. I forget the bitmap only used by hardware.:(
2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>: > msr bitmap can be used to avoid a VM exit (interception) on guest MSR > accesses. In some configurations of VMX controls, the guest can even > directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED > APIC ACCESSES. > > L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. > To do so, L1 would first trick KVM to disable all possible interceptions > by enabling APICv features and then would turn those features off; > nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would > not intercept previously enabled MSRs even though they were not safe > with the new configuration. > > Correctly re-enabling interceptions is not enough as a second bug would > still allow L1+L2 to access host's MSRs: msr bitmap was shared for all > VMCSs, so L1 could trigger a race to get the desired combination of msr > bitmap and VMX controls. > > This fix allocates a msr bitmap for every L1 VCPU, allows only safe > x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would > have to intercept everything anyway. > > Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") > Reported-by: Jim Mattson <jmattson@google.com> > Suggested-by: Wincy Van <fanwenyi0529@gmail.com> > Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com> > --- > arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++------------------------------- > 1 file changed, 44 insertions(+), 63 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index a45d8580f91e..c66ac2c70d22 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -435,6 +435,8 @@ struct nested_vmx { > bool pi_pending; > u16 posted_intr_nv; > > + unsigned long *msr_bitmap; > + > struct hrtimer preemption_timer; > bool preemption_timer_expired; > > @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; > static unsigned long *vmx_msr_bitmap_longmode; > static unsigned long *vmx_msr_bitmap_legacy_x2apic; > static unsigned long *vmx_msr_bitmap_longmode_x2apic; > -static unsigned long *vmx_msr_bitmap_nested; > static unsigned long *vmx_vmread_bitmap; > static unsigned long *vmx_vmwrite_bitmap; > > @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) > unsigned long *msr_bitmap; > > if (is_guest_mode(vcpu)) > - msr_bitmap = vmx_msr_bitmap_nested; > + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; > else if (cpu_has_secondary_exec_ctrls() && > (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & > SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { > @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) > if (!vmx_msr_bitmap_longmode_x2apic) > goto out4; > > - if (nested) { > - vmx_msr_bitmap_nested = > - (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_msr_bitmap_nested) > - goto out5; > - } > - > vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); > if (!vmx_vmread_bitmap) > goto out6; > @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) > > memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); > memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); > - if (nested) > - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); > > if (setup_vmcs_config(&vmcs_config) < 0) { > r = -EIO; > @@ -6529,9 +6521,6 @@ out8: > out7: > free_page((unsigned long)vmx_vmread_bitmap); > out6: > - if (nested) > - free_page((unsigned long)vmx_msr_bitmap_nested); > -out5: > free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); > out4: > free_page((unsigned long)vmx_msr_bitmap_longmode); > @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) > free_page((unsigned long)vmx_io_bitmap_a); > free_page((unsigned long)vmx_vmwrite_bitmap); > free_page((unsigned long)vmx_vmread_bitmap); > - if (nested) > - free_page((unsigned long)vmx_msr_bitmap_nested); > > free_kvm_area(); > } > @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) > return 1; > } > > + if (cpu_has_vmx_msr_bitmap()) { > + vmx->nested.msr_bitmap = > + (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx->nested.msr_bitmap) > + goto out_msr_bitmap; > + } > + > vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); > if (!vmx->nested.cached_vmcs12) > - return -ENOMEM; > + goto out_cached_vmcs12; > > if (enable_shadow_vmcs) { > shadow_vmcs = alloc_vmcs(); > - if (!shadow_vmcs) { > - kfree(vmx->nested.cached_vmcs12); > - return -ENOMEM; > - } > + if (!shadow_vmcs) > + goto out_shadow_vmcs; > /* mark vmcs as shadow */ > shadow_vmcs->revision_id |= (1u << 31); > /* init shadow vmcs */ > @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) > skip_emulated_instruction(vcpu); > nested_vmx_succeed(vcpu); > return 1; > + > +out_shadow_vmcs: > + kfree(vmx->nested.cached_vmcs12); > + > +out_cached_vmcs12: > + free_page((unsigned long)vmx->nested.msr_bitmap); > + > +out_msr_bitmap: > + return -ENOMEM; > } > > /* > @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) > vmx->nested.vmxon = false; > free_vpid(vmx->nested.vpid02); > nested_release_vmcs12(vmx); > + if (vmx->nested.msr_bitmap) { > + free_page((unsigned long)vmx->nested.msr_bitmap); > + vmx->nested.msr_bitmap = NULL; > + } > if (enable_shadow_vmcs) > free_vmcs(vmx->nested.current_shadow_vmcs); > kfree(vmx->nested.cached_vmcs12); > @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, > { > int msr; > struct page *page; > - unsigned long *msr_bitmap; > + unsigned long *msr_bitmap_l1; > + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; > > + /* This shortcut is ok because we support only x2APIC MSRs so far. */ > if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) > return false; > > @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, > WARN_ON(1); > return false; > } > - msr_bitmap = (unsigned long *)kmap(page); > - if (!msr_bitmap) { > + msr_bitmap_l1 = (unsigned long *)kmap(page); > + if (!msr_bitmap_l1) { > nested_release_page_clean(page); > WARN_ON(1); > return false; > } > > + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); > + > if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { > if (nested_cpu_has_apic_reg_virt(vmcs12)) > for (msr = 0x800; msr <= 0x8ff; msr++) > nested_vmx_disable_intercept_for_msr( > - msr_bitmap, > - vmx_msr_bitmap_nested, > + msr_bitmap_l1, msr_bitmap_l0, > msr, MSR_TYPE_R); > - /* TPR is allowed */ > - nested_vmx_disable_intercept_for_msr(msr_bitmap, > - vmx_msr_bitmap_nested, > + > + nested_vmx_disable_intercept_for_msr( > + msr_bitmap_l1, msr_bitmap_l0, > APIC_BASE_MSR + (APIC_TASKPRI >> 4), > MSR_TYPE_R | MSR_TYPE_W); > + > if (nested_cpu_has_vid(vmcs12)) { > - /* EOI and self-IPI are allowed */ > nested_vmx_disable_intercept_for_msr( > - msr_bitmap, > - vmx_msr_bitmap_nested, > + msr_bitmap_l1, msr_bitmap_l0, > APIC_BASE_MSR + (APIC_EOI >> 4), > MSR_TYPE_W); > nested_vmx_disable_intercept_for_msr( > - msr_bitmap, > - vmx_msr_bitmap_nested, > + msr_bitmap_l1, msr_bitmap_l0, > APIC_BASE_MSR + (APIC_SELF_IPI >> 4), > MSR_TYPE_W); > } > - } else { > - /* > - * Enable reading intercept of all the x2apic > - * MSRs. We should not rely on vmcs12 to do any > - * optimizations here, it may have been modified > - * by L1. > - */ > - for (msr = 0x800; msr <= 0x8ff; msr++) > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - msr, > - MSR_TYPE_R); > - > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - APIC_BASE_MSR + (APIC_TASKPRI >> 4), > - MSR_TYPE_W); > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - APIC_BASE_MSR + (APIC_EOI >> 4), > - MSR_TYPE_W); > - __vmx_enable_intercept_for_msr( > - vmx_msr_bitmap_nested, > - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), > - MSR_TYPE_W); > } > kunmap(page); > nested_release_page_clean(page); > @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > } > > if (cpu_has_vmx_msr_bitmap() && > - exec_control & CPU_BASED_USE_MSR_BITMAPS) { > - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); > - /* MSR_BITMAP will be set by following vmx_set_efer. */ > - } else > + exec_control & CPU_BASED_USE_MSR_BITMAPS && > + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) > + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ > + else > exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; > > /* > -- > 2.9.2 >
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a45d8580f91e..c66ac2c70d22 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -435,6 +435,8 @@ struct nested_vmx { bool pi_pending; u16 posted_intr_nv; + unsigned long *msr_bitmap; + struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) unsigned long *msr_bitmap; if (is_guest_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_nested; + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_longmode_x2apic) goto out4; - if (nested) { - vmx_msr_bitmap_nested = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_nested) - goto out5; - } - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out6; @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (nested) - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; @@ -6529,9 +6521,6 @@ out8: out7: free_page((unsigned long)vmx_vmread_bitmap); out6: - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); -out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: free_page((unsigned long)vmx_msr_bitmap_longmode); @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_vmcs12) - return -ENOMEM; + goto out_cached_vmcs12; if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) { - kfree(vmx->nested.cached_vmcs12); - return -ENOMEM; - } + if (!shadow_vmcs) + goto out_shadow_vmcs; /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); return 1; + +out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; } /* @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); + if (vmx->nested.msr_bitmap) { + free_page((unsigned long)vmx->nested.msr_bitmap); + vmx->nested.msr_bitmap = NULL; + } if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); kfree(vmx->nested.cached_vmcs12); @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, { int msr; struct page *page; - unsigned long *msr_bitmap; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, WARN_ON(1); return false; } - msr_bitmap = (unsigned long *)kmap(page); - if (!msr_bitmap) { + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (!msr_bitmap_l1) { nested_release_page_clean(page); WARN_ON(1); return false; } + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) for (msr = 0x800; msr <= 0x8ff; msr++) nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, msr, MSR_TYPE_R); - /* TPR is allowed */ - nested_vmx_disable_intercept_for_msr(msr_bitmap, - vmx_msr_bitmap_nested, + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_TASKPRI >> 4), MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { - /* EOI and self-IPI are allowed */ nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_EOI >> 4), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_SELF_IPI >> 4), MSR_TYPE_W); } - } else { - /* - * Enable reading intercept of all the x2apic - * MSRs. We should not rely on vmcs12 to do any - * optimizations here, it may have been modified - * by L1. - */ - for (msr = 0x800; msr <= 0x8ff; msr++) - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - msr, - MSR_TYPE_R); - - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); } kunmap(page); nested_release_page_clean(page); @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS) { - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); - /* MSR_BITMAP will be set by following vmx_set_efer. */ - } else + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ + else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; /*
msr bitmap can be used to avoid a VM exit (interception) on guest MSR accesses. In some configurations of VMX controls, the guest can even directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED APIC ACCESSES. L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. To do so, L1 would first trick KVM to disable all possible interceptions by enabling APICv features and then would turn those features off; nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would not intercept previously enabled MSRs even though they were not safe with the new configuration. Correctly re-enabling interceptions is not enough as a second bug would still allow L1+L2 to access host's MSRs: msr bitmap was shared for all VMCSs, so L1 could trigger a race to get the desired combination of msr bitmap and VMX controls. This fix allocates a msr bitmap for every L1 VCPU, allows only safe x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would have to intercept everything anyway. Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") Reported-by: Jim Mattson <jmattson@google.com> Suggested-by: Wincy Van <fanwenyi0529@gmail.com> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> --- arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++------------------------------- 1 file changed, 44 insertions(+), 63 deletions(-)