Message ID | 1517271028-15916-5-git-send-email-karahmed@amazon.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Jan 29, 2018 at 4:10 PM, KarimAllah Ahmed <karahmed@amazon.de> wrote: > [ Based on a patch from Ashok Raj <ashok.raj@intel.com> ] > > Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for > guests that will only mitigate Spectre V2 through IBRS+IBPB and will not > be using a retpoline+IBPB based approach. > > To avoid the overhead of atomically saving and restoring the > MSR_IA32_SPEC_CTRL for guests that do not actually use the MSR, only > add_atomic_switch_msr when a non-zero is written to it. > > No attempt is made to handle STIBP here, intentionally. Filtering STIBP > may be added in a future patch, which may require trapping all writes > if we don't want to pass it through directly to the guest. > > [dwmw2: Clean up CPUID bits, save/restore manually, handle reset] > > Cc: Asit Mallick <asit.k.mallick@intel.com> > Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> > Cc: Dave Hansen <dave.hansen@intel.com> > Cc: Andi Kleen <ak@linux.intel.com> > Cc: Andrea Arcangeli <aarcange@redhat.com> > Cc: Linus Torvalds <torvalds@linux-foundation.org> > Cc: Tim Chen <tim.c.chen@linux.intel.com> > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Dan Williams <dan.j.williams@intel.com> > Cc: Jun Nakajima <jun.nakajima@intel.com> > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: David Woodhouse <dwmw@amazon.co.uk> > Cc: Greg KH <gregkh@linuxfoundation.org> > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Ashok Raj <ashok.raj@intel.com> > Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > --- > v2: > - remove 'host_spec_ctrl' in favor of only a comment (dwmw@). > - special case writing '0' in SPEC_CTRL to avoid confusing live-migration > when the instance never used the MSR (dwmw@). > - depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@). > - add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident). > v3: > - Save/restore manually > - Fix CPUID handling > - Fix a copy & paste error in the name of SPEC_CTRL MSR in > disable_intercept. > - support !cpu_has_vmx_msr_bitmap() > --- > arch/x86/kvm/cpuid.c | 7 +++++-- > arch/x86/kvm/vmx.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/x86.c | 2 +- > 3 files changed, 65 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index 1909635..662d0c0 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > > /* cpuid 7.0.edx*/ > const u32 kvm_cpuid_7_0_edx_x86_features = > - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES); > + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | > + F(ARCH_CAPABILITIES); > > /* all calls to cpuid_count() should be made on the same cpu */ > get_cpu(); > @@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > g_phys_as = phys_as; > entry->eax = g_phys_as | (virt_as << 8); > entry->edx = 0; > - /* IBPB isn't necessarily present in hardware cpuid */ > + /* IBRS and IBPB aren't necessarily present in hardware cpuid */ > if (boot_cpu_has(X86_FEATURE_IBPB)) > entry->ebx |= F(IBPB); > + if (boot_cpu_has(X86_FEATURE_IBRS)) > + entry->ebx |= F(IBRS); > entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; > cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); > break; > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 798a00b..9ac9747 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -582,6 +582,8 @@ struct vcpu_vmx { > u64 msr_guest_kernel_gs_base; > #endif > u64 arch_capabilities; > + u64 spec_ctrl; > + bool save_spec_ctrl_on_exit; > > u32 vm_entry_controls_shadow; > u32 vm_exit_controls_shadow; > @@ -922,6 +924,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); > static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, > u16 error_code); > static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); > +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, > + u32 msr, int type); > > static DEFINE_PER_CPU(struct vmcs *, vmxarea); > static DEFINE_PER_CPU(struct vmcs *, current_vmcs); > @@ -3226,6 +3230,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_IA32_TSC: > msr_info->data = guest_read_tsc(vcpu); > break; > + case MSR_IA32_SPEC_CTRL: > + if (!msr_info->host_initiated && > + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) > + return 1; > + > + msr_info->data = to_vmx(vcpu)->spec_ctrl; > + break; > case MSR_IA32_ARCH_CAPABILITIES: > if (!msr_info->host_initiated && > !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) > @@ -3339,6 +3350,31 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_IA32_TSC: > kvm_write_tsc(vcpu, msr_info); > break; > + case MSR_IA32_SPEC_CTRL: > + if (!msr_info->host_initiated && > + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) > + return 1; > + > + /* The STIBP bit doesn't fault even if it's not advertised */ > + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) > + return 1; > + > + vmx->spec_ctrl = data; > + > + /* > + * When it's written (to non-zero) for the first time, pass > + * it through. This means we don't have to take the perf > + * hit of saving it on vmexit for the common case of guests > + * that don't use it. > + */ > + if (cpu_has_vmx_msr_bitmap() && data && > + !vmx->save_spec_ctrl_on_exit) { > + vmx->save_spec_ctrl_on_exit = true; > + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, > + MSR_IA32_SPEC_CTRL, > + MSR_TYPE_RW); > + } This code seems to assume that L1 is currently active. What if L2 is currently active? > + break; > case MSR_IA32_PRED_CMD: > if (!msr_info->host_initiated && > !guest_cpuid_has(vcpu, X86_FEATURE_IBPB)) > @@ -5644,6 +5680,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > u64 cr0; > > vmx->rmode.vm86_active = 0; > + vmx->spec_ctrl = 0; > > vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); > kvm_set_cr8(vcpu, 0); > @@ -9314,6 +9351,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > > vmx_arm_hv_timer(vcpu); > > + /* > + * If this vCPU has touched SPEC_CTRL, restore the guest's value if > + * it's non-zero. Since vmentry is serialising on affected CPUs, there > + * is no need to worry about the conditional branch over the wrmsr > + * being speculatively taken. > + */ > + if (vmx->spec_ctrl) > + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); > + > vmx->__launched = vmx->loaded_vmcs->launched; > asm( > /* Store host registers */ > @@ -9420,6 +9466,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > #endif > ); > > + /* > + * We do not use IBRS in the kernel. If this vCPU has used the > + * SPEC_CTRL MSR it may have left it on; save the value and > + * turn it off. This is much more efficient than blindly adding > + * it to the atomic save/restore list. Especially as the former > + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. > + */ > + if (vmx->save_spec_ctrl_on_exit) > + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); > + > + if (vmx->spec_ctrl) > + wrmsrl(MSR_IA32_SPEC_CTRL, 0); > + > /* Eliminate branch target predictions from guest mode */ > vmexit_fill_RSB(); > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 8e889dc..fc9724c 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1006,7 +1006,7 @@ static u32 msrs_to_save[] = { > #endif > MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, > MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, > - MSR_IA32_ARCH_CAPABILITIES > + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES > }; > > static unsigned num_msrs_to_save; > -- > 2.7.4 >
On 01/30/2018 06:49 PM, Jim Mattson wrote: > On Mon, Jan 29, 2018 at 4:10 PM, KarimAllah Ahmed <karahmed@amazon.de> wrote: >> [ Based on a patch from Ashok Raj <ashok.raj@intel.com> ] >> >> Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for >> guests that will only mitigate Spectre V2 through IBRS+IBPB and will not >> be using a retpoline+IBPB based approach. >> >> To avoid the overhead of atomically saving and restoring the >> MSR_IA32_SPEC_CTRL for guests that do not actually use the MSR, only >> add_atomic_switch_msr when a non-zero is written to it. >> >> No attempt is made to handle STIBP here, intentionally. Filtering STIBP >> may be added in a future patch, which may require trapping all writes >> if we don't want to pass it through directly to the guest. >> >> [dwmw2: Clean up CPUID bits, save/restore manually, handle reset] >> >> Cc: Asit Mallick <asit.k.mallick@intel.com> >> Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> >> Cc: Dave Hansen <dave.hansen@intel.com> >> Cc: Andi Kleen <ak@linux.intel.com> >> Cc: Andrea Arcangeli <aarcange@redhat.com> >> Cc: Linus Torvalds <torvalds@linux-foundation.org> >> Cc: Tim Chen <tim.c.chen@linux.intel.com> >> Cc: Thomas Gleixner <tglx@linutronix.de> >> Cc: Dan Williams <dan.j.williams@intel.com> >> Cc: Jun Nakajima <jun.nakajima@intel.com> >> Cc: Paolo Bonzini <pbonzini@redhat.com> >> Cc: David Woodhouse <dwmw@amazon.co.uk> >> Cc: Greg KH <gregkh@linuxfoundation.org> >> Cc: Andy Lutomirski <luto@kernel.org> >> Cc: Ashok Raj <ashok.raj@intel.com> >> Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> >> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> >> --- >> v2: >> - remove 'host_spec_ctrl' in favor of only a comment (dwmw@). >> - special case writing '0' in SPEC_CTRL to avoid confusing live-migration >> when the instance never used the MSR (dwmw@). >> - depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@). >> - add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident). >> v3: >> - Save/restore manually >> - Fix CPUID handling >> - Fix a copy & paste error in the name of SPEC_CTRL MSR in >> disable_intercept. >> - support !cpu_has_vmx_msr_bitmap() >> --- >> arch/x86/kvm/cpuid.c | 7 +++++-- >> arch/x86/kvm/vmx.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ >> arch/x86/kvm/x86.c | 2 +- >> 3 files changed, 65 insertions(+), 3 deletions(-) >> >> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c >> index 1909635..662d0c0 100644 >> --- a/arch/x86/kvm/cpuid.c >> +++ b/arch/x86/kvm/cpuid.c >> @@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, >> >> /* cpuid 7.0.edx*/ >> const u32 kvm_cpuid_7_0_edx_x86_features = >> - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES); >> + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | >> + F(ARCH_CAPABILITIES); >> >> /* all calls to cpuid_count() should be made on the same cpu */ >> get_cpu(); >> @@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, >> g_phys_as = phys_as; >> entry->eax = g_phys_as | (virt_as << 8); >> entry->edx = 0; >> - /* IBPB isn't necessarily present in hardware cpuid */ >> + /* IBRS and IBPB aren't necessarily present in hardware cpuid */ >> if (boot_cpu_has(X86_FEATURE_IBPB)) >> entry->ebx |= F(IBPB); >> + if (boot_cpu_has(X86_FEATURE_IBRS)) >> + entry->ebx |= F(IBRS); >> entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; >> cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); >> break; >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index 798a00b..9ac9747 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -582,6 +582,8 @@ struct vcpu_vmx { >> u64 msr_guest_kernel_gs_base; >> #endif >> u64 arch_capabilities; >> + u64 spec_ctrl; >> + bool save_spec_ctrl_on_exit; >> >> u32 vm_entry_controls_shadow; >> u32 vm_exit_controls_shadow; >> @@ -922,6 +924,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); >> static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, >> u16 error_code); >> static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); >> +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, >> + u32 msr, int type); >> >> static DEFINE_PER_CPU(struct vmcs *, vmxarea); >> static DEFINE_PER_CPU(struct vmcs *, current_vmcs); >> @@ -3226,6 +3230,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) >> case MSR_IA32_TSC: >> msr_info->data = guest_read_tsc(vcpu); >> break; >> + case MSR_IA32_SPEC_CTRL: >> + if (!msr_info->host_initiated && >> + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) >> + return 1; >> + >> + msr_info->data = to_vmx(vcpu)->spec_ctrl; >> + break; >> case MSR_IA32_ARCH_CAPABILITIES: >> if (!msr_info->host_initiated && >> !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) >> @@ -3339,6 +3350,31 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) >> case MSR_IA32_TSC: >> kvm_write_tsc(vcpu, msr_info); >> break; >> + case MSR_IA32_SPEC_CTRL: >> + if (!msr_info->host_initiated && >> + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) >> + return 1; >> + >> + /* The STIBP bit doesn't fault even if it's not advertised */ >> + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) >> + return 1; >> + >> + vmx->spec_ctrl = data; >> + >> + /* >> + * When it's written (to non-zero) for the first time, pass >> + * it through. This means we don't have to take the perf >> + * hit of saving it on vmexit for the common case of guests >> + * that don't use it. >> + */ >> + if (cpu_has_vmx_msr_bitmap() && data && >> + !vmx->save_spec_ctrl_on_exit) { >> + vmx->save_spec_ctrl_on_exit = true; >> + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, >> + MSR_IA32_SPEC_CTRL, >> + MSR_TYPE_RW); >> + } > > This code seems to assume that L1 is currently active. What if L2 is > currently active? Ooops! I did not think at all about nested :) This should be addressed now, I hope: http://git.infradead.org/linux-retpoline.git/commitdiff/f7f0cbba3e0cffcee050a8a5a9597a162d57e572 I have not tested it yet though. > >> + break; >> case MSR_IA32_PRED_CMD: >> if (!msr_info->host_initiated && >> !guest_cpuid_has(vcpu, X86_FEATURE_IBPB)) >> @@ -5644,6 +5680,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) >> u64 cr0; >> >> vmx->rmode.vm86_active = 0; >> + vmx->spec_ctrl = 0; >> >> vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); >> kvm_set_cr8(vcpu, 0); >> @@ -9314,6 +9351,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) >> >> vmx_arm_hv_timer(vcpu); >> >> + /* >> + * If this vCPU has touched SPEC_CTRL, restore the guest's value if >> + * it's non-zero. Since vmentry is serialising on affected CPUs, there >> + * is no need to worry about the conditional branch over the wrmsr >> + * being speculatively taken. >> + */ >> + if (vmx->spec_ctrl) >> + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); >> + >> vmx->__launched = vmx->loaded_vmcs->launched; >> asm( >> /* Store host registers */ >> @@ -9420,6 +9466,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) >> #endif >> ); >> >> + /* >> + * We do not use IBRS in the kernel. If this vCPU has used the >> + * SPEC_CTRL MSR it may have left it on; save the value and >> + * turn it off. This is much more efficient than blindly adding >> + * it to the atomic save/restore list. Especially as the former >> + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. >> + */ >> + if (vmx->save_spec_ctrl_on_exit) >> + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); >> + >> + if (vmx->spec_ctrl) >> + wrmsrl(MSR_IA32_SPEC_CTRL, 0); >> + >> /* Eliminate branch target predictions from guest mode */ >> vmexit_fill_RSB(); >> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 8e889dc..fc9724c 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -1006,7 +1006,7 @@ static u32 msrs_to_save[] = { >> #endif >> MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, >> MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, >> - MSR_IA32_ARCH_CAPABILITIES >> + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES >> }; >> >> static unsigned num_msrs_to_save; >> -- >> 2.7.4 >> > Amazon Development Center Germany GmbH Berlin - Dresden - Aachen main office: Krausenstr. 38, 10117 Berlin Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger Ust-ID: DE289237879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
On Tue, Jan 30, 2018 at 1:00 PM, KarimAllah Ahmed <karahmed@amazon.com> wrote: > Ooops! I did not think at all about nested :) > > This should be addressed now, I hope: > > http://git.infradead.org/linux-retpoline.git/commitdiff/f7f0cbba3e0cffcee050a8a5a9597a162d57e572 + if (cpu_has_vmx_msr_bitmap() && data && + !vmx->save_spec_ctrl_on_exit) { + vmx->save_spec_ctrl_on_exit = true; + + msr_bitmap = is_guest_mode(vcpu) ? vmx->nested.vmcs02.msr_bitmap : + vmx->vmcs01.msr_bitmap; + vmx_disable_intercept_for_msr(msr_bitmap, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + } There are two ways to get to this point in vmx_set_msr while is_guest_mode(vcpu) is true: 1) L0 is processing vmcs12's VM-entry MSR load list on emulated VM-entry (see enter_vmx_non_root_mode). 2) L2 tried to execute WRMSR, writes to the MSR are intercepted in vmcs02's MSR permission bitmap, and writes to the MSR are not intercepted in vmcs12's MSR permission bitmap. In the first case, disabling the intercepts for the MSR in vmx->nested.vmcs02.msr_bitmap is incorrect, because we haven't yet determined that the intercepts are clear in vmcs12's MSR permission bitmap. In the second case, disabling *both* of the intercepts for the MSR in vmx->nested.vmcs02.msr_bitmap is incorrect, because we don't know that the read intercept is clear in vmcs12's MSR permission bitmap. Furthermore, disabling the write intercept for the MSR in vmx->nested.vmcs02.msr_bitmap is somewhat fruitless, because nested_vmx_merge_msr_bitmap is just going to undo that change on the next emulated VM-entry.
On 30/01/2018 17:49, Jim Mattson wrote: > On Tue, Jan 30, 2018 at 1:00 PM, KarimAllah Ahmed <karahmed@amazon.com> wrote: >> Ooops! I did not think at all about nested :) >> >> This should be addressed now, I hope: >> >> http://git.infradead.org/linux-retpoline.git/commitdiff/f7f0cbba3e0cffcee050a8a5a9597a162d57e572 > > + if (cpu_has_vmx_msr_bitmap() && data && > + !vmx->save_spec_ctrl_on_exit) { > + vmx->save_spec_ctrl_on_exit = true; > + > + msr_bitmap = is_guest_mode(vcpu) ? > vmx->nested.vmcs02.msr_bitmap : > + > vmx->vmcs01.msr_bitmap; > + vmx_disable_intercept_for_msr(msr_bitmap, > + MSR_IA32_SPEC_CTRL, > + MSR_TYPE_RW); > + } > > There are two ways to get to this point in vmx_set_msr while > is_guest_mode(vcpu) is true: > 1) L0 is processing vmcs12's VM-entry MSR load list on emulated > VM-entry (see enter_vmx_non_root_mode). > 2) L2 tried to execute WRMSR, writes to the MSR are intercepted in > vmcs02's MSR permission bitmap, and writes to the MSR are not > intercepted in vmcs12's MSR permission bitmap. > > In the first case, disabling the intercepts for the MSR in > vmx->nested.vmcs02.msr_bitmap is incorrect, because we haven't yet > determined that the intercepts are clear in vmcs12's MSR permission > bitmap. > In the second case, disabling *both* of the intercepts for the MSR in > vmx->nested.vmcs02.msr_bitmap is incorrect, because we don't know that > the read intercept is clear in vmcs12's MSR permission bitmap. > Furthermore, disabling the write intercept for the MSR in > vmx->nested.vmcs02.msr_bitmap is somewhat fruitless, because > nested_vmx_merge_msr_bitmap is just going to undo that change on the > next emulated VM-entry. > Let's keep the original code from David, touching the L0->L1 MSR bitmap unconditionally, and possibly add an "&& !is_guest_mode (vcpu)" to the condition. Paolo
On 01/30/2018 11:49 PM, Jim Mattson wrote: > On Tue, Jan 30, 2018 at 1:00 PM, KarimAllah Ahmed <karahmed@amazon.com> wrote: >> Ooops! I did not think at all about nested :) >> >> This should be addressed now, I hope: >> >> http://git.infradead.org/linux-retpoline.git/commitdiff/f7f0cbba3e0cffcee050a8a5a9597a162d57e572 > > + if (cpu_has_vmx_msr_bitmap() && data && > + !vmx->save_spec_ctrl_on_exit) { > + vmx->save_spec_ctrl_on_exit = true; > + > + msr_bitmap = is_guest_mode(vcpu) ? > vmx->nested.vmcs02.msr_bitmap : > + > vmx->vmcs01.msr_bitmap; > + vmx_disable_intercept_for_msr(msr_bitmap, > + MSR_IA32_SPEC_CTRL, > + MSR_TYPE_RW); > + } > > There are two ways to get to this point in vmx_set_msr while > is_guest_mode(vcpu) is true: > 1) L0 is processing vmcs12's VM-entry MSR load list on emulated > VM-entry (see enter_vmx_non_root_mode). > 2) L2 tried to execute WRMSR, writes to the MSR are intercepted in > vmcs02's MSR permission bitmap, and writes to the MSR are not > intercepted in vmcs12's MSR permission bitmap. > > In the first case, disabling the intercepts for the MSR in > vmx->nested.vmcs02.msr_bitmap is incorrect, because we haven't yet > determined that the intercepts are clear in vmcs12's MSR permission > bitmap. > In the second case, disabling *both* of the intercepts for the MSR in > vmx->nested.vmcs02.msr_bitmap is incorrect, because we don't know that > the read intercept is clear in vmcs12's MSR permission bitmap. > Furthermore, disabling the write intercept for the MSR in > vmx->nested.vmcs02.msr_bitmap is somewhat fruitless, because > nested_vmx_merge_msr_bitmap is just going to undo that change on the > next emulated VM-entry. Okay, I took a second look at the code (specially nested_vmx_merge_msr_bitmap). This means that I simply should not touch the MSR bitmap in set_msr in case of nested, I just need to properly update the l02 msr_bitmap in nested_vmx_merge_msr_bitmap. As in here: http://git.infradead.org/linux-retpoline.git/commitdiff/d90eedebdd16bb00741a2c93bc13c5e444c99c2b or am I still missing something? (sorry, did not actually look at the nested code before!) > Amazon Development Center Germany GmbH Berlin - Dresden - Aachen main office: Krausenstr. 38, 10117 Berlin Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger Ust-ID: DE289237879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
On Tue, Jan 30, 2018 at 3:50 PM, KarimAllah Ahmed <karahmed@amazon.com> wrote: > Okay, I took a second look at the code (specially > nested_vmx_merge_msr_bitmap). > > This means that I simply should not touch the MSR bitmap in set_msr in > case of nested, I just need to properly update the l02 msr_bitmap in > nested_vmx_merge_msr_bitmap. As in here: > > http://git.infradead.org/linux-retpoline.git/commitdiff/d90eedebdd16bb00741a2c93bc13c5e444c99c2b > > or am I still missing something? (sorry, did not actually look at the > nested code before!) + if (cpu_has_vmx_msr_bitmap() && data && + !vmx->save_spec_ctrl_on_exit) { + vmx->save_spec_ctrl_on_exit = true; + + if (is_guest_mode(vcpu)) + break; As Paolo suggested, the test for !is_guest_mode (vcpu) should just be folded into the condition above. If you aren't clearing a 'W' bit in the MSR permission bitmap, there's no need to set vmx->save_spec_ctrl_on_exit. + + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + } + break; ... + if (guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) { + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_R | MSR_TYPE_W); + } + However, here, you should set vmx->save_spec_ctrl_on_exit if nested_vmx_disable_intercept_for_msr clears the 'W' bit for MSR_IA32_SPEC_CTRL in msr_bitmap_l0. Perhaps this would be easier if nested_vmx_disable_intercept_for_msr returned something indicative of which bits it cleared (if any).
On 30/01/2018 18:50, KarimAllah Ahmed wrote: > On 01/30/2018 11:49 PM, Jim Mattson wrote: >> On Tue, Jan 30, 2018 at 1:00 PM, KarimAllah Ahmed >> <karahmed@amazon.com> wrote: >>> Ooops! I did not think at all about nested :) >>> >>> This should be addressed now, I hope: >>> >>> http://git.infradead.org/linux-retpoline.git/commitdiff/f7f0cbba3e0cffcee050a8a5a9597a162d57e572 >>> >> >> + if (cpu_has_vmx_msr_bitmap() && data && >> + !vmx->save_spec_ctrl_on_exit) { >> + vmx->save_spec_ctrl_on_exit = true; >> + >> + msr_bitmap = is_guest_mode(vcpu) ? >> vmx->nested.vmcs02.msr_bitmap : >> + >> vmx->vmcs01.msr_bitmap; >> + vmx_disable_intercept_for_msr(msr_bitmap, >> + MSR_IA32_SPEC_CTRL, >> + MSR_TYPE_RW); >> + } >> >> There are two ways to get to this point in vmx_set_msr while >> is_guest_mode(vcpu) is true: >> 1) L0 is processing vmcs12's VM-entry MSR load list on emulated >> VM-entry (see enter_vmx_non_root_mode). >> 2) L2 tried to execute WRMSR, writes to the MSR are intercepted in >> vmcs02's MSR permission bitmap, and writes to the MSR are not >> intercepted in vmcs12's MSR permission bitmap. >> >> In the first case, disabling the intercepts for the MSR in >> vmx->nested.vmcs02.msr_bitmap is incorrect, because we haven't yet >> determined that the intercepts are clear in vmcs12's MSR permission >> bitmap. >> In the second case, disabling *both* of the intercepts for the MSR in >> vmx->nested.vmcs02.msr_bitmap is incorrect, because we don't know that >> the read intercept is clear in vmcs12's MSR permission bitmap. >> Furthermore, disabling the write intercept for the MSR in >> vmx->nested.vmcs02.msr_bitmap is somewhat fruitless, because >> nested_vmx_merge_msr_bitmap is just going to undo that change on the >> next emulated VM-entry. > > Okay, I took a second look at the code (specially > nested_vmx_merge_msr_bitmap). > > This means that I simply should not touch the MSR bitmap in set_msr in > case of nested, I just need to properly update the l02 msr_bitmap in > nested_vmx_merge_msr_bitmap. As in here: > > http://git.infradead.org/linux-retpoline.git/commitdiff/d90eedebdd16bb00741a2c93bc13c5e444c99c2b > > > or am I still missing something? (sorry, did not actually look at the > nested code before!) The new code in nested_vmx_merge_msr_bitmap should be conditional on vmx->save_spec_ctrl_on_exit. Also, guest_cpuid_has is pretty slow (because of kvm_find_cpuid_entry); calling it once or twice on each and every nested vmexit is probably not a good idea. Apart from this, it looks good to me. Paolo
On Tue, Jan 30, 2018 at 4:19 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > The new code in nested_vmx_merge_msr_bitmap should be conditional on > vmx->save_spec_ctrl_on_exit. But then if L1 doesn't use MSR_IA32_SPEC_CTRL itself and it uses the VM-entry MSR load list to set up L2's MSR_IA32_SPEC_CTRL, you will never set vmx->save_spec_ctrl_on_exit, and L2's accesses to the MSR will always be intercepted by L0.
On 01/31/2018 01:27 AM, Jim Mattson wrote: > On Tue, Jan 30, 2018 at 4:19 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: >> The new code in nested_vmx_merge_msr_bitmap should be conditional on >> vmx->save_spec_ctrl_on_exit. > > But then if L1 doesn't use MSR_IA32_SPEC_CTRL itself and it uses the > VM-entry MSR load list to set up L2's MSR_IA32_SPEC_CTRL, you will > never set vmx->save_spec_ctrl_on_exit, and L2's accesses to the MSR > will always be intercepted by L0. I can add another variable (actually two) to indicate if msr interception should be disabled or not for SPEC_CTRL and PRED_CMD in nested case. That would allow us to have a fast alternative to guest_cpuid_has in nested_vmx_merge_msr_bitmap and at the same time maintain the current semantics of save_spec_ctrl_on_exit (i.e we would still differentiate between set_msr that is called from the loading MSRs for the emulated vm-entry vs L2 actually writing to it). What do you think? Amazon Development Center Germany GmbH Berlin - Dresden - Aachen main office: Krausenstr. 38, 10117 Berlin Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger Ust-ID: DE289237879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
On 30/01/2018 19:27, Jim Mattson wrote: > On Tue, Jan 30, 2018 at 4:19 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: >> The new code in nested_vmx_merge_msr_bitmap should be conditional on >> vmx->save_spec_ctrl_on_exit. > > But then if L1 doesn't use MSR_IA32_SPEC_CTRL itself and it uses the > VM-entry MSR load list to set up L2's MSR_IA32_SPEC_CTRL, you will > never set vmx->save_spec_ctrl_on_exit, and L2's accesses to the MSR > will always be intercepted by L0. If you don't make it conditional, L0 will forget to read back at vmexit what value L2 has written to the MSR. The alternative is to set vmx->save_spec_ctrl_on_exit on all writes, including those coming from L2. That works for me. Paolo
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1909635..662d0c0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES); + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | + F(ARCH_CAPABILITIES); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; - /* IBPB isn't necessarily present in hardware cpuid */ + /* IBRS and IBPB aren't necessarily present in hardware cpuid */ if (boot_cpu_has(X86_FEATURE_IBPB)) entry->ebx |= F(IBPB); + if (boot_cpu_has(X86_FEATURE_IBRS)) + entry->ebx |= F(IBRS); entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 798a00b..9ac9747 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -582,6 +582,8 @@ struct vcpu_vmx { u64 msr_guest_kernel_gs_base; #endif u64 arch_capabilities; + u64 spec_ctrl; + bool save_spec_ctrl_on_exit; u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; @@ -922,6 +924,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code); static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3226,6 +3230,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: msr_info->data = guest_read_tsc(vcpu); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; + break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) @@ -3339,6 +3350,31 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + return 1; + + vmx->spec_ctrl = data; + + /* + * When it's written (to non-zero) for the first time, pass + * it through. This means we don't have to take the perf + * hit of saving it on vmexit for the common case of guests + * that don't use it. + */ + if (cpu_has_vmx_msr_bitmap() && data && + !vmx->save_spec_ctrl_on_exit) { + vmx->save_spec_ctrl_on_exit = true; + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + } + break; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_IBPB)) @@ -5644,6 +5680,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u64 cr0; vmx->rmode.vm86_active = 0; + vmx->spec_ctrl = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -9314,6 +9351,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_arm_hv_timer(vcpu); + /* + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -9420,6 +9466,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and + * turn it off. This is much more efficient than blindly adding + * it to the atomic save/restore list. Especially as the former + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. + */ + if (vmx->save_spec_ctrl_on_exit) + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e889dc..fc9724c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1006,7 +1006,7 @@ static u32 msrs_to_save[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_ARCH_CAPABILITIES + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES }; static unsigned num_msrs_to_save;