Message ID | e28c13c4-4a04-8a41-2a5c-16509e1bad0e@amazon.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> >From 9c19a8ac3f021efba6f70ad7e28f7ad06bb97e43 Mon Sep 17 00:00:00 2001 > From: KarimAllah Ahmed <karahmed@amazon.de> > Date: Mon, 29 Jan 2018 19:58:10 +0000 > Subject: [PATCH] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL > > [ Based on a patch from Ashok Raj <ashok.raj@intel.com> ] > > Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for > guests that will only mitigate Spectre V2 through IBRS+IBPB and will not > be using a retpoline+IBPB based approach. > > To avoid the overhead of atomically saving and restoring the > MSR_IA32_SPEC_CTRL for guests that do not actually use the MSR, only > add_atomic_switch_msr when a non-zero is written to it. ^^^^^^^^^^^^^^^^^^^^^ That part of the comment does not seem to be in sync with the code. > > No attempt is made to handle STIBP here, intentionally. Filtering STIBP > may be added in a future patch, which may require trapping all writes > if we don't want to pass it through directly to the guest. > > [dwmw2: Clean up CPUID bits, save/restore manually, handle reset] > > Cc: Asit Mallick <asit.k.mallick@intel.com> > Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> > Cc: Dave Hansen <dave.hansen@intel.com> > Cc: Andi Kleen <ak@linux.intel.com> > Cc: Andrea Arcangeli <aarcange@redhat.com> > Cc: Linus Torvalds <torvalds@linux-foundation.org> > Cc: Tim Chen <tim.c.chen@linux.intel.com> > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Dan Williams <dan.j.williams@intel.com> > Cc: Jun Nakajima <jun.nakajima@intel.com> > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: David Woodhouse <dwmw@amazon.co.uk> > Cc: Greg KH <gregkh@linuxfoundation.org> > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Ashok Raj <ashok.raj@intel.com> > Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > --- > v6: > - got rid of save_spec_ctrl_on_exit > - introduce spec_ctrl_intercepted > - introduce spec_ctrl_used > v5: > - Also check for X86_FEATURE_SPEC_CTRL for the msr reads/writes > v4: > - Add IBRS to kvm_cpuid_8000_0008_ebx_x86_features > - Handling nested guests > v3: > - Save/restore manually > - Fix CPUID handling > - Fix a copy & paste error in the name of SPEC_CTRL MSR in > disable_intercept. > - support !cpu_has_vmx_msr_bitmap() > v2: > - remove 'host_spec_ctrl' in favor of only a comment (dwmw@). > - special case writing '0' in SPEC_CTRL to avoid confusing live-migration > when the instance never used the MSR (dwmw@). > - depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@). > - add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident). > --- > arch/x86/kvm/cpuid.c | 9 +++-- > arch/x86/kvm/vmx.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++- > arch/x86/kvm/x86.c | 2 +- > 3 files changed, 100 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index 1909635..13f5d42 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -367,7 +367,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > > /* cpuid 0x80000008.ebx */ > const u32 kvm_cpuid_8000_0008_ebx_x86_features = > - F(IBPB); > + F(IBPB) | F(IBRS); > > /* cpuid 0xC0000001.edx */ > const u32 kvm_cpuid_C000_0001_edx_x86_features = > @@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > > /* cpuid 7.0.edx*/ > const u32 kvm_cpuid_7_0_edx_x86_features = > - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES); > + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | > + F(ARCH_CAPABILITIES); > > /* all calls to cpuid_count() should be made on the same cpu */ > get_cpu(); > @@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > g_phys_as = phys_as; > entry->eax = g_phys_as | (virt_as << 8); > entry->edx = 0; > - /* IBPB isn't necessarily present in hardware cpuid */ > + /* IBRS and IBPB aren't necessarily present in hardware cpuid */ > if (boot_cpu_has(X86_FEATURE_IBPB)) > entry->ebx |= F(IBPB); > + if (boot_cpu_has(X86_FEATURE_IBRS)) > + entry->ebx |= F(IBRS); > entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; > cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); > break; > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 6a9f4ec..bfc80ff 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -594,6 +594,14 @@ struct vcpu_vmx { > #endif > > u64 arch_capabilities; > + u64 spec_ctrl; > + > + /* > + * This indicates that: > + * 1) guest_cpuid_has(X86_FEATURE_IBRS) == true && > + * 2) The guest has actually initiated a write against the MSR. > + */ > + bool spec_ctrl_used; > > /* > * This indicates that: > @@ -946,6 +954,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); > static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, > u16 error_code); > static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); > +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, > + u32 msr, int type); > > static DEFINE_PER_CPU(struct vmcs *, vmxarea); > static DEFINE_PER_CPU(struct vmcs *, current_vmcs); > @@ -1917,6 +1927,22 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) > vmcs_write32(EXCEPTION_BITMAP, eb); > } > > +/* Is SPEC_CTRL intercepted for the currently running vCPU? */ > +static bool spec_ctrl_intercepted(struct kvm_vcpu *vcpu) > +{ > + unsigned long *msr_bitmap; > + int f = sizeof(unsigned long); > + > + if (!cpu_has_vmx_msr_bitmap()) > + return true; > + > + msr_bitmap = is_guest_mode(vcpu) ? > + to_vmx(vcpu)->nested.vmcs02.msr_bitmap : > + to_vmx(vcpu)->vmcs01.msr_bitmap; > + > + return !!test_bit(MSR_IA32_SPEC_CTRL, msr_bitmap + 0x800 / f); > +} > + > static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, > unsigned long entry, unsigned long exit) > { > @@ -3246,6 +3272,14 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_IA32_TSC: > msr_info->data = guest_read_tsc(vcpu); > break; > + case MSR_IA32_SPEC_CTRL: > + if (!msr_info->host_initiated && > + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) > + return 1; > + > + msr_info->data = to_vmx(vcpu)->spec_ctrl; > + break; > case MSR_IA32_ARCH_CAPABILITIES: > if (!msr_info->host_initiated && > !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) > @@ -3359,6 +3393,34 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_IA32_TSC: > kvm_write_tsc(vcpu, msr_info); > break; > + case MSR_IA32_SPEC_CTRL: > + if (!msr_info->host_initiated && > + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) > + return 1; > + > + vmx->spec_ctrl_used = true; > + > + /* The STIBP bit doesn't fault even if it's not advertised */ > + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) > + return 1; > + > + vmx->spec_ctrl = data; > + > + /* > + * When it's written (to non-zero) for the first time, pass > + * it through. This means we don't have to take the perf .. But only if it is a nested guest (as you have && is_guest_mode). Do you want to update the comment a bit? > + * hit of saving it on vmexit for the common case of guests > + * that don't use it. > + */ > + if (cpu_has_vmx_msr_bitmap() && data && > + spec_ctrl_intercepted(vcpu) && > + is_guest_mode(vcpu)) ^^^^^^^^^^^^^^^^^^ <=== here > + vmx_disable_intercept_for_msr( > + vmx->vmcs01.msr_bitmap, > + MSR_IA32_SPEC_CTRL, > + MSR_TYPE_RW); > + break; > case MSR_IA32_PRED_CMD: > if (!msr_info->host_initiated && > !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
On Wed, 2018-01-31 at 23:26 -0500, Konrad Rzeszutek Wilk wrote: > > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > > index 6a9f4ec..bfc80ff 100644 > > --- a/arch/x86/kvm/vmx.c > > +++ b/arch/x86/kvm/vmx.c > > @@ -594,6 +594,14 @@ struct vcpu_vmx { > > #endif > > > > u64 arch_capabilities; > > + u64 spec_ctrl; > > + > > + /* > > + * This indicates that: > > + * 1) guest_cpuid_has(X86_FEATURE_IBRS) == true && > > + * 2) The guest has actually initiated a write against the MSR. > > + */ > > + bool spec_ctrl_used; > > > > /* > > * This indicates that: Thanks for persisting with the details here, Karim. In addition to Konrad's heckling at the comments, I'll add my own request to his... I'd like the comment for spec_ctrl_used to explain why it isn't entirely redundant with the spec_ctrl_intercepted() function. Without nesting, I believe it *would* be redundant, but the difference comes when an L2 is running for which L1 has not permitted the MSR to be passed through. That's when we have spec_ctrl_used = true but the MSR *isn't* actually passed through in the active msr_bitmap. Question: if spec_ctrl_used is always equivalent to the intercept bit in the vmcs01.msr_bitmap, just not the guest bitmap... should we ditch it and always use the bit from the vmcs01.msr_bitmap? Sorry :)
.snip.. > > +/* Is SPEC_CTRL intercepted for the currently running vCPU? */ > > +static bool spec_ctrl_intercepted(struct kvm_vcpu *vcpu) > > +{ > > + unsigned long *msr_bitmap; > > + int f = sizeof(unsigned long); > > + > > + if (!cpu_has_vmx_msr_bitmap()) > > + return true; > > + > > + msr_bitmap = is_guest_mode(vcpu) ? > > + to_vmx(vcpu)->nested.vmcs02.msr_bitmap : > > + to_vmx(vcpu)->vmcs01.msr_bitmap; > > + > > + return !!test_bit(MSR_IA32_SPEC_CTRL, msr_bitmap + 0x800 / f); > > +} > > + ..snip.. > > @@ -3359,6 +3393,34 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > > case MSR_IA32_TSC: > > kvm_write_tsc(vcpu, msr_info); > > break; > > + case MSR_IA32_SPEC_CTRL: > > + if (!msr_info->host_initiated && > > + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && > > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) > > + return 1; > > + > > + vmx->spec_ctrl_used = true; > > + > > + /* The STIBP bit doesn't fault even if it's not advertised */ > > + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) > > + return 1; > > + > > + vmx->spec_ctrl = data; > > + > > + /* > > + * When it's written (to non-zero) for the first time, pass > > + * it through. This means we don't have to take the perf > > .. But only if it is a nested guest (as you have && is_guest_mode). > > Do you want to update the comment a bit? > > > + * hit of saving it on vmexit for the common case of guests > > + * that don't use it. > > + */ > > + if (cpu_has_vmx_msr_bitmap() && data && > > + spec_ctrl_intercepted(vcpu) && > > + is_guest_mode(vcpu)) > ^^^^^^^^^^^^^^^^^^ <=== here Would it be perhaps also good to mention the complexity of how we ought to be handling L1 and L2 guests in the commit? We are all stressed and I am sure some of us haven't gotten much sleep - but it can help in say three months when some unluckly new soul is trying to understand this and gets utterly confused. > > + vmx_disable_intercept_for_msr( > > + vmx->vmcs01.msr_bitmap, > > + MSR_IA32_SPEC_CTRL, > > + MSR_TYPE_RW); > > + break; > > case MSR_IA32_PRED_CMD: > > if (!msr_info->host_initiated && > > !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
On 02/01/2018 03:19 PM, Konrad Rzeszutek Wilk wrote: > .snip.. >>> +/* Is SPEC_CTRL intercepted for the currently running vCPU? */ >>> +static bool spec_ctrl_intercepted(struct kvm_vcpu *vcpu) >>> +{ >>> + unsigned long *msr_bitmap; >>> + int f = sizeof(unsigned long); >>> + >>> + if (!cpu_has_vmx_msr_bitmap()) >>> + return true; >>> + >>> + msr_bitmap = is_guest_mode(vcpu) ? >>> + to_vmx(vcpu)->nested.vmcs02.msr_bitmap : >>> + to_vmx(vcpu)->vmcs01.msr_bitmap; >>> + >>> + return !!test_bit(MSR_IA32_SPEC_CTRL, msr_bitmap + 0x800 / f); >>> +} >>> + > ..snip.. >>> @@ -3359,6 +3393,34 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) >>> case MSR_IA32_TSC: >>> kvm_write_tsc(vcpu, msr_info); >>> break; >>> + case MSR_IA32_SPEC_CTRL: >>> + if (!msr_info->host_initiated && >>> + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && >>> + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) >>> + return 1; >>> + >>> + vmx->spec_ctrl_used = true; >>> + >>> + /* The STIBP bit doesn't fault even if it's not advertised */ >>> + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) >>> + return 1; >>> + >>> + vmx->spec_ctrl = data; >>> + >>> + /* >>> + * When it's written (to non-zero) for the first time, pass >>> + * it through. This means we don't have to take the perf >> >> .. But only if it is a nested guest (as you have && is_guest_mode). >> >> Do you want to update the comment a bit? >> >>> + * hit of saving it on vmexit for the common case of guests >>> + * that don't use it. >>> + */ >>> + if (cpu_has_vmx_msr_bitmap() && data && >>> + spec_ctrl_intercepted(vcpu) && >>> + is_guest_mode(vcpu)) >> ^^^^^^^^^^^^^^^^^^ <=== here > > Would it be perhaps also good to mention the complexity of how > we ought to be handling L1 and L2 guests in the commit? > > We are all stressed and I am sure some of us haven't gotten much > sleep - but it can help in say three months when some unluckly new > soul is trying to understand this and gets utterly confused. Yup, I will go through the patches and add as much details as possible. And yes, the is_guest_mode(vcpu) here is inverted :D I blame the late night :) > >>> + vmx_disable_intercept_for_msr( >>> + vmx->vmcs01.msr_bitmap, >>> + MSR_IA32_SPEC_CTRL, >>> + MSR_TYPE_RW); >>> + break; >>> case MSR_IA32_PRED_CMD: >>> if (!msr_info->host_initiated && >>> !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) && > Amazon Development Center Germany GmbH Berlin - Dresden - Aachen main office: Krausenstr. 38, 10117 Berlin Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger Ust-ID: DE289237879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
On 02/01/2018 02:25 PM, David Woodhouse wrote: > > > On Wed, 2018-01-31 at 23:26 -0500, Konrad Rzeszutek Wilk wrote: >> >>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>> index 6a9f4ec..bfc80ff 100644 >>> --- a/arch/x86/kvm/vmx.c >>> +++ b/arch/x86/kvm/vmx.c >>> @@ -594,6 +594,14 @@ struct vcpu_vmx { >>> #endif >>> >>> u64 arch_capabilities; >>> + u64 spec_ctrl; >>> + >>> + /* >>> + * This indicates that: >>> + * 1) guest_cpuid_has(X86_FEATURE_IBRS) == true && >>> + * 2) The guest has actually initiated a write against the MSR. >>> + */ >>> + bool spec_ctrl_used; >>> >>> /* >>> * This indicates that: > > Thanks for persisting with the details here, Karim. In addition to > Konrad's heckling at the comments, I'll add my own request to his... > > I'd like the comment for spec_ctrl_used to explain why it isn't > entirely redundant with the spec_ctrl_intercepted() function. > > Without nesting, I believe it *would* be redundant, but the difference > comes when an L2 is running for which L1 has not permitted the MSR to > be passed through. That's when we have spec_ctrl_used = true but the > MSR *isn't* actually passed through in the active msr_bitmap. > > Question: if spec_ctrl_used is always equivalent to the intercept bit > in the vmcs01.msr_bitmap, just not the guest bitmap... should we ditch > it and always use the bit from the vmcs01.msr_bitmap? If I used the vmcs01.msr_bitmap, spec_ctrl_used will always be true if L0 passed it to L1. Even if L1 did not actually pass it to L2 and even if L2 has not written to it yet (!used). This pretty much renders the short-circuit at nested_vmx_merge_msr_bitmap useless: if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && !to_vmx(vcpu)->pred_cmd_used && !to_vmx(vcpu)->spec_ctrl_used) return false; ... and the default path will be kvm_vcpu_gpa_to_page + kmap. That being said, I have to admit the logic for spec_ctrl_used is not perfect either. If L1 or any of the L2s touched the MSR, spec_ctrl_used will be set to true. So if one L2 used the MSR, all other L2s will also skip the short- circuit mentioned above and end up *always* going through kvm_vcpu_gpa_to_page + kmap. Maybe all of this is over-thinking and in reality the short-circuit above is really useless and all L2 guests are happily using x2apic :) > > Sorry :) > Amazon Development Center Germany GmbH Berlin - Dresden - Aachen main office: Krausenstr. 38, 10117 Berlin Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger Ust-ID: DE289237879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
On 02/01/2018 06:37 PM, KarimAllah Ahmed wrote: > On 02/01/2018 02:25 PM, David Woodhouse wrote: >> >> >> On Wed, 2018-01-31 at 23:26 -0500, Konrad Rzeszutek Wilk wrote: >>> >>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>>> index 6a9f4ec..bfc80ff 100644 >>>> --- a/arch/x86/kvm/vmx.c >>>> +++ b/arch/x86/kvm/vmx.c >>>> @@ -594,6 +594,14 @@ struct vcpu_vmx { >>>> #endif >>>> u64 arch_capabilities; >>>> + u64 spec_ctrl; >>>> + >>>> + /* >>>> + * This indicates that: >>>> + * 1) guest_cpuid_has(X86_FEATURE_IBRS) == true && >>>> + * 2) The guest has actually initiated a write against the MSR. >>>> + */ >>>> + bool spec_ctrl_used; >>>> /* >>>> * This indicates that: >> >> Thanks for persisting with the details here, Karim. In addition to >> Konrad's heckling at the comments, I'll add my own request to his... >> >> I'd like the comment for spec_ctrl_used to explain why it isn't >> entirely redundant with the spec_ctrl_intercepted() function. >> >> Without nesting, I believe it *would* be redundant, but the difference >> comes when an L2 is running for which L1 has not permitted the MSR to >> be passed through. That's when we have spec_ctrl_used = true but the >> MSR *isn't* actually passed through in the active msr_bitmap. >> >> Question: if spec_ctrl_used is always equivalent to the intercept bit >> in the vmcs01.msr_bitmap, just not the guest bitmap... should we ditch >> it and always use the bit from the vmcs01.msr_bitmap? > > If I used the vmcs01.msr_bitmap, spec_ctrl_used will always be true if > L0 passed it to L1. Even if L1 did not actually pass it to L2 and even > if L2 has not written to it yet (!used). > > This pretty much renders the short-circuit at > nested_vmx_merge_msr_bitmap useless: > > if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && > !to_vmx(vcpu)->pred_cmd_used && > !to_vmx(vcpu)->spec_ctrl_used) > return false; > > ... and the default path will be kvm_vcpu_gpa_to_page + kmap. > > That being said, I have to admit the logic for spec_ctrl_used is not > perfect either. > > If L1 or any of the L2s touched the MSR, spec_ctrl_used will be set to > true. So if one L2 used the MSR, all other L2s will also skip the short- > circuit mentioned above and end up *always* going through > kvm_vcpu_gpa_to_page + kmap. > > Maybe all of this is over-thinking and in reality the short-circuit > above is really useless and all L2 guests are happily using x2apic :) > hehe .. >> if spec_ctrl_used is always equivalent to the intercept bit in the vmcs01.msr_bitmap actually yes, we can. I just forgot that we update the msr bitmap lazily! :) >> >> Sorry :) >> Amazon Development Center Germany GmbH Berlin - Dresden - Aachen main office: Krausenstr. 38, 10117 Berlin Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger Ust-ID: DE289237879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
From 9c19a8ac3f021efba6f70ad7e28f7ad06bb97e43 Mon Sep 17 00:00:00 2001 From: KarimAllah Ahmed <karahmed@amazon.de> Date: Mon, 29 Jan 2018 19:58:10 +0000 Subject: [PATCH] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL [ Based on a patch from Ashok Raj <ashok.raj@intel.com> ] Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a retpoline+IBPB based approach. To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL for guests that do not actually use the MSR, only add_atomic_switch_msr when a non-zero is written to it. No attempt is made to handle STIBP here, intentionally. Filtering STIBP may be added in a future patch, which may require trapping all writes if we don't want to pass it through directly to the guest. [dwmw2: Clean up CPUID bits, save/restore manually, handle reset] Cc: Asit Mallick <asit.k.mallick@intel.com> Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Jun Nakajima <jun.nakajima@intel.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: David Woodhouse <dwmw@amazon.co.uk> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ashok Raj <ashok.raj@intel.com> Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> --- v6: - got rid of save_spec_ctrl_on_exit - introduce spec_ctrl_intercepted - introduce spec_ctrl_used v5: - Also check for X86_FEATURE_SPEC_CTRL for the msr reads/writes v4: - Add IBRS to kvm_cpuid_8000_0008_ebx_x86_features - Handling nested guests v3: - Save/restore manually - Fix CPUID handling - Fix a copy & paste error in the name of SPEC_CTRL MSR in disable_intercept. - support !cpu_has_vmx_msr_bitmap() v2: - remove 'host_spec_ctrl' in favor of only a comment (dwmw@). - special case writing '0' in SPEC_CTRL to avoid confusing live-migration when the instance never used the MSR (dwmw@). - depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@). - add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident). --- arch/x86/kvm/cpuid.c | 9 +++-- arch/x86/kvm/vmx.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 2 +- 3 files changed, 100 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1909635..13f5d42 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -367,7 +367,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(IBPB); + F(IBPB) | F(IBRS); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES); + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | + F(ARCH_CAPABILITIES); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; - /* IBPB isn't necessarily present in hardware cpuid */ + /* IBRS and IBPB aren't necessarily present in hardware cpuid */ if (boot_cpu_has(X86_FEATURE_IBPB)) entry->ebx |= F(IBPB); + if (boot_cpu_has(X86_FEATURE_IBRS)) + entry->ebx |= F(IBRS); entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6a9f4ec..bfc80ff 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -594,6 +594,14 @@ struct vcpu_vmx { #endif u64 arch_capabilities; + u64 spec_ctrl; + + /* + * This indicates that: + * 1) guest_cpuid_has(X86_FEATURE_IBRS) == true && + * 2) The guest has actually initiated a write against the MSR. + */ + bool spec_ctrl_used; /* * This indicates that: @@ -946,6 +954,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code); static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1917,6 +1927,22 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +/* Is SPEC_CTRL intercepted for the currently running vCPU? */ +static bool spec_ctrl_intercepted(struct kvm_vcpu *vcpu) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = is_guest_mode(vcpu) ? + to_vmx(vcpu)->nested.vmcs02.msr_bitmap : + to_vmx(vcpu)->vmcs01.msr_bitmap; + + return !!test_bit(MSR_IA32_SPEC_CTRL, msr_bitmap + 0x800 / f); +} + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { @@ -3246,6 +3272,14 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: msr_info->data = guest_read_tsc(vcpu); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; + break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) @@ -3359,6 +3393,34 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + vmx->spec_ctrl_used = true; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + return 1; + + vmx->spec_ctrl = data; + + /* + * When it's written (to non-zero) for the first time, pass + * it through. This means we don't have to take the perf + * hit of saving it on vmexit for the common case of guests + * that don't use it. + */ + if (cpu_has_vmx_msr_bitmap() && data && + spec_ctrl_intercepted(vcpu) && + is_guest_mode(vcpu)) + vmx_disable_intercept_for_msr( + vmx->vmcs01.msr_bitmap, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + break; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) && @@ -5678,6 +5740,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u64 cr0; vmx->rmode.vm86_active = 0; + vmx->spec_ctrl = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -9349,6 +9412,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_arm_hv_timer(vcpu); + /* + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -9467,6 +9539,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and + * turn it off. This is much more efficient than blindly adding + * it to the atomic save/restore list. Especially as the former + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. + */ + if (!spec_ctrl_intercepted(vcpu)) + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); @@ -10092,7 +10177,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !to_vmx(vcpu)->pred_cmd_used) + !to_vmx(vcpu)->pred_cmd_used && + !to_vmx(vcpu)->spec_ctrl_used) return false; page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); @@ -10126,6 +10212,12 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, } } + if (to_vmx(vcpu)->spec_ctrl_used) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_R | MSR_TYPE_W); + if (to_vmx(vcpu)->pred_cmd_used) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4ec142e..ac38143 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1009,7 +1009,7 @@ static u32 msrs_to_save[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_ARCH_CAPABILITIES + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES }; static unsigned num_msrs_to_save; -- 2.7.4