Message ID | 20211208000359.2853257-14-yang.zhong@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | AMX Support in KVM | expand |
On 12/8/2021 8:03 AM, Yang Zhong wrote: > From: Jing Liu <jing2.liu@intel.com> > > Always intercepting IA32_XFD causes non-negligible overhead when this > register is updated frequently in the guest. > > Disable WRMSR interception to IA32_XFD after fpstate reallocation is > completed. There are three options for when to disable the > interception: > > 1) When emulating the 1st WRMSR which requires reallocation, > disable interception before exiting to userapce with the > assumption that the userspace VMM should not bounch back to > the kernel if reallocation fails. However it's not good to > design kernel based on application behavior. If due to bug > the vCPU thread comes back to the kernel after reallocation > fails, XFD passthrough may lead to host memory corruption > when doing XSAVES for guest fpstate which has a smaller size > than what guest XFD allows. > > 2) Disable interception when coming back from the userspace VMM > (for the 1st WRMSR which triggers reallocation). Re-check > whether fpstate size can serve the new guest XFD value. Disable > interception only when the check succeeds. This requires KVM > to store guest XFD value in some place and then compare it > to guest_fpu::user_xfeatures in the completion handler. For option 2), we are considering that fpstate->size can be used to indicate if reallocation is successful. Because once one of the XFD features (today, it's AMX) is enabled, kernel need reallocate full size, otherwise, KVM has no chance to reallocate for other XFD features later since it's non-trapped (to avoid WRMSR VM EXITs due to guest toggling XFD). Then KVM doesn't need to store guest XFD value in some place. And kernel fpu core may need an API to tell guest permitted size for KVM. Thanks, Jing > > 3) Disable interception at the 2nd WRMSR which enables dynamic > XSTATE features. If guest_fpu::user_xfeatures already includes > bits for dynamic features set in guest XFD value, disable > interception. >
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index cefe1d81e2e8..60c27f9990e9 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -30,6 +30,7 @@ KVM_X86_OP(update_exception_bitmap) KVM_X86_OP(get_msr) KVM_X86_OP(set_msr) KVM_X86_OP(get_segment_base) +KVM_X86_OP_NULL(set_xfd_passthrough) KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ac61f85e07b..7c97cc1fea89 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -640,6 +640,7 @@ struct kvm_vcpu_arch { u64 smi_count; bool tpr_access_reporting; bool xsaves_enabled; + bool xfd_out_of_sync; u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; @@ -1328,6 +1329,7 @@ struct kvm_x86_ops { void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); + void (*set_xfd_passthrough)(struct kvm_vcpu *vcpu); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 971d60980d5b..6198b13c4846 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -160,6 +160,7 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, + MSR_IA32_XFD, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, @@ -1924,6 +1925,14 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) return debugctl; } +#ifdef CONFIG_X86_64 +static void vmx_set_xfd_passthrough(struct kvm_vcpu *vcpu) +{ + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); + vcpu->arch.xfd_out_of_sync = true; +} +#endif + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -7657,6 +7666,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, .cancel_hv_timer = vmx_cancel_hv_timer, + .set_xfd_passthrough = vmx_set_xfd_passthrough, #endif .setup_mce = vmx_setup_mce, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4df2ac24ffc1..bf9d3051cd6c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -340,7 +340,7 @@ struct vcpu_vmx { struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 14 struct { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b195f4fa888f..d127b229dd29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -974,6 +974,10 @@ bool kvm_check_guest_realloc_fpstate(struct kvm_vcpu *vcpu, u64 xfd) vcpu->arch.guest_fpu.realloc_request = request; return true; } + + /* Disable WRMSR interception if possible */ + if (kvm_x86_ops.set_xfd_passthrough) + static_call(kvm_x86_set_xfd_passthrough)(vcpu); } return false; @@ -10002,6 +10006,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + if (vcpu->arch.xfd_out_of_sync) + xfd_sync_state(); + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());