Message ID | 20220424101557.134102-9-lei4.wang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: PKS Virtualization support | expand |
Nit, use "KVM: nVMX:" for the shortlog scope. On Sun, Apr 24, 2022, Lei Wang wrote: > @@ -2433,6 +2437,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) > if (kvm_mpx_supported() && vmx->nested.nested_run_pending && > (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) > vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); > + > + if (vmx->nested.nested_run_pending && > + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)) > + vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs); As mentioned in the BNDCFGS thread, this does the wrong thing for SMM. But, after a lot of thought, handling this in nested_vmx_enter_non_root_mode() would be little more than a band-aid, and a messy one at that, because KVM's SMM emulation is horrifically broken with respect to nVMX. Entry does to SMM does not modify _any_ state that is not saved in SMRAM. That we're having to deal with this crap is a symptom of KVM doing the complete wrong thing by piggybacking nested_vmx_vmexit() and nested_vmx_enter_non_root_mode(). The SDM's description of CET spells this out very, very clearly: On processors that support CET shadow stacks, when the processor enters SMM, the processor saves the SSP register to the SMRAM state save area (see Table 31-3) and clears CR4.CET to 0. Thus, the initial execution environment of the SMI handler has CET disabled and all of the CET state of the interrupted program is still in the machine. An SMM that uses CET is required to save the interrupted program’s CET state and restore the CET state prior to exiting SMM. It mostly works because no guest SMM handler does anything with most of the MSRs, but it's all wildy wrong. A concrete example of a lurking bug is if vmcs12 uses the VM-Exit MSR load list, in which case the forced nested_vmx_vmexit() will load state that is never undone. So, my very strong vote is to ignore SMM and let someone who actually cares about SMM fix that mess properly by adding custom flows for exiting/re-entering L2 on SMI/RSM. > } > > if (nested_cpu_has_xsaves(vmcs12)) > @@ -2521,6 +2529,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, > if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || > !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) > vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); > + if (kvm_cpu_cap_has(X86_FEATURE_PKS) && ERROR: trailing whitespace #85: FILE: arch/x86/kvm/vmx/nested.c:3407: +^Iif (kvm_cpu_cap_has(X86_FEATURE_PKS) && $ > + (!vmx->nested.nested_run_pending || > + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))) > + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs); > + > vmx_set_rflags(vcpu, vmcs12->guest_rflags); > > /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the > @@ -2897,6 +2910,10 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, > vmcs12->host_ia32_perf_global_ctrl))) > return -EINVAL; > > + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) && > + CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs))) > + return -EINVAL; > + > #ifdef CONFIG_X86_64 > ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); > #else > @@ -3049,6 +3066,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, > if (nested_check_guest_non_reg_state(vmcs12)) > return -EINVAL; > > + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) && > + CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs))) > + return -EINVAL; > + > return 0; > } > > @@ -3384,6 +3405,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, > (!from_vmentry || > !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) > vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); > + if (kvm_cpu_cap_has(X86_FEATURE_PKS) && > + (!from_vmentry || This should be "!vmx->nested.nested_run_pending" instead of "!from_vmentry" to avoid the unnecessary VMREAD when restoring L2 with a pending VM-Enter. > + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))) > + vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS); > > /* > * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* ... > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h > index 91723a226bf3..82f79ac46d7b 100644 > --- a/arch/x86/kvm/vmx/vmx.h > +++ b/arch/x86/kvm/vmx/vmx.h > @@ -222,6 +222,8 @@ struct nested_vmx { > u64 vmcs01_debugctl; > u64 vmcs01_guest_bndcfgs; > Please pack these together, i.e. don't have a blank line between the various vmcs01_* fields. > + u64 vmcs01_guest_pkrs; > + > /* to migrate it to L1 if L2 writes to L1's CR8 directly */ > int l1_tpr_threshold; > > -- > 2.25.1 >
On 5/20/2022 9:24 AM, Sean Christopherson wrote: > Nit, use "KVM: nVMX:" for the shortlog scope. Will change it. > On Sun, Apr 24, 2022, Lei Wang wrote: >> @@ -2433,6 +2437,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) >> if (kvm_mpx_supported() && vmx->nested.nested_run_pending && >> (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) >> vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); >> + >> + if (vmx->nested.nested_run_pending && >> + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)) >> + vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs); > As mentioned in the BNDCFGS thread, this does the wrong thing for SMM. But, after > a lot of thought, handling this in nested_vmx_enter_non_root_mode() would be little > more than a band-aid, and a messy one at that, because KVM's SMM emulation is > horrifically broken with respect to nVMX. > > Entry does to SMM does not modify _any_ state that is not saved in SMRAM. That > we're having to deal with this crap is a symptom of KVM doing the complete wrong > thing by piggybacking nested_vmx_vmexit() and nested_vmx_enter_non_root_mode(). > > The SDM's description of CET spells this out very, very clearly: > > On processors that support CET shadow stacks, when the processor enters SMM, > the processor saves the SSP register to the SMRAM state save area (see Table 31-3) > and clears CR4.CET to 0. Thus, the initial execution environment of the SMI handler > has CET disabled and all of the CET state of the interrupted program is still in the > machine. An SMM that uses CET is required to save the interrupted program’s CET > state and restore the CET state prior to exiting SMM. > > It mostly works because no guest SMM handler does anything with most of the MSRs, > but it's all wildy wrong. A concrete example of a lurking bug is if vmcs12 uses > the VM-Exit MSR load list, in which case the forced nested_vmx_vmexit() will load > state that is never undone. > > So, my very strong vote is to ignore SMM and let someone who actually cares about > SMM fix that mess properly by adding custom flows for exiting/re-entering L2 on > SMI/RSM. OK, I will leave the mess alone. >> } >> >> if (nested_cpu_has_xsaves(vmcs12)) >> @@ -2521,6 +2529,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, >> if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || >> !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) >> vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); >> + if (kvm_cpu_cap_has(X86_FEATURE_PKS) && > ERROR: trailing whitespace > #85: FILE: arch/x86/kvm/vmx/nested.c:3407: > +^Iif (kvm_cpu_cap_has(X86_FEATURE_PKS) && $ Sorry for my carelessness, will remove the trailing whitespace. >> + (!vmx->nested.nested_run_pending || >> + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))) >> + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs); >> + >> vmx_set_rflags(vcpu, vmcs12->guest_rflags); >> >> /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the >> @@ -2897,6 +2910,10 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, >> vmcs12->host_ia32_perf_global_ctrl))) >> return -EINVAL; >> >> + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) && >> + CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs))) >> + return -EINVAL; >> + >> #ifdef CONFIG_X86_64 >> ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); >> #else >> @@ -3049,6 +3066,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, >> if (nested_check_guest_non_reg_state(vmcs12)) >> return -EINVAL; >> >> + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) && >> + CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs))) >> + return -EINVAL; >> + >> return 0; >> } >> >> @@ -3384,6 +3405,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, >> (!from_vmentry || >> !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) >> vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); >> + if (kvm_cpu_cap_has(X86_FEATURE_PKS) && >> + (!from_vmentry || > This should be "!vmx->nested.nested_run_pending" instead of "!from_vmentry" to > avoid the unnecessary VMREAD when restoring L2 with a pending VM-Enter. Will fix that. >> + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))) >> + vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS); >> >> /* >> * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* > ... > >> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h >> index 91723a226bf3..82f79ac46d7b 100644 >> --- a/arch/x86/kvm/vmx/vmx.h >> +++ b/arch/x86/kvm/vmx/vmx.h >> @@ -222,6 +222,8 @@ struct nested_vmx { >> u64 vmcs01_debugctl; >> u64 vmcs01_guest_bndcfgs; >> > Please pack these together, i.e. don't have a blank line between the various > vmcs01_* fields. OK, will check them and remove the blank lines.
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 58a1fa7defc9..dde359dacfcb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -252,6 +252,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, dest->ds_sel = src->ds_sel; dest->es_sel = src->es_sel; #endif + vmx_set_host_pkrs(dest, src->pkrs); } static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) @@ -685,6 +686,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PKRS, MSR_TYPE_RW); + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); vmx->nested.force_msr_bitmap_recalc = false; @@ -2433,6 +2437,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (kvm_mpx_supported() && vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)) + vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs); } if (nested_cpu_has_xsaves(vmcs12)) @@ -2521,6 +2529,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + if (kvm_cpu_cap_has(X86_FEATURE_PKS) && + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))) + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -2897,6 +2910,10 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, vmcs12->host_ia32_perf_global_ctrl))) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) && + CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs))) + return -EINVAL; + #ifdef CONFIG_X86_64 ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); #else @@ -3049,6 +3066,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, if (nested_check_guest_non_reg_state(vmcs12)) return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) && + CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs))) + return -EINVAL; + return 0; } @@ -3384,6 +3405,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, (!from_vmentry || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (kvm_cpu_cap_has(X86_FEATURE_PKS) && + (!from_vmentry || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))) + vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* @@ -4029,6 +4054,7 @@ static bool is_vmcs12_ext_field(unsigned long field) case GUEST_IDTR_BASE: case GUEST_PENDING_DBG_EXCEPTIONS: case GUEST_BNDCFGS: + case GUEST_IA32_PKRS: return true; default: break; @@ -4080,6 +4106,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_IA32_PKRS) + vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } @@ -4317,6 +4345,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) + vmcs_write64(GUEST_IA32_PKRS, vmcs12->host_ia32_pkrs); + /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ seg = (struct kvm_segment) { @@ -6559,7 +6590,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_LOAD_IA32_PKRS; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | @@ -6579,7 +6611,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_PKRS; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 2251b60920f8..7aad1b2f1d81 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -62,9 +62,11 @@ const unsigned short vmcs12_field_offsets[] = { FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), FIELD64(GUEST_BNDCFGS, guest_bndcfgs), + FIELD64(GUEST_IA32_PKRS, guest_ia32_pkrs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), + FIELD64(HOST_IA32_PKRS, host_ia32_pkrs), FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), FIELD(EXCEPTION_BITMAP, exception_bitmap), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 746129ddd5ae..4f41be3c351c 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -185,6 +185,8 @@ struct __packed vmcs12 { u16 host_gs_selector; u16 host_tr_selector; u16 guest_pml_index; + u64 host_ia32_pkrs; + u64 guest_ia32_pkrs; }; /* @@ -359,6 +361,8 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(host_gs_selector, 992); CHECK_OFFSET(host_tr_selector, 994); CHECK_OFFSET(guest_pml_index, 996); + CHECK_OFFSET(host_ia32_pkrs, 998); + CHECK_OFFSET(guest_ia32_pkrs, 1006); } extern const unsigned short vmcs12_field_offsets[]; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cbcb0d7b47a4..a62dc65299d5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7294,6 +7294,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); + cr4_fixed1_update(X86_CR4_PKS, ecx, feature_bit(PKS)); #undef cr4_fixed1_update } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 91723a226bf3..82f79ac46d7b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -222,6 +222,8 @@ struct nested_vmx { u64 vmcs01_debugctl; u64 vmcs01_guest_bndcfgs; + u64 vmcs01_guest_pkrs; + /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold;