From patchwork Thu Dec 10 18:38:23 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: oritw@il.ibm.com X-Patchwork-Id: 66321 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id nBAIcdZv003672 for ; Thu, 10 Dec 2009 18:38:39 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1761398AbZLJSia (ORCPT ); Thu, 10 Dec 2009 13:38:30 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1761335AbZLJSi3 (ORCPT ); Thu, 10 Dec 2009 13:38:29 -0500 Received: from mtagate1.de.ibm.com ([195.212.17.161]:45514 "EHLO mtagate1.de.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1761409AbZLJSi0 (ORCPT ); Thu, 10 Dec 2009 13:38:26 -0500 Received: from d12nrmr1607.megacenter.de.ibm.com (d12nrmr1607.megacenter.de.ibm.com [9.149.167.49]) by mtagate1.de.ibm.com (8.13.1/8.13.1) with ESMTP id nBAIcWw2025223 for ; Thu, 10 Dec 2009 18:38:32 GMT Received: from d12av02.megacenter.de.ibm.com (d12av02.megacenter.de.ibm.com [9.149.165.228]) by d12nrmr1607.megacenter.de.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id nBAIcWaw1151016 for ; Thu, 10 Dec 2009 19:38:32 +0100 Received: from d12av02.megacenter.de.ibm.com (loopback [127.0.0.1]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id nBAIcVA9005044 for ; Thu, 10 Dec 2009 19:38:32 +0100 Received: from localhost.localdomain (cluwyn.haifa.ibm.com [9.148.27.75]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id nBAIcUFj004785; Thu, 10 Dec 2009 19:38:31 +0100 From: oritw@il.ibm.com To: avi@redhat.com Cc: kvm@vger.kernel.org, oritw@il.ibm.com, benami@il.ibm.com, abelg@il.ibm.com, muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com Subject: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff Date: Thu, 10 Dec 2009 20:38:23 +0200 Message-Id: <1260470309-7166-2-git-send-email-oritw@il.ibm.com> X-Mailer: git-send-email 1.6.0.4 In-Reply-To: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> References: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3de0b37..3f63cdd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -121,9 +121,6 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 1; -module_param(nested, int, S_IRUGO); - static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9a0a2cf..2726a6c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -92,6 +92,16 @@ struct shared_msr_entry { u64 mask; }; +struct __attribute__ ((__packed__)) level_state { +}; + +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; + /* Level 1 state for switching to level 2 and back */ + struct level_state *l1_state; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; @@ -136,6 +146,9 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + /* Nested vmx */ + struct nested_vmx nested; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -201,6 +214,7 @@ static struct kvm_vmx_segment_field { static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); +static int create_l1_state(struct kvm_vcpu *vcpu); /* * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it @@ -961,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) } /* + * Handles msr read for nested virtualization + */ +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, + u64 *pdata) +{ + u64 vmx_msr = 0; + + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + *pdata = 0; + break; + case MSR_IA32_VMX_BASIC: + *pdata = 0; + rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr); + *pdata = (vmx_msr & 0x00ffffcfffffffff); + break; + case MSR_IA32_VMX_PINBASED_CTLS: + rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr); + *pdata = (PIN_BASED_EXT_INTR_MASK & vmcs_config.pin_based_exec_ctrl) | + (PIN_BASED_NMI_EXITING & vmcs_config.pin_based_exec_ctrl) | + (PIN_BASED_VIRTUAL_NMIS & vmcs_config.pin_based_exec_ctrl); + break; + case MSR_IA32_VMX_PROCBASED_CTLS: + { + u32 vmx_msr_high, vmx_msr_low; + u32 control = CPU_BASED_HLT_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING | + CPU_BASED_TPR_SHADOW | + CPU_BASED_USE_MSR_BITMAPS | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + + control &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + control |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + *pdata = (CPU_BASED_HLT_EXITING & control) | +#ifdef CONFIG_X86_64 + (CPU_BASED_CR8_LOAD_EXITING & control) | + (CPU_BASED_CR8_STORE_EXITING & control) | +#endif + (CPU_BASED_CR3_LOAD_EXITING & control) | + (CPU_BASED_CR3_STORE_EXITING & control) | + (CPU_BASED_USE_IO_BITMAPS & control) | + (CPU_BASED_MOV_DR_EXITING & control) | + (CPU_BASED_USE_TSC_OFFSETING & control) | + (CPU_BASED_INVLPG_EXITING & control) ; + + if (cpu_has_secondary_exec_ctrls()) + *pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + + if (vm_need_tpr_shadow(vcpu->kvm)) + *pdata |= CPU_BASED_TPR_SHADOW; + break; + } + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = 0; +#ifdef CONFIG_X86_64 + *pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + break; + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = 0; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *pdata = 0; + if (vm_need_virtualize_apic_accesses(vcpu->kvm)) + *pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + *pdata = 0; + break; + default: + return 1; + } + + return 0; +} + +/* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1004,6 +1107,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; default: vmx_load_host_state(to_vmx(vcpu)); + if (nested && + !nested_vmx_get_msr(vcpu, msr_index, &data)) + break; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { vmx_load_host_state(to_vmx(vcpu)); @@ -1018,6 +1124,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) } /* + * Writes msr value for nested virtualization + * Returns 0 on success, non-0 otherwise. + */ +static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + if ((data & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + return 1; + break; + default: + return 1; + } + + return 0; +} + +/* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1067,6 +1194,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Otherwise falls through to kvm_set_msr_common */ default: + if (nested && + !nested_vmx_set_msr(vcpu, msr_index, data)) + break; msr = find_msr_entry(vmx, msr_index); if (msr) { vmx_load_host_state(vmx); @@ -1163,6 +1293,31 @@ static void vmclear_local_vcpus(void) __vcpu_clear(vmx); } +static struct level_state *create_state(void) +{ + struct level_state *state = NULL; + + state = kzalloc(sizeof(struct level_state), GFP_KERNEL); + if (!state) { + printk(KERN_INFO "Error create level state\n"); + return NULL; + } + return state; +} + +static int create_l1_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.l1_state) { + vmx->nested.l1_state = create_state(); + if (!vmx->nested.l1_state) + return -ENOMEM; + } else + return 0; + + return 0; +} /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() * tricks. @@ -1333,6 +1488,18 @@ static void free_vmcs(struct vmcs *vmcs) free_pages((unsigned long)vmcs, vmcs_config.order); } +static void free_l1_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.l1_state) + return; + + kfree(vmx->nested.l1_state); + vmx->nested.l1_state = NULL; +} + + static void free_kvm_area(void) { int cpu; @@ -3146,12 +3313,105 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) return 1; } +/* + * Check to see if vcpu can execute vmx command + * Inject the corrseponding exception + */ +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + + if (!vmx->nested.vmxon) { + pr_debug("%s: vmx not on\n", __func__); + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || + (is_long_mode(vcpu) && !cs.l)) { + pr_debug("%s: invalid mode cs.l %d is_long mode %d\n", + __func__, cs.l, is_long_mode(vcpu)); + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 0; + } + + return 1; +} + static int handle_vmx_insn(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +static int handle_vmoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx->nested.vmxon = 0; + + free_l1_state(vcpu); + + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_vmon(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested) { + pr_debug("%s: nested vmx not enabled\n", __func__); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + + if (!(vcpu->arch.cr4 & X86_CR4_VMXE) || + !(vcpu->arch.cr0 & X86_CR0_PE) || + (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + kvm_queue_exception(vcpu, UD_VECTOR); + printk(KERN_INFO "%s invalid register state\n", __func__); + return 1; + } + + if (is_long_mode(vcpu) && !cs.l) { + kvm_queue_exception(vcpu, UD_VECTOR); + printk(KERN_INFO "%s invalid register state\n", __func__); + return 1; + } + + if (vmx_get_cpl(vcpu)) { + printk(KERN_INFO "%s no permission\n", __func__); + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (create_l1_state(vcpu)) { + printk(KERN_ERR "%s create_l1_state failed\n", __func__); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx->nested.vmxon = 1; + + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3442,8 +3702,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMREAD] = handle_vmx_insn, [EXIT_REASON_VMRESUME] = handle_vmx_insn, [EXIT_REASON_VMWRITE] = handle_vmx_insn, - [EXIT_REASON_VMOFF] = handle_vmx_insn, - [EXIT_REASON_VMON] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmoff, + [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, @@ -3823,6 +4083,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (vmx->vpid != 0) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); + free_l1_state(vcpu); vmx_free_vmcs(vcpu); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dd15d7a..b698952 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -88,6 +88,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +int nested = 1; +EXPORT_SYMBOL_GPL(nested); +module_param(nested, int, S_IRUGO); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -505,7 +509,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return; } - if (cr4 & X86_CR4_VMXE) { + if (cr4 & X86_CR4_VMXE && !nested) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; @@ -615,7 +619,10 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_VMX_BASIC, MSR_IA32_VMX_PINBASED_CTLS, + MSR_IA32_VMX_PROCBASED_CTLS, MSR_IA32_VMX_EXIT_CTLS, MSR_IA32_VMX_ENTRY_CTLS, + MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea5..57204cb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +extern int nested; + #endif