From patchwork Thu Dec 10 18:38:25 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: oritw@il.ibm.com X-Patchwork-Id: 66323 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id nBAIcjCE003688 for ; Thu, 10 Dec 2009 18:38:45 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1761377AbZLJSid (ORCPT ); Thu, 10 Dec 2009 13:38:33 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1761404AbZLJSid (ORCPT ); Thu, 10 Dec 2009 13:38:33 -0500 Received: from mtagate7.de.ibm.com ([195.212.17.167]:56454 "EHLO mtagate7.de.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1761377AbZLJSi2 (ORCPT ); Thu, 10 Dec 2009 13:38:28 -0500 Received: from d12nrmr1607.megacenter.de.ibm.com (d12nrmr1607.megacenter.de.ibm.com [9.149.167.49]) by mtagate7.de.ibm.com (8.13.1/8.13.1) with ESMTP id nBAIcYWY028632 for ; Thu, 10 Dec 2009 18:38:34 GMT Received: from d12av02.megacenter.de.ibm.com (d12av02.megacenter.de.ibm.com [9.149.165.228]) by d12nrmr1607.megacenter.de.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id nBAIcY2p1151026 for ; Thu, 10 Dec 2009 19:38:34 +0100 Received: from d12av02.megacenter.de.ibm.com (loopback [127.0.0.1]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id nBAIcXGR005099 for ; Thu, 10 Dec 2009 19:38:33 +0100 Received: from localhost.localdomain (cluwyn.haifa.ibm.com [9.148.27.75]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id nBAIcUFl004785; Thu, 10 Dec 2009 19:38:32 +0100 From: oritw@il.ibm.com To: avi@redhat.com Cc: kvm@vger.kernel.org, oritw@il.ibm.com, benami@il.ibm.com, abelg@il.ibm.com, muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com Subject: [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst Date: Thu, 10 Dec 2009 20:38:25 +0200 Message-Id: <1260470309-7166-4-git-send-email-oritw@il.ibm.com> X-Mailer: git-send-email 1.6.0.4 In-Reply-To: <1260470309-7166-3-git-send-email-oritw@il.ibm.com> References: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> <1260470309-7166-2-git-send-email-oritw@il.ibm.com> <1260470309-7166-3-git-send-email-oritw@il.ibm.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a7ffd5e..46a4f3a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -92,9 +92,142 @@ struct shared_msr_entry { u64 mask; }; +struct __attribute__ ((__packed__)) shadow_vmcs { + u16 virtual_processor_id; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 ept_pointer; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 host_ia32_pat; + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + unsigned long cr0_guest_host_mask; + unsigned long cr4_guest_host_mask; + unsigned long cr0_read_shadow; + unsigned long cr4_read_shadow; + unsigned long cr3_target_value0; + unsigned long cr3_target_value1; + unsigned long cr3_target_value2; + unsigned long cr3_target_value3; + unsigned long exit_qualification; + unsigned long guest_linear_address; + unsigned long guest_cr0; + unsigned long guest_cr3; + unsigned long guest_cr4; + unsigned long guest_es_base; + unsigned long guest_cs_base; + unsigned long guest_ss_base; + unsigned long guest_ds_base; + unsigned long guest_fs_base; + unsigned long guest_gs_base; + unsigned long guest_ldtr_base; + unsigned long guest_tr_base; + unsigned long guest_gdtr_base; + unsigned long guest_idtr_base; + unsigned long guest_dr7; + unsigned long guest_rsp; + unsigned long guest_rip; + unsigned long guest_rflags; + unsigned long guest_pending_dbg_exceptions; + unsigned long guest_sysenter_esp; + unsigned long guest_sysenter_eip; + unsigned long host_cr0; + unsigned long host_cr3; + unsigned long host_cr4; + unsigned long host_fs_base; + unsigned long host_gs_base; + unsigned long host_tr_base; + unsigned long host_gdtr_base; + unsigned long host_idtr_base; + unsigned long host_ia32_sysenter_esp; + unsigned long host_ia32_sysenter_eip; + unsigned long host_rsp; + unsigned long host_rip; +}; + + struct __attribute__ ((__packed__)) level_state { /* Has the level1 guest done vmclear? */ bool vmclear; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + + bool first_launch; }; /* @@ -122,6 +255,8 @@ struct nested_vmx { gpa_t current_vmptr; /* Level 1 state for switching to level 2 and back */ struct level_state *l1_state; + /* Level 1 shadow vmcs for switching to level 2 and back */ + struct shadow_vmcs *l1_shadow_vmcs; /* list of vmcs for each l2 guest created by l1 */ struct list_head l2_vmcs_list; /* l2 page corresponding to the current vmcs set by l1 */ @@ -187,10 +322,7 @@ static struct page *nested_get_page(struct kvm_vcpu *vcpu, { struct page *vmcs_page = NULL; - down_read(¤t->mm->mmap_sem); vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); - if (is_error_page(vmcs_page)) { printk(KERN_ERR "%s error allocating page 0x%llx\n", __func__, vmcs_addr); @@ -832,13 +964,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { u8 error; - per_cpu(current_vmcs, cpu) = vmx->vmcs; + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc"); + if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmx->vmcs, phys_addr); } @@ -1240,6 +1373,7 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return 0; } + static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry) { int r = 0; @@ -1430,6 +1564,18 @@ static struct level_state *create_state(void) return state; } +static struct vmcs *nested_get_current_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct nested_vmcs_list *list_item, *n; + + list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list) + if (list_item->vmcs_addr == vmx->nested.current_vmptr) + return list_item->l2_vmcs; + + return NULL; +} + static int create_l1_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1441,10 +1587,75 @@ static int create_l1_state(struct kvm_vcpu *vcpu) } else return 0; + vmx->nested.l1_shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!vmx->nested.l1_shadow_vmcs) { + printk(KERN_INFO "%s could not allocate memory for l1_shadow vmcs\n", + __func__); + kfree(vmx->nested.l1_state); + return -ENOMEM; + } + INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list)); return 0; } +static struct vmcs *alloc_vmcs(void); +int create_l2_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs *l2_vmcs; + + if (!nested_map_current(vcpu)) { + printk(KERN_ERR "%s error mapping level 2 page", __func__); + return -ENOMEM; + } + + l2_vmcs = nested_get_current_vmcs(vcpu); + if (!l2_vmcs) { + struct nested_vmcs_list *new_l2_guest = + (struct nested_vmcs_list *) + kmalloc(sizeof(struct nested_vmcs_list), GFP_KERNEL); + + if (!new_l2_guest) { + printk(KERN_ERR "%s error could not allocate memory for a new l2 guest list item", + __func__); + nested_unmap_current(vcpu); + return -ENOMEM; + } + + l2_vmcs = alloc_vmcs(); + + if (!l2_vmcs) { + printk(KERN_ERR "%s error could not allocate memory for l2_vmcs", + __func__); + kfree(new_l2_guest); + nested_unmap_current(vcpu); + return -ENOMEM; + } + + new_l2_guest->vmcs_addr = vmx->nested.current_vmptr; + new_l2_guest->l2_vmcs = l2_vmcs; + list_add(&(new_l2_guest->list), &(vmx->nested.l2_vmcs_list)); + } + + if (cpu_has_vmx_msr_bitmap()) + vmx->nested.current_l2_page->l2_state.msr_bitmap = + vmcs_read64(MSR_BITMAP); + else + vmx->nested.current_l2_page->l2_state.msr_bitmap = 0; + + vmx->nested.current_l2_page->l2_state.io_bitmap_a = + vmcs_read64(IO_BITMAP_A); + vmx->nested.current_l2_page->l2_state.io_bitmap_b = + vmcs_read64(IO_BITMAP_B); + + vmx->nested.current_l2_page->l2_state.first_launch = true; + + nested_unmap_current(vcpu); + + return 0; +} + /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() * tricks. */ @@ -1623,6 +1834,7 @@ static void nested_free_current_vmcs(struct kvm_vcpu *vcpu) if (list_item->vmcs_addr == vmx->nested.current_vmptr) { free_vmcs(list_item->l2_vmcs); list_del(&(list_item->list)); + kfree(list_item); return; } } @@ -1637,11 +1849,14 @@ static void free_l1_state(struct kvm_vcpu *vcpu) kfree(vmx->nested.l1_state); vmx->nested.l1_state = NULL; + kfree(vmx->nested.l1_shadow_vmcs); + vmx->nested.l1_shadow_vmcs = NULL; list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list) { free_vmcs(list_item->l2_vmcs); list_del(&(list_item->list)); + kfree(list_item); } } @@ -3604,6 +3819,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) { struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); + int r = 0; if (!nested) { pr_debug("%s: nested vmx not enabled\n", __func__); @@ -3633,8 +3849,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (create_l1_state(vcpu)) { - printk(KERN_ERR "%s create_l1_state failed\n", __func__); + r = create_l1_state(vcpu); + if (r) { + printk(KERN_ERR "%s create_l1_state failed: %d\n", __func__, r); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3645,6 +3862,63 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } +static int handle_vmptrld(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 guest_vmcs_addr; + gva_t vmcs_gva; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + int r = 0; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info); + + if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr)) + return 1; + + if (vmx->nested.current_vmptr != guest_vmcs_addr) { + vmx->nested.current_vmptr = guest_vmcs_addr; + r = create_l2_state(vcpu); + if (r) { + printk(KERN_ERR "%s create_l2_state failed: %d\n", + __func__, r); + return 1; + } + } + + clear_rflags_cf_zf(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_vmptrst(struct kvm_vcpu *vcpu) +{ + int r = 0; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gva_t vmcs_gva; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info); + + r = kvm_write_guest_virt(vmcs_gva, + (void *)&to_vmx(vcpu)->nested.current_vmptr, + sizeof(u64), vcpu); + if (r) { + printk(KERN_INFO "%s failed to write vmptr\n", __func__); + return 1; + } + clear_rflags_cf_zf(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3930,8 +4204,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, - [EXIT_REASON_VMPTRLD] = handle_vmx_insn, - [EXIT_REASON_VMPTRST] = handle_vmx_insn, + [EXIT_REASON_VMPTRLD] = handle_vmptrld, + [EXIT_REASON_VMPTRST] = handle_vmptrst, [EXIT_REASON_VMREAD] = handle_vmx_insn, [EXIT_REASON_VMRESUME] = handle_vmx_insn, [EXIT_REASON_VMWRITE] = handle_vmx_insn, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5acf22..e990405 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2804,8 +2804,8 @@ out: } EXPORT_SYMBOL_GPL(kvm_read_guest_virt); -static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; @@ -2833,7 +2833,7 @@ static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, out: return r; } - +EXPORT_SYMBOL_GPL(kvm_write_guest_virt); static int emulator_read_emulated(unsigned long addr, void *val, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2d7b2dc..b49b55a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -38,6 +38,9 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu); +int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu); + extern int nested; #endif