From patchwork Thu Dec 10 18:38:24 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: oritw@il.ibm.com X-Patchwork-Id: 66322 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id nBAIch4x003678 for ; Thu, 10 Dec 2009 18:38:43 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1761426AbZLJSib (ORCPT ); Thu, 10 Dec 2009 13:38:31 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1761424AbZLJSib (ORCPT ); Thu, 10 Dec 2009 13:38:31 -0500 Received: from mtagate2.de.ibm.com ([195.212.17.162]:38332 "EHLO mtagate2.de.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1761404AbZLJSi1 (ORCPT ); Thu, 10 Dec 2009 13:38:27 -0500 Received: from d12nrmr1607.megacenter.de.ibm.com (d12nrmr1607.megacenter.de.ibm.com [9.149.167.49]) by mtagate2.de.ibm.com (8.13.1/8.13.1) with ESMTP id nBAIcXQE010002 for ; Thu, 10 Dec 2009 18:38:33 GMT Received: from d12av02.megacenter.de.ibm.com (d12av02.megacenter.de.ibm.com [9.149.165.228]) by d12nrmr1607.megacenter.de.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id nBAIcXfJ1355786 for ; Thu, 10 Dec 2009 19:38:33 +0100 Received: from d12av02.megacenter.de.ibm.com (loopback [127.0.0.1]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id nBAIcWtY005078 for ; Thu, 10 Dec 2009 19:38:33 +0100 Received: from localhost.localdomain (cluwyn.haifa.ibm.com [9.148.27.75]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id nBAIcUFk004785; Thu, 10 Dec 2009 19:38:32 +0100 From: oritw@il.ibm.com To: avi@redhat.com Cc: kvm@vger.kernel.org, oritw@il.ibm.com, benami@il.ibm.com, abelg@il.ibm.com, muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com Subject: [PATCH 2/7] Nested VMX patch 2 implements vmclear Date: Thu, 10 Dec 2009 20:38:24 +0200 Message-Id: <1260470309-7166-3-git-send-email-oritw@il.ibm.com> X-Mailer: git-send-email 1.6.0.4 In-Reply-To: <1260470309-7166-2-git-send-email-oritw@il.ibm.com> References: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> <1260470309-7166-2-git-send-email-oritw@il.ibm.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2726a6c..a7ffd5e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -93,13 +93,39 @@ struct shared_msr_entry { }; struct __attribute__ ((__packed__)) level_state { + /* Has the level1 guest done vmclear? */ + bool vmclear; +}; + +/* + * This structure is mapped to guest memory. + * It is packed in order to preseve the binary content + * after live migration. + * If there are changed in the content or layout the revision_id must be updated. + */ +struct __attribute__ ((__packed__)) nested_vmcs_page { + u32 revision_id; + u32 abort; + struct level_state l2_state; +}; + +struct nested_vmcs_list { + struct list_head list; + gpa_t vmcs_addr; + struct vmcs *l2_vmcs; }; struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + /* What is the location of the current vmcs l1 keeps for l2 */ + gpa_t current_vmptr; /* Level 1 state for switching to level 2 and back */ struct level_state *l1_state; + /* list of vmcs for each l2 guest created by l1 */ + struct list_head l2_vmcs_list; + /* l2 page corresponding to the current vmcs set by l1 */ + struct nested_vmcs_page *current_l2_page; }; struct vcpu_vmx { @@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static struct page *nested_get_page(struct kvm_vcpu *vcpu, + u64 vmcs_addr) +{ + struct page *vmcs_page = NULL; + + down_read(¤t->mm->mmap_sem); + vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(vmcs_page)) { + printk(KERN_ERR "%s error allocating page 0x%llx\n", + __func__, vmcs_addr); + kvm_release_page_clean(vmcs_page); + return NULL; + } + + return vmcs_page; + +} + +static int nested_map_current(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct page *vmcs_page = + nested_get_page(vcpu, vmx->nested.current_vmptr); + struct nested_vmcs_page *mapped_page; + + if (vmcs_page == NULL) { + printk(KERN_INFO "%s: failure in nested_get_page\n", __func__); + return 0; + } + + if (vmx->nested.current_l2_page) { + printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__); + WARN_ON(1); + return 0; + } + + mapped_page = kmap_atomic(vmcs_page, KM_USER0); + + if (!mapped_page) { + printk(KERN_INFO "%s: error in kmap_atomic\n", __func__); + return 0; + } + + vmx->nested.current_l2_page = mapped_page; + + return 1; +} + +static void nested_unmap_current(struct kvm_vcpu *vcpu) +{ + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.current_l2_page) { + printk(KERN_INFO "Shadow vmcs already unmapped\n"); + WARN_ON(1); + return; + } + + page = kmap_atomic_to_page(vmx->nested.current_l2_page); + + kunmap_atomic(vmx->nested.current_l2_page, KM_USER0); + + kvm_release_page_dirty(page); + + vmx->nested.current_l2_page = NULL; +} + static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); @@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return 0; } +static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry) +{ + int r = 0; + uint size; + + *gentry = 0; + + if (is_long_mode(vcpu)) + size = sizeof(u64); + else + size = sizeof(u32); + + r = kvm_read_guest_virt(gva, gentry, + size, vcpu); + if (r) { + printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n", + __func__, vcpu->arch.regs[VCPU_REGS_RAX], r); + return r; + } + + if (!IS_ALIGNED(*gentry, PAGE_SIZE)) { + printk(KERN_DEBUG "%s addr %llx not aligned\n", + __func__, *gentry); + return 1; + } + + return 0; +} + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -1316,6 +1441,7 @@ static int create_l1_state(struct kvm_vcpu *vcpu) } else return 0; + INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list)); return 0; } @@ -1488,15 +1614,35 @@ static void free_vmcs(struct vmcs *vmcs) free_pages((unsigned long)vmcs, vmcs_config.order); } +static void nested_free_current_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct nested_vmcs_list *list_item, *n; + + list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list) + if (list_item->vmcs_addr == vmx->nested.current_vmptr) { + free_vmcs(list_item->l2_vmcs); + list_del(&(list_item->list)); + return; + } +} + static void free_l1_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct nested_vmcs_list *list_item, *n; if (!vmx->nested.l1_state) return; kfree(vmx->nested.l1_state); vmx->nested.l1_state = NULL; + + list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, + list) { + free_vmcs(list_item->l2_vmcs); + list_del(&(list_item->list)); + } } @@ -3352,6 +3498,93 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu) return 1; } +static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; + rflags = vmx_get_rflags(vcpu); + rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF); + vmx_set_rflags(vcpu, rflags); +} + +/* + * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11 + * For additional information regarding offset calculation see 3.7.5 + */ +static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info) +{ + int scaling = vmx_instruction_info & 3; /* bits 0:1 scaling */ + int addr_size = (vmx_instruction_info >> 7) & 7; /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */ + bool is_reg = vmx_instruction_info & (1u << 10); /* bit 10 1=register operand, 0= memory */ + int seg_reg = (vmx_instruction_info >> 15) & 7; /* bits 15:17 segment register */ + int index_reg = (vmx_instruction_info >> 18) & 0xf; /* bits 18:21 index register */ + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); /* bit 22 index register validity, 0=valid, 1=invalid */ + int base_reg = (vmx_instruction_info >> 23) & 0xf; /* bits 23:26 index register */ + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); /* bit 27 base register validity, 0=valid, 1=invalid */ + gva_t addr; + + if (is_reg) + return 0; + + switch (addr_size) { + case 1: + exit_qualification &= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */ + break; + case 2: + break; + default: + return 0; + } + + /* Addr = segment_base + offset */ + /* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */ + addr = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + addr += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + addr += kvm_register_read(vcpu, index_reg)*scaling; + addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */ + + return addr; +} + +static int handle_vmclear(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct level_state *l2_state; + gpa_t guest_vmcs_addr; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gva_t vmcs_gva; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info); + + if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr)) + return 1; + + vmx->nested.current_vmptr = guest_vmcs_addr; + if (!nested_map_current(vcpu)) + return 1; + + l2_state = &(to_vmx(vcpu)->nested.current_l2_page->l2_state); + l2_state->vmclear = 1; + nested_free_current_vmcs(vcpu); + + vmx->nested.current_vmptr = -1ull; + + nested_unmap_current(vcpu); + + skip_emulated_instruction(vcpu); + clear_rflags_cf_zf(vcpu); + + return 1; +} + static int handle_vmoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3695,7 +3928,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_VMCLEAR] = handle_vmx_insn, + [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, [EXIT_REASON_VMPTRLD] = handle_vmx_insn, [EXIT_REASON_VMPTRST] = handle_vmx_insn, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b698952..e5acf22 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2773,8 +2773,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; @@ -2802,6 +2802,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, out: return r; } +EXPORT_SYMBOL_GPL(kvm_read_guest_virt); static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 57204cb..2d7b2dc 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -35,6 +35,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu); + extern int nested; #endif