@@ -93,13 +93,39 @@ struct shared_msr_entry {
};
struct __attribute__ ((__packed__)) level_state {
+ /* Has the level1 guest done vmclear? */
+ bool vmclear;
+};
+
+/*
+ * This structure is mapped to guest memory.
+ * It is packed in order to preseve the binary content
+ * after live migration.
+ * If there are changed in the content or layout the revision_id must be updated.
+ */
+struct __attribute__ ((__packed__)) nested_vmcs_page {
+ u32 revision_id;
+ u32 abort;
+ struct level_state l2_state;
+};
+
+struct nested_vmcs_list {
+ struct list_head list;
+ gpa_t vmcs_addr;
+ struct vmcs *l2_vmcs;
};
struct nested_vmx {
/* Has the level1 guest done vmxon? */
bool vmxon;
+ /* What is the location of the current vmcs l1 keeps for l2 */
+ gpa_t current_vmptr;
/* Level 1 state for switching to level 2 and back */
struct level_state *l1_state;
+ /* list of vmcs for each l2 guest created by l1 */
+ struct list_head l2_vmcs_list;
+ /* l2 page corresponding to the current vmcs set by l1 */
+ struct nested_vmcs_page *current_l2_page;
};
struct vcpu_vmx {
@@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+static struct page *nested_get_page(struct kvm_vcpu *vcpu,
+ u64 vmcs_addr)
+{
+ struct page *vmcs_page = NULL;
+
+ down_read(¤t->mm->mmap_sem);
+ vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
+ if (is_error_page(vmcs_page)) {
+ printk(KERN_ERR "%s error allocating page 0x%llx\n",
+ __func__, vmcs_addr);
+ kvm_release_page_clean(vmcs_page);
+ return NULL;
+ }
+
+ return vmcs_page;
+
+}
+
+static int nested_map_current(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *vmcs_page =
+ nested_get_page(vcpu, vmx->nested.current_vmptr);
+ struct nested_vmcs_page *mapped_page;
+
+ if (vmcs_page == NULL) {
+ printk(KERN_INFO "%s: failure in nested_get_page\n", __func__);
+ return 0;
+ }
+
+ if (vmx->nested.current_l2_page) {
+ printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__);
+ WARN_ON(1);
+ return 0;
+ }
+
+ mapped_page = kmap_atomic(vmcs_page, KM_USER0);
+
+ if (!mapped_page) {
+ printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
+ return 0;
+ }
+
+ vmx->nested.current_l2_page = mapped_page;
+
+ return 1;
+}
+
+static void nested_unmap_current(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->nested.current_l2_page) {
+ printk(KERN_INFO "Shadow vmcs already unmapped\n");
+ WARN_ON(1);
+ return;
+ }
+
+ page = kmap_atomic_to_page(vmx->nested.current_l2_page);
+
+ kunmap_atomic(vmx->nested.current_l2_page, KM_USER0);
+
+ kvm_release_page_dirty(page);
+
+ vmx->nested.current_l2_page = NULL;
+}
+
static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
@@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
return 0;
}
+static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
+{
+ int r = 0;
+ uint size;
+
+ *gentry = 0;
+
+ if (is_long_mode(vcpu))
+ size = sizeof(u64);
+ else
+ size = sizeof(u32);
+
+ r = kvm_read_guest_virt(gva, gentry,
+ size, vcpu);
+ if (r) {
+ printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
+ __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
+ return r;
+ }
+
+ if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
+ printk(KERN_DEBUG "%s addr %llx not aligned\n",
+ __func__, *gentry);
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -1316,6 +1441,7 @@ static int create_l1_state(struct kvm_vcpu *vcpu)
} else
return 0;
+ INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list));
return 0;
}
@@ -1488,15 +1614,35 @@ static void free_vmcs(struct vmcs *vmcs)
free_pages((unsigned long)vmcs, vmcs_config.order);
}
+static void nested_free_current_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct nested_vmcs_list *list_item, *n;
+
+ list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list)
+ if (list_item->vmcs_addr == vmx->nested.current_vmptr) {
+ free_vmcs(list_item->l2_vmcs);
+ list_del(&(list_item->list));
+ return;
+ }
+}
+
static void free_l1_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct nested_vmcs_list *list_item, *n;
if (!vmx->nested.l1_state)
return;
kfree(vmx->nested.l1_state);
vmx->nested.l1_state = NULL;
+
+ list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list,
+ list) {
+ free_vmcs(list_item->l2_vmcs);
+ list_del(&(list_item->list));
+ }
}
@@ -3352,6 +3498,93 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
return 1;
}
+static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+ rflags = vmx_get_rflags(vcpu);
+ rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
+ vmx_set_rflags(vcpu, rflags);
+}
+
+/*
+ * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11
+ * For additional information regarding offset calculation see 3.7.5
+ */
+static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
+ unsigned long exit_qualification,
+ u32 vmx_instruction_info)
+{
+ int scaling = vmx_instruction_info & 3; /* bits 0:1 scaling */
+ int addr_size = (vmx_instruction_info >> 7) & 7; /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */
+ bool is_reg = vmx_instruction_info & (1u << 10); /* bit 10 1=register operand, 0= memory */
+ int seg_reg = (vmx_instruction_info >> 15) & 7; /* bits 15:17 segment register */
+ int index_reg = (vmx_instruction_info >> 18) & 0xf; /* bits 18:21 index register */
+ bool index_is_valid = !(vmx_instruction_info & (1u << 22)); /* bit 22 index register validity, 0=valid, 1=invalid */
+ int base_reg = (vmx_instruction_info >> 23) & 0xf; /* bits 23:26 index register */
+ bool base_is_valid = !(vmx_instruction_info & (1u << 27)); /* bit 27 base register validity, 0=valid, 1=invalid */
+ gva_t addr;
+
+ if (is_reg)
+ return 0;
+
+ switch (addr_size) {
+ case 1:
+ exit_qualification &= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */
+ break;
+ case 2:
+ break;
+ default:
+ return 0;
+ }
+
+ /* Addr = segment_base + offset */
+ /* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */
+ addr = vmx_get_segment_base(vcpu, seg_reg);
+ if (base_is_valid)
+ addr += kvm_register_read(vcpu, base_reg);
+ if (index_is_valid)
+ addr += kvm_register_read(vcpu, index_reg)*scaling;
+ addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */
+
+ return addr;
+}
+
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct level_state *l2_state;
+ gpa_t guest_vmcs_addr;
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gva_t vmcs_gva;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info);
+
+ if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr))
+ return 1;
+
+ vmx->nested.current_vmptr = guest_vmcs_addr;
+ if (!nested_map_current(vcpu))
+ return 1;
+
+ l2_state = &(to_vmx(vcpu)->nested.current_l2_page->l2_state);
+ l2_state->vmclear = 1;
+ nested_free_current_vmcs(vcpu);
+
+ vmx->nested.current_vmptr = -1ull;
+
+ nested_unmap_current(vcpu);
+
+ skip_emulated_instruction(vcpu);
+ clear_rflags_cf_zf(vcpu);
+
+ return 1;
+}
+
static int handle_vmoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3695,7 +3928,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
- [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
+ [EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
[EXIT_REASON_VMPTRLD] = handle_vmx_insn,
[EXIT_REASON_VMPTRST] = handle_vmx_insn,
@@ -2773,8 +2773,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
}
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu)
{
void *data = val;
int r = X86EMUL_CONTINUE;
@@ -2802,6 +2802,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
out:
return r;
}
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu)
@@ -35,6 +35,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
extern int nested;
#endif