diff mbox

[5/6] Nested VMX patch 5 implements vmlaunch and vmresume

Message ID 1251905916-2834-6-git-send-email-oritw@il.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

oritw@il.ibm.com Sept. 2, 2009, 3:38 p.m. UTC
From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c | 1142 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 1130 insertions(+), 12 deletions(-)

Comments

Avi Kivity Sept. 2, 2009, 9:38 p.m. UTC | #1
On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> -struct nested_vmx {
> -	/* Has the level1 guest done vmon? */
> +struct nested_vmx {	/* Has the level1 guest done vmon? */
>    

A \n died here.

>   	bool vmon;
>   	/* Has the level1 guest done vmclear? */
>   	bool vmclear;
> +
> +	/* Are we running nested guest */
> +	bool nested_mode;
> +
> +	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> +	bool nested_run_pending;
> +
> +	/* flag indicating if there was a valid IDT after exiting from l2 */
> +	bool nested_pending_valid_idt;
>    

What does this mean?  pending event?

>
> +
> +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
> +{
> +	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> +		cpu_based_vm_exec_control&  CPU_BASED_TPR_SHADOW;
> +}
>    

Don't we need to check if the host supports it too?

> +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
> +							   *vcpu)
> +{
> +	struct shadow_vmcs *shadow = to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> +
> +	return (shadow->secondary_vm_exec_control&
> +		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)&&
> +		to_vmx(vcpu)->nested.l2_state->shadow_vmcs->apic_access_addr != 0;
> +}
>    

Why check apic_access_addr?

> +
> +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> +{
> +	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> +		secondary_vm_exec_control&  SECONDARY_EXEC_ENABLE_EPT;
> +}
>    

Need to check if secondary controls enabled?

> +static void vmx_set_irq(struct kvm_vcpu *vcpu)
> +{
> +	if (to_vmx(vcpu)->nested.nested_mode)
> +		return;
>    

Why?

Note if the guest didn't enable external interrupt exiting, we need to 
inject as usual.

>
> +static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
> +{
>    

Again the name is confusing.  pending_event_injection?

> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int irq;
> +	int type;
> +	int errCodeValid;
> +	u32 idt_vectoring_info;
> +	u32 guest_intr;
> +	bool nmi_window_open;
> +	bool interrupt_window_open;
> +
> +	if (vmx->nested.nested_mode&&  vmx->nested.nested_pending_valid_idt) {
> +		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +		irq  = idt_vectoring_info&  VECTORING_INFO_VECTOR_MASK;
> +		type = idt_vectoring_info&  VECTORING_INFO_TYPE_MASK;
> +		errCodeValid = idt_vectoring_info&
> +			VECTORING_INFO_DELIVER_CODE_MASK;
> +
> +		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +		nmi_window_open =
> +			!(guest_intr&  (GUEST_INTR_STATE_STI |
> +					GUEST_INTR_STATE_MOV_SS |
> +					GUEST_INTR_STATE_NMI));
> +
> +		interrupt_window_open =
> +			((vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
> +			 !(guest_intr&  (GUEST_INTR_STATE_STI |
> +					 GUEST_INTR_STATE_MOV_SS)));
> +
> +		if (type == INTR_TYPE_EXT_INTR&&  !interrupt_window_open) {
> +			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
> +			return 0;
> +		}
>    

How can this happen?  Unless it's on nested entry, in which case we need 
to abort the entry.

> +
>   #ifdef CONFIG_X86_64
>   #define R "r"
>   #define Q "q"
> @@ -4646,6 +4842,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> +	nested_handle_pending_idt(vcpu);
>    

You're not checking the return code (need to do that on entry).

> +
> +	if (vmx->nested.nested_mode) {
> +		vmcs_writel(GUEST_CR0, vmx->nested.l2_state->shadow_vmcs->guest_cr0);
>    

Might not be legal.  We may also want to force-enable caching.  Lastly, 
don't we need to handle cr0.ts and ct0.mp specially to manage the fpu state?

>
> +	if (vmx->nested.nested_mode)
> +		vmx->nested.vmclear = 0;
> +
>    

Why?

>   free_vmcs:
> @@ -5122,6 +5339,228 @@ static int shadow_vmcs_load(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>
> +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> +{
> +	struct shadow_vmcs *l2_shadow_vmcs =
> +		to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> +	struct shadow_vmcs *l1_shadow_vmcs =
> +		to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
> +
> +	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +
> +	l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +
> +	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	l2_shadow_vmcs->guest_physical_address =
> +		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	l2_shadow_vmcs->vm_entry_intr_info_field =
> +		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	l2_shadow_vmcs->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	l2_shadow_vmcs->vm_entry_instruction_len =
> +		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vm_instruction_error =
> +		vmcs_read32(VM_INSTRUCTION_ERROR);
> +	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	l2_shadow_vmcs->vm_exit_intr_error_code =
> +		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	l2_shadow_vmcs->idt_vectoring_info_field =
> +		vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	l2_shadow_vmcs->idt_vectoring_error_code =
> +		vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	l2_shadow_vmcs->vm_exit_instruction_len =
> +		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vmx_instruction_info =
> +		vmcs_read32(VMX_INSTRUCTION_INFO);
> +	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	l2_shadow_vmcs->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	l2_shadow_vmcs->guest_activity_state =
> +		vmcs_read32(GUEST_ACTIVITY_STATE);
> +	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +
> +	l1_shadow_vmcs->host_ia32_sysenter_cs =
> +		vmcs_read32(HOST_IA32_SYSENTER_CS);
> +
> +	l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> +
> +	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> +	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> +	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	l2_shadow_vmcs->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +
> +	l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
> +	l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
> +	l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
> +	l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
> +	l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
> +	l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +	l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +	l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> +	l1_shadow_vmcs->host_ia32_sysenter_esp =
> +		vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +	l1_shadow_vmcs->host_ia32_sysenter_eip =
> +		vmcs_readl(HOST_IA32_SYSENTER_EIP);
> +	l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
> +	l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
> +}
>    

Can't we do it lazily?  Only read these on demand?

> +
> +int load_vmcs_common(struct shadow_vmcs *src)
> +{
> +	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> +	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> +	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> +	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> +	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> +	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> +	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> +	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> +
> +	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> +	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> +
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
> +
> +	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
> +	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
> +	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> +		     src->vm_entry_exception_error_code);
> +	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
> +
> +	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
> +	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
> +	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
> +	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
> +	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
> +	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
> +	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
> +	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
> +	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
> +	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
> +	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
> +	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
> +	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
> +	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
> +	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
> +	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
> +	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
> +	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
> +	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> +		     src->guest_interruptibility_info);
> +	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
> +	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
> +
> +	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
> +	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
> +	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
> +	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
> +	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
> +	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
> +	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
> +	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
> +	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
> +	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
> +	vmcs_writel(GUEST_DR7, src->guest_dr7);
> +	vmcs_writel(GUEST_RSP, src->guest_rsp);
> +	vmcs_writel(GUEST_RIP, src->guest_rip);
> +	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
> +	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> +		    src->guest_pending_dbg_exceptions);
> +	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
> +	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
> +
> +	return 0;
> +}
>    

If we do it lazily, we'll only need to reload bits that have changed.

>   struct level_state *create_state(void)
>   {
>   	struct level_state *state = NULL;
> @@ -5176,6 +5615,685 @@ int create_l2_state(struct kvm_vcpu *vcpu)
>
>   	return 0;
>   }
> +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct shadow_vmcs *src = vmx->nested.l2_state->shadow_vmcs;
> +	u32 exec_control;
> +
> +	if (!src) {
> +		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> +		return 1;
> +	}
> +
> +	load_vmcs_common(src);
> +
> +	if (cpu_has_vmx_vpid()&&  vmx->nested.l2_state->vpid != 0)
> +		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
> +
> +	if (vmx->nested.l2_state->io_bitmap_a)
> +		vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
> +
> +	if (vmx->nested.l2_state->io_bitmap_b)
> +		vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
> +
> +	if (vmx->nested.l2_state->msr_bitmap)
> +		vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
>    

Don't we need to combine the host and guest msr bitmaps and I/O 
bitmaps?  If the host doesn't allow an msr or I/O access to the guest, 
it shouldn't allow it to nested guests.

> +
> +	if (src->vm_entry_msr_load_count>  0) {
> +		struct page *page;
> +
> +		page = nested_get_page(vcpu,
> +				       src->vm_entry_msr_load_addr);
> +		if (!page)
> +			return 1;
> +
> +		vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
>    

Luckily we don't use the msr autoload stuff.  If we did we'd have to 
merge it too.  But We have to emulate those loads (via vmx_set_msr), the 
guest can easily load bad msrs which would kill the host.

> +	if (src->virtual_apic_page_addr != 0) {
> +		struct page *page;
> +
> +		page = nested_get_page(vcpu,
> +				       src->virtual_apic_page_addr);
> +		if (!page)
> +			return 1;
> +
> +		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
> +
> +		kvm_release_page_clean(page);
> +	}  else {
> +		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
> +			     src->virtual_apic_page_addr);
> +	}
>    

Don't understand the special zero value.

> +
> +	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> +		     (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
> +		      src->pin_based_vm_exec_control));
> +
> +	exec_control = vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +	exec_control&= ~CPU_BASED_TPR_SHADOW;
>    

Why?

> +	if (enable_vpid) {
> +		if (vmx->nested.l2_state->vpid == 0) {
> +			allocate_vpid(vmx);
> +			vmx->nested.l2_state->vpid = vmx->vpid;
>    

What if the guest has a nonzero vpid?

> +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> +			     bool is_interrupt)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int initial_pfu_active = vcpu->fpu_active;
> +
> +	if (!vmx->nested.nested_mode) {
> +		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> +		       __func__);
> +		return 0;
> +	}
> +
> +	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> +
> +	sync_cached_regs_to_vmcs(vcpu);
> +
> +	prepare_vmcs_12(vcpu);
> +	if (is_interrupt)
> +		vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
> +			EXIT_REASON_EXTERNAL_INTERRUPT;
>    

Need to auto-ack the interrupt if requested by the guest.
oritw@il.ibm.com Sept. 3, 2009, 2:53 p.m. UTC | #2
Avi Kivity <avi@redhat.com> wrote on 03/09/2009 00:38:16:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 03/09/2009 00:38
>
> Subject:
>
> Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > -struct nested_vmx {
> > -   /* Has the level1 guest done vmon? */
> > +struct nested_vmx {   /* Has the level1 guest done vmon? */
> >
>
> A \n died here.
I will fix it.
>
> >      bool vmon;
> >      /* Has the level1 guest done vmclear? */
> >      bool vmclear;
> > +
> > +   /* Are we running nested guest */
> > +   bool nested_mode;
> > +
> > +   /* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> > +   bool nested_run_pending;
> > +
> > +   /* flag indicating if there was a valid IDT after exiting from l2
*/
> > +   bool nested_pending_valid_idt;
> >
>
> What does this mean?  pending event?
I will rename it.
> >
> > +
> > +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu
*vcpu)
> > +{
> > +   return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> > +      cpu_based_vm_exec_control&  CPU_BASED_TPR_SHADOW;
> > +}
> >
>
> Don't we need to check if the host supports it too?
We check it separately but I can add it here
>
> > +static inline bool nested_vm_need_virtualize_apic_accesses(struct
kvm_vcpu
> > +                        *vcpu)
> > +{
> > +   struct shadow_vmcs *shadow = to_vmx(vcpu)->nested.l2_state->
shadow_vmcs;
> > +
> > +   return (shadow->secondary_vm_exec_control&
> > +      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)&&
> > +      to_vmx(vcpu)->nested.l2_state->shadow_vmcs->apic_access_addr !=
0;
> > +}
> >
>
> Why check apic_access_addr?
I will remove it.
>
> > +
> > +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> > +{
> > +   return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> > +      secondary_vm_exec_control&  SECONDARY_EXEC_ENABLE_EPT;
> > +}
> >
>
> Need to check if secondary controls enabled?
If the secondary controls are not enabled this field is zero.
>
> > +static void vmx_set_irq(struct kvm_vcpu *vcpu)
> > +{
> > +   if (to_vmx(vcpu)->nested.nested_mode)
> > +      return;
> >
>
> Why?
>
> Note if the guest didn't enable external interrupt exiting, we need to
> inject as usual.
I look into it.
>
> >
> > +static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
> > +{
> >
>
> Again the name is confusing.  pending_event_injection?
I will rename it.
>
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int irq;
> > +   int type;
> > +   int errCodeValid;
> > +   u32 idt_vectoring_info;
> > +   u32 guest_intr;
> > +   bool nmi_window_open;
> > +   bool interrupt_window_open;
> > +
> > +   if (vmx->nested.nested_mode&&  vmx->
nested.nested_pending_valid_idt) {
> > +      idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +      irq  = idt_vectoring_info&  VECTORING_INFO_VECTOR_MASK;
> > +      type = idt_vectoring_info&  VECTORING_INFO_TYPE_MASK;
> > +      errCodeValid = idt_vectoring_info&
> > +         VECTORING_INFO_DELIVER_CODE_MASK;
> > +
> > +      guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +      nmi_window_open =
> > +         !(guest_intr&  (GUEST_INTR_STATE_STI |
> > +               GUEST_INTR_STATE_MOV_SS |
> > +               GUEST_INTR_STATE_NMI));
> > +
> > +      interrupt_window_open =
> > +         ((vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
> > +          !(guest_intr&  (GUEST_INTR_STATE_STI |
> > +                GUEST_INTR_STATE_MOV_SS)));
> > +
> > +      if (type == INTR_TYPE_EXT_INTR&&  !interrupt_window_open) {
> > +         printk(KERN_INFO "IDT ignored, l2 interrupt window
closed!\n");
> > +         return 0;
> > +      }
> >
>
> How can this happen?  Unless it's on nested entry, in which case we need
> to abort the entry.
Ok i will fix it. The truth I never saw it happen.
>
> > +
> >   #ifdef CONFIG_X86_64
> >   #define R "r"
> >   #define Q "q"
> > @@ -4646,6 +4842,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >   {
> >      struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> > +   nested_handle_pending_idt(vcpu);
> >
>
> You're not checking the return code (need to do that on entry).
I will fix it.
>
> > +
> > +   if (vmx->nested.nested_mode) {
> > +      vmcs_writel(GUEST_CR0, vmx->nested.l2_state->shadow_vmcs->
guest_cr0);
> >
>
> Might not be legal.  We may also want to force-enable caching.  Lastly,
> don't we need to handle cr0.ts and ct0.mp specially to manage the fpu
state?
We are working on implementing this correctly . kvm seems to handle it fine
but vmware doesn't like it.
>
> >
> > +   if (vmx->nested.nested_mode)
> > +      vmx->nested.vmclear = 0;
> > +
> >
>
> Why?
I will check it.
>
> >   free_vmcs:
> > @@ -5122,6 +5339,228 @@ static int shadow_vmcs_load(struct kvm_vcpu
*vcpu)
> >      return 0;
> >   }
> >
> > +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> > +{
> > +   struct shadow_vmcs *l2_shadow_vmcs =
> > +      to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> > +   struct shadow_vmcs *l1_shadow_vmcs =
> > +      to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
> > +
> > +   l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16
(GUEST_LDTR_SELECTOR);
> > +   l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +
> > +   l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> > +   l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> > +   l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> > +   l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> > +   l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> > +   l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> > +   l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> > +
> > +   l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   l2_shadow_vmcs->guest_physical_address =
> > +      vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64
(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   l2_shadow_vmcs->vm_entry_intr_info_field =
> > +      vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> > +   l2_shadow_vmcs->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_entry_instruction_len =
> > +      vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vm_instruction_error =
> > +      vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   l2_shadow_vmcs->vm_exit_intr_error_code =
> > +      vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> > +   l2_shadow_vmcs->idt_vectoring_info_field =
> > +      vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +   l2_shadow_vmcs->idt_vectoring_error_code =
> > +      vmcs_read32(IDT_VECTORING_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_exit_instruction_len =
> > +      vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vmx_instruction_info =
> > +      vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32
(GUEST_LDTR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   l2_shadow_vmcs->guest_activity_state =
> > +      vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +
> > +   l1_shadow_vmcs->host_ia32_sysenter_cs =
> > +      vmcs_read32(HOST_IA32_SYSENTER_CS);
> > +
> > +   l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   l2_shadow_vmcs->exit_qualification = vmcs_readl
(EXIT_QUALIFICATION);
> > +   l2_shadow_vmcs->guest_linear_address = vmcs_readl
(GUEST_LINEAR_ADDRESS);
> > +   l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +
> > +   l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> > +   l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   l2_shadow_vmcs->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl
(GUEST_SYSENTER_ESP);
> > +   l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl
(GUEST_SYSENTER_EIP);
> > +
> > +   l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
> > +   l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
> > +   l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
> > +   l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
> > +   l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
> > +   l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
> > +   l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> > +   l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> > +   l1_shadow_vmcs->host_ia32_sysenter_esp =
> > +      vmcs_readl(HOST_IA32_SYSENTER_ESP);
> > +   l1_shadow_vmcs->host_ia32_sysenter_eip =
> > +      vmcs_readl(HOST_IA32_SYSENTER_EIP);
> > +   l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
> > +   l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
> > +}
> >
>
> Can't we do it lazily?  Only read these on demand?
We can optimize and read some fields only when they are changed (after we
switch to qemu for example),
we do it in our performance version. Also there are some field that kvm
only write to once , we can read once.
This can be dangerous for other hypervisors , that may change them more
frequently.

>
> > +
> > +int load_vmcs_common(struct shadow_vmcs *src)
> > +{
> > +   vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> > +   vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> > +   vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> > +   vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> > +   vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> > +   vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> > +   vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> > +   vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> > +
> > +   vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> > +   vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> > +
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
> > +
> > +   vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->
vm_entry_msr_load_count);
> > +   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->
vm_entry_intr_info_field);
> > +   vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> > +           src->vm_entry_exception_error_code);
> > +   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->
vm_entry_instruction_len);
> > +
> > +   vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
> > +   vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
> > +   vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
> > +   vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
> > +   vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
> > +   vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
> > +   vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
> > +   vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
> > +   vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
> > +   vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
> > +   vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
> > +   vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
> > +   vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
> > +   vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
> > +   vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
> > +   vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
> > +   vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
> > +   vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
> > +   vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> > +           src->guest_interruptibility_info);
> > +   vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
> > +   vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
> > +
> > +   vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
> > +   vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
> > +   vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
> > +   vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
> > +   vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
> > +   vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
> > +   vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
> > +   vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
> > +   vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
> > +   vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
> > +   vmcs_writel(GUEST_DR7, src->guest_dr7);
> > +   vmcs_writel(GUEST_RSP, src->guest_rsp);
> > +   vmcs_writel(GUEST_RIP, src->guest_rip);
> > +   vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
> > +   vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> > +          src->guest_pending_dbg_exceptions);
> > +   vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
> > +   vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
> > +
> > +   return 0;
> > +}
> >
>
> If we do it lazily, we'll only need to reload bits that have changed.
True, we can add a bitmap and update the fields written to only.
>
> >   struct level_state *create_state(void)
> >   {
> >      struct level_state *state = NULL;
> > @@ -5176,6 +5615,685 @@ int create_l2_state(struct kvm_vcpu *vcpu)
> >
> >      return 0;
> >   }
> > +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct shadow_vmcs *src = vmx->nested.l2_state->shadow_vmcs;
> > +   u32 exec_control;
> > +
> > +   if (!src) {
> > +      printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   load_vmcs_common(src);
> > +
> > +   if (cpu_has_vmx_vpid()&&  vmx->nested.l2_state->vpid != 0)
> > +      vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
> > +
> > +   if (vmx->nested.l2_state->io_bitmap_a)
> > +      vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
> > +
> > +   if (vmx->nested.l2_state->io_bitmap_b)
> > +      vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
> > +
> > +   if (vmx->nested.l2_state->msr_bitmap)
> > +      vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
> >
>
> Don't we need to combine the host and guest msr bitmaps and I/O
> bitmaps?  If the host doesn't allow an msr or I/O access to the guest,
> it shouldn't allow it to nested guests.
Yes we didn't implement it yet.
>
> > +
> > +   if (src->vm_entry_msr_load_count>  0) {
> > +      struct page *page;
> > +
> > +      page = nested_get_page(vcpu,
> > +                   src->vm_entry_msr_load_addr);
> > +      if (!page)
> > +         return 1;
> > +
> > +      vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
> >
>
> Luckily we don't use the msr autoload stuff.  If we did we'd have to
> merge it too.  But We have to emulate those loads (via vmx_set_msr), the
> guest can easily load bad msrs which would kill the host.
Ok.
>
> > +   if (src->virtual_apic_page_addr != 0) {
> > +      struct page *page;
> > +
> > +      page = nested_get_page(vcpu,
> > +                   src->virtual_apic_page_addr);
> > +      if (!page)
> > +         return 1;
> > +
> > +      vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
> > +
> > +      kvm_release_page_clean(page);
> > +   }  else {
> > +      vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
> > +              src->virtual_apic_page_addr);
> > +   }
> >
>
> Don't understand the special zero value.
I will look into it.
>
> > +
> > +   vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> > +           (vmx->nested.l1_state->shadow_vmcs->
pin_based_vm_exec_control |
> > +            src->pin_based_vm_exec_control));
> > +
> > +   exec_control =
> vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
> > +
> > +   exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> > +
> > +   exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> > +
> > +   exec_control&= ~CPU_BASED_TPR_SHADOW;
> >
>
> Why?
We use the values from VMCS12 always for those controls.
>
> > +   if (enable_vpid) {
> > +      if (vmx->nested.l2_state->vpid == 0) {
> > +         allocate_vpid(vmx);
> > +         vmx->nested.l2_state->vpid = vmx->vpid;
> >
>
> What if the guest has a nonzero vpid?
>
> > +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> > +              bool is_interrupt)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int initial_pfu_active = vcpu->fpu_active;
> > +
> > +   if (!vmx->nested.nested_mode) {
> > +      printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> > +             __func__);
> > +      return 0;
> > +   }
> > +
> > +   save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> > +
> > +   sync_cached_regs_to_vmcs(vcpu);
> > +
> > +   prepare_vmcs_12(vcpu);
> > +   if (is_interrupt)
> > +      vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
> > +         EXIT_REASON_EXTERNAL_INTERRUPT;
> >
>
> Need to auto-ack the interrupt if requested by the guest.
The is_interrupt means L1 has interrupts, kvm regular code will handle it.
>
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 6, 2009, 9:29 a.m. UTC | #3
On 09/03/2009 05:53 PM, Orit Wasserman wrote:
>>
>> Need to auto-ack the interrupt if requested by the guest.
>>      
> The is_interrupt means L1 has interrupts, kvm regular code will handle it.
>    

If the VM-Exit Controls bit 15 (Acknowledge interrupts on exit) is set, 
when the nested guest exits you need to run kvm_cpu_get_interrupt() and 
put the vector number in the VM-Exit interruption-information field.  
kvm doesn't set this bit but I think Xen does.
oritw@il.ibm.com Sept. 6, 2009, 1:38 p.m. UTC | #4
Avi Kivity <avi@redhat.com> wrote on 06/09/2009 12:29:58:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mmday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 12:30
>
> Subject:
>
> Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On 09/03/2009 05:53 PM, Orit Wasserman wrote:
> >>
> >> Need to auto-ack the interrupt if requested by the guest.
> >>
> > The is_interrupt means L1 has interrupts, kvm regular code will handle
it.
> >
>
> If the VM-Exit Controls bit 15 (Acknowledge interrupts on exit) is set,
> when the nested guest exits you need to run kvm_cpu_get_interrupt() and
> put the vector number in the VM-Exit interruption-information field.
> kvm doesn't set this bit but I think Xen does.
VMware doesn't set it either.
We have to run L2 with the bit off even if the L1 hypervisor set it, and
emulate the L2 exit to L1 hypervisor correctly.
I will look at it.
>
> --
> error compiling committee.c: too many arguments to function
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 62139b5..a7a62df 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -221,17 +221,30 @@  static inline int vmcs_field_length(unsigned long field)
 	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
 }
 
+#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
+					VM_EXIT_SAVE_IA32_PAT))
+#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
+					 VM_ENTRY_IA32E_MODE))
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
 	char data[0];
 };
 
-struct nested_vmx {
-	/* Has the level1 guest done vmon? */
+struct nested_vmx {	/* Has the level1 guest done vmon? */
 	bool vmon;
 	/* Has the level1 guest done vmclear? */
 	bool vmclear;
+
+	/* Are we running nested guest */
+	bool nested_mode;
+
+	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
+	bool nested_run_pending;
+
+	/* flag indicating if there was a valid IDT after exiting from l2 */
+	bool nested_pending_valid_idt;
+
 	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
 	u64 l1_cur_vmcs;
 	/*
@@ -704,6 +717,53 @@  static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
 #endif
 }
 
+
+static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
+		cpu_based_vm_exec_control & CPU_BASED_TPR_SHADOW;
+}
+
+static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->cpu_based_vm_exec_control &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
+							   *vcpu)
+{
+	struct shadow_vmcs *shadow = to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+
+	return (shadow->secondary_vm_exec_control &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+		to_vmx(vcpu)->nested.l2_state->shadow_vmcs->apic_access_addr != 0;
+}
+
+static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
+		secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->secondary_vm_exec_control &
+		SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->vm_entry_controls &
+		VM_ENTRY_LOAD_IA32_PAT;
+}
+
+static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->cpu_based_vm_exec_control &
+		CPU_BASED_USE_MSR_BITMAPS;
+}
+
 static struct page *nested_get_page(struct kvm_vcpu *vcpu,
 				    u64 vmcs_addr)
 {
@@ -779,9 +839,16 @@  static struct kvm_vmx_segment_field {
 };
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
-
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code);
+static int nested_vmx_intr(struct kvm_vcpu *vcpu);
 static int create_l1_state(struct kvm_vcpu *vcpu);
 static int create_l2_state(struct kvm_vcpu *vcpu);
+static int launch_guest(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override);
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt);
 static int shadow_vmcs_load(struct kvm_vcpu *vcpu);
 
 /*
@@ -899,6 +966,18 @@  static inline bool cpu_has_vmx_ept_2m_page(void)
 	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
 }
 
+static inline int is_exception(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_nmi(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
 static inline int cpu_has_vmx_invept_individual_addr(void)
 {
 	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -1460,6 +1539,9 @@  static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code))
+		return;
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -3238,6 +3320,14 @@  static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
 
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (nested_vmx_intr(vcpu))
+				return;
+		}
+		return;
+	}
+
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -3289,10 +3379,25 @@  static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
 }
 
+static void vmx_set_irq(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode)
+		return;
+
+	if (nested_vmx_intr(vcpu))
+		return;
+
+	vmx_inject_irq(vcpu);
+}
+
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (vmx->nested.nested_mode) {
+		return;
+	}
+
 	if (!cpu_has_virtual_nmis()) {
 		/*
 		 * Tracking the NMI-blocked state in software is built upon
@@ -3334,6 +3439,13 @@  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (!nested_vmx_intr(vcpu))
+				return 0;
+		}
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3988,10 +4100,25 @@  static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
 {
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+	if (!to_vmx(vcpu)->nested.vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
+}
+
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (to_vmx(vcpu)->nested.vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
 }
 
 static int handle_vmread(struct kvm_vcpu *vcpu)
@@ -4421,11 +4548,11 @@  static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmread,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
@@ -4452,6 +4579,16 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
 	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+	if (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME)
+		vmx->nested.nested_run_pending = 1;
+	else
+		vmx->nested.nested_run_pending = 0;
+
+	if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) {
+		nested_vmx_vmexit(vcpu, false);
+		return 1;
+	}
 
 	/* If we need to emulate an MMIO from handle_invalid_guest_state
 	 * we just return 0 */
@@ -4481,7 +4618,7 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
 
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (!vmx->nested.nested_mode && unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4528,10 +4665,13 @@  static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	int type;
 	bool idtv_info_valid;
 
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
+	if (vmx->nested.nested_mode)
+		return;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
 	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
@@ -4634,6 +4774,62 @@  static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int irq;
+	int type;
+	int errCodeValid;
+	u32 idt_vectoring_info;
+	u32 guest_intr;
+	bool nmi_window_open;
+	bool interrupt_window_open;
+
+	if (vmx->nested.nested_mode && vmx->nested.nested_pending_valid_idt) {
+		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+		irq  = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+		type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+		errCodeValid = idt_vectoring_info &
+			VECTORING_INFO_DELIVER_CODE_MASK;
+
+		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+		nmi_window_open =
+			!(guest_intr & (GUEST_INTR_STATE_STI |
+					GUEST_INTR_STATE_MOV_SS |
+					GUEST_INTR_STATE_NMI));
+
+		interrupt_window_open =
+			((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+			 !(guest_intr & (GUEST_INTR_STATE_STI |
+					 GUEST_INTR_STATE_MOV_SS)));
+
+		if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
+			return 0;
+		}
+
+		if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
+			return 0;
+		}
+
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			irq | type | INTR_INFO_VALID_MASK | errCodeValid);
+
+
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+		if (errCodeValid)
+			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				     vmcs_read32(IDT_VECTORING_ERROR_CODE));
+
+		return 1;
+	}
+
+	return 0;
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
@@ -4646,6 +4842,15 @@  static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	nested_handle_pending_idt(vcpu);
+
+	if (vmx->nested.nested_mode) {
+		vmcs_writel(GUEST_CR0, vmx->nested.l2_state->shadow_vmcs->guest_cr0);
+		vmcs_write32(EXCEPTION_BITMAP, vmx->nested.l2_state->shadow_vmcs->
+			     exception_bitmap |
+			     vmx->nested.l1_state->shadow_vmcs->exception_bitmap);
+	}
+
 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
 		ept_load_pdptrs(vcpu);
@@ -4783,12 +4988,19 @@  static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	get_debugreg(vcpu->arch.dr6, 6);
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+	vmx->nested.nested_pending_valid_idt = vmx->nested.nested_mode &&
+		(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
+	if (vmx->nested.nested_mode)
+		vmx->nested.vmclear = 0;
+
 	vmx_complete_interrupts(vmx);
 }
 
@@ -4871,6 +5083,11 @@  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->nested.l1_cur_vmcs = 0;
+
+	vmx->nested.l1_state = NULL;
+	vmx->nested.l2_state = NULL;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -5122,6 +5339,228 @@  static int shadow_vmcs_load(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *l2_shadow_vmcs =
+		to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+	struct shadow_vmcs *l1_shadow_vmcs =
+		to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
+
+	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+	l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+
+	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
+	l2_shadow_vmcs->guest_physical_address =
+		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	l2_shadow_vmcs->vm_entry_intr_info_field =
+		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	l2_shadow_vmcs->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	l2_shadow_vmcs->vm_entry_instruction_len =
+		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vm_instruction_error =
+		vmcs_read32(VM_INSTRUCTION_ERROR);
+	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	l2_shadow_vmcs->vm_exit_intr_error_code =
+		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	l2_shadow_vmcs->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	l2_shadow_vmcs->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	l2_shadow_vmcs->vm_exit_instruction_len =
+		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vmx_instruction_info =
+		vmcs_read32(VMX_INSTRUCTION_INFO);
+	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	l2_shadow_vmcs->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	l2_shadow_vmcs->guest_activity_state =
+		vmcs_read32(GUEST_ACTIVITY_STATE);
+	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+	l1_shadow_vmcs->host_ia32_sysenter_cs =
+		vmcs_read32(HOST_IA32_SYSENTER_CS);
+
+	l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
+	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
+	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
+	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
+	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	l2_shadow_vmcs->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+	l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
+	l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
+	l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
+	l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
+	l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
+	l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
+	l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+	l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+	l1_shadow_vmcs->host_ia32_sysenter_esp =
+		vmcs_readl(HOST_IA32_SYSENTER_ESP);
+	l1_shadow_vmcs->host_ia32_sysenter_eip =
+		vmcs_readl(HOST_IA32_SYSENTER_EIP);
+	l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
+	l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
+}
+
+int load_vmcs_common(struct shadow_vmcs *src)
+{
+	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
+
+	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
+	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
+
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		     src->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
+
+	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		     src->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
+
+	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
+	vmcs_writel(GUEST_DR7, src->guest_dr7);
+	vmcs_writel(GUEST_RSP, src->guest_rsp);
+	vmcs_writel(GUEST_RIP, src->guest_rip);
+	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		    src->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
+
+	return 0;
+}
+
+int load_vmcs_host_state(struct shadow_vmcs *src)
+{
+	vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
+	vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
+	vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
+	vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
+	vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
+	vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
+	vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
+
+	vmcs_write64(TSC_OFFSET, src->tsc_offset);
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
+
+	vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
+
+	vmcs_writel(HOST_CR0, src->host_cr0);
+	vmcs_writel(HOST_CR3, src->host_cr3);
+	vmcs_writel(HOST_CR4, src->host_cr4);
+	vmcs_writel(HOST_FS_BASE, src->host_fs_base);
+	vmcs_writel(HOST_GS_BASE, src->host_gs_base);
+	vmcs_writel(HOST_TR_BASE, src->host_tr_base);
+	vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
+	vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
+	vmcs_writel(HOST_RSP, src->host_rsp);
+	vmcs_writel(HOST_RIP, src->host_rip);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
+
+	return 0;
+}
+
 struct level_state *create_state(void)
 {
 	struct level_state *state = NULL;
@@ -5176,6 +5615,685 @@  int create_l2_state(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+int prepare_vmcs_02(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct shadow_vmcs *src = vmx->nested.l2_state->shadow_vmcs;
+	u32 exec_control;
+
+	if (!src) {
+		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
+		return 1;
+	}
+
+	load_vmcs_common(src);
+
+	if (cpu_has_vmx_vpid() && vmx->nested.l2_state->vpid != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
+
+	if (vmx->nested.l2_state->io_bitmap_a)
+		vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
+
+	if (vmx->nested.l2_state->io_bitmap_b)
+		vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
+
+	if (vmx->nested.l2_state->msr_bitmap)
+		vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
+
+	if (src->vm_entry_msr_load_count > 0) {
+		struct page *page;
+
+		page = nested_get_page(vcpu,
+				       src->vm_entry_msr_load_addr);
+		if (!page)
+			return 1;
+
+		vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
+
+		kvm_release_page_clean(page);
+	}
+
+	if (src->virtual_apic_page_addr != 0) {
+		struct page *page;
+
+		page = nested_get_page(vcpu,
+				       src->virtual_apic_page_addr);
+		if (!page)
+			return 1;
+
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
+
+		kvm_release_page_clean(page);
+	}  else {
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+			     src->virtual_apic_page_addr);
+	}
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm)) {
+		if (src->apic_access_addr != 0) {
+			struct page *page =
+				nested_get_page(vcpu, src->apic_access_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+			kvm_release_page_clean(page);
+		} else {
+			vmcs_write64(APIC_ACCESS_ADDR, 0);
+		}
+	}
+
+	if (vm_need_tpr_shadow(vcpu->kvm) &&
+	    nested_cpu_has_vmx_tpr_shadow(vcpu))
+		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
+
+	if (enable_ept) {
+		if (!nested_cpu_has_vmx_ept(vcpu)) {
+			vmcs_write64(EPT_POINTER,
+				     vmx->nested.l1_state->shadow_vmcs->ept_pointer);
+			vmcs_write64(GUEST_PDPTR0,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr0);
+			vmcs_write64(GUEST_PDPTR1,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr1);
+			vmcs_write64(GUEST_PDPTR2,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr2);
+			vmcs_write64(GUEST_PDPTR3,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr3);
+		}
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		     (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
+		      src->pin_based_vm_exec_control));
+
+	exec_control = vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+	exec_control |= src->cpu_based_vm_exec_control;
+
+	if (!vm_need_tpr_shadow(vcpu->kvm) ||
+	    src->virtual_apic_page_addr == 0) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+			CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	} else if (exec_control & CPU_BASED_TPR_SHADOW) {
+
+#ifdef CONFIG_X86_64
+		exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
+		exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      src->exception_bitmap));
+
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_mask &
+		      src->page_fault_error_code_mask));
+
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_match &
+		      src->page_fault_error_code_match));
+
+	vmcs_write32(VM_EXIT_CONTROLS,
+		     ((vmx->nested.l1_state->shadow_vmcs->vm_exit_controls &
+		       NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     (vmx->nested.l1_state->shadow_vmcs->vm_entry_controls &
+		      NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	if (cpu_has_secondary_exec_ctrls()) {
+
+		exec_control =
+			vmx->nested.l1_state->shadow_vmcs->secondary_vm_exec_control;
+
+		if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
+
+			exec_control |= src->secondary_vm_exec_control;
+
+			if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
+			    !nested_vm_need_virtualize_apic_accesses(vcpu))
+				exec_control &=
+				       ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		}
+
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+	vmcs_writel(CR0_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr0_guest_host_mask  &
+		     src->cr0_guest_host_mask));
+	vmcs_writel(CR4_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr4_guest_host_mask  &
+		     src->cr4_guest_host_mask));
+
+	load_vmcs_host_state(vmx->nested.l1_state->shadow_vmcs);
+
+	return 0;
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
+
+	if (enable_vpid && src->virtual_processor_id != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     src->apic_access_addr);
+
+	if (enable_ept) {
+		vmcs_write64(EPT_POINTER, src->ept_pointer);
+		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     src->page_fault_error_code_mask);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     src->page_fault_error_code_match);
+	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     src->secondary_vm_exec_control);
+
+	load_vmcs_common(src);
+
+	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_state->shadow_vmcs);
+
+	return 0;
+}
+
+void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
+{
+	unsigned long mask;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
+
+	if (vcpu->arch.regs_dirty & mask) {
+		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
+		       vcpu->arch.regs_dirty, mask);
+		WARN_ON(1);
+	}
+
+	vcpu->arch.regs_dirty = 0;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu)
+{
+	/* verify that l1 has done vmptrld for l2 earlier */
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+	int r = 0;
+
+	if (vmx->nested.nested_mode) {
+		printk(KERN_INFO "Nested guest already running\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	vmx->nested.nested_mode = 1;
+
+	vcpu->arch.exception.pending = false;
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	save_vmcs(vmx->nested.l1_state->shadow_vmcs);
+
+	vmx->nested.l1_state->shadow_efer = vcpu->arch.shadow_efer;
+	if (!enable_ept)
+		vmx->nested.l1_state->cr3 = vcpu->arch.cr3;
+	vmx->nested.l1_state->cr4 = vcpu->arch.cr4;
+
+	if (enable_vpid) {
+		if (vmx->nested.l2_state->vpid == 0) {
+			allocate_vpid(vmx);
+			vmx->nested.l2_state->vpid = vmx->vpid;
+		}
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l1_state->msr_bitmap = 0;
+
+	vmx->nested.l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	vmx->nested.l1_state->vmcs = vmx->vmcs;
+	vmx->nested.l1_state->cpu = vcpu->cpu;
+	vmx->nested.l1_state->launched = vmx->launched;
+
+	vmx->vmcs = vmx->nested.l2_state->vmcs;
+	vcpu->cpu = vmx->nested.l2_state->cpu;
+	vmx->launched = vmx->nested.l2_state->launched;
+
+	if (vmx->nested.vmclear || !vmx->launched) {
+		vmcs_clear(vmx->vmcs);
+		vmx->launched = 0;
+	}
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	prepare_vmcs_02(vcpu);
+
+	if (vmx->nested.l2_state->shadow_vmcs->vm_entry_controls &
+	    VM_ENTRY_IA32E_MODE) {
+		if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
+		      (vcpu->arch.shadow_efer & EFER_LME)))
+			vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
+	} else {
+		if ((vcpu->arch.shadow_efer & EFER_LMA) ||
+		    (vcpu->arch.shadow_efer & EFER_LME))
+			vcpu->arch.shadow_efer = 0;
+	}
+
+	vmx_set_cr0(vcpu, vmx->nested.l2_state->shadow_vmcs->guest_cr0);
+	vmcs_writel(CR0_READ_SHADOW,
+		    vmx->nested.l2_state->shadow_vmcs->cr0_read_shadow);
+	vmx_set_cr4(vcpu, vmx->nested.l2_state->shadow_vmcs->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW,
+		    vmx->nested.l2_state->shadow_vmcs->cr4_read_shadow);
+
+	vcpu->arch.cr0 |= X86_CR0_PG;
+
+	if (enable_ept && !nested_cpu_has_vmx_ept(vcpu)) {
+		vmcs_write32(GUEST_CR3, vmx->nested.l2_state->shadow_vmcs->guest_cr3);
+		vmx->vcpu.arch.cr3 = vmx->nested.l2_state->shadow_vmcs->guest_cr3;
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l2_state->shadow_vmcs->guest_cr3);
+		kvm_mmu_reset_context(vcpu);
+
+		r = kvm_mmu_load(vcpu);
+		if (unlikely(r)) {
+			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
+			nested_vmx_vmexit(vcpu, false);
+			set_rflags_to_vmx_fail_valid(vcpu);
+			return 1;
+		}
+
+	}
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l2_state->shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l2_state->shadow_vmcs->guest_rip);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      vmx->nested.l2_state->shadow_vmcs->exception_bitmap));
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 1;
+}
+
+static int launch_guest(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+
+	nested_vmx_run(vcpu);
+
+	return 1;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+
+	if (!vmx->nested.nested_mode) {
+		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+		       __func__);
+		return 0;
+	}
+
+	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	prepare_vmcs_12(vcpu);
+	if (is_interrupt)
+		vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
+			EXIT_REASON_EXTERNAL_INTERRUPT;
+
+	vmx->nested.l2_state->launched = vmx->launched;
+	vmx->nested.l2_state->cpu = vcpu->cpu;
+
+	vmx->vmcs = vmx->nested.l1_state->vmcs;
+	vcpu->cpu = vmx->nested.l1_state->cpu;
+	vmx->launched = vmx->nested.l1_state->launched;
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	vcpu->arch.exception.pending = false;
+
+	vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer;
+	vmx_set_cr0(vcpu, vmx->nested.l1_state->shadow_vmcs->cr0_read_shadow);
+	vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4);
+
+	if (enable_ept) {
+		vcpu->arch.cr3 = vmx->nested.l1_state->shadow_vmcs->guest_cr3;
+		vmcs_write32(GUEST_CR3, vmx->nested.l1_state->shadow_vmcs->guest_cr3);
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3);
+	}
+
+	switch_back_vmcs(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rip);
+
+	vmx->nested.nested_mode = 0;
+
+	kvm_mmu_reset_context(vcpu);
+	kvm_mmu_load(vcpu);
+
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		set_rflags_to_vmx_fail_valid(vcpu);
+	} else
+		clear_rflags_cf_zf(vcpu);
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		struct page *msr_page = NULL;
+		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+		struct shadow_vmcs *l2svmcs =
+			to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+
+		if (!cpu_has_vmx_msr_bitmap()
+		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+			return 1;
+
+		msr_page = nested_get_page(vcpu,
+					   l2svmcs->msr_bitmap);
+
+		if (!msr_page) {
+			printk(KERN_INFO "%s error in nested_get_page\n",
+			       __func__);
+			return 0;
+		}
+
+		switch (exit_code) {
+		case EXIT_REASON_MSR_READ:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x000)))
+					return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x400)))
+					return 1;
+			}
+			break;
+		case EXIT_REASON_MSR_WRITE:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x800)))
+						return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0xc00)))
+					return 1;
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct shadow_vmcs *l2svmcs = vmx->nested.l2_state->shadow_vmcs;
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	if (kvm_override) {
+		switch (exit_code) {
+		case EXIT_REASON_EXTERNAL_INTERRUPT:
+			return 0;
+		case EXIT_REASON_EXCEPTION_NMI:
+			if (!is_exception(intr_info))
+				return 0;
+
+			if (is_page_fault(intr_info) && (!enable_ept))
+				return 0;
+
+			break;
+		case EXIT_REASON_EPT_VIOLATION:
+			if (enable_ept)
+				return 0;
+
+			break;
+		}
+	}
+
+	switch (exit_code) {
+	case EXIT_REASON_INVLPG:
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_INVLPG_EXITING)
+			return 1;
+
+		break;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		return nested_vmx_exit_handled_msr(vcpu);
+	case EXIT_REASON_CR_ACCESS: {
+		unsigned long exit_qualification =
+			vmcs_readl(EXIT_QUALIFICATION);
+		int cr = exit_qualification & 15;
+		int reg = (exit_qualification >> 8) & 15;
+		unsigned long val = kvm_register_read(vcpu, reg);
+
+		switch ((exit_qualification >> 4) & 3) {
+		case 0: /* mov to cr */
+			switch (cr) {
+			case 0:
+				if (l2svmcs->cr0_guest_host_mask &
+				    (val ^ l2svmcs->cr0_read_shadow))
+					return 1;
+				break;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_LOAD_EXITING)
+					return 1;
+				return 1;
+			case 4:
+				if (l2svmcs->cr4_guest_host_mask &
+				    (l2svmcs->cr4_read_shadow ^ val))
+					return 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_LOAD_EXITING)
+					return 1;
+				break;
+			}
+			break;
+		case 2: /* clts */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				return 1;
+			break;
+		case 1: /*mov from cr*/
+			switch (cr) {
+			case 0:
+				return 1;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_STORE_EXITING)
+					return 1;
+				break;
+			case 4:
+				return 1;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_STORE_EXITING)
+					return 1;
+				break;
+			}
+			break;
+		case 3: /* lmsw */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				return 1;
+			break;
+		}
+		break;
+	}
+	case EXIT_REASON_DR_ACCESS: {
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_MOV_DR_EXITING)
+			return 1;
+		break;
+	}
+
+	case EXIT_REASON_EXCEPTION_NMI: {
+
+		if (is_external_interrupt(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_EXT_INTR_MASK))
+			return 1;
+
+		if (is_nmi(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_NMI_EXITING))
+			return 1;
+
+		if (is_exception(intr_info) &&
+		    (l2svmcs->exception_bitmap &
+		     (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
+			return 1;
+
+		if (is_page_fault(intr_info))
+			return 1;
+
+		break;
+	}
+
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		if (l2svmcs->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK)
+			return 1;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (vmx->nested.nested_mode) {
+		if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
+			nested_vmx_vmexit(&vmx->vcpu, false);
+			vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
+				EXIT_REASON_EXCEPTION_NMI;
+			vmx->nested.l2_state->shadow_vmcs->vm_exit_intr_info =
+				(nr | INTR_TYPE_HARD_EXCEPTION
+				 | (has_error_code ?
+				    INTR_INFO_DELIVER_CODE_MASK : 0)
+				 | INTR_INFO_VALID_MASK);
+
+			if (has_error_code)
+				vmx->nested.l2_state->shadow_vmcs->
+					vm_exit_intr_error_code = error_code;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int nested_vmx_intr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->nested.nested_mode) {
+		if (vmx->nested.l2_state->shadow_vmcs->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK) {
+			if (vmx->nested.nested_run_pending)
+				return 0;
+
+			nested_vmx_vmexit(vcpu, true);
+			return 1;
+		}
+	}
+
+	return 0;
+}
 
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
@@ -5224,7 +6342,7 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.set_interrupt_shadow = vmx_set_interrupt_shadow,
 	.get_interrupt_shadow = vmx_get_interrupt_shadow,
 	.patch_hypercall = vmx_patch_hypercall,
-	.set_irq = vmx_inject_irq,
+	.set_irq = vmx_set_irq,
 	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
 	.interrupt_allowed = vmx_interrupt_allowed,