diff mbox

[v3] KVM: x86: INIT and reset sequences are different

Message ID 1428924848-28212-1-git-send-email-namit@cs.technion.ac.il (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Amit April 13, 2015, 11:34 a.m. UTC
x86 architecture defines differences between the reset and INIT sequences.
INIT does not initialize the FPU (including MMX, XMM, YMM, etc.), TSC, PMU,
MSRs (in general), MTRRs machine-check, APIC ID, APIC arbitration ID and BSP.

References (from Intel SDM):

"If the MP protocol has completed and a BSP is chosen, subsequent INITs (either
to a specific processor or system wide) do not cause the MP protocol to be
repeated." [8.4.2: MP Initialization Protocol Requirements and Restrictions]

[Table 9-1. IA-32 Processor States Following Power-up, Reset, or INIT]

"If the processor is reset by asserting the INIT# pin, the x87 FPU state is not
changed." [9.2: X87 FPU INITIALIZATION]

"The state of the local APIC following an INIT reset is the same as it is after
a power-up or hardware reset, except that the APIC ID and arbitration ID
registers are not affected." [10.4.7.3: Local APIC State After an INIT Reset
(“Wait-for-SIPI” State)]

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>

---

v3:

- Leave EFER unchanged on INIT. Instead, set cr0 correctly so vmx_set_cr0 would
  recognize that paging was changed from on to off and clear LMA.
- Clean the surrounding from unnecassary indirection of vmx->vcpu.
- Change svm similarly to vmx (UNTESTED).

v2:

- Same as v1 (was part of a patch-set that was modified due to missing tests)
---
 arch/x86/include/asm/kvm_host.h |  6 ++---
 arch/x86/kvm/lapic.c            | 11 +++++----
 arch/x86/kvm/lapic.h            |  2 +-
 arch/x86/kvm/svm.c              | 27 +++++++++++-----------
 arch/x86/kvm/vmx.c              | 51 +++++++++++++++++++++++------------------
 arch/x86/kvm/x86.c              | 17 ++++++++------
 6 files changed, 63 insertions(+), 51 deletions(-)

Comments

Paolo Bonzini April 13, 2015, 2:45 p.m. UTC | #1
On 13/04/2015 13:34, Nadav Amit wrote:
> x86 architecture defines differences between the reset and INIT sequences.
> INIT does not initialize the FPU (including MMX, XMM, YMM, etc.), TSC, PMU,
> MSRs (in general), MTRRs machine-check, APIC ID, APIC arbitration ID and BSP.
> 
> References (from Intel SDM):
> 
> "If the MP protocol has completed and a BSP is chosen, subsequent INITs (either
> to a specific processor or system wide) do not cause the MP protocol to be
> repeated." [8.4.2: MP Initialization Protocol Requirements and Restrictions]
> 
> [Table 9-1. IA-32 Processor States Following Power-up, Reset, or INIT]
> 
> "If the processor is reset by asserting the INIT# pin, the x87 FPU state is not
> changed." [9.2: X87 FPU INITIALIZATION]
> 
> "The state of the local APIC following an INIT reset is the same as it is after
> a power-up or hardware reset, except that the APIC ID and arbitration ID
> registers are not affected." [10.4.7.3: Local APIC State After an INIT Reset
> (“Wait-for-SIPI” State)]
> 
> Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
> 
> ---
> 
> v3:
> 
> - Leave EFER unchanged on INIT. Instead, set cr0 correctly so vmx_set_cr0 would
>   recognize that paging was changed from on to off and clear LMA.
> - Clean the surrounding from unnecassary indirection of vmx->vcpu.
> - Change svm similarly to vmx (UNTESTED).

Thanks, applied (locally) for 4.2.  It will take a few weeks before it
gets to kvm/next and in the meanwhile I'll make sure to test on SVM as
well, and to integrate the kvm-unit-tests patches.

Paolo

> v2:
> 
> - Same as v1 (was part of a patch-set that was modified due to missing tests)
> ---
>  arch/x86/include/asm/kvm_host.h |  6 ++---
>  arch/x86/kvm/lapic.c            | 11 +++++----
>  arch/x86/kvm/lapic.h            |  2 +-
>  arch/x86/kvm/svm.c              | 27 +++++++++++-----------
>  arch/x86/kvm/vmx.c              | 51 +++++++++++++++++++++++------------------
>  arch/x86/kvm/x86.c              | 17 ++++++++------
>  6 files changed, 63 insertions(+), 51 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f80ad59..3a19e30 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -711,7 +711,7 @@ struct kvm_x86_ops {
>  	/* Create, but do not attach this VCPU */
>  	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
>  	void (*vcpu_free)(struct kvm_vcpu *vcpu);
> -	void (*vcpu_reset)(struct kvm_vcpu *vcpu);
> +	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
>  
>  	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
>  	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
> @@ -1001,7 +1001,7 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
>  
>  void kvm_inject_nmi(struct kvm_vcpu *vcpu);
>  
> -int fx_init(struct kvm_vcpu *vcpu);
> +int fx_init(struct kvm_vcpu *vcpu, bool init_event);
>  
>  void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  		       const u8 *new, int bytes);
> @@ -1145,7 +1145,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
> -void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
>  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
>  void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
>  					   unsigned long address);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index fe2d89e..a91fb2f 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1557,7 +1557,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  
>  }
>  
> -void kvm_lapic_reset(struct kvm_vcpu *vcpu)
> +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  {
>  	struct kvm_lapic *apic;
>  	int i;
> @@ -1571,7 +1571,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
>  	/* Stop the timer in case it's a reset to an active apic */
>  	hrtimer_cancel(&apic->lapic_timer.timer);
>  
> -	kvm_apic_set_id(apic, vcpu->vcpu_id);
> +	if (!init_event)
> +		kvm_apic_set_id(apic, vcpu->vcpu_id);
>  	kvm_apic_set_version(apic->vcpu);
>  
>  	for (i = 0; i < APIC_LVT_NUM; i++)
> @@ -1713,7 +1714,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
>  			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
>  
>  	static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
> -	kvm_lapic_reset(vcpu);
> +	kvm_lapic_reset(vcpu, false);
>  	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
>  
>  	return 0;
> @@ -2047,8 +2048,8 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
>  	pe = xchg(&apic->pending_events, 0);
>  
>  	if (test_bit(KVM_APIC_INIT, &pe)) {
> -		kvm_lapic_reset(vcpu);
> -		kvm_vcpu_reset(vcpu);
> +		kvm_lapic_reset(vcpu, true);
> +		kvm_vcpu_reset(vcpu, true);
>  		if (kvm_vcpu_is_bsp(apic->vcpu))
>  			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>  		else
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 9d28383..793f761 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
>  int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
>  void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
> -void kvm_lapic_reset(struct kvm_vcpu *vcpu);
> +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
>  u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
>  void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
>  void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 46299da..b6ad0f9 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1082,7 +1082,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
>  	return target_tsc - tsc;
>  }
>  
> -static void init_vmcb(struct vcpu_svm *svm)
> +static void init_vmcb(struct vcpu_svm *svm, bool init_event)
>  {
>  	struct vmcb_control_area *control = &svm->vmcb->control;
>  	struct vmcb_save_area *save = &svm->vmcb->save;
> @@ -1153,17 +1153,17 @@ static void init_vmcb(struct vcpu_svm *svm)
>  	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
>  	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
>  
> -	svm_set_efer(&svm->vcpu, 0);
> +	if (!init_event)
> +		svm_set_efer(&svm->vcpu, 0);
>  	save->dr6 = 0xffff0ff0;
>  	kvm_set_rflags(&svm->vcpu, 2);
>  	save->rip = 0x0000fff0;
>  	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
>  
>  	/*
> -	 * This is the guest-visible cr0 value.
>  	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
> +	 * It also updates the guest-visible cr0 value.
>  	 */
> -	svm->vcpu.arch.cr0 = 0;
>  	(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
>  
>  	save->cr4 = X86_CR4_PAE;
> @@ -1195,13 +1195,19 @@ static void init_vmcb(struct vcpu_svm *svm)
>  	enable_gif(svm);
>  }
>  
> -static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
> +static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
>  	u32 dummy;
>  	u32 eax = 1;
>  
> -	init_vmcb(svm);
> +	if (!init_event) {
> +		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
> +					   MSR_IA32_APICBASE_ENABLE;
> +		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
> +			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
> +	}
> +	init_vmcb(svm, init_event);
>  
>  	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
>  	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
> @@ -1257,12 +1263,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>  	clear_page(svm->vmcb);
>  	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
>  	svm->asid_generation = 0;
> -	init_vmcb(svm);
> -
> -	svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
> -				   MSR_IA32_APICBASE_ENABLE;
> -	if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
> -		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
> +	init_vmcb(svm, false);
>  
>  	svm_init_osvw(&svm->vcpu);
>  
> @@ -1884,7 +1885,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
>  	 * so reinitialize it.
>  	 */
>  	clear_page(svm->vmcb);
> -	init_vmcb(svm);
> +	init_vmcb(svm, false);
>  
>  	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
>  	return 0;
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 8c14d6a..f7a0a7f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4694,22 +4694,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  	return 0;
>  }
>  
> -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
> +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	struct msr_data apic_base_msr;
> +	u64 cr0;
>  
>  	vmx->rmode.vm86_active = 0;
>  
>  	vmx->soft_vnmi_blocked = 0;
>  
>  	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
> -	kvm_set_cr8(&vmx->vcpu, 0);
> -	apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
> -	if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
> -		apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
> -	apic_base_msr.host_initiated = true;
> -	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
> +	kvm_set_cr8(vcpu, 0);
> +
> +	if (!init_event) {
> +		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
> +				     MSR_IA32_APICBASE_ENABLE;
> +		if (kvm_vcpu_is_reset_bsp(vcpu))
> +			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
> +		apic_base_msr.host_initiated = true;
> +		kvm_set_apic_base(vcpu, &apic_base_msr);
> +	}
>  
>  	vmx_segment_cache_clear(vmx);
>  
> @@ -4733,9 +4738,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
>  	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
>  
> -	vmcs_write32(GUEST_SYSENTER_CS, 0);
> -	vmcs_writel(GUEST_SYSENTER_ESP, 0);
> -	vmcs_writel(GUEST_SYSENTER_EIP, 0);
> +	if (!init_event) {
> +		vmcs_write32(GUEST_SYSENTER_CS, 0);
> +		vmcs_writel(GUEST_SYSENTER_ESP, 0);
> +		vmcs_writel(GUEST_SYSENTER_EIP, 0);
> +		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
> +	}
>  
>  	vmcs_writel(GUEST_RFLAGS, 0x02);
>  	kvm_rip_write(vcpu, 0xfff0);
> @@ -4750,18 +4758,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
>  	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
>  
> -	/* Special registers */
> -	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
> -
>  	setup_msrs(vmx);
>  
>  	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
>  
> -	if (cpu_has_vmx_tpr_shadow()) {
> +	if (cpu_has_vmx_tpr_shadow() && !init_event) {
>  		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
> -		if (vm_need_tpr_shadow(vmx->vcpu.kvm))
> +		if (vm_need_tpr_shadow(vcpu->kvm))
>  			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
> -				     __pa(vmx->vcpu.arch.apic->regs));
> +				     __pa(vcpu->arch.apic->regs));
>  		vmcs_write32(TPR_THRESHOLD, 0);
>  	}
>  
> @@ -4773,12 +4778,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  	if (vmx->vpid != 0)
>  		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
>  
> -	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
> -	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
> -	vmx_set_cr4(&vmx->vcpu, 0);
> -	vmx_set_efer(&vmx->vcpu, 0);
> -	vmx_fpu_activate(&vmx->vcpu);
> -	update_exception_bitmap(&vmx->vcpu);
> +	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
> +	vmx_set_cr0(vcpu, cr0); /* enter rmode */
> +	vmx->vcpu.arch.cr0 = cr0;
> +	vmx_set_cr4(vcpu, 0);
> +	if (!init_event)
> +		vmx_set_efer(vcpu, 0);
> +	vmx_fpu_activate(vcpu);
> +	update_exception_bitmap(vcpu);
>  
>  	vpid_sync_context(vmx);
>  }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c3859a6..ea3584b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7011,7 +7011,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>  	return 0;
>  }
>  
> -int fx_init(struct kvm_vcpu *vcpu)
> +int fx_init(struct kvm_vcpu *vcpu, bool init_event)
>  {
>  	int err;
>  
> @@ -7019,7 +7019,9 @@ int fx_init(struct kvm_vcpu *vcpu)
>  	if (err)
>  		return err;
>  
> -	fpu_finit(&vcpu->arch.guest_fpu);
> +	if (!init_event)
> +		fpu_finit(&vcpu->arch.guest_fpu);
> +
>  	if (cpu_has_xsaves)
>  		vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
>  			host_xcr0 | XSTATE_COMPACTION_ENABLED;
> @@ -7099,7 +7101,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
>  	r = vcpu_load(vcpu);
>  	if (r)
>  		return r;
> -	kvm_vcpu_reset(vcpu);
> +	kvm_vcpu_reset(vcpu, false);
>  	kvm_mmu_setup(vcpu);
>  	vcpu_put(vcpu);
>  
> @@ -7137,7 +7139,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->vcpu_free(vcpu);
>  }
>  
> -void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  {
>  	atomic_set(&vcpu->arch.nmi_queued, 0);
>  	vcpu->arch.nmi_pending = 0;
> @@ -7164,13 +7166,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
>  	kvm_async_pf_hash_reset(vcpu);
>  	vcpu->arch.apf.halted = false;
>  
> -	kvm_pmu_reset(vcpu);
> +	if (!init_event)
> +		kvm_pmu_reset(vcpu);
>  
>  	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
>  	vcpu->arch.regs_avail = ~0;
>  	vcpu->arch.regs_dirty = ~0;
>  
> -	kvm_x86_ops->vcpu_reset(vcpu);
> +	kvm_x86_ops->vcpu_reset(vcpu, init_event);
>  }
>  
>  void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
> @@ -7352,7 +7355,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>  		goto fail_free_mce_banks;
>  	}
>  
> -	r = fx_init(vcpu);
> +	r = fx_init(vcpu, false);
>  	if (r)
>  		goto fail_free_wbinvd_dirty_mask;
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Oct. 1, 2015, 12:55 p.m. UTC | #2
On 13/04/2015 13:34, Nadav Amit wrote:
> x86 architecture defines differences between the reset and INIT sequences.
> INIT does not initialize the FPU (including MMX, XMM, YMM, etc.), TSC, PMU,
> MSRs (in general), MTRRs machine-check, APIC ID, APIC arbitration ID and BSP.
> 
> References (from Intel SDM):
> 
> "If the MP protocol has completed and a BSP is chosen, subsequent INITs (either
> to a specific processor or system wide) do not cause the MP protocol to be
> repeated." [8.4.2: MP Initialization Protocol Requirements and Restrictions]
> 
> [Table 9-1. IA-32 Processor States Following Power-up, Reset, or INIT]
> 
> "If the processor is reset by asserting the INIT# pin, the x87 FPU state is not
> changed." [9.2: X87 FPU INITIALIZATION]
> 
> "The state of the local APIC following an INIT reset is the same as it is after
> a power-up or hardware reset, except that the APIC ID and arbitration ID
> registers are not affected." [10.4.7.3: Local APIC State After an INIT Reset
> (“Wait-for-SIPI” State)]
> 
> Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
> 
> ---
> 
> v3:
> 
> - Leave EFER unchanged on INIT. Instead, set cr0 correctly so vmx_set_cr0 would
>   recognize that paging was changed from on to off and clear LMA.

I wonder if this change from v2 to v3 was correct.

It means that a 32-bit firmware cannot enter paging mode without
clearing EFER.LME first (which it should not know about).

Yang, can you check what real hardware does to EFER on an INIT?  Perhaps
it only clears EFER.LME (in addition of course to EFER.LMA, which is
cleared as a side effect of writing CR0).

Thanks,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Oct. 9, 2015, 8:06 a.m. UTC | #3
Paolo Bonzini wrote on 2015-10-01:

Hi Paolo

Sorry for the late reply. I am just back from vacation.

> 

> 

> On 13/04/2015 13:34, Nadav Amit wrote:

>> x86 architecture defines differences between the reset and INIT

>> sequences. INIT does not initialize the FPU (including MMX, XMM, YMM,

>> etc.), TSC, PMU, MSRs (in general), MTRRs machine-check, APIC ID, APIC

>> arbitration ID and BSP.

>> 

>> References (from Intel SDM):

>> 

>> "If the MP protocol has completed and a BSP is chosen, subsequent INITs

>> (either to a specific processor or system wide) do not cause the MP

>> protocol to be repeated." [8.4.2: MP Initialization Protocol

>> Requirements and Restrictions]

>> 

>> [Table 9-1. IA-32 Processor States Following Power-up, Reset, or INIT]

>> 

>> "If the processor is reset by asserting the INIT# pin, the x87 FPU state is not

>> changed." [9.2: X87 FPU INITIALIZATION]

>> 

>> "The state of the local APIC following an INIT reset is the same as it is after

>> a power-up or hardware reset, except that the APIC ID and arbitration ID

>> registers are not affected." [10.4.7.3: Local APIC State After an INIT Reset

>> (“Wait-for-SIPI” State)]

>> 

>> Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>

>> 

>> ---

>> 

>> v3:

>> 

>> - Leave EFER unchanged on INIT. Instead, set cr0 correctly so vmx_set_cr0

> would

>>   recognize that paging was changed from on to off and clear LMA.

> 

> I wonder if this change from v2 to v3 was correct.

> 

> It means that a 32-bit firmware cannot enter paging mode without

> clearing EFER.LME first (which it should not know about).

> 

> Yang, can you check what real hardware does to EFER on an INIT?  Perhaps

> it only clears EFER.LME (in addition of course to EFER.LMA, which is

> cleared as a side effect of writing CR0).


Sure, I will check it with our hardware expert.

> 

> Thanks,

> 

> Paolo



Best regards,
Yang
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f80ad59..3a19e30 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -711,7 +711,7 @@  struct kvm_x86_ops {
 	/* Create, but do not attach this VCPU */
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
-	void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
 	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -1001,7 +1001,7 @@  void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-int fx_init(struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu, bool init_event);
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes);
@@ -1145,7 +1145,7 @@  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 					   unsigned long address);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fe2d89e..a91fb2f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1557,7 +1557,7 @@  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 }
 
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct kvm_lapic *apic;
 	int i;
@@ -1571,7 +1571,8 @@  void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
-	kvm_apic_set_id(apic, vcpu->vcpu_id);
+	if (!init_event)
+		kvm_apic_set_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
 	for (i = 0; i < APIC_LVT_NUM; i++)
@@ -1713,7 +1714,7 @@  int kvm_create_lapic(struct kvm_vcpu *vcpu)
 			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
 	static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
-	kvm_lapic_reset(vcpu);
+	kvm_lapic_reset(vcpu, false);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
 	return 0;
@@ -2047,8 +2048,8 @@  void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 	pe = xchg(&apic->pending_events, 0);
 
 	if (test_bit(KVM_APIC_INIT, &pe)) {
-		kvm_lapic_reset(vcpu);
-		kvm_vcpu_reset(vcpu);
+		kvm_lapic_reset(vcpu, true);
+		kvm_vcpu_reset(vcpu, true);
 		if (kvm_vcpu_is_bsp(apic->vcpu))
 			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		else
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9d28383..793f761 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,7 +48,7 @@  int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 46299da..b6ad0f9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1082,7 +1082,7 @@  static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 	return target_tsc - tsc;
 }
 
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct vcpu_svm *svm, bool init_event)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1153,17 +1153,17 @@  static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	svm_set_efer(&svm->vcpu, 0);
+	if (!init_event)
+		svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	kvm_set_rflags(&svm->vcpu, 2);
 	save->rip = 0x0000fff0;
 	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
 	/*
-	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+	 * It also updates the guest-visible cr0 value.
 	 */
-	svm->vcpu.arch.cr0 = 0;
 	(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
 
 	save->cr4 = X86_CR4_PAE;
@@ -1195,13 +1195,19 @@  static void init_vmcb(struct vcpu_svm *svm)
 	enable_gif(svm);
 }
 
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u32 dummy;
 	u32 eax = 1;
 
-	init_vmcb(svm);
+	if (!init_event) {
+		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+					   MSR_IA32_APICBASE_ENABLE;
+		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	}
+	init_vmcb(svm, init_event);
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1257,12 +1263,7 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
-	init_vmcb(svm);
-
-	svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-				   MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	init_vmcb(svm, false);
 
 	svm_init_osvw(&svm->vcpu);
 
@@ -1884,7 +1885,7 @@  static int shutdown_interception(struct vcpu_svm *svm)
 	 * so reinitialize it.
 	 */
 	clear_page(svm->vmcb);
-	init_vmcb(svm);
+	init_vmcb(svm, false);
 
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	return 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8c14d6a..f7a0a7f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4694,22 +4694,27 @@  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	return 0;
 }
 
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct msr_data apic_base_msr;
+	u64 cr0;
 
 	vmx->rmode.vm86_active = 0;
 
 	vmx->soft_vnmi_blocked = 0;
 
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	kvm_set_cr8(&vmx->vcpu, 0);
-	apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
-		apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
-	apic_base_msr.host_initiated = true;
-	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
+	kvm_set_cr8(vcpu, 0);
+
+	if (!init_event) {
+		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+				     MSR_IA32_APICBASE_ENABLE;
+		if (kvm_vcpu_is_reset_bsp(vcpu))
+			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+		apic_base_msr.host_initiated = true;
+		kvm_set_apic_base(vcpu, &apic_base_msr);
+	}
 
 	vmx_segment_cache_clear(vmx);
 
@@ -4733,9 +4738,12 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
 	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
 
-	vmcs_write32(GUEST_SYSENTER_CS, 0);
-	vmcs_writel(GUEST_SYSENTER_ESP, 0);
-	vmcs_writel(GUEST_SYSENTER_EIP, 0);
+	if (!init_event) {
+		vmcs_write32(GUEST_SYSENTER_CS, 0);
+		vmcs_writel(GUEST_SYSENTER_ESP, 0);
+		vmcs_writel(GUEST_SYSENTER_EIP, 0);
+		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+	}
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
 	kvm_rip_write(vcpu, 0xfff0);
@@ -4750,18 +4758,15 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-	/* Special registers */
-	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
 	setup_msrs(vmx);
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 
-	if (cpu_has_vmx_tpr_shadow()) {
+	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-		if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+		if (vm_need_tpr_shadow(vcpu->kvm))
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-				     __pa(vmx->vcpu.arch.apic->regs));
+				     __pa(vcpu->arch.apic->regs));
 		vmcs_write32(TPR_THRESHOLD, 0);
 	}
 
@@ -4773,12 +4778,14 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	if (vmx->vpid != 0)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
-	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-	vmx_set_cr4(&vmx->vcpu, 0);
-	vmx_set_efer(&vmx->vcpu, 0);
-	vmx_fpu_activate(&vmx->vcpu);
-	update_exception_bitmap(&vmx->vcpu);
+	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+	vmx_set_cr0(vcpu, cr0); /* enter rmode */
+	vmx->vcpu.arch.cr0 = cr0;
+	vmx_set_cr4(vcpu, 0);
+	if (!init_event)
+		vmx_set_efer(vcpu, 0);
+	vmx_fpu_activate(vcpu);
+	update_exception_bitmap(vcpu);
 
 	vpid_sync_context(vmx);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3859a6..ea3584b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7011,7 +7011,7 @@  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
-int fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu, bool init_event)
 {
 	int err;
 
@@ -7019,7 +7019,9 @@  int fx_init(struct kvm_vcpu *vcpu)
 	if (err)
 		return err;
 
-	fpu_finit(&vcpu->arch.guest_fpu);
+	if (!init_event)
+		fpu_finit(&vcpu->arch.guest_fpu);
+
 	if (cpu_has_xsaves)
 		vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
 			host_xcr0 | XSTATE_COMPACTION_ENABLED;
@@ -7099,7 +7101,7 @@  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
-	kvm_vcpu_reset(vcpu);
+	kvm_vcpu_reset(vcpu, false);
 	kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
 
@@ -7137,7 +7139,7 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
@@ -7164,13 +7166,14 @@  void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
-	kvm_pmu_reset(vcpu);
+	if (!init_event)
+		kvm_pmu_reset(vcpu);
 
 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
 	vcpu->arch.regs_avail = ~0;
 	vcpu->arch.regs_dirty = ~0;
 
-	kvm_x86_ops->vcpu_reset(vcpu);
+	kvm_x86_ops->vcpu_reset(vcpu, init_event);
 }
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7352,7 +7355,7 @@  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 	}
 
-	r = fx_init(vcpu);
+	r = fx_init(vcpu, false);
 	if (r)
 		goto fail_free_wbinvd_dirty_mask;