diff mbox

[v3,1/4] KVM: nVMX: fix CR3 load if L2 uses PAE paging and EPT

Message ID 1480518191-4572-2-git-send-email-lprosek@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ladi Prosek Nov. 30, 2016, 3:03 p.m. UTC
KVM does not correctly handle L1 hypervisors that emulate L2 real mode with
PAE and EPT, such as Hyper-V. In this mode, the L1 hypervisor populates guest
PDPTE VMCS fields and leaves guest CR3 uninitialized because it is not used
(see 26.3.2.4 Loading Page-Directory-Pointer-Table Entries). KVM always
dereferences CR3 and tries to load PDPTEs if PAE is on. This leads to two
related issues:

1) On the first nested vmentry, the guest PDPTEs, as populated by L1, are
overwritten in ept_load_pdptrs because the registers are believed to have
been loaded in load_pdptrs as part of kvm_set_cr3. This is incorrect. L2 is
running with PAE enabled but PDPTRs have been set up by L1.

2) When L2 is about to enable paging and loads its CR3, we, again, attempt
to load PDPTEs in load_pdptrs called from kvm_set_cr3. There are no guarantees
that this will succeed (it's just a CR3 load, paging is not enabled yet) and
if it doesn't, kvm_set_cr3 returns early without persisting the CR3 which is
then lost and L2 crashes right after it enables paging.

This patch replaces the kvm_set_cr3 call with a simple register write if PAE
and EPT are both on. CR3 is not to be interpreted in this case.

Signed-off-by: Ladi Prosek <lprosek@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

Comments

Wanpeng Li Dec. 21, 2016, 7:31 a.m. UTC | #1
2016-11-30 23:03 GMT+08:00 Ladi Prosek <lprosek@redhat.com>:
> KVM does not correctly handle L1 hypervisors that emulate L2 real mode with
> PAE and EPT, such as Hyper-V. In this mode, the L1 hypervisor populates guest
> PDPTE VMCS fields and leaves guest CR3 uninitialized because it is not used

How L1 hypervisor populates guest PDPTE VMCS fields if undereference CR3?

> (see 26.3.2.4 Loading Page-Directory-Pointer-Table Entries). KVM always
> dereferences CR3 and tries to load PDPTEs if PAE is on. This leads to two
> related issues:
>
> 1) On the first nested vmentry, the guest PDPTEs, as populated by L1, are
> overwritten in ept_load_pdptrs because the registers are believed to have
> been loaded in load_pdptrs as part of kvm_set_cr3. This is incorrect. L2 is
> running with PAE enabled but PDPTRs have been set up by L1.
>
> 2) When L2 is about to enable paging and loads its CR3, we, again, attempt
> to load PDPTEs in load_pdptrs called from kvm_set_cr3. There are no guarantees
> that this will succeed (it's just a CR3 load, paging is not enabled yet) and
> if it doesn't, kvm_set_cr3 returns early without persisting the CR3 which is
> then lost and L2 crashes right after it enables paging.
>
> This patch replaces the kvm_set_cr3 call with a simple register write if PAE
> and EPT are both on. CR3 is not to be interpreted in this case.
>
> Signed-off-by: Ladi Prosek <lprosek@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 16 ++++++++++++++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 81fbda0..034dc7a 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -9808,6 +9808,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  {
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
>         u32 exec_control;
> +       bool nested_ept_enabled = false;
>
>         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
> @@ -9972,6 +9973,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>                                 vmcs12->guest_intr_status);
>                 }
>
> +               nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
>                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
>         }
>
> @@ -10113,8 +10115,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
>         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
>
> -       /* shadow page tables on either EPT or shadow page tables */
> -       kvm_set_cr3(vcpu, vmcs12->guest_cr3);
> +       /*
> +        * Shadow page tables on either EPT or shadow page tables.
> +        * If PAE and EPT are both on, CR3 is not used by the CPU and must not
> +        * be dereferenced.
> +        */
> +       if (is_pae(vcpu) && is_paging(vcpu) && !is_long_mode(vcpu) &&
> +           nested_ept_enabled) {
> +               vcpu->arch.cr3 = vmcs12->guest_cr3;
> +               __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
> +       } else
> +               kvm_set_cr3(vcpu, vmcs12->guest_cr3);
> +
>         kvm_mmu_reset_context(vcpu);
>
>         if (!enable_ept)
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ladi Prosek Dec. 21, 2016, 8:09 a.m. UTC | #2
On Wed, Dec 21, 2016 at 8:31 AM, Wanpeng Li <kernellwp@gmail.com> wrote:
> 2016-11-30 23:03 GMT+08:00 Ladi Prosek <lprosek@redhat.com>:
>> KVM does not correctly handle L1 hypervisors that emulate L2 real mode with
>> PAE and EPT, such as Hyper-V. In this mode, the L1 hypervisor populates guest
>> PDPTE VMCS fields and leaves guest CR3 uninitialized because it is not used
>
> How L1 hypervisor populates guest PDPTE VMCS fields if undereference CR3?

L2 is free to write anything to CR3. From its perspective it's still
running in real mode so CR3 is effectively a general purpose register.
If L1 intercepts CR3 accesses, it just shadows the value in the guest
CR3 VMCS field so it's preserved across vmexists. It does not and
should not dereference it.

Let me know if I misunderstood the question.

>> (see 26.3.2.4 Loading Page-Directory-Pointer-Table Entries). KVM always
>> dereferences CR3 and tries to load PDPTEs if PAE is on. This leads to two
>> related issues:
>>
>> 1) On the first nested vmentry, the guest PDPTEs, as populated by L1, are
>> overwritten in ept_load_pdptrs because the registers are believed to have
>> been loaded in load_pdptrs as part of kvm_set_cr3. This is incorrect. L2 is
>> running with PAE enabled but PDPTRs have been set up by L1.
>>
>> 2) When L2 is about to enable paging and loads its CR3, we, again, attempt
>> to load PDPTEs in load_pdptrs called from kvm_set_cr3. There are no guarantees
>> that this will succeed (it's just a CR3 load, paging is not enabled yet) and
>> if it doesn't, kvm_set_cr3 returns early without persisting the CR3 which is
>> then lost and L2 crashes right after it enables paging.
>>
>> This patch replaces the kvm_set_cr3 call with a simple register write if PAE
>> and EPT are both on. CR3 is not to be interpreted in this case.
>>
>> Signed-off-by: Ladi Prosek <lprosek@redhat.com>
>> ---
>>  arch/x86/kvm/vmx.c | 16 ++++++++++++++--
>>  1 file changed, 14 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 81fbda0..034dc7a 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -9808,6 +9808,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  {
>>         struct vcpu_vmx *vmx = to_vmx(vcpu);
>>         u32 exec_control;
>> +       bool nested_ept_enabled = false;
>>
>>         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>>         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
>> @@ -9972,6 +9973,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>                                 vmcs12->guest_intr_status);
>>                 }
>>
>> +               nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
>>                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
>>         }
>>
>> @@ -10113,8 +10115,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
>>         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
>>
>> -       /* shadow page tables on either EPT or shadow page tables */
>> -       kvm_set_cr3(vcpu, vmcs12->guest_cr3);
>> +       /*
>> +        * Shadow page tables on either EPT or shadow page tables.
>> +        * If PAE and EPT are both on, CR3 is not used by the CPU and must not
>> +        * be dereferenced.
>> +        */
>> +       if (is_pae(vcpu) && is_paging(vcpu) && !is_long_mode(vcpu) &&
>> +           nested_ept_enabled) {
>> +               vcpu->arch.cr3 = vmcs12->guest_cr3;
>> +               __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
>> +       } else
>> +               kvm_set_cr3(vcpu, vmcs12->guest_cr3);
>> +
>>         kvm_mmu_reset_context(vcpu);
>>
>>         if (!enable_ept)
>> --
>> 2.7.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li Dec. 21, 2016, 12:20 p.m. UTC | #3
2016-12-21 16:09 GMT+08:00 Ladi Prosek <lprosek@redhat.com>:
> On Wed, Dec 21, 2016 at 8:31 AM, Wanpeng Li <kernellwp@gmail.com> wrote:
>> 2016-11-30 23:03 GMT+08:00 Ladi Prosek <lprosek@redhat.com>:
>>> KVM does not correctly handle L1 hypervisors that emulate L2 real mode with
>>> PAE and EPT, such as Hyper-V. In this mode, the L1 hypervisor populates guest
>>> PDPTE VMCS fields and leaves guest CR3 uninitialized because it is not used
>>
>> How L1 hypervisor populates guest PDPTE VMCS fields if undereference CR3?
>
> L2 is free to write anything to CR3. From its perspective it's still
> running in real mode so CR3 is effectively a general purpose register.
> If L1 intercepts CR3 accesses, it just shadows the value in the guest
> CR3 VMCS field so it's preserved across vmexists. It does not and
> should not dereference it.
>
> Let me know if I misunderstood the question.

I just confuse how PDPTE VMCS field is set when CR3 is uninitialized,
I think it should be from something like ept_identity_map_addr in
Hyper-V.

Regards,
Wanpeng Li
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81fbda0..034dc7a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9808,6 +9808,7 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
+	bool nested_ept_enabled = false;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -9972,6 +9973,7 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmcs12->guest_intr_status);
 		}
 
+		nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
@@ -10113,8 +10115,18 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-	/* shadow page tables on either EPT or shadow page tables */
-	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+	/*
+	 * Shadow page tables on either EPT or shadow page tables.
+	 * If PAE and EPT are both on, CR3 is not used by the CPU and must not
+	 * be dereferenced.
+	 */
+	if (is_pae(vcpu) && is_paging(vcpu) && !is_long_mode(vcpu) &&
+	    nested_ept_enabled) {
+		vcpu->arch.cr3 = vmcs12->guest_cr3;
+		__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+	} else
+		kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+
 	kvm_mmu_reset_context(vcpu);
 
 	if (!enable_ept)