diff mbox series

[kvm-unit-tests] x86: vmx: Add test for MTF on a guest MOV-to-CR0 that enables PAE

Message ID 20200818002537.207910-1-pshier@google.com (mailing list archive)
State New, archived
Headers show
Series [kvm-unit-tests] x86: vmx: Add test for MTF on a guest MOV-to-CR0 that enables PAE | expand

Commit Message

Peter Shier Aug. 18, 2020, 12:25 a.m. UTC
Verify that when L2 guest enables PAE paging and L0 intercept of L2
MOV to CR0 reflects MTF exit to L1, subsequent resume to L2 correctly
preserves PDPTE array specified by L2 CR3.

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by:   Peter Shier <pshier@google.com>
Signed-off-by: Peter Shier <pshier@google.com>
---
 lib/x86/asm/page.h |   8 +++
 x86/vmx_tests.c    | 171 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)

Comments

Nadav Amit Oct. 10, 2020, 8:17 a.m. UTC | #1
> On Aug 17, 2020, at 5:25 PM, Peter Shier <pshier@google.com> wrote:
> 
> Verify that when L2 guest enables PAE paging and L0 intercept of L2
> MOV to CR0 reflects MTF exit to L1, subsequent resume to L2 correctly
> preserves PDPTE array specified by L2 CR3.
> 
> Signed-off-by: Jim Mattson <jmattson@google.com>
> Reviewed-by:   Peter Shier <pshier@google.com>
> Signed-off-by: Peter Shier <pshier@google.com>
> ---
> lib/x86/asm/page.h |   8 +++
> x86/vmx_tests.c    | 171 +++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 179 insertions(+)
> 
> diff --git a/lib/x86/asm/page.h b/lib/x86/asm/page.h
> index 7e2a3dd4b90a..1359eb74cde4 100644
> --- a/lib/x86/asm/page.h
> +++ b/lib/x86/asm/page.h
> @@ -36,10 +36,18 @@ typedef unsigned long pgd_t;
> #define PT64_NX_MASK		(1ull << 63)
> #define PT_ADDR_MASK		GENMASK_ULL(51, 12)
> 
> +#define PDPTE64_PAGE_SIZE_MASK	  (1ull << 7)
> +#define PDPTE64_RSVD_MASK	  GENMASK_ULL(51, cpuid_maxphyaddr())
> +
> #define PT_AD_MASK              (PT_ACCESSED_MASK | PT_DIRTY_MASK)
> 
> +#define PAE_PDPTE_RSVD_MASK     (GENMASK_ULL(63, cpuid_maxphyaddr()) |	\
> +				 GENMASK_ULL(8, 5) | GENMASK_ULL(2, 1))
> +
> +
> #ifdef __x86_64__
> #define	PAGE_LEVEL	4
> +#define	PDPT_LEVEL	3
> #define	PGDIR_WIDTH	9
> #define	PGDIR_MASK	511
> #else
> diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
> index 32e3d4f47b33..372e5efb6b5f 100644
> --- a/x86/vmx_tests.c
> +++ b/x86/vmx_tests.c
> @@ -5250,6 +5250,176 @@ static void vmx_mtf_test(void)
> 	enter_guest();
> }
> 
> +extern char vmx_mtf_pdpte_guest_begin;
> +extern char vmx_mtf_pdpte_guest_end;
> +
> +asm("vmx_mtf_pdpte_guest_begin:\n\t"
> +    "mov %cr0, %rax\n\t"    /* save CR0 with PG=1                 */
> +    "vmcall\n\t"            /* on return from this CR0.PG=0       */
> +    "mov %rax, %cr0\n\t"    /* restore CR0.PG=1 to enter PAE mode */
> +    "vmcall\n\t"
> +    "retq\n\t"
> +    "vmx_mtf_pdpte_guest_end:");
> +
> +static void vmx_mtf_pdpte_test(void)
> +{
> +	void *test_mtf_pdpte_guest;
> +	pteval_t *pdpt;
> +	u32 guest_ar_cs;
> +	u64 guest_efer;
> +	pteval_t *pte;
> +	u64 guest_cr0;
> +	u64 guest_cr3;
> +	u64 guest_cr4;
> +	u64 ent_ctls;
> +	int i;
> +
> +	if (setup_ept(false))
> +		return;
> +
> +	if (!(ctrl_cpu_rev[0].clr & CPU_MTF)) {
> +		printf("CPU does not support 'monitor trap flag.'\n");
> +		return;
> +	}
> +
> +	if (!(ctrl_cpu_rev[1].clr & CPU_URG)) {
> +		printf("CPU does not support 'unrestricted guest.'\n");
> +		return;
> +	}
> +
> +	vmcs_write(EXC_BITMAP, ~0);
> +	vmcs_write(CPU_EXEC_CTRL1, vmcs_read(CPU_EXEC_CTRL1) | CPU_URG);
> +
> +	/*
> +	 * Copy the guest code to an identity-mapped page.
> +	 */
> +	test_mtf_pdpte_guest = alloc_page();
> +	memcpy(test_mtf_pdpte_guest, &vmx_mtf_pdpte_guest_begin,
> +	       &vmx_mtf_pdpte_guest_end - &vmx_mtf_pdpte_guest_begin);
> +
> +	test_set_guest(test_mtf_pdpte_guest);
> +
> +	enter_guest();
> +	skip_exit_vmcall();
> +
> +	/*
> +	 * Put the guest in non-paged 32-bit protected mode, ready to enter
> +	 * PAE mode when CR0.PG is set. CR4.PAE will already have been set
> +	 * when the guest started out in long mode.
> +	 */
> +	ent_ctls = vmcs_read(ENT_CONTROLS);
> +	vmcs_write(ENT_CONTROLS, ent_ctls & ~ENT_GUEST_64);
> +
> +	guest_efer = vmcs_read(GUEST_EFER);
> +	vmcs_write(GUEST_EFER, guest_efer & ~(EFER_LMA | EFER_LME));
> +
> +	/*
> +	 * Set CS access rights bits for 32-bit protected mode:
> +	 * 3:0    B execute/read/accessed
> +	 * 4      1 code or data
> +	 * 6:5    0 descriptor privilege level
> +	 * 7      1 present
> +	 * 11:8   0 reserved
> +	 * 12     0 available for use by system software
> +	 * 13     0 64 bit mode not active
> +	 * 14     1 default operation size 32-bit segment
> +	 * 15     1 page granularity: segment limit in 4K units
> +	 * 16     0 segment usable
> +	 * 31:17  0 reserved
> +	 */
> +	guest_ar_cs = vmcs_read(GUEST_AR_CS);
> +	vmcs_write(GUEST_AR_CS, 0xc09b);
> +
> +	guest_cr0 = vmcs_read(GUEST_CR0);
> +	vmcs_write(GUEST_CR0, guest_cr0 & ~X86_CR0_PG);
> +
> +	guest_cr4 = vmcs_read(GUEST_CR4);
> +	vmcs_write(GUEST_CR4, guest_cr4 & ~X86_CR4_PCIDE);
> +
> +	guest_cr3 = vmcs_read(GUEST_CR3);
> +
> +	/*
> +	 * Turn the 4-level page table into a PAE page table by following the 0th
> +	 * PML4 entry to a PDPT page, and grab the first four PDPTEs from that
> +	 * page.
> +	 *
> +	 * Why does this work?
> +	 *
> +	 * PAE uses 32-bit addressing which implies:
> +	 * Bits 11:0   page offset
> +	 * Bits 20:12  entry into 512-entry page table
> +	 * Bits 29:21  entry into a 512-entry directory table
> +	 * Bits 31:30  entry into the page directory pointer table.
> +	 * Bits 63:32  zero
> +	 *
> +	 * As only 2 bits are needed to select the PDPTEs for the entire
> +	 * 32-bit address space, take the first 4 PDPTEs in the level 3 page
> +	 * directory pointer table. It doesn't matter which of these PDPTEs
> +	 * are present because they must cover the guest code given that it
> +	 * has already run successfully.
> +	 *
> +	 * Get a pointer to PTE for GVA=0 in the page directory pointer table
> +	 */
> +	pte = get_pte_level((pgd_t *)(guest_cr3 & ~X86_CR3_PCID_MASK), 0, PDPT_LEVEL);
> +
> +	/*
> +	 * Need some memory for the 4-entry PAE page directory pointer
> +	 * table. Use the end of the identity-mapped page where the guest code
> +	 * is stored. There is definitely space as the guest code is only a
> +	 * few bytes.
> +	 */
> +	pdpt = test_mtf_pdpte_guest + PAGE_SIZE - 4 * sizeof(pteval_t);
> +
> +	/*
> +	 * Copy the first four PDPTEs into the PAE page table with reserved
> +	 * bits cleared. Note that permission bits from the PML4E and PDPTE
> +	 * are not propagated.
> +	 */
> +	for (i = 0; i < 4; i++) {
> +		TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_RSVD_MASK),
> +				   "PDPTE has invalid reserved bits");
> +		TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_PAGE_SIZE_MASK),
> +				   "Cannot use 1GB super pages for PAE");
> +		pdpt[i] = pte[i] & ~(PAE_PDPTE_RSVD_MASK);
> +	}
> +	vmcs_write(GUEST_CR3, virt_to_phys(pdpt));
> +
> +	enable_mtf();
> +	enter_guest();

This entry failed on my bare-metal machine:

Test suite: vmx_mtf_pdpte_test
VM-Exit failure on vmresume (reason=0x80000021, qual=0): invalid guest state

Any idea why?
Nadav Amit Oct. 10, 2020, 9:52 a.m. UTC | #2
> On Oct 10, 2020, at 1:17 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
> 
>> On Aug 17, 2020, at 5:25 PM, Peter Shier <pshier@google.com> wrote:
>> 
>> Verify that when L2 guest enables PAE paging and L0 intercept of L2
>> MOV to CR0 reflects MTF exit to L1, subsequent resume to L2 correctly
>> preserves PDPTE array specified by L2 CR3.
>> 
>> Signed-off-by: Jim Mattson <jmattson@google.com>
>> Reviewed-by:   Peter Shier <pshier@google.com>
>> Signed-off-by: Peter Shier <pshier@google.com>
>> ---
>> lib/x86/asm/page.h |   8 +++
>> x86/vmx_tests.c    | 171 +++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 179 insertions(+)
>> 
>> diff --git a/lib/x86/asm/page.h b/lib/x86/asm/page.h
>> index 7e2a3dd4b90a..1359eb74cde4 100644
>> --- a/lib/x86/asm/page.h
>> +++ b/lib/x86/asm/page.h
>> @@ -36,10 +36,18 @@ typedef unsigned long pgd_t;
>> #define PT64_NX_MASK		(1ull << 63)
>> #define PT_ADDR_MASK		GENMASK_ULL(51, 12)
>> 
>> +#define PDPTE64_PAGE_SIZE_MASK	  (1ull << 7)
>> +#define PDPTE64_RSVD_MASK	  GENMASK_ULL(51, cpuid_maxphyaddr())
>> +
>> #define PT_AD_MASK              (PT_ACCESSED_MASK | PT_DIRTY_MASK)
>> 
>> +#define PAE_PDPTE_RSVD_MASK     (GENMASK_ULL(63, cpuid_maxphyaddr()) |	\
>> +				 GENMASK_ULL(8, 5) | GENMASK_ULL(2, 1))
>> +
>> +
>> #ifdef __x86_64__
>> #define	PAGE_LEVEL	4
>> +#define	PDPT_LEVEL	3
>> #define	PGDIR_WIDTH	9
>> #define	PGDIR_MASK	511
>> #else
>> diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
>> index 32e3d4f47b33..372e5efb6b5f 100644
>> --- a/x86/vmx_tests.c
>> +++ b/x86/vmx_tests.c
>> @@ -5250,6 +5250,176 @@ static void vmx_mtf_test(void)
>> 	enter_guest();
>> }
>> 
>> +extern char vmx_mtf_pdpte_guest_begin;
>> +extern char vmx_mtf_pdpte_guest_end;
>> +
>> +asm("vmx_mtf_pdpte_guest_begin:\n\t"
>> +    "mov %cr0, %rax\n\t"    /* save CR0 with PG=1                 */
>> +    "vmcall\n\t"            /* on return from this CR0.PG=0       */
>> +    "mov %rax, %cr0\n\t"    /* restore CR0.PG=1 to enter PAE mode */
>> +    "vmcall\n\t"
>> +    "retq\n\t"
>> +    "vmx_mtf_pdpte_guest_end:");
>> +
>> +static void vmx_mtf_pdpte_test(void)
>> +{
>> +	void *test_mtf_pdpte_guest;
>> +	pteval_t *pdpt;
>> +	u32 guest_ar_cs;
>> +	u64 guest_efer;
>> +	pteval_t *pte;
>> +	u64 guest_cr0;
>> +	u64 guest_cr3;
>> +	u64 guest_cr4;
>> +	u64 ent_ctls;
>> +	int i;
>> +
>> +	if (setup_ept(false))
>> +		return;
>> +
>> +	if (!(ctrl_cpu_rev[0].clr & CPU_MTF)) {
>> +		printf("CPU does not support 'monitor trap flag.'\n");
>> +		return;
>> +	}
>> +
>> +	if (!(ctrl_cpu_rev[1].clr & CPU_URG)) {
>> +		printf("CPU does not support 'unrestricted guest.'\n");
>> +		return;
>> +	}
>> +
>> +	vmcs_write(EXC_BITMAP, ~0);
>> +	vmcs_write(CPU_EXEC_CTRL1, vmcs_read(CPU_EXEC_CTRL1) | CPU_URG);
>> +
>> +	/*
>> +	 * Copy the guest code to an identity-mapped page.
>> +	 */
>> +	test_mtf_pdpte_guest = alloc_page();
>> +	memcpy(test_mtf_pdpte_guest, &vmx_mtf_pdpte_guest_begin,
>> +	       &vmx_mtf_pdpte_guest_end - &vmx_mtf_pdpte_guest_begin);
>> +
>> +	test_set_guest(test_mtf_pdpte_guest);
>> +
>> +	enter_guest();
>> +	skip_exit_vmcall();
>> +
>> +	/*
>> +	 * Put the guest in non-paged 32-bit protected mode, ready to enter
>> +	 * PAE mode when CR0.PG is set. CR4.PAE will already have been set
>> +	 * when the guest started out in long mode.
>> +	 */
>> +	ent_ctls = vmcs_read(ENT_CONTROLS);
>> +	vmcs_write(ENT_CONTROLS, ent_ctls & ~ENT_GUEST_64);
>> +
>> +	guest_efer = vmcs_read(GUEST_EFER);
>> +	vmcs_write(GUEST_EFER, guest_efer & ~(EFER_LMA | EFER_LME));
>> +
>> +	/*
>> +	 * Set CS access rights bits for 32-bit protected mode:
>> +	 * 3:0    B execute/read/accessed
>> +	 * 4      1 code or data
>> +	 * 6:5    0 descriptor privilege level
>> +	 * 7      1 present
>> +	 * 11:8   0 reserved
>> +	 * 12     0 available for use by system software
>> +	 * 13     0 64 bit mode not active
>> +	 * 14     1 default operation size 32-bit segment
>> +	 * 15     1 page granularity: segment limit in 4K units
>> +	 * 16     0 segment usable
>> +	 * 31:17  0 reserved
>> +	 */
>> +	guest_ar_cs = vmcs_read(GUEST_AR_CS);
>> +	vmcs_write(GUEST_AR_CS, 0xc09b);
>> +
>> +	guest_cr0 = vmcs_read(GUEST_CR0);
>> +	vmcs_write(GUEST_CR0, guest_cr0 & ~X86_CR0_PG);
>> +
>> +	guest_cr4 = vmcs_read(GUEST_CR4);
>> +	vmcs_write(GUEST_CR4, guest_cr4 & ~X86_CR4_PCIDE);
>> +
>> +	guest_cr3 = vmcs_read(GUEST_CR3);
>> +
>> +	/*
>> +	 * Turn the 4-level page table into a PAE page table by following the 0th
>> +	 * PML4 entry to a PDPT page, and grab the first four PDPTEs from that
>> +	 * page.
>> +	 *
>> +	 * Why does this work?
>> +	 *
>> +	 * PAE uses 32-bit addressing which implies:
>> +	 * Bits 11:0   page offset
>> +	 * Bits 20:12  entry into 512-entry page table
>> +	 * Bits 29:21  entry into a 512-entry directory table
>> +	 * Bits 31:30  entry into the page directory pointer table.
>> +	 * Bits 63:32  zero
>> +	 *
>> +	 * As only 2 bits are needed to select the PDPTEs for the entire
>> +	 * 32-bit address space, take the first 4 PDPTEs in the level 3 page
>> +	 * directory pointer table. It doesn't matter which of these PDPTEs
>> +	 * are present because they must cover the guest code given that it
>> +	 * has already run successfully.
>> +	 *
>> +	 * Get a pointer to PTE for GVA=0 in the page directory pointer table
>> +	 */
>> +	pte = get_pte_level((pgd_t *)(guest_cr3 & ~X86_CR3_PCID_MASK), 0, PDPT_LEVEL);
>> +
>> +	/*
>> +	 * Need some memory for the 4-entry PAE page directory pointer
>> +	 * table. Use the end of the identity-mapped page where the guest code
>> +	 * is stored. There is definitely space as the guest code is only a
>> +	 * few bytes.
>> +	 */
>> +	pdpt = test_mtf_pdpte_guest + PAGE_SIZE - 4 * sizeof(pteval_t);
>> +
>> +	/*
>> +	 * Copy the first four PDPTEs into the PAE page table with reserved
>> +	 * bits cleared. Note that permission bits from the PML4E and PDPTE
>> +	 * are not propagated.
>> +	 */
>> +	for (i = 0; i < 4; i++) {
>> +		TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_RSVD_MASK),
>> +				   "PDPTE has invalid reserved bits");
>> +		TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_PAGE_SIZE_MASK),
>> +				   "Cannot use 1GB super pages for PAE");
>> +		pdpt[i] = pte[i] & ~(PAE_PDPTE_RSVD_MASK);
>> +	}
>> +	vmcs_write(GUEST_CR3, virt_to_phys(pdpt));
>> +
>> +	enable_mtf();
>> +	enter_guest();
> 
> This entry failed on my bare-metal machine:
> 
> Test suite: vmx_mtf_pdpte_test
> VM-Exit failure on vmresume (reason=0x80000021, qual=0): invalid guest state
> 
> Any idea why?

I guess that the test makes an assumption that there are no addresses
greater than 4GB. When I reduce the size of the memory, the test passes.
Jim Mattson Oct. 12, 2020, 6:16 p.m. UTC | #3
On Sat, Oct 10, 2020 at 2:52 AM Nadav Amit <nadav.amit@gmail.com> wrote:

> I guess that the test makes an assumption that there are no addresses
> greater than 4GB. When I reduce the size of the memory, the test passes.

Yes; the identity-mapped page used for real-address mode has to be
less than 4Gb.

I think this can be fixed with an entry in unittests.cfg that specifies -m 2048.
Nadav Amit Oct. 12, 2020, 6:23 p.m. UTC | #4
> On Oct 12, 2020, at 11:16 AM, Jim Mattson <jmattson@google.com> wrote:
> 
> On Sat, Oct 10, 2020 at 2:52 AM Nadav Amit <nadav.amit@gmail.com> wrote:
> 
>> I guess that the test makes an assumption that there are no addresses
>> greater than 4GB. When I reduce the size of the memory, the test passes.
> 
> Yes; the identity-mapped page used for real-address mode has to be
> less than 4Gb.
> 
> I think this can be fixed with an entry in unittests.cfg that specifies -m 2048.

I prefer to skip the test if the conditions do not allow to run it, as I do
not use unittests.cfg.

I will send a patch later.
Jim Mattson Oct. 12, 2020, 6:31 p.m. UTC | #5
On Mon, Oct 12, 2020 at 11:23 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Oct 12, 2020, at 11:16 AM, Jim Mattson <jmattson@google.com> wrote:
> >
> > On Sat, Oct 10, 2020 at 2:52 AM Nadav Amit <nadav.amit@gmail.com> wrote:
> >
> >> I guess that the test makes an assumption that there are no addresses
> >> greater than 4GB. When I reduce the size of the memory, the test passes.
> >
> > Yes; the identity-mapped page used for real-address mode has to be
> > less than 4Gb.
> >
> > I think this can be fixed with an entry in unittests.cfg that specifies -m 2048.
>
> I prefer to skip the test if the conditions do not allow to run it, as I do
> not use unittests.cfg.
>
> I will send a patch later.

It doesn't have to be skipped. We just need a variant of alloc_page()
that will dole out a page with a 32-bit address.
diff mbox series

Patch

diff --git a/lib/x86/asm/page.h b/lib/x86/asm/page.h
index 7e2a3dd4b90a..1359eb74cde4 100644
--- a/lib/x86/asm/page.h
+++ b/lib/x86/asm/page.h
@@ -36,10 +36,18 @@  typedef unsigned long pgd_t;
 #define PT64_NX_MASK		(1ull << 63)
 #define PT_ADDR_MASK		GENMASK_ULL(51, 12)
 
+#define PDPTE64_PAGE_SIZE_MASK	  (1ull << 7)
+#define PDPTE64_RSVD_MASK	  GENMASK_ULL(51, cpuid_maxphyaddr())
+
 #define PT_AD_MASK              (PT_ACCESSED_MASK | PT_DIRTY_MASK)
 
+#define PAE_PDPTE_RSVD_MASK     (GENMASK_ULL(63, cpuid_maxphyaddr()) |	\
+				 GENMASK_ULL(8, 5) | GENMASK_ULL(2, 1))
+
+
 #ifdef __x86_64__
 #define	PAGE_LEVEL	4
+#define	PDPT_LEVEL	3
 #define	PGDIR_WIDTH	9
 #define	PGDIR_MASK	511
 #else
diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index 32e3d4f47b33..372e5efb6b5f 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -5250,6 +5250,176 @@  static void vmx_mtf_test(void)
 	enter_guest();
 }
 
+extern char vmx_mtf_pdpte_guest_begin;
+extern char vmx_mtf_pdpte_guest_end;
+
+asm("vmx_mtf_pdpte_guest_begin:\n\t"
+    "mov %cr0, %rax\n\t"    /* save CR0 with PG=1                 */
+    "vmcall\n\t"            /* on return from this CR0.PG=0       */
+    "mov %rax, %cr0\n\t"    /* restore CR0.PG=1 to enter PAE mode */
+    "vmcall\n\t"
+    "retq\n\t"
+    "vmx_mtf_pdpte_guest_end:");
+
+static void vmx_mtf_pdpte_test(void)
+{
+	void *test_mtf_pdpte_guest;
+	pteval_t *pdpt;
+	u32 guest_ar_cs;
+	u64 guest_efer;
+	pteval_t *pte;
+	u64 guest_cr0;
+	u64 guest_cr3;
+	u64 guest_cr4;
+	u64 ent_ctls;
+	int i;
+
+	if (setup_ept(false))
+		return;
+
+	if (!(ctrl_cpu_rev[0].clr & CPU_MTF)) {
+		printf("CPU does not support 'monitor trap flag.'\n");
+		return;
+	}
+
+	if (!(ctrl_cpu_rev[1].clr & CPU_URG)) {
+		printf("CPU does not support 'unrestricted guest.'\n");
+		return;
+	}
+
+	vmcs_write(EXC_BITMAP, ~0);
+	vmcs_write(CPU_EXEC_CTRL1, vmcs_read(CPU_EXEC_CTRL1) | CPU_URG);
+
+	/*
+	 * Copy the guest code to an identity-mapped page.
+	 */
+	test_mtf_pdpte_guest = alloc_page();
+	memcpy(test_mtf_pdpte_guest, &vmx_mtf_pdpte_guest_begin,
+	       &vmx_mtf_pdpte_guest_end - &vmx_mtf_pdpte_guest_begin);
+
+	test_set_guest(test_mtf_pdpte_guest);
+
+	enter_guest();
+	skip_exit_vmcall();
+
+	/*
+	 * Put the guest in non-paged 32-bit protected mode, ready to enter
+	 * PAE mode when CR0.PG is set. CR4.PAE will already have been set
+	 * when the guest started out in long mode.
+	 */
+	ent_ctls = vmcs_read(ENT_CONTROLS);
+	vmcs_write(ENT_CONTROLS, ent_ctls & ~ENT_GUEST_64);
+
+	guest_efer = vmcs_read(GUEST_EFER);
+	vmcs_write(GUEST_EFER, guest_efer & ~(EFER_LMA | EFER_LME));
+
+	/*
+	 * Set CS access rights bits for 32-bit protected mode:
+	 * 3:0    B execute/read/accessed
+	 * 4      1 code or data
+	 * 6:5    0 descriptor privilege level
+	 * 7      1 present
+	 * 11:8   0 reserved
+	 * 12     0 available for use by system software
+	 * 13     0 64 bit mode not active
+	 * 14     1 default operation size 32-bit segment
+	 * 15     1 page granularity: segment limit in 4K units
+	 * 16     0 segment usable
+	 * 31:17  0 reserved
+	 */
+	guest_ar_cs = vmcs_read(GUEST_AR_CS);
+	vmcs_write(GUEST_AR_CS, 0xc09b);
+
+	guest_cr0 = vmcs_read(GUEST_CR0);
+	vmcs_write(GUEST_CR0, guest_cr0 & ~X86_CR0_PG);
+
+	guest_cr4 = vmcs_read(GUEST_CR4);
+	vmcs_write(GUEST_CR4, guest_cr4 & ~X86_CR4_PCIDE);
+
+	guest_cr3 = vmcs_read(GUEST_CR3);
+
+	/*
+	 * Turn the 4-level page table into a PAE page table by following the 0th
+	 * PML4 entry to a PDPT page, and grab the first four PDPTEs from that
+	 * page.
+	 *
+	 * Why does this work?
+	 *
+	 * PAE uses 32-bit addressing which implies:
+	 * Bits 11:0   page offset
+	 * Bits 20:12  entry into 512-entry page table
+	 * Bits 29:21  entry into a 512-entry directory table
+	 * Bits 31:30  entry into the page directory pointer table.
+	 * Bits 63:32  zero
+	 *
+	 * As only 2 bits are needed to select the PDPTEs for the entire
+	 * 32-bit address space, take the first 4 PDPTEs in the level 3 page
+	 * directory pointer table. It doesn't matter which of these PDPTEs
+	 * are present because they must cover the guest code given that it
+	 * has already run successfully.
+	 *
+	 * Get a pointer to PTE for GVA=0 in the page directory pointer table
+	 */
+	pte = get_pte_level((pgd_t *)(guest_cr3 & ~X86_CR3_PCID_MASK), 0, PDPT_LEVEL);
+
+	/*
+	 * Need some memory for the 4-entry PAE page directory pointer
+	 * table. Use the end of the identity-mapped page where the guest code
+	 * is stored. There is definitely space as the guest code is only a
+	 * few bytes.
+	 */
+	pdpt = test_mtf_pdpte_guest + PAGE_SIZE - 4 * sizeof(pteval_t);
+
+	/*
+	 * Copy the first four PDPTEs into the PAE page table with reserved
+	 * bits cleared. Note that permission bits from the PML4E and PDPTE
+	 * are not propagated.
+	 */
+	for (i = 0; i < 4; i++) {
+		TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_RSVD_MASK),
+				   "PDPTE has invalid reserved bits");
+		TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_PAGE_SIZE_MASK),
+				   "Cannot use 1GB super pages for PAE");
+		pdpt[i] = pte[i] & ~(PAE_PDPTE_RSVD_MASK);
+	}
+	vmcs_write(GUEST_CR3, virt_to_phys(pdpt));
+
+	enable_mtf();
+	enter_guest();
+	assert_exit_reason(VMX_MTF);
+	disable_mtf();
+
+	/*
+	 * The four PDPTEs should have been loaded into the VMCS when
+	 * the guest set CR0.PG to enter PAE mode.
+	 */
+	for (i = 0; i < 4; i++) {
+		u64 pdpte = vmcs_read(GUEST_PDPTE + 2 * i);
+
+		report(pdpte == pdpt[i], "PDPTE%d is 0x%lx (expected 0x%lx)",
+		       i, pdpte, pdpt[i]);
+	}
+
+	/*
+	 * Now, try to enter the guest in PAE mode. If the PDPTEs in the
+	 * vmcs are wrong, this will fail.
+	 */
+	enter_guest();
+	skip_exit_vmcall();
+
+	/*
+	 * Return guest to 64-bit mode and wrap up.
+	 */
+	vmcs_write(ENT_CONTROLS, ent_ctls);
+	vmcs_write(GUEST_EFER, guest_efer);
+	vmcs_write(GUEST_AR_CS, guest_ar_cs);
+	vmcs_write(GUEST_CR0, guest_cr0);
+	vmcs_write(GUEST_CR4, guest_cr4);
+	vmcs_write(GUEST_CR3, guest_cr3);
+
+	enter_guest();
+}
+
 /*
  * Tests for VM-execution control fields
  */
@@ -10112,5 +10282,6 @@  struct vmx_test vmx_tests[] = {
 	TEST(atomic_switch_overflow_msrs_test),
 	TEST(rdtsc_vmexit_diff_test),
 	TEST(vmx_mtf_test),
+	TEST(vmx_mtf_pdpte_test),
 	{ NULL, NULL, NULL, NULL, NULL, {0} },
 };