diff mbox series

[RFC,16/28] kvm: mmu: Add direct MMU page fault handler

Message ID 20190926231824.149014-17-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: mmu: Rework the x86 TDP direct mapped case | expand

Commit Message

Ben Gardon Sept. 26, 2019, 11:18 p.m. UTC
Adds handler functions to replace __direct_map in handling direct page
faults. These functions, unlike __direct_map can handle page faults on
multiple VCPUs simultaneously.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu.c | 192 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 179 insertions(+), 13 deletions(-)

Comments

Peter Xu Jan. 8, 2020, 5:20 p.m. UTC | #1
On Thu, Sep 26, 2019 at 04:18:12PM -0700, Ben Gardon wrote:

[...]

> +static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
> +		unsigned long mmu_seq, int write, int map_writable, int level,
> +		gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
> +{
> +	struct direct_walk_iterator iter;
> +	struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
> +	u64 *child_pt;
> +	u64 new_pte;
> +	int ret = RET_PF_RETRY;
> +
> +	direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
> +			kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
> +			(gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
> +	while (direct_walk_iterator_next_pte(&iter)) {
> +		if (iter.level == level) {
> +			ret = direct_page_fault_handle_target_level(vcpu,
> +					write, map_writable, &iter, pfn,
> +					prefault);
> +
> +			break;
> +		} else if (!is_present_direct_pte(iter.old_pte) ||
> +			   is_large_pte(iter.old_pte)) {
> +			/*
> +			 * The leaf PTE for this fault must be mapped at a
> +			 * lower level, so a non-leaf PTE must be inserted into
> +			 * the paging structure. If the assignment below
> +			 * succeeds, it will add the non-leaf PTE and a new
> +			 * page of page table memory. Then the iterator can
> +			 * traverse into that new page. If the atomic compare/
> +			 * exchange fails, the iterator will repeat the current
> +			 * PTE, so the only thing this function must do
> +			 * differently is return the page table memory to the
> +			 * vCPU's fault cache.
> +			 */
> +			child_pt = mmu_memory_cache_alloc(pf_pt_cache);
> +			new_pte = generate_nonleaf_pte(child_pt, false);
> +
> +			if (!direct_walk_iterator_set_pte(&iter, new_pte))
> +				mmu_memory_cache_return(pf_pt_cache, child_pt);
> +		}
> +	}

I have a question on how this will guarantee safe concurrency...

As you mentioned previously somewhere, the design somehow mimics how
the core mm works with process page tables, and IIUC here the rwlock
works really like the mmap_sem that we have for the process mm.  So
with the series now we can have multiple page fault happening with
read lock held of the mmu_lock to reach here.

Then I'm imagining a case where both vcpu threads faulted on the same
address range while when they wanted to do different things, like: (1)
vcpu1 thread wanted to map this as a 2M huge page, while (2) vcpu2
thread wanted to map this as a 4K page.  Then is it possible that
vcpu2 is faster so it firstly setup the pmd as a page table page (via
direct_walk_iterator_set_pte above), then vcpu1 quickly overwrite it
as a huge page (via direct_page_fault_handle_target_level, level=2),
then I feel like the previous page table page that setup by vcpu2 can
be lost unnoticed.

I think general process page table does not have this issue is because
it has per pmd lock so anyone who changes the pmd or beneath it will
need to take that.  However here we don't have it, instead we only
depend on the atomic ops, which seems to be not enough for this?

Thanks,

> +	direct_walk_iterator_end_traversal(&iter);
> +
> +	/* If emulating, flush this vcpu's TLB. */
> +	if (ret == RET_PF_EMULATE)
> +		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> +
> +	return ret;
> +}
Ben Gardon Jan. 8, 2020, 6:15 p.m. UTC | #2
On Wed, Jan 8, 2020 at 9:20 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Sep 26, 2019 at 04:18:12PM -0700, Ben Gardon wrote:
>
> [...]
>
> > +static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
> > +             unsigned long mmu_seq, int write, int map_writable, int level,
> > +             gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
> > +{
> > +     struct direct_walk_iterator iter;
> > +     struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
> > +     u64 *child_pt;
> > +     u64 new_pte;
> > +     int ret = RET_PF_RETRY;
> > +
> > +     direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
> > +                     kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
> > +                     (gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
> > +     while (direct_walk_iterator_next_pte(&iter)) {
> > +             if (iter.level == level) {
> > +                     ret = direct_page_fault_handle_target_level(vcpu,
> > +                                     write, map_writable, &iter, pfn,
> > +                                     prefault);
> > +
> > +                     break;
> > +             } else if (!is_present_direct_pte(iter.old_pte) ||
> > +                        is_large_pte(iter.old_pte)) {
> > +                     /*
> > +                      * The leaf PTE for this fault must be mapped at a
> > +                      * lower level, so a non-leaf PTE must be inserted into
> > +                      * the paging structure. If the assignment below
> > +                      * succeeds, it will add the non-leaf PTE and a new
> > +                      * page of page table memory. Then the iterator can
> > +                      * traverse into that new page. If the atomic compare/
> > +                      * exchange fails, the iterator will repeat the current
> > +                      * PTE, so the only thing this function must do
> > +                      * differently is return the page table memory to the
> > +                      * vCPU's fault cache.
> > +                      */
> > +                     child_pt = mmu_memory_cache_alloc(pf_pt_cache);
> > +                     new_pte = generate_nonleaf_pte(child_pt, false);
> > +
> > +                     if (!direct_walk_iterator_set_pte(&iter, new_pte))
> > +                             mmu_memory_cache_return(pf_pt_cache, child_pt);
> > +             }
> > +     }
>
> I have a question on how this will guarantee safe concurrency...
>
> As you mentioned previously somewhere, the design somehow mimics how
> the core mm works with process page tables, and IIUC here the rwlock
> works really like the mmap_sem that we have for the process mm.  So
> with the series now we can have multiple page fault happening with
> read lock held of the mmu_lock to reach here.

Ah, I'm sorry if I put that down somewhere. I think that comparing the
MMU rwlock in this series to the core mm mmap_sem was a mistake. I do
not understand the ways in which the core mm uses the mmap_sem enough
to make such a comparison. You're correct that with two faulting vCPUs
we could have page faults on the same address range happening in
parallel. I'll try to elaborate more on why that's safe.

> Then I'm imagining a case where both vcpu threads faulted on the same
> address range while when they wanted to do different things, like: (1)
> vcpu1 thread wanted to map this as a 2M huge page, while (2) vcpu2
> thread wanted to map this as a 4K page.

By vcpu thread, do you mean the page fault / EPT violation handler
wants to map memory at different levels?. As far as I understand,
vCPUs do not have any intent to map  a page at a certain level when
they take an EPT violation. The page fault handlers could certainly
want to map the memory at different levels. For example, if guest
memory was backed with 2M hugepages and one vCPU tried to do an
instruction fetch on an unmapped page while another tried to read it,
that should result in the page fault handler for the first vCPU trying
to map at 4K and the other trying to map at 2M, as in your example.

> Then is it possible that
> vcpu2 is faster so it firstly setup the pmd as a page table page (via
> direct_walk_iterator_set_pte above),

This is definitely possible

> then vcpu1 quickly overwrite it
> as a huge page (via direct_page_fault_handle_target_level, level=2),
> then I feel like the previous page table page that setup by vcpu2 can
> be lost unnoticed.

There are two possibilities here. 1.) vCPU2 saw vCPU1's modification
to the PTE during its walk. In this case, vCPU2 should not map the
memory at 2M. (I realize that in this example there is a discrepancy
as there's no NX hugepage support in this RFC. I need to add that in
the next version. In this case, vCPU1 would set a bit in the non-leaf
PTE to indicate it was split to allow X on a constituent 4K entry.)
2.) If vCPU2 did not see vCPU1's modification during its walk, it will
indeed try to map the memory at 2M. However in this case the atomic
cpmxchg on the PTE will fail because vCPU2 did not have the current
value of the PTE. In this case the PTE will be re-read and the walk
will continue or the page fault will be retried. When threads using
the direct walk iterator change PTEs with an atomic cmpxchg, they are
guaranteed to know what the value of the PTE was before the cmpxchg
and so that thread is then responsible for any cleanup associated with
the PTE modification - e.g. freeing pages of page table memory.

> I think general process page table does not have this issue is because
> it has per pmd lock so anyone who changes the pmd or beneath it will
> need to take that.  However here we don't have it, instead we only
> depend on the atomic ops, which seems to be not enough for this?

I think that atomic ops (plus rcu to ensure no use-after-free) are
enough in this case, but I could definitely be wrong. If your concern
about the race requires the NX hugepages stuff, I need to get on top
of sending out those patches. If you can think of a race that doesn't
require that, I'd be very excited to hear it.

> Thanks,
>
> > +     direct_walk_iterator_end_traversal(&iter);
> > +
> > +     /* If emulating, flush this vcpu's TLB. */
> > +     if (ret == RET_PF_EMULATE)
> > +             kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> > +
> > +     return ret;
> > +}
>
> --
> Peter Xu
>
Peter Xu Jan. 8, 2020, 7 p.m. UTC | #3
On Wed, Jan 08, 2020 at 10:15:41AM -0800, Ben Gardon wrote:
> On Wed, Jan 8, 2020 at 9:20 AM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Thu, Sep 26, 2019 at 04:18:12PM -0700, Ben Gardon wrote:
> >
> > [...]
> >
> > > +static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
> > > +             unsigned long mmu_seq, int write, int map_writable, int level,
> > > +             gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
> > > +{
> > > +     struct direct_walk_iterator iter;
> > > +     struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
> > > +     u64 *child_pt;
> > > +     u64 new_pte;
> > > +     int ret = RET_PF_RETRY;
> > > +
> > > +     direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
> > > +                     kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
> > > +                     (gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
> > > +     while (direct_walk_iterator_next_pte(&iter)) {
> > > +             if (iter.level == level) {
> > > +                     ret = direct_page_fault_handle_target_level(vcpu,
> > > +                                     write, map_writable, &iter, pfn,
> > > +                                     prefault);
> > > +
> > > +                     break;
> > > +             } else if (!is_present_direct_pte(iter.old_pte) ||
> > > +                        is_large_pte(iter.old_pte)) {
> > > +                     /*
> > > +                      * The leaf PTE for this fault must be mapped at a
> > > +                      * lower level, so a non-leaf PTE must be inserted into
> > > +                      * the paging structure. If the assignment below
> > > +                      * succeeds, it will add the non-leaf PTE and a new
> > > +                      * page of page table memory. Then the iterator can
> > > +                      * traverse into that new page. If the atomic compare/
> > > +                      * exchange fails, the iterator will repeat the current
> > > +                      * PTE, so the only thing this function must do
> > > +                      * differently is return the page table memory to the
> > > +                      * vCPU's fault cache.
> > > +                      */
> > > +                     child_pt = mmu_memory_cache_alloc(pf_pt_cache);
> > > +                     new_pte = generate_nonleaf_pte(child_pt, false);
> > > +
> > > +                     if (!direct_walk_iterator_set_pte(&iter, new_pte))
> > > +                             mmu_memory_cache_return(pf_pt_cache, child_pt);
> > > +             }
> > > +     }
> >
> > I have a question on how this will guarantee safe concurrency...
> >
> > As you mentioned previously somewhere, the design somehow mimics how
> > the core mm works with process page tables, and IIUC here the rwlock
> > works really like the mmap_sem that we have for the process mm.  So
> > with the series now we can have multiple page fault happening with
> > read lock held of the mmu_lock to reach here.
> 
> Ah, I'm sorry if I put that down somewhere. I think that comparing the
> MMU rwlock in this series to the core mm mmap_sem was a mistake. I do
> not understand the ways in which the core mm uses the mmap_sem enough
> to make such a comparison. You're correct that with two faulting vCPUs
> we could have page faults on the same address range happening in
> parallel. I'll try to elaborate more on why that's safe.
> 
> > Then I'm imagining a case where both vcpu threads faulted on the same
> > address range while when they wanted to do different things, like: (1)
> > vcpu1 thread wanted to map this as a 2M huge page, while (2) vcpu2
> > thread wanted to map this as a 4K page.
> 
> By vcpu thread, do you mean the page fault / EPT violation handler
> wants to map memory at different levels?. As far as I understand,
> vCPUs do not have any intent to map  a page at a certain level when
> they take an EPT violation. The page fault handlers could certainly
> want to map the memory at different levels. For example, if guest
> memory was backed with 2M hugepages and one vCPU tried to do an
> instruction fetch on an unmapped page while another tried to read it,
> that should result in the page fault handler for the first vCPU trying
> to map at 4K and the other trying to map at 2M, as in your example.
> 
> > Then is it possible that
> > vcpu2 is faster so it firstly setup the pmd as a page table page (via
> > direct_walk_iterator_set_pte above),
> 
> This is definitely possible
> 
> > then vcpu1 quickly overwrite it
> > as a huge page (via direct_page_fault_handle_target_level, level=2),
> > then I feel like the previous page table page that setup by vcpu2 can
> > be lost unnoticed.
> 
> There are two possibilities here. 1.) vCPU2 saw vCPU1's modification
> to the PTE during its walk. In this case, vCPU2 should not map the
> memory at 2M. (I realize that in this example there is a discrepancy
> as there's no NX hugepage support in this RFC. I need to add that in
> the next version. In this case, vCPU1 would set a bit in the non-leaf
> PTE to indicate it was split to allow X on a constituent 4K entry.)
> 2.) If vCPU2 did not see vCPU1's modification during its walk, it will
> indeed try to map the memory at 2M. However in this case the atomic
> cpmxchg on the PTE will fail because vCPU2 did not have the current
> value of the PTE. In this case the PTE will be re-read and the walk
> will continue or the page fault will be retried. When threads using
> the direct walk iterator change PTEs with an atomic cmpxchg, they are
> guaranteed to know what the value of the PTE was before the cmpxchg
> and so that thread is then responsible for any cleanup associated with
> the PTE modification - e.g. freeing pages of page table memory.
> 
> > I think general process page table does not have this issue is because
> > it has per pmd lock so anyone who changes the pmd or beneath it will
> > need to take that.  However here we don't have it, instead we only
> > depend on the atomic ops, which seems to be not enough for this?
> 
> I think that atomic ops (plus rcu to ensure no use-after-free) are
> enough in this case, but I could definitely be wrong. If your concern
> about the race requires the NX hugepages stuff, I need to get on top
> of sending out those patches. If you can think of a race that doesn't
> require that, I'd be very excited to hear it.

Actually nx_huge_pages is exactly the thing I thought about for this
case when I was trying to find a real scenario (because in most cases
even if vcpu1 & vcpu2 traps at the same address, they still seem to
map the pages in the same way).  But yes if you're even prepared for
that (so IIUC the 2M mapping will respect the 4K mappings in that
case) then it looks reasonable.

And I think I was wrong above in that the page should not be leaked
anyway, since I just noticed handle_changed_pte() should take care of
that, iiuc:

	if (was_present && !was_leaf && (pfn_changed || !is_present)) {
		/*
		 * The level of the page table being freed is one level lower
		 * than the level at which it is mapped.
		 */
		child_level = level - 1;

		/*
		 * If there was a present non-leaf entry before, and now the
		 * entry points elsewhere, the lpage stats and dirty logging /
		 * access tracking status for all the entries the old pte
		 * pointed to must be updated and the page table pages it
		 * pointed to must be freed.
		 */
		handle_disconnected_pt(kvm, as_id, gfn, spte_to_pfn(old_pte),
				       child_level, vm_teardown,
				       disconnected_pts);
	}

With that, I don't have any other concerns so far.  Will wait for your
next version.

Thanks,
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0696658b527c..f3a26a32c8174 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1117,6 +1117,24 @@  static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
 }
 
+/*
+ * Return an unused object to the specified cache. The object's memory should
+ * be zeroed before being returned if that memory was modified after allocation
+ * from the cache.
+ */
+static void mmu_memory_cache_return(struct kvm_mmu_memory_cache *mc,
+				     void *obj)
+{
+	/*
+	 * Since this object was allocated from the cache, the cache should
+	 * have at least one spare capacity to put the object back.
+	 */
+	BUG_ON(mc->nobjs >= ARRAY_SIZE(mc->objects));
+
+	mc->objects[mc->nobjs] = obj;
+	mc->nobjs++;
+}
+
 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
 	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
@@ -2426,6 +2444,21 @@  static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
 	return r;
 }
 
+static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
+{
+	u64 pte;
+
+	pte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
+
+	if (ad_disabled)
+		pte |= shadow_acc_track_value;
+	else
+		pte |= shadow_accessed_mask;
+
+	return pte;
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
@@ -3432,13 +3465,7 @@  static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 
 	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
-	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
-
-	if (sp_ad_disabled(sp))
-		spte |= shadow_acc_track_value;
-	else
-		spte |= shadow_accessed_mask;
+	spte = generate_nonleaf_pte(sp->spt, sp_ad_disabled(sp));
 
 	mmu_spte_set(sptep, spte);
 
@@ -4071,6 +4098,126 @@  static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 	return ret;
 }
 
+static int direct_page_fault_handle_target_level(struct kvm_vcpu *vcpu,
+		int write, int map_writable, struct direct_walk_iterator *iter,
+		kvm_pfn_t pfn, bool prefault)
+{
+	u64 new_pte;
+	int ret = 0;
+	int generate_pte_ret = 0;
+
+	if (unlikely(is_noslot_pfn(pfn)))
+		new_pte = generate_mmio_pte(vcpu, iter->pte_gfn_start, ACC_ALL);
+	else {
+		generate_pte_ret = generate_pte(vcpu, ACC_ALL, iter->level,
+						iter->pte_gfn_start, pfn,
+						iter->old_pte, prefault, false,
+						map_writable, false, &new_pte);
+		/* Failed to construct a PTE. Retry the page fault. */
+		if (!new_pte)
+			return RET_PF_RETRY;
+	}
+
+	/*
+	 * If the page fault was caused by a write but the page is write
+	 * protected, emulation is needed. If the emulation was skipped,
+	 * the vcpu would have the same fault again.
+	 */
+	if ((generate_pte_ret & SET_SPTE_WRITE_PROTECTED_PT) && write)
+		ret = RET_PF_EMULATE;
+
+	/* If an MMIO PTE was installed, the MMIO will need to be emulated. */
+	if (unlikely(is_mmio_spte(new_pte)))
+		ret = RET_PF_EMULATE;
+
+	/*
+	 * If this would not change the PTE then some other thread must have
+	 * already fixed the page fault and there's no need to proceed.
+	 */
+	if (iter->old_pte == new_pte)
+		return ret;
+
+	/*
+	 * If this warning were to trigger, it would indicate that there was a
+	 * missing MMU notifier or this thread raced with some notifier
+	 * handler. The page fault handler should never change a present, leaf
+	 * PTE to point to a differnt PFN. A notifier handler should have
+	 * zapped the PTE before the main MM's page table was changed.
+	 */
+	WARN_ON(is_present_direct_pte(iter->old_pte) &&
+		is_present_direct_pte(new_pte) &&
+		is_last_spte(iter->old_pte, iter->level) &&
+		is_last_spte(new_pte, iter->level) &&
+		spte_to_pfn(iter->old_pte) != spte_to_pfn(new_pte));
+
+	/*
+	 * If the page fault handler lost the race to set the PTE, retry the
+	 * page fault.
+	 */
+	if (!direct_walk_iterator_set_pte(iter, new_pte))
+		return RET_PF_RETRY;
+
+	/*
+	 * Update some stats for this page fault, if the page
+	 * fault was not speculative.
+	 */
+	if (!prefault)
+		vcpu->stat.pf_fixed++;
+
+	return ret;
+
+}
+
+static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
+		unsigned long mmu_seq, int write, int map_writable, int level,
+		gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+{
+	struct direct_walk_iterator iter;
+	struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
+	u64 *child_pt;
+	u64 new_pte;
+	int ret = RET_PF_RETRY;
+
+	direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
+			kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
+			(gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
+	while (direct_walk_iterator_next_pte(&iter)) {
+		if (iter.level == level) {
+			ret = direct_page_fault_handle_target_level(vcpu,
+					write, map_writable, &iter, pfn,
+					prefault);
+
+			break;
+		} else if (!is_present_direct_pte(iter.old_pte) ||
+			   is_large_pte(iter.old_pte)) {
+			/*
+			 * The leaf PTE for this fault must be mapped at a
+			 * lower level, so a non-leaf PTE must be inserted into
+			 * the paging structure. If the assignment below
+			 * succeeds, it will add the non-leaf PTE and a new
+			 * page of page table memory. Then the iterator can
+			 * traverse into that new page. If the atomic compare/
+			 * exchange fails, the iterator will repeat the current
+			 * PTE, so the only thing this function must do
+			 * differently is return the page table memory to the
+			 * vCPU's fault cache.
+			 */
+			child_pt = mmu_memory_cache_alloc(pf_pt_cache);
+			new_pte = generate_nonleaf_pte(child_pt, false);
+
+			if (!direct_walk_iterator_set_pte(&iter, new_pte))
+				mmu_memory_cache_return(pf_pt_cache, child_pt);
+		}
+	}
+	direct_walk_iterator_end_traversal(&iter);
+
+	/* If emulating, flush this vcpu's TLB. */
+	if (ret == RET_PF_EMULATE)
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+	return ret;
+}
+
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
 {
 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
@@ -5014,7 +5161,7 @@  static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
-	bool map_writable;
+	bool map_writable = false;
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
@@ -5035,8 +5182,9 @@  static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}
 
-	if (fast_page_fault(vcpu, gpa, level, error_code))
-		return RET_PF_RETRY;
+	if (!vcpu->kvm->arch.direct_mmu_enabled)
+		if (fast_page_fault(vcpu, gpa, level, error_code))
+			return RET_PF_RETRY;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
@@ -5048,17 +5196,31 @@  static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		return r;
 
 	r = RET_PF_RETRY;
-	write_lock(&vcpu->kvm->mmu_lock);
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		read_lock(&vcpu->kvm->mmu_lock);
+	else
+		write_lock(&vcpu->kvm->mmu_lock);
+
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 	if (make_mmu_pages_available(vcpu) < 0)
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		r = handle_direct_page_fault(vcpu, mmu_seq, write, map_writable,
+				level, gpa, gfn, pfn, prefault);
+	else
+		r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+				 prefault);
 
 out_unlock:
-	write_unlock(&vcpu->kvm->mmu_lock);
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		read_unlock(&vcpu->kvm->mmu_lock);
+	else
+		write_unlock(&vcpu->kvm->mmu_lock);
+
 	kvm_release_pfn_clean(pfn);
 	return r;
 }
@@ -6242,6 +6404,10 @@  static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 {
 	LIST_HEAD(invalid_list);
 
+	if (vcpu->arch.mmu->direct_map && vcpu->kvm->arch.direct_mmu_enabled)
+		/* Reclaim is a todo. */
+		return true;
+
 	if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
 		return 0;