diff mbox series

[19/24] kvm: x86/mmu: Protect tdp_mmu_pages with a lock

Message ID 20210112181041.356734-20-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Allow parallel page faults with TDP MMU | expand

Commit Message

Ben Gardon Jan. 12, 2021, 6:10 p.m. UTC
Add a lock to protect the data structures that track the page table
memory used by the TDP MMU. In order to handle multiple TDP MMU
operations in parallel, pages of PT memory must be added and removed
without the exclusive protection of the MMU lock. A new lock to protect
the list(s) of in-use pages will cause some serialization, but only on
non-leaf page table entries, so the lock is not expected to be very
contended.

Reviewed-by: Peter Feiner <pfeiner@google.com>

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/include/asm/kvm_host.h | 15 ++++++++
 arch/x86/kvm/mmu/tdp_mmu.c      | 67 +++++++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 8 deletions(-)

Comments

Sean Christopherson Jan. 21, 2021, 7:22 p.m. UTC | #1
On Tue, Jan 12, 2021, Ben Gardon wrote:
> Add a lock to protect the data structures that track the page table
> memory used by the TDP MMU. In order to handle multiple TDP MMU
> operations in parallel, pages of PT memory must be added and removed
> without the exclusive protection of the MMU lock. A new lock to protect
> the list(s) of in-use pages will cause some serialization, but only on
> non-leaf page table entries, so the lock is not expected to be very
> contended.
> 
> Reviewed-by: Peter Feiner <pfeiner@google.com>
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h | 15 ++++++++
>  arch/x86/kvm/mmu/tdp_mmu.c      | 67 +++++++++++++++++++++++++++++----
>  2 files changed, 74 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 92d5340842c8..f8dccb27c722 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1034,6 +1034,21 @@ struct kvm_arch {
>  	 * tdp_mmu_page set and a root_count of 0.
>  	 */
>  	struct list_head tdp_mmu_pages;
> +
> +	/*
> +	 * Protects accesses to the following fields when the MMU lock is
> +	 * not held exclusively:
> +	 *  - tdp_mmu_pages (above)
> +	 *  - the link field of struct kvm_mmu_pages used by the TDP MMU
> +	 *    when they are part of tdp_mmu_pages (but not when they are part
> +	 *    of the tdp_mmu_free_list or tdp_mmu_disconnected_list)

Neither tdp_mmu_free_list nor tdp_mmu_disconnected_list exists.

> +	 *  - lpage_disallowed_mmu_pages
> +	 *  - the lpage_disallowed_link field of struct kvm_mmu_pages used
> +	 *    by the TDP MMU
> +	 *  May be acquired under the MMU lock in read mode or non-overlapping
> +	 *  with the MMU lock.
> +	 */
> +	spinlock_t tdp_mmu_pages_lock;
>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 8b61bdb391a0..264594947c3b 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -33,6 +33,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
>  	kvm->arch.tdp_mmu_enabled = true;
>  
>  	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
> +	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
>  	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
>  }
>  
> @@ -262,6 +263,58 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
>  	}
>  }
>  
> +/**
> + * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
> + *
> + * @kvm: kvm instance
> + * @sp: the new page
> + * @atomic: This operation is not running under the exclusive use of the MMU
> + *	    lock and the operation must be atomic with respect to ther threads
> + *	    that might be adding or removing pages.
> + * @account_nx: This page replaces a NX large page and should be marked for
> + *		eventual reclaim.
> + */
> +static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> +			      bool atomic, bool account_nx)
> +{
> +	if (atomic)

This is unnecessary, there is exactly one caller and it is always "atomic".

Assuming some of this code lives on (see below), I'd prefer a different name
than "atomic".  Writing the SPTE is atomic (though even that is a bit of a lie,
e.g. tdp_mmu_zap_spte_atomic() is very much not atomic), but all the other
operations are the exact opposite of atomic.

Maybe change it from a bool to an enum with READ/WRITE_LOCKED or something?

> +		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> +	else
> +		kvm_mmu_lock_assert_held_exclusive(kvm);
> +
> +	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
> +	if (account_nx)
> +		account_huge_nx_page(kvm, sp);
> +
> +	if (atomic)
> +		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> +}
> +
> +/**
> + * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
> + *
> + * @kvm: kvm instance
> + * @sp: the page to be removed
> + * @atomic: This operation is not running under the exclusive use of the MMU
> + *	    lock and the operation must be atomic with respect to ther threads
> + *	    that might be adding or removing pages.
> + */
> +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> +				bool atomic)
> +{
> +	if (atomic)

Summarizing an off-list discussion with Ben:

This path isn't reachable in this series, which means all the RCU stuff is more
or less untestable.  Only the page fault path modifies the MMU while hold a read
lock, and it can't zap non-leaf shadow pages (only zaps large SPTEs and installs
new SPs).

The intent is to convert other zap-happy paths to a read lock, notably
kvm_mmu_zap_collapsible_sptes() and kvm_recover_nx_lpages().  Ben will include
patches to convert at least one of those in the next version of this series so
that there is justification and coverage for the RCU-deferred freeing.

> +		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> +	else
> +		kvm_mmu_lock_assert_held_exclusive(kvm);
> +	list_del(&sp->link);
> +	if (sp->lpage_disallowed)
> +		unaccount_huge_nx_page(kvm, sp);
> +
> +	if (atomic)
> +		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> +}
> +
>  /**
>   * handle_disconnected_tdp_mmu_page - handle a pt removed from the TDP structure
>   *
Sean Christopherson Jan. 21, 2021, 9:32 p.m. UTC | #2
On Thu, Jan 21, 2021, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Ben Gardon wrote:
> > +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> > +				bool atomic)
> > +{
> > +	if (atomic)
> 
> Summarizing an off-list discussion with Ben:
> 
> This path isn't reachable in this series, which means all the RCU stuff is more
> or less untestable.  Only the page fault path modifies the MMU while hold a read
> lock, and it can't zap non-leaf shadow pages (only zaps large SPTEs and installs
> new SPs).

Aha!  I was wrong.  This will be hit when KVM zaps a 4k SPTE and installs a
large SPTE overtop a SP, e.g. if the host migrates a page for compaction and
creates a new THP.

  tdp_mmu_map_handle_target_level()
     tdp_mmu_set_spte_atomic()
       handle_changed_spte()
         __handle_changed_spte()
	   handle_disconnected_tdp_mmu_page()
	     tdp_mmu_unlink_page()

> The intent is to convert other zap-happy paths to a read lock, notably
> kvm_mmu_zap_collapsible_sptes() and kvm_recover_nx_lpages().  Ben will include
> patches to convert at least one of those in the next version of this series so
> that there is justification and coverage for the RCU-deferred freeing.

Somewhat offtopic, zap_collapsible_spte_range() looks wrong.  It zaps non-leaf
SPs, and has several comments that make it quite clear that that's its intent,
but the logic is messed up.  For non-leaf SPs, PFN points at the next table, not
the final PFN that is mapped into the guest.  That absolutely should never be a
reserved PFN, and whether or not its a huge page is irrelevant.  My analysis is
more or less confirmed by looking at Ben's internal code, which explicitly does
the exact opposite in that it explicitly zaps leaf SPTEs.

	tdp_root_for_each_pte(iter, root, start, end) {
		/* Ensure forward progress has been made before yielding. */
		if (iter.goal_gfn != last_goal_gfn &&
		    tdp_mmu_iter_flush_cond_resched(kvm, &iter)) {
			last_goal_gfn = iter.goal_gfn;
			spte_set = false;
			/*
			 * Yielding caused the paging structure walk to be
			 * reset so skip to the next iteration to continue the
			 * walk from the root.
			 */
			continue;
		}

		if (!is_shadow_present_pte(iter.old_spte) ||
		    is_last_spte(iter.old_spte, iter.level)) <--- inverted?
			continue;

		pfn = spte_to_pfn(iter.old_spte); <-- this would be the page table?
		if (kvm_is_reserved_pfn(pfn) ||
		    !PageTransCompoundMap(pfn_to_page(pfn)))
			continue;

		tdp_mmu_set_spte(kvm, &iter, 0);
		spte_set = true;
	}


Coming back to this series, I wonder if the RCU approach is truly necessary to
get the desired scalability.  If both zap_collapsible_sptes() and NX huge page
recovery zap _only_ leaf SPTEs, then the only path that can actually unlink a
shadow page while holding the lock for read is the page fault path that installs
a huge page over an existing shadow page.

Assuming the above analysis is correct, I think it's worth exploring alternatives
to using RCU to defer freeing the SP memory, e.g. promoting to a write lock in
the specific case of overwriting a SP (though that may not exist for rwlocks),
or maybe something entirely different?

I actually do like deferred free concept, but I find it difficult to reason
about exactly what protections are provided by RCU, and what even _needs_ to be
protected.  Maybe we just need to add some __rcu annotations?
Paolo Bonzini Jan. 26, 2021, 1:37 p.m. UTC | #3
On 12/01/21 19:10, Ben Gardon wrote:
> +	 *  May be acquired under the MMU lock in read mode or non-overlapping
> +	 *  with the MMU lock.
> +	 */
> +	spinlock_t tdp_mmu_pages_lock;

Is this correct?  My understanding is that:

- you can take tdp_mmu_pages_lock from a shared MMU lock critical section

- you don't need to take tdp_mmu_pages_lock from an exclusive MMU lock 
critical section, because you can't be concurrent with a shared critical 
section

- but then, you can't take tdp_mmu_pages_lock outside the MMU lock, 
because you could have

    write_lock(mmu_lock)
                                      spin_lock(tdp_mmu_pages_lock)
    do tdp_mmu_pages_lock stuff  !!!  do tdp_mmu_pages_lock stuff
    write_unlock(mmu_lock)
                                      spin_unlock(tdp_mmu_pages_lock)

Paolo
Paolo Bonzini Jan. 26, 2021, 2:27 p.m. UTC | #4
On 21/01/21 22:32, Sean Christopherson wrote:
> Coming back to this series, I wonder if the RCU approach is truly necessary to
> get the desired scalability.  If both zap_collapsible_sptes() and NX huge page
> recovery zap_only_  leaf SPTEs, then the only path that can actually unlink a
> shadow page while holding the lock for read is the page fault path that installs
> a huge page over an existing shadow page.
> 
> Assuming the above analysis is correct, I think it's worth exploring alternatives
> to using RCU to defer freeing the SP memory, e.g. promoting to a write lock in
> the specific case of overwriting a SP (though that may not exist for rwlocks),
> or maybe something entirely different?

You can do the deferred freeing with a short write-side critical section 
to ensure all readers have terminated.

If the bool argument to handle_disconnected_tdp_mmu_page is true(*), the 
pages would be added to an llist, instead of being freed immediately. 
At the end of a shared critical section you would do

	if (!llist_empty(&kvm->arch.tdp_mmu_disconnected_pages)) {
		struct llist_node *first;
		kvm_mmu_lock(kvm);
		first = __list_del_all(&kvm->arch.tdp_mmu_disconnected_pages);
		kvm_mmu_unlock(kvm);

		/*
		 * All vCPUs have already stopped using the pages when
		 * their TLBs were flushed.  The exclusive critical
		 * section above means that there can be no readers
		 * either.
		 */
		tdp_mmu_free_disconnected_pages(first);
	}

So this is still deferred reclamation, but it's done by one of the vCPUs 
rather than a worker RCU thread.  This would replace patches 11/12/13 
and probably would be implemented after patch 18.

Paolo

(*) this idea is what prompted the comment about s/atomic/shared/
Ben Gardon Jan. 26, 2021, 9:07 p.m. UTC | #5
On Tue, Jan 26, 2021 at 5:37 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 12/01/21 19:10, Ben Gardon wrote:
> > +      *  May be acquired under the MMU lock in read mode or non-overlapping
> > +      *  with the MMU lock.
> > +      */
> > +     spinlock_t tdp_mmu_pages_lock;
>
> Is this correct?  My understanding is that:
>
> - you can take tdp_mmu_pages_lock from a shared MMU lock critical section
>
> - you don't need to take tdp_mmu_pages_lock from an exclusive MMU lock
> critical section, because you can't be concurrent with a shared critical
> section
>
> - but then, you can't take tdp_mmu_pages_lock outside the MMU lock,
> because you could have
>
>     write_lock(mmu_lock)
>                                       spin_lock(tdp_mmu_pages_lock)
>     do tdp_mmu_pages_lock stuff  !!!  do tdp_mmu_pages_lock stuff
>     write_unlock(mmu_lock)
>                                       spin_unlock(tdp_mmu_pages_lock)
>

You're absolutely right, that would cause a problem. I'll amend the
comment to specify that the lock should only be held under the mmu
lock in read mode.

> Paolo
>
Ben Gardon Jan. 26, 2021, 9:47 p.m. UTC | #6
On Tue, Jan 26, 2021 at 6:28 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 21/01/21 22:32, Sean Christopherson wrote:
> > Coming back to this series, I wonder if the RCU approach is truly necessary to
> > get the desired scalability.  If both zap_collapsible_sptes() and NX huge page
> > recovery zap_only_  leaf SPTEs, then the only path that can actually unlink a
> > shadow page while holding the lock for read is the page fault path that installs
> > a huge page over an existing shadow page.
> >
> > Assuming the above analysis is correct, I think it's worth exploring alternatives
> > to using RCU to defer freeing the SP memory, e.g. promoting to a write lock in
> > the specific case of overwriting a SP (though that may not exist for rwlocks),
> > or maybe something entirely different?
>
> You can do the deferred freeing with a short write-side critical section
> to ensure all readers have terminated.
>
> If the bool argument to handle_disconnected_tdp_mmu_page is true(*), the
> pages would be added to an llist, instead of being freed immediately.
> At the end of a shared critical section you would do
>
>         if (!llist_empty(&kvm->arch.tdp_mmu_disconnected_pages)) {
>                 struct llist_node *first;
>                 kvm_mmu_lock(kvm);
>                 first = __list_del_all(&kvm->arch.tdp_mmu_disconnected_pages);
>                 kvm_mmu_unlock(kvm);
>
>                 /*
>                  * All vCPUs have already stopped using the pages when
>                  * their TLBs were flushed.  The exclusive critical
>                  * section above means that there can be no readers
>                  * either.
>                  */
>                 tdp_mmu_free_disconnected_pages(first);
>         }
>
> So this is still deferred reclamation, but it's done by one of the vCPUs
> rather than a worker RCU thread.  This would replace patches 11/12/13
> and probably would be implemented after patch 18.

While I agree that this would work, it could be a major performance
bottleneck as it could result in the MMU lock being acquired in read
mode by a VCPU thread handling a page fault. Even though the critical
section is very short it still has to serialize with the potentially
many overlapping page fault handlers which want the MMU read lock. In
order to perform well with hundreds of vCPUs, the vCPU threads really
cannot be acquiring the MMU lock in write mode. The MMU lock above
could be replaced with the TDP MMU pages lock, but that still adds
serialization where it's not really necessary.
The use of RCU also provides a nice separation of concerns, freeing
the various functions which need to remove pages from the paging
structure from having to follow up on freeing them later.

>
> Paolo
>
> (*) this idea is what prompted the comment about s/atomic/shared/
>
Sean Christopherson Jan. 26, 2021, 10:02 p.m. UTC | #7
On Tue, Jan 26, 2021, Paolo Bonzini wrote:
> On 21/01/21 22:32, Sean Christopherson wrote:
> > Coming back to this series, I wonder if the RCU approach is truly necessary to
> > get the desired scalability.  If both zap_collapsible_sptes() and NX huge page
> > recovery zap_only_  leaf SPTEs, then the only path that can actually unlink a
> > shadow page while holding the lock for read is the page fault path that installs
> > a huge page over an existing shadow page.
> > 
> > Assuming the above analysis is correct, I think it's worth exploring alternatives
> > to using RCU to defer freeing the SP memory, e.g. promoting to a write lock in
> > the specific case of overwriting a SP (though that may not exist for rwlocks),
> > or maybe something entirely different?
> 
> You can do the deferred freeing with a short write-side critical section to
> ensure all readers have terminated.

Hmm, the most obvious downside I see is that the zap_collapsible_sptes() case
will not scale as well as the RCU approach.  E.g. the lock may be heavily
contested when refaulting all of guest memory to (re)install huge pages after a
failed migration.

Though I wonder, could we do something even more clever for that particular
case?  And I suppose it would apply to NX huge pages as well.  Instead of
zapping the leaf PTEs and letting the fault handler install the huge page, do an
in-place promotion when dirty logging is disabled.  That could all be done under
the read lock, and with Paolo's method for deferred free on the back end.  That
way only the thread doing the memslot update would take mmu_lock for write, and
only once per memslot update.

> If the bool argument to handle_disconnected_tdp_mmu_page is true(*), the
> pages would be added to an llist, instead of being freed immediately. At the
> end of a shared critical section you would do
> 
> 	if (!llist_empty(&kvm->arch.tdp_mmu_disconnected_pages)) {
> 		struct llist_node *first;
> 		kvm_mmu_lock(kvm);
> 		first = __list_del_all(&kvm->arch.tdp_mmu_disconnected_pages);
> 		kvm_mmu_unlock(kvm);
> 
> 		/*
> 		 * All vCPUs have already stopped using the pages when
> 		 * their TLBs were flushed.  The exclusive critical
> 		 * section above means that there can be no readers
> 		 * either.
> 		 */
> 		tdp_mmu_free_disconnected_pages(first);
> 	}
> 
> So this is still deferred reclamation, but it's done by one of the vCPUs
> rather than a worker RCU thread.  This would replace patches 11/12/13 and
> probably would be implemented after patch 18.
> 
> Paolo
> 
> (*) this idea is what prompted the comment about s/atomic/shared/
>
Sean Christopherson Jan. 26, 2021, 10:09 p.m. UTC | #8
On Tue, Jan 26, 2021, Sean Christopherson wrote:
> On Tue, Jan 26, 2021, Paolo Bonzini wrote:
> > On 21/01/21 22:32, Sean Christopherson wrote:
> > > Coming back to this series, I wonder if the RCU approach is truly necessary to
> > > get the desired scalability.  If both zap_collapsible_sptes() and NX huge page
> > > recovery zap_only_  leaf SPTEs, then the only path that can actually unlink a
> > > shadow page while holding the lock for read is the page fault path that installs
> > > a huge page over an existing shadow page.
> > > 
> > > Assuming the above analysis is correct, I think it's worth exploring alternatives
> > > to using RCU to defer freeing the SP memory, e.g. promoting to a write lock in
> > > the specific case of overwriting a SP (though that may not exist for rwlocks),
> > > or maybe something entirely different?
> > 
> > You can do the deferred freeing with a short write-side critical section to
> > ensure all readers have terminated.
> 
> Hmm, the most obvious downside I see is that the zap_collapsible_sptes() case
> will not scale as well as the RCU approach.  E.g. the lock may be heavily
> contested when refaulting all of guest memory to (re)install huge pages after a
> failed migration.
> 
> Though I wonder, could we do something even more clever for that particular
> case?  And I suppose it would apply to NX huge pages as well.  Instead of
> zapping the leaf PTEs and letting the fault handler install the huge page, do an
> in-place promotion when dirty logging is disabled.  That could all be done under
> the read lock, and with Paolo's method for deferred free on the back end.  That
> way only the thread doing the memslot update would take mmu_lock for write, and
> only once per memslot update.

Oh, and we could even skip the remote TLB flush in that case since the GPA->HPA
translation is unchanged.
Paolo Bonzini Jan. 27, 2021, 12:40 p.m. UTC | #9
On 26/01/21 23:02, Sean Christopherson wrote:
>> You can do the deferred freeing with a short write-side critical section to
>> ensure all readers have terminated.
>
> Hmm, the most obvious downside I see is that the zap_collapsible_sptes() case
> will not scale as well as the RCU approach.  E.g. the lock may be heavily
> contested when refaulting all of guest memory to (re)install huge pages after a
> failed migration.

The simplest solution is to use a write_trylock on the read_unlock() 
path; if it fails, schedule a delayed work item 1 second in the future 
so that it's possible to do some batching.

(The work item would also have to re-check the llist after each iteration.)

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 92d5340842c8..f8dccb27c722 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1034,6 +1034,21 @@  struct kvm_arch {
 	 * tdp_mmu_page set and a root_count of 0.
 	 */
 	struct list_head tdp_mmu_pages;
+
+	/*
+	 * Protects accesses to the following fields when the MMU lock is
+	 * not held exclusively:
+	 *  - tdp_mmu_pages (above)
+	 *  - the link field of struct kvm_mmu_pages used by the TDP MMU
+	 *    when they are part of tdp_mmu_pages (but not when they are part
+	 *    of the tdp_mmu_free_list or tdp_mmu_disconnected_list)
+	 *  - lpage_disallowed_mmu_pages
+	 *  - the lpage_disallowed_link field of struct kvm_mmu_pages used
+	 *    by the TDP MMU
+	 *  May be acquired under the MMU lock in read mode or non-overlapping
+	 *  with the MMU lock.
+	 */
+	spinlock_t tdp_mmu_pages_lock;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 8b61bdb391a0..264594947c3b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -33,6 +33,7 @@  void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 	kvm->arch.tdp_mmu_enabled = true;
 
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
@@ -262,6 +263,58 @@  static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 	}
 }
 
+/**
+ * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the new page
+ * @atomic: This operation is not running under the exclusive use of the MMU
+ *	    lock and the operation must be atomic with respect to ther threads
+ *	    that might be adding or removing pages.
+ * @account_nx: This page replaces a NX large page and should be marked for
+ *		eventual reclaim.
+ */
+static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+			      bool atomic, bool account_nx)
+{
+	if (atomic)
+		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+	else
+		kvm_mmu_lock_assert_held_exclusive(kvm);
+
+	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+	if (account_nx)
+		account_huge_nx_page(kvm, sp);
+
+	if (atomic)
+		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ * @atomic: This operation is not running under the exclusive use of the MMU
+ *	    lock and the operation must be atomic with respect to ther threads
+ *	    that might be adding or removing pages.
+ */
+static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				bool atomic)
+{
+	if (atomic)
+		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+	else
+		kvm_mmu_lock_assert_held_exclusive(kvm);
+
+	list_del(&sp->link);
+	if (sp->lpage_disallowed)
+		unaccount_huge_nx_page(kvm, sp);
+
+	if (atomic)
+		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
 /**
  * handle_disconnected_tdp_mmu_page - handle a pt removed from the TDP structure
  *
@@ -285,10 +338,7 @@  static void handle_disconnected_tdp_mmu_page(struct kvm *kvm, u64 *pt)
 
 	trace_kvm_mmu_prepare_zap_page(sp);
 
-	list_del(&sp->link);
-
-	if (sp->lpage_disallowed)
-		unaccount_huge_nx_page(kvm, sp);
+	tdp_mmu_unlink_page(kvm, sp, atomic);
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 		old_child_spte = READ_ONCE(*(pt + i));
@@ -719,15 +769,16 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 
 		if (!is_shadow_present_pte(iter.old_spte)) {
 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
-			list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
 			child_pt = sp->spt;
+
+			tdp_mmu_link_page(vcpu->kvm, sp, false,
+					  huge_page_disallowed &&
+					  req_level >= iter.level);
+
 			new_spte = make_nonleaf_spte(child_pt,
 						     !shadow_accessed_mask);
 
 			trace_kvm_mmu_get_page(sp, true);
-			if (huge_page_disallowed && req_level >= iter.level)
-				account_huge_nx_page(vcpu->kvm, sp);
-
 			tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
 		}
 	}