diff mbox series

[v2,20/28] KVM: x86/mmu: Use atomic ops to set SPTEs in TDP MMU map

Message ID 20210202185734.1680553-21-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Allow parallel MMU operations with TDP MMU | expand

Commit Message

Ben Gardon Feb. 2, 2021, 6:57 p.m. UTC
To prepare for handling page faults in parallel, change the TDP MMU
page fault handler to use atomic operations to set SPTEs so that changes
are not lost if multiple threads attempt to modify the same SPTE.

Reviewed-by: Peter Feiner <pfeiner@google.com>
Signed-off-by: Ben Gardon <bgardon@google.com>

---

v1 -> v2
- Rename "atomic" arg to "shared" in multiple functions
- Merged the commit that protects the lists of TDP MMU pages with a new
  lock
- Merged the commits to add an atomic option for setting SPTEs and to
  use that option in the TDP MMU page fault handler

 arch/x86/include/asm/kvm_host.h |  13 +++
 arch/x86/kvm/mmu/tdp_mmu.c      | 142 ++++++++++++++++++++++++--------
 2 files changed, 122 insertions(+), 33 deletions(-)

Comments

kernel test robot Feb. 3, 2021, 2:48 a.m. UTC | #1
Hi Ben,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/master]
[also build test ERROR on linux/master linus/master v5.11-rc6 next-20210125]
[cannot apply to kvm/linux-next tip/sched/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ben-Gardon/Allow-parallel-MMU-operations-with-TDP-MMU/20210203-032259
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git a7e0bdf1b07ea6169930ec42b0bdb17e1c1e3bb0
config: i386-allyesconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/54f2f26ad4d34bc74287a904d2eebc011974147c
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ben-Gardon/Allow-parallel-MMU-operations-with-TDP-MMU/20210203-032259
        git checkout 54f2f26ad4d34bc74287a904d2eebc011974147c
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from arch/x86/include/asm/atomic.h:8,
                    from include/linux/atomic.h:7,
                    from include/linux/cpumask.h:13,
                    from arch/x86/include/asm/cpumask.h:5,
                    from arch/x86/include/asm/msr.h:11,
                    from arch/x86/include/asm/processor.h:22,
                    from arch/x86/include/asm/cpufeature.h:5,
                    from arch/x86/include/asm/thread_info.h:53,
                    from include/linux/thread_info.h:56,
                    from arch/x86/include/asm/preempt.h:7,
                    from include/linux/preempt.h:78,
                    from include/linux/percpu.h:6,
                    from include/linux/context_tracking_state.h:5,
                    from include/linux/hardirq.h:5,
                    from include/linux/kvm_host.h:7,
                    from arch/x86/kvm/mmu.h:5,
                    from arch/x86/kvm/mmu/tdp_mmu.c:3:
   In function 'handle_removed_tdp_mmu_page',
       inlined from '__handle_changed_spte' at arch/x86/kvm/mmu/tdp_mmu.c:454:3:
>> arch/x86/include/asm/cmpxchg.h:67:4: error: call to '__xchg_wrong_size' declared with attribute error: Bad argument size for xchg
      67 |    __ ## op ## _wrong_size();   \
         |    ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/include/asm/cmpxchg.h:78:27: note: in expansion of macro '__xchg_op'
      78 | #define arch_xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
         |                           ^~~~~~~~~
   include/asm-generic/atomic-instrumented.h:1649:2: note: in expansion of macro 'arch_xchg'
    1649 |  arch_xchg(__ai_ptr, __VA_ARGS__); \
         |  ^~~~~~~~~
   arch/x86/kvm/mmu/tdp_mmu.c:350:21: note: in expansion of macro 'xchg'
     350 |    old_child_spte = xchg(sptep, 0);
         |                     ^~~~


vim +/__xchg_wrong_size +67 arch/x86/include/asm/cmpxchg.h

e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  37  
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  38  /* 
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  39   * An exchange-type operation, which takes a value and a pointer, and
7f5281ae8a8e7f Li Zhong            2013-04-25  40   * returns the old value.
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  41   */
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  42  #define __xchg_op(ptr, arg, op, lock)					\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  43  	({								\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  44  	        __typeof__ (*(ptr)) __ret = (arg);			\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  45  		switch (sizeof(*(ptr))) {				\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  46  		case __X86_CASE_B:					\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  47  			asm volatile (lock #op "b %b0, %1\n"		\
2ca052a3710fac Jeremy Fitzhardinge 2012-04-02  48  				      : "+q" (__ret), "+m" (*(ptr))	\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  49  				      : : "memory", "cc");		\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  50  			break;						\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  51  		case __X86_CASE_W:					\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  52  			asm volatile (lock #op "w %w0, %1\n"		\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  53  				      : "+r" (__ret), "+m" (*(ptr))	\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  54  				      : : "memory", "cc");		\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  55  			break;						\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  56  		case __X86_CASE_L:					\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  57  			asm volatile (lock #op "l %0, %1\n"		\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  58  				      : "+r" (__ret), "+m" (*(ptr))	\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  59  				      : : "memory", "cc");		\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  60  			break;						\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  61  		case __X86_CASE_Q:					\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  62  			asm volatile (lock #op "q %q0, %1\n"		\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  63  				      : "+r" (__ret), "+m" (*(ptr))	\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  64  				      : : "memory", "cc");		\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  65  			break;						\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  66  		default:						\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30 @67  			__ ## op ## _wrong_size();			\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  68  		}							\
31a8394e069e47 Jeremy Fitzhardinge 2011-09-30  69  		__ret;							\
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  70  	})
e9826380d83d1b Jeremy Fitzhardinge 2011-08-18  71  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Paolo Bonzini Feb. 3, 2021, 11:14 a.m. UTC | #2
On 02/02/21 19:57, Ben Gardon wrote:
> To prepare for handling page faults in parallel, change the TDP MMU
> page fault handler to use atomic operations to set SPTEs so that changes
> are not lost if multiple threads attempt to modify the same SPTE.
> 
> Reviewed-by: Peter Feiner <pfeiner@google.com>
> Signed-off-by: Ben Gardon <bgardon@google.com>
> 
> ---
> 
> v1 -> v2
> - Rename "atomic" arg to "shared" in multiple functions
> - Merged the commit that protects the lists of TDP MMU pages with a new
>    lock
> - Merged the commits to add an atomic option for setting SPTEs and to
>    use that option in the TDP MMU page fault handler

I'll look at the kernel test robot report if nobody beats me to it.  In 
the meanwhile here's some doc to squash in:

diff --git a/Documentation/virt/kvm/locking.rst 
b/Documentation/virt/kvm/locking.rst
index b21a34c34a21..bd03638f1e55 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows:
  - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
    them together is quite rare.

-On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
+On x86:
+
+- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
+
+- kvm->arch.mmu_lock is an rwlock.  kvm->arch.tdp_mmu_pages_lock is
+  taken inside kvm->arch.mmu_lock, and cannot be taken without already
+  holding kvm->arch.mmu_lock (with ``read_lock``, otherwise there's
+  no need to take kvm->arch.tdp_mmu_pages_lock at all).

  Everything else is a leaf: no other lock is taken inside the critical
  sections.

Paoloo

>   arch/x86/include/asm/kvm_host.h |  13 +++
>   arch/x86/kvm/mmu/tdp_mmu.c      | 142 ++++++++++++++++++++++++--------
>   2 files changed, 122 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b6ebf2558386..78ebf56f2b37 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1028,6 +1028,19 @@ struct kvm_arch {
>   	 * tdp_mmu_page set and a root_count of 0.
>   	 */
>   	struct list_head tdp_mmu_pages;
> +
> +	/*
> +	 * Protects accesses to the following fields when the MMU lock
> +	 * is held in read mode:
> +	 *  - tdp_mmu_pages (above)
> +	 *  - the link field of struct kvm_mmu_pages used by the TDP MMU
> +	 *  - lpage_disallowed_mmu_pages
> +	 *  - the lpage_disallowed_link field of struct kvm_mmu_pages used
> +	 *    by the TDP MMU
> +	 * It is acceptable, but not necessary, to acquire this lock when
> +	 * the thread holds the MMU lock in write mode.
> +	 */
> +	spinlock_t tdp_mmu_pages_lock;
>   };
>   
>   struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 5a9e964e0178..0b5a9339ac55 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -7,6 +7,7 @@
>   #include "tdp_mmu.h"
>   #include "spte.h"
>   
> +#include <asm/cmpxchg.h>
>   #include <trace/events/kvm.h>
>   
>   #ifdef CONFIG_X86_64
> @@ -33,6 +34,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
>   	kvm->arch.tdp_mmu_enabled = true;
>   
>   	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
> +	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
>   	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
>   }
>   
> @@ -225,7 +227,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
>   }
>   
>   static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
> -				u64 old_spte, u64 new_spte, int level);
> +				u64 old_spte, u64 new_spte, int level,
> +				bool shared);
>   
>   static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
>   {
> @@ -267,17 +270,26 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
>    *
>    * @kvm: kvm instance
>    * @sp: the new page
> + * @shared: This operation may not be running under the exclusive use of
> + *	    the MMU lock and the operation must synchronize with other
> + *	    threads that might be adding or removing pages.
>    * @account_nx: This page replaces a NX large page and should be marked for
>    *		eventual reclaim.
>    */
>   static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> -			      bool account_nx)
> +			      bool shared, bool account_nx)
>   {
> -	lockdep_assert_held_write(&kvm->mmu_lock);
> +	if (shared)
> +		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> +	else
> +		lockdep_assert_held_write(&kvm->mmu_lock);
>   
>   	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
>   	if (account_nx)
>   		account_huge_nx_page(kvm, sp);
> +
> +	if (shared)
> +		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>   }
>   
>   /**
> @@ -285,14 +297,24 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>    *
>    * @kvm: kvm instance
>    * @sp: the page to be removed
> + * @shared: This operation may not be running under the exclusive use of
> + *	    the MMU lock and the operation must synchronize with other
> + *	    threads that might be adding or removing pages.
>    */
> -static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> +				bool shared)
>   {
> -	lockdep_assert_held_write(&kvm->mmu_lock);
> +	if (shared)
> +		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> +	else
> +		lockdep_assert_held_write(&kvm->mmu_lock);
>   
>   	list_del(&sp->link);
>   	if (sp->lpage_disallowed)
>   		unaccount_huge_nx_page(kvm, sp);
> +
> +	if (shared)
> +		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>   }
>   
>   /**
> @@ -300,28 +322,39 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>    *
>    * @kvm: kvm instance
>    * @pt: the page removed from the paging structure
> + * @shared: This operation may not be running under the exclusive use
> + *	    of the MMU lock and the operation must synchronize with other
> + *	    threads that might be modifying SPTEs.
>    *
>    * Given a page table that has been removed from the TDP paging structure,
>    * iterates through the page table to clear SPTEs and free child page tables.
>    */
> -static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
> +static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
> +					bool shared)
>   {
>   	struct kvm_mmu_page *sp = sptep_to_sp(pt);
>   	int level = sp->role.level;
>   	gfn_t gfn = sp->gfn;
>   	u64 old_child_spte;
> +	u64 *sptep;
>   	int i;
>   
>   	trace_kvm_mmu_prepare_zap_page(sp);
>   
> -	tdp_mmu_unlink_page(kvm, sp);
> +	tdp_mmu_unlink_page(kvm, sp, shared);
>   
>   	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
> -		old_child_spte = READ_ONCE(*(pt + i));
> -		WRITE_ONCE(*(pt + i), 0);
> +		sptep = pt + i;
> +
> +		if (shared) {
> +			old_child_spte = xchg(sptep, 0);
> +		} else {
> +			old_child_spte = READ_ONCE(*sptep);
> +			WRITE_ONCE(*sptep, 0);
> +		}
>   		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp),
>   			gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
> -			old_child_spte, 0, level - 1);
> +			old_child_spte, 0, level - 1, shared);
>   	}
>   
>   	kvm_flush_remote_tlbs_with_address(kvm, gfn,
> @@ -338,12 +371,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
>    * @old_spte: The value of the SPTE before the change
>    * @new_spte: The value of the SPTE after the change
>    * @level: the level of the PT the SPTE is part of in the paging structure
> + * @shared: This operation may not be running under the exclusive use of
> + *	    the MMU lock and the operation must synchronize with other
> + *	    threads that might be modifying SPTEs.
>    *
>    * Handle bookkeeping that might result from the modification of a SPTE.
>    * This function must be called for all TDP SPTE modifications.
>    */
>   static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
> -				u64 old_spte, u64 new_spte, int level)
> +				  u64 old_spte, u64 new_spte, int level,
> +				  bool shared)
>   {
>   	bool was_present = is_shadow_present_pte(old_spte);
>   	bool is_present = is_shadow_present_pte(new_spte);
> @@ -415,18 +452,51 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
>   	 */
>   	if (was_present && !was_leaf && (pfn_changed || !is_present))
>   		handle_removed_tdp_mmu_page(kvm,
> -				spte_to_child_pt(old_spte, level));
> +				spte_to_child_pt(old_spte, level), shared);
>   }
>   
>   static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
> -				u64 old_spte, u64 new_spte, int level)
> +				u64 old_spte, u64 new_spte, int level,
> +				bool shared)
>   {
> -	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
> +	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
> +			      shared);
>   	handle_changed_spte_acc_track(old_spte, new_spte, level);
>   	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
>   				      new_spte, level);
>   }
>   
> +/*
> + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
> + * associated bookkeeping
> + *
> + * @kvm: kvm instance
> + * @iter: a tdp_iter instance currently on the SPTE that should be set
> + * @new_spte: The value the SPTE should be set to
> + * Returns: true if the SPTE was set, false if it was not. If false is returned,
> + *	    this function will have no side-effects.
> + */
> +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
> +					   struct tdp_iter *iter,
> +					   u64 new_spte)
> +{
> +	u64 *root_pt = tdp_iter_root_pt(iter);
> +	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
> +	int as_id = kvm_mmu_page_as_id(root);
> +
> +	lockdep_assert_held_read(&kvm->mmu_lock);
> +
> +	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
> +		      new_spte) != iter->old_spte)
> +		return false;
> +
> +	handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
> +			    iter->level, true);
> +
> +	return true;
> +}
> +
> +
>   /*
>    * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
>    * @kvm: kvm instance
> @@ -456,7 +526,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
>   	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
>   
>   	__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
> -			      iter->level);
> +			      iter->level, false);
>   	if (record_acc_track)
>   		handle_changed_spte_acc_track(iter->old_spte, new_spte,
>   					      iter->level);
> @@ -630,23 +700,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
>   	int ret = 0;
>   	int make_spte_ret = 0;
>   
> -	if (unlikely(is_noslot_pfn(pfn))) {
> +	if (unlikely(is_noslot_pfn(pfn)))
>   		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
> -		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
> -				     new_spte);
> -	} else {
> +	else
>   		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
>   					 pfn, iter->old_spte, prefault, true,
>   					 map_writable, !shadow_accessed_mask,
>   					 &new_spte);
> -		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
> -				       rcu_dereference(iter->sptep));
> -	}
>   
>   	if (new_spte == iter->old_spte)
>   		ret = RET_PF_SPURIOUS;
> -	else
> -		tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
> +	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
> +		return RET_PF_RETRY;
>   
>   	/*
>   	 * If the page fault was caused by a write but the page is write
> @@ -660,8 +725,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
>   	}
>   
>   	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
> -	if (unlikely(is_mmio_spte(new_spte)))
> +	if (unlikely(is_mmio_spte(new_spte))) {
> +		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
> +				     new_spte);
>   		ret = RET_PF_EMULATE;
> +	} else
> +		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
> +				       rcu_dereference(iter->sptep));
>   
>   	trace_kvm_mmu_set_spte(iter->level, iter->gfn,
>   			       rcu_dereference(iter->sptep));
> @@ -720,7 +790,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>   		 */
>   		if (is_shadow_present_pte(iter.old_spte) &&
>   		    is_large_pte(iter.old_spte)) {
> -			tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
> +			if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
> +				break;
>   
>   			kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
>   					KVM_PAGES_PER_HPAGE(iter.level));
> @@ -737,19 +808,24 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>   			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
>   			child_pt = sp->spt;
>   
> -			tdp_mmu_link_page(vcpu->kvm, sp,
> -					  huge_page_disallowed &&
> -					  req_level >= iter.level);
> -
>   			new_spte = make_nonleaf_spte(child_pt,
>   						     !shadow_accessed_mask);
>   
> -			trace_kvm_mmu_get_page(sp, true);
> -			tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
> +			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
> +						    new_spte)) {
> +				tdp_mmu_link_page(vcpu->kvm, sp, true,
> +						  huge_page_disallowed &&
> +						  req_level >= iter.level);
> +
> +				trace_kvm_mmu_get_page(sp, true);
> +			} else {
> +				tdp_mmu_free_sp(sp);
> +				break;
> +			}
>   		}
>   	}
>   
> -	if (WARN_ON(iter.level != level)) {
> +	if (iter.level != level) {
>   		rcu_read_unlock();
>   		return RET_PF_RETRY;
>   	}
>
Sean Christopherson Feb. 6, 2021, 12:26 a.m. UTC | #3
On Wed, Feb 03, 2021, Paolo Bonzini wrote:
> On 02/02/21 19:57, Ben Gardon wrote:
> > To prepare for handling page faults in parallel, change the TDP MMU
> > page fault handler to use atomic operations to set SPTEs so that changes
> > are not lost if multiple threads attempt to modify the same SPTE.
> > 
> > Reviewed-by: Peter Feiner <pfeiner@google.com>
> > Signed-off-by: Ben Gardon <bgardon@google.com>
> > 
> > ---
> > 
> > v1 -> v2
> > - Rename "atomic" arg to "shared" in multiple functions
> > - Merged the commit that protects the lists of TDP MMU pages with a new
> >    lock
> > - Merged the commits to add an atomic option for setting SPTEs and to
> >    use that option in the TDP MMU page fault handler
> 
> I'll look at the kernel test robot report if nobody beats me to it.

It's just a vanilla i386 compilation issue, the xchg() is on an 8-byte value.

We could fudge around it via #ifdef around the xchg().  Making all of tdp_mmu.c
x86-64 only would be nice to avoid future annoyance, though the number of stubs
required would be painful...
Paolo Bonzini Feb. 8, 2021, 10:32 a.m. UTC | #4
On 06/02/21 01:26, Sean Christopherson wrote:
> We could fudge around it via #ifdef around the xchg().  Making all of tdp_mmu.c
> x86-64 only would be nice to avoid future annoyance, though the number of stubs
> required would be painful...

It's really just a handful, so it's worth it.

Paolo
Paolo Bonzini April 1, 2021, 10:32 a.m. UTC | #5
On 02/02/21 19:57, Ben Gardon wrote:
> @@ -720,7 +790,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>   		 */
>   		if (is_shadow_present_pte(iter.old_spte) &&
>   		    is_large_pte(iter.old_spte)) {
> -			tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
> +			if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
> +				break;
>   
>   			kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
>   					KVM_PAGES_PER_HPAGE(iter.level));
>
>  			/*
>  			 * The iter must explicitly re-read the spte here
>  			 * because the new value informs the !present
>                          * path below.
>                          */
>                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
>                 }
> 
>                 if (!is_shadow_present_pte(iter.old_spte)) {

Would it be easier to reason about this code by making it retry, like:

retry:
                 if (is_shadow_present_pte(iter.old_spte)) {
			if (is_large_pte(iter.old_spte)) {
	                        if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
	                                break;

				/*
				 * The iter must explicitly re-read the SPTE because
				 * the atomic cmpxchg failed.
				 */
	                        iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
				goto retry;
			}
                 } else {
			...
		}

?

Paolo
Ben Gardon April 1, 2021, 4:50 p.m. UTC | #6
On Thu, Apr 1, 2021 at 3:32 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 02/02/21 19:57, Ben Gardon wrote:
> > @@ -720,7 +790,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
> >                */
> >               if (is_shadow_present_pte(iter.old_spte) &&
> >                   is_large_pte(iter.old_spte)) {
> > -                     tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
> > +                     if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
> > +                             break;
> >
> >                       kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
> >                                       KVM_PAGES_PER_HPAGE(iter.level));
> >
> >                       /*
> >                        * The iter must explicitly re-read the spte here
> >                        * because the new value informs the !present
> >                          * path below.
> >                          */
> >                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
> >                 }
> >
> >                 if (!is_shadow_present_pte(iter.old_spte)) {
>
> Would it be easier to reason about this code by making it retry, like:
>
> retry:
>                  if (is_shadow_present_pte(iter.old_spte)) {
>                         if (is_large_pte(iter.old_spte)) {
>                                 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
>                                         break;
>
>                                 /*
>                                  * The iter must explicitly re-read the SPTE because
>                                  * the atomic cmpxchg failed.
>                                  */
>                                 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
>                                 goto retry;
>                         }
>                  } else {
>                         ...
>                 }
>
> ?

To be honest, that feels less readable to me. For me retry implies
that we failed to make progress and need to repeat an operation, but
the reality is that we did make progress and there are just multiple
steps to replace the large SPTE with a child PT.
Another option which could improve readability and performance would
be to use the retry to repeat failed cmpxchgs instead of breaking out
of the loop. Then we could avoid retrying the page fault each time a
cmpxchg failed, which may happen a lot as vCPUs allocate intermediate
page tables on boot. (Probably less common for leaf entries, but
possibly useful there too.)
Another-nother option would be to remove this two part process by
eagerly splitting large page mappings in a single step. This would
substantially reduce the number of page faults incurred for NX
splitting / dirty logging splitting. It's been on our list of features
to send upstream for a while and I hope we'll be able to get it into
shape and send it out reasonably soon.

>
> Paolo
>
Paolo Bonzini April 1, 2021, 5:32 p.m. UTC | #7
On 01/04/21 18:50, Ben Gardon wrote:
>> retry:
>>                   if (is_shadow_present_pte(iter.old_spte)) {
>>                          if (is_large_pte(iter.old_spte)) {
>>                                  if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
>>                                          break;
>>
>>                                  /*
>>                                   * The iter must explicitly re-read the SPTE because
>>                                   * the atomic cmpxchg failed.
>>                                   */
>>                                  iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
>>                                  goto retry;
>>                          }
>>                   } else {
>>                          ...
>>                  }
>>
>> ?
> To be honest, that feels less readable to me. For me retry implies
> that we failed to make progress and need to repeat an operation, but
> the reality is that we did make progress and there are just multiple
> steps to replace the large SPTE with a child PT.

You're right, it's makes no sense---I misremembered the direction of
tdp_mmu_zap_spte_atomic's return value.  I was actually thinking of this:

> Another option which could improve readability and performance would
> be to use the retry to repeat failed cmpxchgs instead of breaking out
> of the loop. Then we could avoid retrying the page fault each time a
> cmpxchg failed, which may happen a lot as vCPUs allocate intermediate
> page tables on boot. (Probably less common for leaf entries, but
> possibly useful there too.)

which would be

retry:
                  if (is_shadow_present_pte(iter.old_spte)) {
                        if (is_large_pte(iter.old_spte) &&
                            !tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) {
                                 /*
                                  * The iter must explicitly re-read the SPTE because
                                  * the atomic cmpxchg failed.
                                  */
                                 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
                                 goto retry;
                             }
                             /* XXX move this to tdp_mmu_zap_spte_atomic? */
                             iter.old_spte = 0;
                        } else {
                             continue;
                        }
                  }
                  sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
                  child_pt = sp->spt;

                  new_spte = make_nonleaf_spte(child_pt,
                                               !shadow_accessed_mask);

                  if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
                                              new_spte)) {
                       tdp_mmu_free_sp(sp);
                       /*
                        * The iter must explicitly re-read the SPTE because
                        * the atomic cmpxchg failed.
                        */
                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
                       goto retry;
                  }
                  tdp_mmu_link_page(vcpu->kvm, sp, true,
                                    huge_page_disallowed &&
                                    req_level >= iter.level);

                  trace_kvm_mmu_get_page(sp, true);

which survives at least a quick smoke test of booting a 20-vCPU Windows
guest.  If you agree I'll turn this into an actual patch.
Sean Christopherson April 1, 2021, 6:09 p.m. UTC | #8
On Thu, Apr 01, 2021, Paolo Bonzini wrote:
> On 01/04/21 18:50, Ben Gardon wrote:
> > > retry:
> > >                   if (is_shadow_present_pte(iter.old_spte)) {
> > >                          if (is_large_pte(iter.old_spte)) {
> > >                                  if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
> > >                                          break;
> > > 
> > >                                  /*
> > >                                   * The iter must explicitly re-read the SPTE because
> > >                                   * the atomic cmpxchg failed.
> > >                                   */
> > >                                  iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
> > >                                  goto retry;
> > >                          }
> > >                   } else {
> > >                          ...
> > >                  }
> > > 
> > > ?
> > To be honest, that feels less readable to me. For me retry implies
> > that we failed to make progress and need to repeat an operation, but
> > the reality is that we did make progress and there are just multiple
> > steps to replace the large SPTE with a child PT.
> 
> You're right, it's makes no sense---I misremembered the direction of
> tdp_mmu_zap_spte_atomic's return value.  I was actually thinking of this:
> 
> > Another option which could improve readability and performance would
> > be to use the retry to repeat failed cmpxchgs instead of breaking out
> > of the loop. Then we could avoid retrying the page fault each time a
> > cmpxchg failed, which may happen a lot as vCPUs allocate intermediate
> > page tables on boot. (Probably less common for leaf entries, but
> > possibly useful there too.)
> 
> which would be
> 
> retry:
>                  if (is_shadow_present_pte(iter.old_spte)) {
>                        if (is_large_pte(iter.old_spte) &&
>                            !tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) {
>                                 /*
>                                  * The iter must explicitly re-read the SPTE because
>                                  * the atomic cmpxchg failed.
>                                  */
>                                 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
>                                 goto retry;
>                             }
>                             /* XXX move this to tdp_mmu_zap_spte_atomic? */
>                             iter.old_spte = 0;
>                        } else {
>                             continue;

This is wrong.  If a large PTE is successfully zapped, it will leave a !PRESENT
intermediate entry.  It's probably not fatal; I'm guessing it would lead to
RET_PF_RETRY and cleaned up on the subsequent re-fault.

>                        }
>                  }
>                  sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
>                  child_pt = sp->spt;
> 
>                  new_spte = make_nonleaf_spte(child_pt,
>                                               !shadow_accessed_mask);
> 
>                  if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
>                                              new_spte)) {
>                       tdp_mmu_free_sp(sp);
>                       /*
>                        * The iter must explicitly re-read the SPTE because
>                        * the atomic cmpxchg failed.
>                        */
>                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
>                       goto retry;

I'm not sure that _always_ retrying is correct.  The conflict means something
else is writing the same SPTE.  That could be a different vCPU handling an
identical fault, but it could also be something else blasting away the SPTE.  If
an upper level SPTE was zapped, e.g. the entire MMU instance is zapped,
installing a new SPE would be wrong.

AFAICT, the only motivation for retrying in this loop is to handle the case
where a different vCPU is handling an identical fault.  It should be safe to
handler that, but if the conflicting SPTE is not-present, I believe this needs
to break to handle any pending updates.

			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
			if (!is_shadow_present_pte(iter.old_spte))
				break;
			goto retry;

>                  }
>                  tdp_mmu_link_page(vcpu->kvm, sp, true,
>                                    huge_page_disallowed &&
>                                    req_level >= iter.level);
> 
>                  trace_kvm_mmu_get_page(sp, true);
> 
> which survives at least a quick smoke test of booting a 20-vCPU Windows
> guest.  If you agree I'll turn this into an actual patch.
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b6ebf2558386..78ebf56f2b37 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1028,6 +1028,19 @@  struct kvm_arch {
 	 * tdp_mmu_page set and a root_count of 0.
 	 */
 	struct list_head tdp_mmu_pages;
+
+	/*
+	 * Protects accesses to the following fields when the MMU lock
+	 * is held in read mode:
+	 *  - tdp_mmu_pages (above)
+	 *  - the link field of struct kvm_mmu_pages used by the TDP MMU
+	 *  - lpage_disallowed_mmu_pages
+	 *  - the lpage_disallowed_link field of struct kvm_mmu_pages used
+	 *    by the TDP MMU
+	 * It is acceptable, but not necessary, to acquire this lock when
+	 * the thread holds the MMU lock in write mode.
+	 */
+	spinlock_t tdp_mmu_pages_lock;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 5a9e964e0178..0b5a9339ac55 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -7,6 +7,7 @@ 
 #include "tdp_mmu.h"
 #include "spte.h"
 
+#include <asm/cmpxchg.h>
 #include <trace/events/kvm.h>
 
 #ifdef CONFIG_X86_64
@@ -33,6 +34,7 @@  void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 	kvm->arch.tdp_mmu_enabled = true;
 
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
@@ -225,7 +227,8 @@  static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 }
 
 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
-				u64 old_spte, u64 new_spte, int level);
+				u64 old_spte, u64 new_spte, int level,
+				bool shared);
 
 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 {
@@ -267,17 +270,26 @@  static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
  *
  * @kvm: kvm instance
  * @sp: the new page
+ * @shared: This operation may not be running under the exclusive use of
+ *	    the MMU lock and the operation must synchronize with other
+ *	    threads that might be adding or removing pages.
  * @account_nx: This page replaces a NX large page and should be marked for
  *		eventual reclaim.
  */
 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-			      bool account_nx)
+			      bool shared, bool account_nx)
 {
-	lockdep_assert_held_write(&kvm->mmu_lock);
+	if (shared)
+		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+	else
+		lockdep_assert_held_write(&kvm->mmu_lock);
 
 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 	if (account_nx)
 		account_huge_nx_page(kvm, sp);
+
+	if (shared)
+		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 }
 
 /**
@@ -285,14 +297,24 @@  static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
  *
  * @kvm: kvm instance
  * @sp: the page to be removed
+ * @shared: This operation may not be running under the exclusive use of
+ *	    the MMU lock and the operation must synchronize with other
+ *	    threads that might be adding or removing pages.
  */
-static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				bool shared)
 {
-	lockdep_assert_held_write(&kvm->mmu_lock);
+	if (shared)
+		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+	else
+		lockdep_assert_held_write(&kvm->mmu_lock);
 
 	list_del(&sp->link);
 	if (sp->lpage_disallowed)
 		unaccount_huge_nx_page(kvm, sp);
+
+	if (shared)
+		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 }
 
 /**
@@ -300,28 +322,39 @@  static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  *
  * @kvm: kvm instance
  * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ *	    of the MMU lock and the operation must synchronize with other
+ *	    threads that might be modifying SPTEs.
  *
  * Given a page table that has been removed from the TDP paging structure,
  * iterates through the page table to clear SPTEs and free child page tables.
  */
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+					bool shared)
 {
 	struct kvm_mmu_page *sp = sptep_to_sp(pt);
 	int level = sp->role.level;
 	gfn_t gfn = sp->gfn;
 	u64 old_child_spte;
+	u64 *sptep;
 	int i;
 
 	trace_kvm_mmu_prepare_zap_page(sp);
 
-	tdp_mmu_unlink_page(kvm, sp);
+	tdp_mmu_unlink_page(kvm, sp, shared);
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-		old_child_spte = READ_ONCE(*(pt + i));
-		WRITE_ONCE(*(pt + i), 0);
+		sptep = pt + i;
+
+		if (shared) {
+			old_child_spte = xchg(sptep, 0);
+		} else {
+			old_child_spte = READ_ONCE(*sptep);
+			WRITE_ONCE(*sptep, 0);
+		}
 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp),
 			gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
-			old_child_spte, 0, level - 1);
+			old_child_spte, 0, level - 1, shared);
 	}
 
 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
@@ -338,12 +371,16 @@  static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
  * @old_spte: The value of the SPTE before the change
  * @new_spte: The value of the SPTE after the change
  * @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ *	    the MMU lock and the operation must synchronize with other
+ *	    threads that might be modifying SPTEs.
  *
  * Handle bookkeeping that might result from the modification of a SPTE.
  * This function must be called for all TDP SPTE modifications.
  */
 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
-				u64 old_spte, u64 new_spte, int level)
+				  u64 old_spte, u64 new_spte, int level,
+				  bool shared)
 {
 	bool was_present = is_shadow_present_pte(old_spte);
 	bool is_present = is_shadow_present_pte(new_spte);
@@ -415,18 +452,51 @@  static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 	 */
 	if (was_present && !was_leaf && (pfn_changed || !is_present))
 		handle_removed_tdp_mmu_page(kvm,
-				spte_to_child_pt(old_spte, level));
+				spte_to_child_pt(old_spte, level), shared);
 }
 
 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
-				u64 old_spte, u64 new_spte, int level)
+				u64 old_spte, u64 new_spte, int level,
+				bool shared)
 {
-	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
+	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
+			      shared);
 	handle_changed_spte_acc_track(old_spte, new_spte, level);
 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 				      new_spte, level);
 }
 
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
+ * associated bookkeeping
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ *	    this function will have no side-effects.
+ */
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+					   struct tdp_iter *iter,
+					   u64 new_spte)
+{
+	u64 *root_pt = tdp_iter_root_pt(iter);
+	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
+	int as_id = kvm_mmu_page_as_id(root);
+
+	lockdep_assert_held_read(&kvm->mmu_lock);
+
+	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
+		      new_spte) != iter->old_spte)
+		return false;
+
+	handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
+			    iter->level, true);
+
+	return true;
+}
+
+
 /*
  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
  * @kvm: kvm instance
@@ -456,7 +526,7 @@  static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
 	__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-			      iter->level);
+			      iter->level, false);
 	if (record_acc_track)
 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
 					      iter->level);
@@ -630,23 +700,18 @@  static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 	int ret = 0;
 	int make_spte_ret = 0;
 
-	if (unlikely(is_noslot_pfn(pfn))) {
+	if (unlikely(is_noslot_pfn(pfn)))
 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
-		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
-				     new_spte);
-	} else {
+	else
 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 					 pfn, iter->old_spte, prefault, true,
 					 map_writable, !shadow_accessed_mask,
 					 &new_spte);
-		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
-				       rcu_dereference(iter->sptep));
-	}
 
 	if (new_spte == iter->old_spte)
 		ret = RET_PF_SPURIOUS;
-	else
-		tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
+	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+		return RET_PF_RETRY;
 
 	/*
 	 * If the page fault was caused by a write but the page is write
@@ -660,8 +725,13 @@  static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 	}
 
 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
-	if (unlikely(is_mmio_spte(new_spte)))
+	if (unlikely(is_mmio_spte(new_spte))) {
+		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+				     new_spte);
 		ret = RET_PF_EMULATE;
+	} else
+		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+				       rcu_dereference(iter->sptep));
 
 	trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 			       rcu_dereference(iter->sptep));
@@ -720,7 +790,8 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 		 */
 		if (is_shadow_present_pte(iter.old_spte) &&
 		    is_large_pte(iter.old_spte)) {
-			tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
+			if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
+				break;
 
 			kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
 					KVM_PAGES_PER_HPAGE(iter.level));
@@ -737,19 +808,24 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 			child_pt = sp->spt;
 
-			tdp_mmu_link_page(vcpu->kvm, sp,
-					  huge_page_disallowed &&
-					  req_level >= iter.level);
-
 			new_spte = make_nonleaf_spte(child_pt,
 						     !shadow_accessed_mask);
 
-			trace_kvm_mmu_get_page(sp, true);
-			tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
+			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
+						    new_spte)) {
+				tdp_mmu_link_page(vcpu->kvm, sp, true,
+						  huge_page_disallowed &&
+						  req_level >= iter.level);
+
+				trace_kvm_mmu_get_page(sp, true);
+			} else {
+				tdp_mmu_free_sp(sp);
+				break;
+			}
 		}
 	}
 
-	if (WARN_ON(iter.level != level)) {
+	if (iter.level != level) {
 		rcu_read_unlock();
 		return RET_PF_RETRY;
 	}