diff mbox series

[2/2] KVM: x86/mmu: Recover NX Huge pages belonging to TDP MMU under MMU read lock

Message ID 20240812171341.1763297-3-vipinsh@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Run NX huge page recovery under MMU read lock | expand

Commit Message

Vipin Sharma Aug. 12, 2024, 5:13 p.m. UTC
Use MMU read lock to recover NX huge pages belonging to TDP MMU. Iterate
through kvm->arch.possible_nx_huge_pages while holding
kvm->arch.tdp_mmu_pages_lock. Rename kvm_tdp_mmu_zap_sp() to
tdp_mmu_zap_sp() and make it static as there are no callers outside of
TDP MMU.

Ignore the zapping if any of the following is true for parent pte:

 - Pointing to some other page table.
 - Pointing to a huge page.
 - Not present.

These scenarios can happen as recovering is running under MMU read lock.

Zapping under MMU read lock unblock vCPUs which are waiting for MMU read
lock too.

This optimizaion was created to solve a guest jitter issue on Windows VM
which was observing an increase in network latency. The test workload sets
up two Windows VM and use latte.exe[1] binary to run network latency
benchmark. Running NX huge page recovery under MMU lock was causing
latency to increase up to 30 ms because vCPUs were waiting for MMU lock.

Running the tool on VMs using MMU read lock NX huge page recovery
removed the jitter issue completely and MMU lock wait time by vCPUs was
also reduced.

Command used for testing:

Server:
latte.exe -udp -a 192.168.100.1:9000 -i 10000000

Client:
latte.exe -c -udp -a 192.168.100.1:9000 -i 10000000 -hist -hl 1000 -hc 30

Output from the latency tool on client:

Before
------

Protocol      UDP
SendMethod    Blocking
ReceiveMethod Blocking
SO_SNDBUF     Default
SO_RCVBUF     Default
MsgSize(byte) 4
Iterations    10000000
Latency(usec) 70.41
CPU(%)        1.7
CtxSwitch/sec 31125     (2.19/iteration)
SysCall/sec   62184     (4.38/iteration)
Interrupt/sec 48319     (3.40/iteration)

Interval(usec)   Frequency
      0          9999964
   1000          12
   2000          3
   3000          0
   4000          0
   5000          0
   6000          0
   7000          1
   8000          1
   9000          1
  10000          2
  11000          1
  12000          0
  13000          4
  14000          1
  15000          1
  16000          4
  17000          1
  18000          2
  19000          0
  20000          0
  21000          1
  22000          0
  23000          0
  24000          1

After
-----

Protocol      UDP
SendMethod    Blocking
ReceiveMethod Blocking
SO_SNDBUF     Default
SO_RCVBUF     Default
MsgSize(byte) 4
Iterations    10000000
Latency(usec) 70.98
CPU(%)        1.3
CtxSwitch/sec 28967     (2.06/iteration)
SysCall/sec   48988     (3.48/iteration)
Interrupt/sec 47280     (3.36/iteration)

Interval(usec)   Frequency
      0          9999996
   1000          4

[1] https://github.com/microsoft/latte

Signed-off-by: Vipin Sharma <vipinsh@google.com>
---
 arch/x86/kvm/mmu/mmu.c     | 10 +++++---
 arch/x86/kvm/mmu/tdp_mmu.c | 48 ++++++++++++++++++++++++++------------
 arch/x86/kvm/mmu/tdp_mmu.h |  1 -
 3 files changed, 40 insertions(+), 19 deletions(-)

Comments

kernel test robot Aug. 14, 2024, 9:33 a.m. UTC | #1
Hi Vipin,

kernel test robot noticed the following build errors:

[auto build test ERROR on 332d2c1d713e232e163386c35a3ba0c1b90df83f]

url:    https://github.com/intel-lab-lkp/linux/commits/Vipin-Sharma/KVM-x86-mmu-Split-NX-hugepage-recovery-flow-into-TDP-and-non-TDP-flow/20240814-091542
base:   332d2c1d713e232e163386c35a3ba0c1b90df83f
patch link:    https://lore.kernel.org/r/20240812171341.1763297-3-vipinsh%40google.com
patch subject: [PATCH 2/2] KVM: x86/mmu: Recover NX Huge pages belonging to TDP MMU under MMU read lock
config: i386-buildonly-randconfig-005-20240814 (https://download.01.org/0day-ci/archive/20240814/202408141753.ZY1CSmGo-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240814/202408141753.ZY1CSmGo-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408141753.ZY1CSmGo-lkp@intel.com/

All errors (new ones prefixed by >>):

   arch/x86/kvm/mmu/mmu.c: In function 'kvm_mmu_possible_nx_huge_page':
>> arch/x86/kvm/mmu/mmu.c:7324:29: error: 'struct kvm_arch' has no member named 'tdp_mmu_pages_lock'
    7324 |         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
         |                             ^
   arch/x86/kvm/mmu/mmu.c:7335:47: error: 'struct kvm_arch' has no member named 'tdp_mmu_pages_lock'
    7335 |                         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
         |                                               ^
   arch/x86/kvm/mmu/mmu.c:7340:31: error: 'struct kvm_arch' has no member named 'tdp_mmu_pages_lock'
    7340 |         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
         |                               ^


vim +7324 arch/x86/kvm/mmu/mmu.c

  7313	
  7314	/*
  7315	 * Get the first shadow mmu page of desired type from the NX huge pages list.
  7316	 * Return NULL if list doesn't have the needed page with in the first max pages.
  7317	 */
  7318	struct kvm_mmu_page *kvm_mmu_possible_nx_huge_page(struct kvm *kvm, bool tdp_mmu,
  7319							   ulong max)
  7320	{
  7321		struct kvm_mmu_page *sp = NULL;
  7322		ulong i = 0;
  7323	
> 7324		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
  7325		/*
  7326		 * We use a separate list instead of just using active_mmu_pages because
  7327		 * the number of shadow pages that be replaced with an NX huge page is
  7328		 * expected to be relatively small compared to the total number of shadow
  7329		 * pages. And because the TDP MMU doesn't use active_mmu_pages.
  7330		 */
  7331		list_for_each_entry(sp, &kvm->arch.possible_nx_huge_pages, possible_nx_huge_page_link) {
  7332			if (i++ >= max)
  7333				break;
  7334			if (is_tdp_mmu_page(sp) == tdp_mmu) {
  7335				spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  7336				return sp;
  7337			}
  7338		}
  7339	
  7340		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  7341		return NULL;
  7342	}
  7343
Vipin Sharma Aug. 14, 2024, 6:23 p.m. UTC | #2
On Wed, Aug 14, 2024 at 2:33 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi Vipin,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on 332d2c1d713e232e163386c35a3ba0c1b90df83f]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Vipin-Sharma/KVM-x86-mmu-Split-NX-hugepage-recovery-flow-into-TDP-and-non-TDP-flow/20240814-091542
> base:   332d2c1d713e232e163386c35a3ba0c1b90df83f
> patch link:    https://lore.kernel.org/r/20240812171341.1763297-3-vipinsh%40google.com
> patch subject: [PATCH 2/2] KVM: x86/mmu: Recover NX Huge pages belonging to TDP MMU under MMU read lock
> config: i386-buildonly-randconfig-005-20240814 (https://download.01.org/0day-ci/archive/20240814/202408141753.ZY1CSmGo-lkp@intel.com/config)
> compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240814/202408141753.ZY1CSmGo-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202408141753.ZY1CSmGo-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
>    arch/x86/kvm/mmu/mmu.c: In function 'kvm_mmu_possible_nx_huge_page':
> >> arch/x86/kvm/mmu/mmu.c:7324:29: error: 'struct kvm_arch' has no member named 'tdp_mmu_pages_lock'
>     7324 |         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
>          |                             ^
>    arch/x86/kvm/mmu/mmu.c:7335:47: error: 'struct kvm_arch' has no member named 'tdp_mmu_pages_lock'
>     7335 |                         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>          |                                               ^
>    arch/x86/kvm/mmu/mmu.c:7340:31: error: 'struct kvm_arch' has no member named 'tdp_mmu_pages_lock'
>     7340 |         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>          |                               ^
My bad, didn't check for 32 bit build. In next version, I will take
the lock in tdp_mmu.c.
kernel test robot Aug. 14, 2024, 10:50 p.m. UTC | #3
Hi Vipin,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 332d2c1d713e232e163386c35a3ba0c1b90df83f]

url:    https://github.com/intel-lab-lkp/linux/commits/Vipin-Sharma/KVM-x86-mmu-Split-NX-hugepage-recovery-flow-into-TDP-and-non-TDP-flow/20240814-091542
base:   332d2c1d713e232e163386c35a3ba0c1b90df83f
patch link:    https://lore.kernel.org/r/20240812171341.1763297-3-vipinsh%40google.com
patch subject: [PATCH 2/2] KVM: x86/mmu: Recover NX Huge pages belonging to TDP MMU under MMU read lock
config: x86_64-randconfig-123-20240814 (https://download.01.org/0day-ci/archive/20240815/202408150646.VV4z8Znl-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240815/202408150646.VV4z8Znl-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408150646.VV4z8Znl-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> arch/x86/kvm/mmu/tdp_mmu.c:847:21: sparse: sparse: incompatible types in comparison expression (different address spaces):
   arch/x86/kvm/mmu/tdp_mmu.c:847:21: sparse:    unsigned long long [usertype] *
   arch/x86/kvm/mmu/tdp_mmu.c:847:21: sparse:    unsigned long long [noderef] [usertype] __rcu *
   arch/x86/kvm/mmu/tdp_mmu.c: note: in included file (through include/linux/rbtree.h, include/linux/mm_types.h, include/linux/mmzone.h, ...):
   include/linux/rcupdate.h:812:25: sparse: sparse: context imbalance in '__tdp_mmu_zap_root' - unexpected unlock
   arch/x86/kvm/mmu/tdp_mmu.c:1447:33: sparse: sparse: context imbalance in 'tdp_mmu_split_huge_pages_root' - unexpected unlock

vim +847 arch/x86/kvm/mmu/tdp_mmu.c

   819	
   820	static bool tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
   821	{
   822		struct tdp_iter iter = {};
   823	
   824		lockdep_assert_held_read(&kvm->mmu_lock);
   825	
   826		/*
   827		 * This helper intentionally doesn't allow zapping a root shadow page,
   828		 * which doesn't have a parent page table and thus no associated entry.
   829		 */
   830		if (WARN_ON_ONCE(!sp->ptep))
   831			return false;
   832	
   833		iter.old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
   834		iter.sptep = sp->ptep;
   835		iter.level = sp->role.level + 1;
   836		iter.gfn = sp->gfn;
   837		iter.as_id = kvm_mmu_page_as_id(sp);
   838	
   839	retry:
   840		/*
   841		 * Since mmu_lock is held in read mode, it's possible to race with
   842		 * another CPU which can remove sp from the page table hierarchy.
   843		 *
   844		 * No need to re-read iter.old_spte as tdp_mmu_set_spte_atomic() will
   845		 * update it in the case of failure.
   846		 */
 > 847		if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
   848			return false;
   849	
   850		if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
   851			goto retry;
   852	
   853		return true;
   854	}
   855
Vipin Sharma Aug. 15, 2024, 4:42 p.m. UTC | #4
On 2024-08-15 06:50:04, kernel test robot wrote:
> sparse warnings: (new ones prefixed by >>)
> >> arch/x86/kvm/mmu/tdp_mmu.c:847:21: sparse: sparse: incompatible types in comparison expression (different address spaces):
>    arch/x86/kvm/mmu/tdp_mmu.c:847:21: sparse:    unsigned long long [usertype] *
>    arch/x86/kvm/mmu/tdp_mmu.c:847:21: sparse:    unsigned long long [noderef] [usertype] __rcu *
>    arch/x86/kvm/mmu/tdp_mmu.c: note: in included file (through include/linux/rbtree.h, include/linux/mm_types.h, include/linux/mmzone.h, ...):
>    include/linux/rcupdate.h:812:25: sparse: sparse: context imbalance in '__tdp_mmu_zap_root' - unexpected unlock
>    arch/x86/kvm/mmu/tdp_mmu.c:1447:33: sparse: sparse: context imbalance in 'tdp_mmu_split_huge_pages_root' - unexpected unlock
> 
> vim +847 arch/x86/kvm/mmu/tdp_mmu.c
> 
>    819	
>    820	static bool tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>    821	{
>    822		struct tdp_iter iter = {};
>    823	
>    824		lockdep_assert_held_read(&kvm->mmu_lock);
>    825	
>    826		/*
>    827		 * This helper intentionally doesn't allow zapping a root shadow page,
>    828		 * which doesn't have a parent page table and thus no associated entry.
>    829		 */
>    830		if (WARN_ON_ONCE(!sp->ptep))
>    831			return false;
>    832	
>    833		iter.old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
>    834		iter.sptep = sp->ptep;
>    835		iter.level = sp->role.level + 1;
>    836		iter.gfn = sp->gfn;
>    837		iter.as_id = kvm_mmu_page_as_id(sp);
>    838	
>    839	retry:
>    840		/*
>    841		 * Since mmu_lock is held in read mode, it's possible to race with
>    842		 * another CPU which can remove sp from the page table hierarchy.
>    843		 *
>    844		 * No need to re-read iter.old_spte as tdp_mmu_set_spte_atomic() will
>    845		 * update it in the case of failure.
>    846		 */
>  > 847		if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))

Hmm, I need to wrap spte_to_child_pt() with rcu_access_pointer() before
comparing it to sp->spt. Following patch makes this Sparse error go
away.

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7c7d207ee590..7d5dbfe48c4b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -820,6 +820,7 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 static bool tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        struct tdp_iter iter = {};
+       tdp_ptep_t pt;

        lockdep_assert_held_read(&kvm->mmu_lock);

@@ -844,7 +845,8 @@ static bool tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
         * No need to re-read iter.old_spte as tdp_mmu_set_spte_atomic() will
         * update it in the case of failure.
         */
-       if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
+       pt = spte_to_child_pt(iter.old_spte, iter.level);
+       if (sp->spt != rcu_access_pointer(pt))
                return false;

        if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
Sean Christopherson Aug. 16, 2024, 11:38 p.m. UTC | #5
On Mon, Aug 12, 2024, Vipin Sharma wrote:
> @@ -1807,7 +1822,7 @@ ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
>  	struct kvm_mmu_page *sp;
>  	bool flush = false;
>  
> -	lockdep_assert_held_write(&kvm->mmu_lock);
> +	lockdep_assert_held_read(&kvm->mmu_lock);
>  	/*
>  	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
>  	 * be done under RCU protection, because the pages are freed via RCU
> @@ -1821,7 +1836,6 @@ ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
>  		if (!sp)
>  			break;
>  
> -		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
>  		WARN_ON_ONCE(!sp->role.direct);
>  
>  		/*
> @@ -1831,12 +1845,17 @@ ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
>  		 * recovered, along with all the other huge pages in the slot,
>  		 * when dirty logging is disabled.
>  		 */
> -		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp))
> +		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
> +			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
>  			unaccount_nx_huge_page(kvm, sp);
> -		else
> -			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
> -
> -		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> +			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> +			to_zap--;
> +			WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> +		} else if (tdp_mmu_zap_sp(kvm, sp)) {
> +			flush = true;
> +			to_zap--;

This is actively dangerous.  In the old code, tdp_mmu_zap_sp() could fail only
in a WARN-able scenario, i.e. practice was guaranteed to succeed.  And, the
for-loop *always* decremented to_zap, i.e. couldn't get stuck in an infinite
loop.

Neither of those protections exist in this version.  Obviously it shouldn't happen,
but it's possible this could flail on the same SP over and over, since nothing
guarnatees forward progress.  The cond_resched() would save KVM from true pain,
but it's still a wart in the implementation.

Rather than loop on to_zap, just do

	list_for_each_entry(...) {

		if (!to_zap)
			break;
	}

And if we don't use separate lists, that'll be an improvement too, as it KVM
will only have to skip "wrong" shadow pages once, whereas this approach means
every iteration of the loop has to walk past the "wrong" shadow pages.

But I'd still prefer to use separate lists.
Vipin Sharma Aug. 19, 2024, 5:34 p.m. UTC | #6
On 2024-08-16 16:38:05, Sean Christopherson wrote:
> On Mon, Aug 12, 2024, Vipin Sharma wrote:
> > @@ -1831,12 +1845,17 @@ ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
> >  		 * recovered, along with all the other huge pages in the slot,
> >  		 * when dirty logging is disabled.
> >  		 */
> > -		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp))
> > +		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
> > +			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> >  			unaccount_nx_huge_page(kvm, sp);
> > -		else
> > -			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
> > -
> > -		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> > +			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> > +			to_zap--;
> > +			WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> > +		} else if (tdp_mmu_zap_sp(kvm, sp)) {
> > +			flush = true;
> > +			to_zap--;
> 
> This is actively dangerous.  In the old code, tdp_mmu_zap_sp() could fail only
> in a WARN-able scenario, i.e. practice was guaranteed to succeed.  And, the
> for-loop *always* decremented to_zap, i.e. couldn't get stuck in an infinite
> loop.
> 
> Neither of those protections exist in this version.  Obviously it shouldn't happen,
> but it's possible this could flail on the same SP over and over, since nothing
> guarnatees forward progress.  The cond_resched() would save KVM from true pain,
> but it's still a wart in the implementation.
> 
> Rather than loop on to_zap, just do
> 
> 	list_for_each_entry(...) {
> 
> 		if (!to_zap)
> 			break;
> 	}
> 
> And if we don't use separate lists, that'll be an improvement too, as it KVM
> will only have to skip "wrong" shadow pages once, whereas this approach means
> every iteration of the loop has to walk past the "wrong" shadow pages.

TDP MMU accesses possible_nx_huge_pages using tdp_mmu_pages_lock spin
lock. We cannot hold it for recovery duration.

In this patch, tdp_mmu_zap_sp() has been modified to retry failures,
which is similar to other retry mechanism in TDP MMU. Won't it be the
same issue with other TDP MMU retry flows?

> 
> But I'd still prefer to use separate lists.
Sean Christopherson Aug. 19, 2024, 10:12 p.m. UTC | #7
On Mon, Aug 19, 2024, Vipin Sharma wrote:
> On 2024-08-16 16:38:05, Sean Christopherson wrote:
> > On Mon, Aug 12, 2024, Vipin Sharma wrote:
> > > @@ -1831,12 +1845,17 @@ ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
> > >  		 * recovered, along with all the other huge pages in the slot,
> > >  		 * when dirty logging is disabled.
> > >  		 */
> > > -		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp))
> > > +		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
> > > +			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> > >  			unaccount_nx_huge_page(kvm, sp);
> > > -		else
> > > -			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
> > > -
> > > -		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> > > +			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> > > +			to_zap--;
> > > +			WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> > > +		} else if (tdp_mmu_zap_sp(kvm, sp)) {
> > > +			flush = true;
> > > +			to_zap--;
> > 
> > This is actively dangerous.  In the old code, tdp_mmu_zap_sp() could fail only
> > in a WARN-able scenario, i.e. practice was guaranteed to succeed.  And, the
> > for-loop *always* decremented to_zap, i.e. couldn't get stuck in an infinite
> > loop.
> > 
> > Neither of those protections exist in this version.  Obviously it shouldn't happen,
> > but it's possible this could flail on the same SP over and over, since nothing
> > guarnatees forward progress.  The cond_resched() would save KVM from true pain,
> > but it's still a wart in the implementation.
> > 
> > Rather than loop on to_zap, just do
> > 
> > 	list_for_each_entry(...) {
> > 
> > 		if (!to_zap)
> > 			break;
> > 	}
> > 
> > And if we don't use separate lists, that'll be an improvement too, as it KVM
> > will only have to skip "wrong" shadow pages once, whereas this approach means
> > every iteration of the loop has to walk past the "wrong" shadow pages.
> 
> TDP MMU accesses possible_nx_huge_pages using tdp_mmu_pages_lock spin
> lock. We cannot hold it for recovery duration.

Ah, right.  Hmm, we actually can.  More thoughts below.

> In this patch, tdp_mmu_zap_sp() has been modified to retry failures,
> which is similar to other retry mechanism in TDP MMU. Won't it be the
> same issue with other TDP MMU retry flows?

Similar, but not exactly the same.  The other flows are guarnateed to make forward
progress, as they'll never revisit a SPTE.  I.e. once a SPTE is observed to be
!shadow-present, that SPTE will never again be processed.

This is spinning on a pre-computed variable, and waiting until that many SPs have
been zapped.  The early break if the list is empty mostly protects against an
infinite loop, but it's theoretically possible other tasks could keep adding and
deleting from the list, in perpetuity.

Side topic, with the proposed changes, kvm_tdp_mmu_zap_sp() should return an int,
not a bool, i.e. 0/-errno, not true/false.  The existing code is specifically
returning whether or not a flush is needed, it does NOT indicate whether or not
the shadow page was successfully zapped, i.e. the !PRESENT case is treated as
being successful since something apparently already zapped the page.

[never mind, I've since come up with a better idea, but I typed  all this out,
 so I'm leaving it]

What about something like this?  If the shadow page can't be zapped because
something else was modifying it, just move on and deal with it next time.

	for ( ; to_zap; --to_zap) {
		...


		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
			unaccount_nx_huge_page(kvm, sp);
			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		} else if (!tdp_mmu_zap_sp(...)) {
			flush = true;
		} else {
			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
			list_move_tail(&sp->possible_nx_huge_page_link, kvm->
				       &kvm->arch.possible_nx_huge_pages);
			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		}
	}

But jumping back to the "we actually can [hold tdp_mmu_pages_lock]", if the zap
is split into the actually CMPXCHG vs. handle_removed_pt() call, then the lock
can be held while walking+zapping.  And it's quite straightforward, if we're
willing to forego the sanity checks on the old_spte, which would require wrapping
the sp in a struct to create a tuple.

The only part that gives me pause is the fact that it's not super obvious that,
ignoring the tracepoint, handle_changed_spte() is just a fat wrapper for
handle_removed_pt() when zapping a SP.

Huh.  Actually, after a lot of fiddling and staring, there's a simpler solution,
and it would force us to comment/document an existing race that's subly ok.

For the dirty logging case, the result of kvm_mmu_sp_dirty_logging_enabled() is
visible to the NX recovery thread before the memslot update task is guaranteed
to finish (or even start) kvm_mmu_zap_collapsible_sptes().  I.e. KVM could
unaccount an NX shadow page before it is zapped, and that could lead to a vCPU
replacing the shadow page with an NX huge page.

Functionally, that's a-ok, because the accounting doesn't provide protection
against iTLB multi-hit bug, it's there purely to prevent KVM from bouncing a gfn
between an NX hugepage and an execute small page.  The only downside to the vCPU
doing the replacement is that the vCPU will get saddle with tearing down all the
child SPTEs.  But this should be a very rare race, so I can't imagine that would
be problematic in practice.

This contains some feedback I gathered along the, I'll respond to the original
patch since the context is lost.

static bool tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
					      struct kvm_mmu_page *sp)
{
	struct tdp_iter iter = {
		.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
		.sptep = sp->ptep,
		.level = sp->role.level + 1,
		.gfn = sp->gfn,
		.as_id = kvm_mmu_page_as_id(sp),
	};

	lockdep_assert_held_read(&kvm->mmu_lock);

	/*
	 * Root shadow pages don't a parent page table and thus no associated
	 * entry, but they can never be possible NX huge pages.
	 */
	if (WARN_ON_ONCE(!sp->ptep))
		return false;

	/*
	 * Since mmu_lock is held in read mode, it's possible another task has
	 * already modified the SPTE.  Zap the SPTE if and only if the SPTE
	 * points at the SP's page table, as checking  shadow-present isn't
	 * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
	 * another SP.  Note, spte_to_child_pt() also checks that the SPTE is
	 * shadow-present, i.e. guards against zapping a frozen SPTE.
	 */
	if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
		return false;

	/*
	 * If a different task modified the SPTE, then it should be impossible
	 * for the SPTE to still be used for the to-be-zapped SP.  Non-leaf
	 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
	 * creating non-leaf SPTEs, and all other bits are immutable for non-
	 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
	 * zapping and replacement.
	 */
	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
		WARN_ON_ONCE(sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
		return false;
	}

	return true;
}

void kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, unsigned long to_zap)
{
	struct kvm_mmu_page *sp;
	bool flush = false;

	lockdep_assert_held_read(&kvm->mmu_lock);
	/*
	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
	 * be done under RCU protection, because the pages are freed via RCU
	 * callback.
	 */
	rcu_read_lock();

	for ( ; to_zap; --to_zap) {
		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
		if (list_empty(&kvm->arch.possible_tdp_mmu_nx_huge_pages))
			break;

		sp = list_first_entry(&kvm->arch.possible_tdp_mmu_nx_huge_pages,
				      struct kvm_mmu_page,
				      possible_nx_huge_page_link);

		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
		WARN_ON_ONCE(!sp->role.direct);

		/*
		 * Unaccount the shadow page before zapping its SPTE so as to
		 * avoid bouncing tdp_mmu_pages_lock() more than is necessary.
		 * Clearing nx_huge_page_disallowed before zapping is safe, as
		 * the flag doesn't protect against iTLB multi-hit, it's there
		 * purely to prevent bouncing the gfn between an NX huge page
		 * and an X small spage.  A vCPU could get stuck tearing down
		 * the shadow page, e.g. if it happens to fault on the region
		 * before the SPTE is zapped and replaces the shadow page with
		 * an NX huge page and get stuck tearing down the child SPTEs,
		 * but that is a rare race, i.e. shouldn't impact performance.
		 */
		unaccount_nx_huge_page(kvm, sp);

		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);

		/*
		 * Don't bother zapping shadow pages if the memslot is being
		 * dirty logged, as the relevant pages would just be faulted
		 * back in as 4KiB pages.  Potential NX Huge Pages in this slot
		 * will be recovered, along with all the other huge pages in
		 * the slot, when dirty logging is disabled.
		 */
		if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp))
			flush |= tdp_mmu_zap_possible_nx_huge_page(kvm, sp);

		WARN_ON_ONCE(sp->nx_huge_page_disallowed);

		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
			if (flush) {
				kvm_flush_remote_tlbs(kvm);
				flush = false;
			}
			rcu_read_unlock();
			cond_resched_rwlock_read(&kvm->mmu_lock);
			rcu_read_lock();
		}
	}

	if (flush)
		kvm_flush_remote_tlbs(kvm);
	rcu_read_unlock();
}
Sean Christopherson Aug. 19, 2024, 10:19 p.m. UTC | #8
On Mon, Aug 12, 2024, Vipin Sharma wrote:
> Use MMU read lock to recover NX huge pages belonging to TDP MMU. Iterate
> through kvm->arch.possible_nx_huge_pages while holding
> kvm->arch.tdp_mmu_pages_lock. Rename kvm_tdp_mmu_zap_sp() to
> tdp_mmu_zap_sp() and make it static as there are no callers outside of
> TDP MMU.
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 933bb8b11c9f..7c7d207ee590 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -817,9 +817,11 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
>  	rcu_read_unlock();
>  }
>  
> -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> +static bool tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)

At this point, I think we should rename this to tdp_mmu_zap_possible_nx_huge_page(),
as I can't imagine there's another use case where we'll zap a SP starting from the
SP itself, i.e. without first walking from the root.

>  {
> -	u64 old_spte;
> +	struct tdp_iter iter = {};

Rather than initializes the on-stack structure, I think it makes sense to directly
initialize the whole thing and then WARN after, e.g. so that its easier to see
that "iter" is simply being filled from @sp.

	struct tdp_iter iter = {
		.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
		.sptep = sp->ptep,
		.level = sp->role.level + 1,
		.gfn = sp->gfn,
		.as_id = kvm_mmu_page_as_id(sp),
	};

	lockdep_assert_held_read(&kvm->mmu_lock);

	/*
	 * Root shadow pages don't a parent page table and thus no associated
	 * entry, but they can never be possible NX huge pages.
	 */
	if (WARN_ON_ONCE(!sp->ptep))
		return false;

> +
> +	lockdep_assert_held_read(&kvm->mmu_lock);
>  
>  	/*
>  	 * This helper intentionally doesn't allow zapping a root shadow page,
> @@ -828,12 +830,25 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  	if (WARN_ON_ONCE(!sp->ptep))
>  		return false;
>  
> -	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
> -	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
> +	iter.old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
> +	iter.sptep = sp->ptep;
> +	iter.level = sp->role.level + 1;
> +	iter.gfn = sp->gfn;
> +	iter.as_id = kvm_mmu_page_as_id(sp);
> +
> +retry:
> +	/*
> +	 * Since mmu_lock is held in read mode, it's possible to race with
> +	 * another CPU which can remove sp from the page table hierarchy.
> +	 *
> +	 * No need to re-read iter.old_spte as tdp_mmu_set_spte_atomic() will
> +	 * update it in the case of failure.
> +	 */
> +	if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
>  		return false;
>  
> -	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
> -			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
> +	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
> +		goto retry;

I'm pretty sure there's no need to retry.  Non-leaf SPTEs don't have Dirty bits,
and KVM always sets the Accessed bit (and never clears it) for non-leaf SPTEs.
Ditty for the Writable bit.

So unless I'm missing something, the only way for the CMPXCHG to fail is if the
SPTE was zapped or replaced with something else, in which case the sp->spt will
fail.  I would much prefer to WARN on that logic failing than have what appears
to be a potential infinite loop.
Vipin Sharma Aug. 23, 2024, 7:27 p.m. UTC | #9
On 2024-08-19 15:19:19, Sean Christopherson wrote:
> On Mon, Aug 12, 2024, Vipin Sharma wrote:
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -817,9 +817,11 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
> >  	rcu_read_unlock();
> >  }
> >  
> > -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> > +static bool tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> 
> At this point, I think we should rename this to tdp_mmu_zap_possible_nx_huge_page(),
> as I can't imagine there's another use case where we'll zap a SP starting from the
> SP itself, i.e. without first walking from the root.
> 

Okay.

> >  {
> > -	u64 old_spte;
> > +	struct tdp_iter iter = {};
> 
> Rather than initializes the on-stack structure, I think it makes sense to directly
> initialize the whole thing and then WARN after, e.g. so that its easier to see
> that "iter" is simply being filled from @sp.
> 
> 	struct tdp_iter iter = {
> 		.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
> 		.sptep = sp->ptep,
> 		.level = sp->role.level + 1,
> 		.gfn = sp->gfn,
> 		.as_id = kvm_mmu_page_as_id(sp),
> 	};

Okay.

> > +retry:
> > +	/*
> > +	 * Since mmu_lock is held in read mode, it's possible to race with
> > +	 * another CPU which can remove sp from the page table hierarchy.
> > +	 *
> > +	 * No need to re-read iter.old_spte as tdp_mmu_set_spte_atomic() will
> > +	 * update it in the case of failure.
> > +	 */
> > +	if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
> >  		return false;
> >  
> > -	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
> > -			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
> > +	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
> > +		goto retry;
> 
> I'm pretty sure there's no need to retry.  Non-leaf SPTEs don't have Dirty bits,
> and KVM always sets the Accessed bit (and never clears it) for non-leaf SPTEs.
> Ditty for the Writable bit.
> 
> So unless I'm missing something, the only way for the CMPXCHG to fail is if the
> SPTE was zapped or replaced with something else, in which case the sp->spt will
> fail.  I would much prefer to WARN on that logic failing than have what appears
> to be a potential infinite loop.

I don't think we should WARN() in that scenario. Because there is
nothing wrong with someone racing with NX huge page recovery for zapping
or replacing the SPTE. This function should be just trying to zap a page
and if that didn't suceed then return the error and let caller handle
however they want to.

NX huge page recovery should be tolerant of this zapping failure and
move on to the next shadow page. May be we can put WARN if NX huge page
recovery couldn't zap any pages during its run time. For example, if it
was supposed to zap 10 pages and it couldn't zap any of them then print
using WARN_ON_ONCE. This is with the assumption that if more than 1
pages are there to be zapped then at least some of them will get zapped
whenever recovery worker kicks in.
Sean Christopherson Aug. 23, 2024, 8:45 p.m. UTC | #10
On Fri, Aug 23, 2024, Vipin Sharma wrote:
> On 2024-08-19 15:19:19, Sean Christopherson wrote:
> > On Mon, Aug 12, 2024, Vipin Sharma wrote:
> > > +retry:
> > > +	/*
> > > +	 * Since mmu_lock is held in read mode, it's possible to race with
> > > +	 * another CPU which can remove sp from the page table hierarchy.
> > > +	 *
> > > +	 * No need to re-read iter.old_spte as tdp_mmu_set_spte_atomic() will
> > > +	 * update it in the case of failure.
> > > +	 */
> > > +	if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
> > >  		return false;
> > >  
> > > -	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
> > > -			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
> > > +	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
> > > +		goto retry;
> > 
> > I'm pretty sure there's no need to retry.  Non-leaf SPTEs don't have Dirty bits,
> > and KVM always sets the Accessed bit (and never clears it) for non-leaf SPTEs.
> > Ditty for the Writable bit.
> > 
> > So unless I'm missing something, the only way for the CMPXCHG to fail is if the
> > SPTE was zapped or replaced with something else, in which case the sp->spt will
> > fail.  I would much prefer to WARN on that logic failing than have what appears
> > to be a potential infinite loop.
> 
> I don't think we should WARN() in that scenario. Because there is
> nothing wrong with someone racing with NX huge page recovery for zapping
> or replacing the SPTE.

Agreed, but as sketched out in my other reply[*], I'm saying KVM should WARN like so:

	/*
	 * If a different task modified the SPTE, then it should be impossible
	 * for the SPTE to still be used for the to-be-zapped SP.  Non-leaf
	 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
	 * creating non-leaf SPTEs, and all other bits are immutable for non-
	 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
	 * zapping and replacement.
	 */
	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
		WARN_ON_ONCE(sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
		return false;
	}

[*] https://lore.kernel.org/all/ZsPDWqOiv_g7Wh_H@google.com
	
> This function should be just trying to zap a page

Yes and no.  KVM, very deliberately, has multiple different flows for zapping
SPTEs, because the requirements, rules, and performance needs vary.

Initially, the TDP MMU had one common flow for zapping, and it was difficult to
maintain and understand, because trying to optimize and adjust for the various
scenarios in a single path was difficult and convoluted.  E.g. when zapping for
mmu_notifier invalidations, mmu_lock must be held for read, a flush is required,
KVM _must_ process stale/obsolete roots, and KVM only needs to zap leaf SPTEs.

Constrast that with zapping SPTEs in the back half of a fast zap, e.g. after a
memslot deletion, which runs with mmu_lock held for read, only processes invalid
roots, doesn't need to flush, but does need to zap _everything_.

This API is no different.  It is very specifically for zapping non-leaf, non-root
SPTEs in live roots.

So "yes", it's "just" trying to zap a page, but "no" in the sense that there are
a lot of rules and requirements behind that simple statement.

> and if that didn't suceed then return the error and let caller handle however
> they want to.  NX huge page recovery should be tolerant of this zapping
> failure and move on to the next shadow page.

Again, I agree, but that is orthogonal to the WARN I am suggesting.  The intent
of the WARN isn't to detect that zapping failed, it's to flag that the impossible
has happened.

> May be we can put WARN if NX huge page recovery couldn't zap any pages during
> its run time. For example, if it was supposed to zap 10 pages and it couldn't
> zap any of them then print using WARN_ON_ONCE.

Why?  It's improbable, but absolutely possible that zapping 10 SPTEs could fail.
Would it make sense to publish a stat so that userspace can alert on NX huge page
recovery not being as effective as it should be?  Maybe.  But the kernel should
never WARN because an unlikely scenario occurred.

If you look at all the KVM_MMU_WARN_ON() and (hopefully) WARN_ON_ONCE() calls in
KVM x86, they're either for things that should never happen absent software or
hardware bugs, or to ensure future changes update all relevant code paths.

The WARN I am suggesting is the same.  It can only fire if there's a hardware
bug, a KVM bug, or if KVM changes how it behaves, e.g. starts doing A/D tracking
on non-leaf SPTEs.

> This is with the assumption that if more than 1 pages are there to be zapped
> then at least some of them will get zapped whenever recovery worker kicks in.
Vipin Sharma Aug. 23, 2024, 9:41 p.m. UTC | #11
On Fri, Aug 23, 2024 at 1:45 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Aug 23, 2024, Vipin Sharma wrote:
> > and if that didn't suceed then return the error and let caller handle however
> > they want to.  NX huge page recovery should be tolerant of this zapping
> > failure and move on to the next shadow page.
>
> Again, I agree, but that is orthogonal to the WARN I am suggesting.  The intent
> of the WARN isn't to detect that zapping failed, it's to flag that the impossible
> has happened.
>
> > May be we can put WARN if NX huge page recovery couldn't zap any pages during
> > its run time. For example, if it was supposed to zap 10 pages and it couldn't
> > zap any of them then print using WARN_ON_ONCE.
>
> Why?  It's improbable, but absolutely possible that zapping 10 SPTEs could fail.
> Would it make sense to publish a stat so that userspace can alert on NX huge page
> recovery not being as effective as it should be?  Maybe.  But the kernel should
> never WARN because an unlikely scenario occurred.

I misunderstood your WARN requirement in the response to this patch
and that's why suggested that if you are preferring WARN this much :)
maybe we can move it out outside and print only if nothing got zapped.
Glad it's not.

We don't need stat,  userspace can use page stats and NX parameters to
observe if it is working effectively or not.

> If you look at all the KVM_MMU_WARN_ON() and (hopefully) WARN_ON_ONCE() calls in
> KVM x86, they're either for things that should never happen absent software or
> hardware bugs, or to ensure future changes update all relevant code paths.
>
> The WARN I am suggesting is the same.  It can only fire if there's a hardware
> bug, a KVM bug, or if KVM changes how it behaves, e.g. starts doing A/D tracking
> on non-leaf SPTEs.
>

I responded to this email before moving on to your next email :)

I agree with WARN there which is for checking if new SPTE is pointing
to the old page table. I have a few other comments (not related to
WARN), I will respond there.

Thanks!
Vipin Sharma Aug. 23, 2024, 10:38 p.m. UTC | #12
On 2024-08-19 15:12:42, Sean Christopherson wrote:
> On Mon, Aug 19, 2024, Vipin Sharma wrote:
> > On 2024-08-16 16:38:05, Sean Christopherson wrote:
> > In this patch, tdp_mmu_zap_sp() has been modified to retry failures,
> > which is similar to other retry mechanism in TDP MMU. Won't it be the
> > same issue with other TDP MMU retry flows?
> 
> Similar, but not exactly the same.  The other flows are guarnateed to make forward
> progress, as they'll never revisit a SPTE.  I.e. once a SPTE is observed to be
> !shadow-present, that SPTE will never again be processed.
> 
> This is spinning on a pre-computed variable, and waiting until that many SPs have
> been zapped.  The early break if the list is empty mostly protects against an
> infinite loop, but it's theoretically possible other tasks could keep adding and
> deleting from the list, in perpetuity.

Got it.

> What about something like this?  If the shadow page can't be zapped because
> something else was modifying it, just move on and deal with it next time.

This sounds good. I was trying to force zapping "to_zap" times
shadow pages to make it similar to existing NX huge page recovery
approach.

> But jumping back to the "we actually can [hold tdp_mmu_pages_lock]", if the zap
> is split into the actually CMPXCHG vs. handle_removed_pt() call, then the lock
> can be held while walking+zapping.  And it's quite straightforward, if we're
> willing to forego the sanity checks on the old_spte, which would require wrapping
> the sp in a struct to create a tuple.
> 
> The only part that gives me pause is the fact that it's not super obvious that,
> ignoring the tracepoint, handle_changed_spte() is just a fat wrapper for
> handle_removed_pt() when zapping a SP.
> 
> Huh.  Actually, after a lot of fiddling and staring, there's a simpler solution,
> and it would force us to comment/document an existing race that's subly ok.
> 
> For the dirty logging case, the result of kvm_mmu_sp_dirty_logging_enabled() is
> visible to the NX recovery thread before the memslot update task is guaranteed
> to finish (or even start) kvm_mmu_zap_collapsible_sptes().  I.e. KVM could
> unaccount an NX shadow page before it is zapped, and that could lead to a vCPU
> replacing the shadow page with an NX huge page.
> 
> Functionally, that's a-ok, because the accounting doesn't provide protection
> against iTLB multi-hit bug, it's there purely to prevent KVM from bouncing a gfn
> between an NX hugepage and an execute small page.  The only downside to the vCPU
> doing the replacement is that the vCPU will get saddle with tearing down all the
> child SPTEs.  But this should be a very rare race, so I can't imagine that would
> be problematic in practice.

I am worried that whenever this happens it might cause guest jitter
which we are trying to avoid as handle_changed_spte() might be keep a
vCPU busy for sometime.

> static bool tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
> 					      struct kvm_mmu_page *sp)
> {
> 	/*
> 	 * If a different task modified the SPTE, then it should be impossible
> 	 * for the SPTE to still be used for the to-be-zapped SP.  Non-leaf
> 	 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
> 	 * creating non-leaf SPTEs, and all other bits are immutable for non-
> 	 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
> 	 * zapping and replacement.
> 	 */
> 	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
> 		WARN_ON_ONCE(sp->spt == spte_to_child_pt(iter.old_spte, iter.level));

I responded in another patch before reading all this here. This looks
good.

> void kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, unsigned long to_zap)
> 
> 		/*
> 		 * Unaccount the shadow page before zapping its SPTE so as to
> 		 * avoid bouncing tdp_mmu_pages_lock() more than is necessary.
> 		 * Clearing nx_huge_page_disallowed before zapping is safe, as
> 		 * the flag doesn't protect against iTLB multi-hit, it's there
> 		 * purely to prevent bouncing the gfn between an NX huge page
> 		 * and an X small spage.  A vCPU could get stuck tearing down
> 		 * the shadow page, e.g. if it happens to fault on the region
> 		 * before the SPTE is zapped and replaces the shadow page with
> 		 * an NX huge page and get stuck tearing down the child SPTEs,
> 		 * but that is a rare race, i.e. shouldn't impact performance.
> 		 */
> 		unaccount_nx_huge_page(kvm, sp);

Might cause jitter. A long jitter might cause an escalation.

What if I do not unaccount in the beginning, and  move page to the end
of the list only if it is still in the list? If zapping failed because
some other flow might be removing this page but it still in the
possible_nx_huge_pages list, then just move it to the end. The thread
which is removing will remove it from the list eventually.

for ( ; to_zap; --to_zap) {
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	if (list_empty(&kvm->arch.possible_tdp_mmu_nx_huge_pages)) {
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		break;
	}

	sp = list_first_entry(&kvm->arch.possible_tdp_mmu_nx_huge_pages,
			      struct kvm_mmu_page,
			      possible_nx_huge_page_link);

	WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
	WARN_ON_ONCE(!sp->role.direct);

	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);


	/*
	 * Don't bother zapping shadow pages if the memslot is being
	 * dirty logged, as the relevant pages would just be faulted
	 * back in as 4KiB pages.  Potential NX Huge Pages in this slot
	 * will be recovered, along with all the other huge pages in
	 * the slot, when dirty logging is disabled.
	 */
	if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
		unaccount_nx_huge_page(kvm, sp);
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
	} else if (tdp_mmu_zap_possible_nx_huge_page(kvm, sp)) {
		flush = true;
		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
	} else {
		/*
		 * Try again in future if the page is still in the
		 * list
		 */
		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
		if (!list_empty(&sp->possible_nx_huge_page_link))
			list_move_tail(&sp->possible_nx_huge_page_link,
			kvm-> &kvm->arch.possible_nx_huge_pages);
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
	}

	/* Resched code below */
}
Sean Christopherson Aug. 26, 2024, 2:34 p.m. UTC | #13
On Fri, Aug 23, 2024, Vipin Sharma wrote:
> On 2024-08-19 15:12:42, Sean Christopherson wrote:
> > On Mon, Aug 19, 2024, Vipin Sharma wrote:
> > Huh.  Actually, after a lot of fiddling and staring, there's a simpler solution,
> > and it would force us to comment/document an existing race that's subly ok.
> > 
> > For the dirty logging case, the result of kvm_mmu_sp_dirty_logging_enabled() is
> > visible to the NX recovery thread before the memslot update task is guaranteed
> > to finish (or even start) kvm_mmu_zap_collapsible_sptes().  I.e. KVM could
> > unaccount an NX shadow page before it is zapped, and that could lead to a vCPU
> > replacing the shadow page with an NX huge page.
> > 
> > Functionally, that's a-ok, because the accounting doesn't provide protection
> > against iTLB multi-hit bug, it's there purely to prevent KVM from bouncing a gfn
> > between an NX hugepage and an execute small page.  The only downside to the vCPU
> > doing the replacement is that the vCPU will get saddle with tearing down all the
> > child SPTEs.  But this should be a very rare race, so I can't imagine that would
> > be problematic in practice.
> 
> I am worried that whenever this happens it might cause guest jitter
> which we are trying to avoid as handle_changed_spte() might be keep a
> vCPU busy for sometime.

That race already exists today, and your series already extends the ways in which
the race can be hit.  My suggestion is to (a) explicit document that race and (b)
expand the window in which it can occur to also apply to dirty logging being off.

> > void kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, unsigned long to_zap)
> > 
> > 		/*
> > 		 * Unaccount the shadow page before zapping its SPTE so as to
> > 		 * avoid bouncing tdp_mmu_pages_lock() more than is necessary.
> > 		 * Clearing nx_huge_page_disallowed before zapping is safe, as
> > 		 * the flag doesn't protect against iTLB multi-hit, it's there
> > 		 * purely to prevent bouncing the gfn between an NX huge page
> > 		 * and an X small spage.  A vCPU could get stuck tearing down
> > 		 * the shadow page, e.g. if it happens to fault on the region
> > 		 * before the SPTE is zapped and replaces the shadow page with
> > 		 * an NX huge page and get stuck tearing down the child SPTEs,
> > 		 * but that is a rare race, i.e. shouldn't impact performance.
> > 		 */
> > 		unaccount_nx_huge_page(kvm, sp);
> 
> Might cause jitter. A long jitter might cause an escalation.
> 
> What if I do not unaccount in the beginning, and  move page to the end
> of the list only if it is still in the list?

The race between kvm_mmu_sp_dirty_logging_enabled() vs. kvm_tdp_mmu_map() vs.
kvm_mmu_zap_collapsible_sptes() still exists.

> If zapping failed because some other flow might be removing this page but it
> still in the possible_nx_huge_pages list, then just move it to the end. The
> thread which is removing will remove it from the list eventually.
> 
> for ( ; to_zap; --to_zap) {
> 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> 	if (list_empty(&kvm->arch.possible_tdp_mmu_nx_huge_pages)) {
> 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> 		break;
> 	}
> 
> 	sp = list_first_entry(&kvm->arch.possible_tdp_mmu_nx_huge_pages,
> 			      struct kvm_mmu_page,
> 			      possible_nx_huge_page_link);
> 
> 	WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
> 	WARN_ON_ONCE(!sp->role.direct);
> 
> 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> 
> 
> 	/*
> 	 * Don't bother zapping shadow pages if the memslot is being
> 	 * dirty logged, as the relevant pages would just be faulted
> 	 * back in as 4KiB pages.  Potential NX Huge Pages in this slot
> 	 * will be recovered, along with all the other huge pages in
> 	 * the slot, when dirty logging is disabled.
> 	 */
> 	if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
> 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> 		unaccount_nx_huge_page(kvm, sp);
> 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> 		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> 	} else if (tdp_mmu_zap_possible_nx_huge_page(kvm, sp)) {
> 		flush = true;
> 		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> 	} else {
> 		/*
> 		 * Try again in future if the page is still in the
> 		 * list
> 		 */
> 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> 		if (!list_empty(&sp->possible_nx_huge_page_link))
> 			list_move_tail(&sp->possible_nx_huge_page_link,
> 			kvm-> &kvm->arch.possible_nx_huge_pages);

This is unsafe.  The only thing that prevents a use-after-free of "sp" is the fact
that this task holds rcu_read_lock().  The sp could already been queued for freeing
via call_rcu().

> 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> 	}
> 
> 	/* Resched code below */
> }
Vipin Sharma Aug. 26, 2024, 7:24 p.m. UTC | #14
On 2024-08-26 07:34:35, Sean Christopherson wrote:
> On Fri, Aug 23, 2024, Vipin Sharma wrote:
> > On 2024-08-19 15:12:42, Sean Christopherson wrote:
> > > On Mon, Aug 19, 2024, Vipin Sharma wrote:
> > > Huh.  Actually, after a lot of fiddling and staring, there's a simpler solution,
> > > and it would force us to comment/document an existing race that's subly ok.
> > > 
> > > For the dirty logging case, the result of kvm_mmu_sp_dirty_logging_enabled() is
> > > visible to the NX recovery thread before the memslot update task is guaranteed
> > > to finish (or even start) kvm_mmu_zap_collapsible_sptes().  I.e. KVM could
> > > unaccount an NX shadow page before it is zapped, and that could lead to a vCPU
> > > replacing the shadow page with an NX huge page.
> > > 
> > > Functionally, that's a-ok, because the accounting doesn't provide protection
> > > against iTLB multi-hit bug, it's there purely to prevent KVM from bouncing a gfn
> > > between an NX hugepage and an execute small page.  The only downside to the vCPU
> > > doing the replacement is that the vCPU will get saddle with tearing down all the
> > > child SPTEs.  But this should be a very rare race, so I can't imagine that would
> > > be problematic in practice.
> > 
> > I am worried that whenever this happens it might cause guest jitter
> > which we are trying to avoid as handle_changed_spte() might be keep a
> > vCPU busy for sometime.
> 
> That race already exists today, and your series already extends the ways in which
> the race can be hit.  My suggestion is to (a) explicit document that race and (b)
> expand the window in which it can occur to also apply to dirty logging being off.
> 

I was not clear about vCPU doing the replacement part. I see how this
change is expanding the window.

> > 	} else {
> > 		/*
> > 		 * Try again in future if the page is still in the
> > 		 * list
> > 		 */
> > 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> > 		if (!list_empty(&sp->possible_nx_huge_page_link))
> > 			list_move_tail(&sp->possible_nx_huge_page_link,
> > 			kvm-> &kvm->arch.possible_nx_huge_pages);
> 
> This is unsafe.  The only thing that prevents a use-after-free of "sp" is the fact
> that this task holds rcu_read_lock().  The sp could already been queued for freeing
> via call_rcu().

Before call_rcu() happens, that page will be removed from
kvm->arch.possible_nx_huge_pages list in handle_remove_pt() via
tdp_mmu_unlink_sp() using kvm->arch.tdp_mmu_pages_lock. Here, we are
using the same lock and checking if page is in the list or not. If it is
in the list move to end and if it is not then don't do anything.

Am I missing something else? Otherwise, this logic seems correct to me.

Overall, I will be using your example code, so you won't see this code
in next version but just want to understand the concern with this else
part.

> 
> > 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> > 	}
> > 
> > 	/* Resched code below */
> > }
Sean Christopherson Aug. 26, 2024, 7:51 p.m. UTC | #15
On Mon, Aug 26, 2024, Vipin Sharma wrote:
> On 2024-08-26 07:34:35, Sean Christopherson wrote:
> > > 	} else {
> > > 		/*
> > > 		 * Try again in future if the page is still in the
> > > 		 * list
> > > 		 */
> > > 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> > > 		if (!list_empty(&sp->possible_nx_huge_page_link))
> > > 			list_move_tail(&sp->possible_nx_huge_page_link,
> > > 			kvm-> &kvm->arch.possible_nx_huge_pages);
> > 
> > This is unsafe.  The only thing that prevents a use-after-free of "sp" is the fact
> > that this task holds rcu_read_lock().  The sp could already been queued for freeing
> > via call_rcu().
> 
> Before call_rcu() happens, that page will be removed from
> kvm->arch.possible_nx_huge_pages list in handle_remove_pt() via
> tdp_mmu_unlink_sp() using kvm->arch.tdp_mmu_pages_lock.

Gah, my bad, I inverted the list_empty() check when reading.

> Here, we are using the same lock and checking if page is in the list or not.
> If it is in the list move to end and if it is not then don't do anything.
> 
> Am I missing something else? Otherwise, this logic seems correct to me.

Nope, poor code reading on my part, especially since the _move_ action should have
made it obvious the SP is still live.

> Overall, I will be using your example code, so you won't see this code
> in next version but just want to understand the concern with this else
> part.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5534fcc9d1b5..d95770d5303a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7321,6 +7321,7 @@  struct kvm_mmu_page *kvm_mmu_possible_nx_huge_page(struct kvm *kvm, bool tdp_mmu
 	struct kvm_mmu_page *sp = NULL;
 	ulong i = 0;
 
+	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 	/*
 	 * We use a separate list instead of just using active_mmu_pages because
 	 * the number of shadow pages that be replaced with an NX huge page is
@@ -7330,10 +7331,13 @@  struct kvm_mmu_page *kvm_mmu_possible_nx_huge_page(struct kvm *kvm, bool tdp_mmu
 	list_for_each_entry(sp, &kvm->arch.possible_nx_huge_pages, possible_nx_huge_page_link) {
 		if (i++ >= max)
 			break;
-		if (is_tdp_mmu_page(sp) == tdp_mmu)
+		if (is_tdp_mmu_page(sp) == tdp_mmu) {
+			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 			return sp;
+		}
 	}
 
+	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 	return NULL;
 }
 
@@ -7422,9 +7426,9 @@  static void kvm_recover_nx_huge_pages(struct kvm *kvm)
 	rcu_idx = srcu_read_lock(&kvm->srcu);
 
 	if (to_zap && tdp_mmu_enabled) {
-		write_lock(&kvm->mmu_lock);
+		read_lock(&kvm->mmu_lock);
 		to_zap = kvm_tdp_mmu_recover_nx_huge_pages(kvm, to_zap);
-		write_unlock(&kvm->mmu_lock);
+		read_unlock(&kvm->mmu_lock);
 	}
 
 	if (to_zap && kvm_memslots_have_rmaps(kvm)) {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 933bb8b11c9f..7c7d207ee590 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -817,9 +817,11 @@  static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 	rcu_read_unlock();
 }
 
-bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+static bool tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	u64 old_spte;
+	struct tdp_iter iter = {};
+
+	lockdep_assert_held_read(&kvm->mmu_lock);
 
 	/*
 	 * This helper intentionally doesn't allow zapping a root shadow page,
@@ -828,12 +830,25 @@  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (WARN_ON_ONCE(!sp->ptep))
 		return false;
 
-	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
-	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
+	iter.old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
+	iter.sptep = sp->ptep;
+	iter.level = sp->role.level + 1;
+	iter.gfn = sp->gfn;
+	iter.as_id = kvm_mmu_page_as_id(sp);
+
+retry:
+	/*
+	 * Since mmu_lock is held in read mode, it's possible to race with
+	 * another CPU which can remove sp from the page table hierarchy.
+	 *
+	 * No need to re-read iter.old_spte as tdp_mmu_set_spte_atomic() will
+	 * update it in the case of failure.
+	 */
+	if (sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
 		return false;
 
-	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
-			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
+	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
+		goto retry;
 
 	return true;
 }
@@ -1807,7 +1822,7 @@  ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
 	struct kvm_mmu_page *sp;
 	bool flush = false;
 
-	lockdep_assert_held_write(&kvm->mmu_lock);
+	lockdep_assert_held_read(&kvm->mmu_lock);
 	/*
 	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
 	 * be done under RCU protection, because the pages are freed via RCU
@@ -1821,7 +1836,6 @@  ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
 		if (!sp)
 			break;
 
-		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
 		WARN_ON_ONCE(!sp->role.direct);
 
 		/*
@@ -1831,12 +1845,17 @@  ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
 		 * recovered, along with all the other huge pages in the slot,
 		 * when dirty logging is disabled.
 		 */
-		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp))
+		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
+			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 			unaccount_nx_huge_page(kvm, sp);
-		else
-			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
-
-		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
+			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+			to_zap--;
+			WARN_ON_ONCE(sp->nx_huge_page_disallowed);
+		} else if (tdp_mmu_zap_sp(kvm, sp)) {
+			flush = true;
+			to_zap--;
+			WARN_ON_ONCE(sp->nx_huge_page_disallowed);
+		}
 
 		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 			if (flush) {
@@ -1844,10 +1863,9 @@  ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap)
 				flush = false;
 			}
 			rcu_read_unlock();
-			cond_resched_rwlock_write(&kvm->mmu_lock);
+			cond_resched_rwlock_read(&kvm->mmu_lock);
 			rcu_read_lock();
 		}
-		to_zap--;
 	}
 
 	if (flush)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 7d68c2ddf78c..e0315cce6798 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -20,7 +20,6 @@  __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
-bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);