diff mbox series

KVM: SVM: Connect 'npt' module param to KVM's internal 'npt_enabled'

Message ID 20210305021637.3768573-1-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: SVM: Connect 'npt' module param to KVM's internal 'npt_enabled' | expand

Commit Message

Sean Christopherson March 5, 2021, 2:16 a.m. UTC
Directly connect the 'npt' param to the 'npt_enabled' variable so that
runtime adjustments to npt_enabled are reflected in sysfs.  Move the
!PAE restriction to a runtime check to ensure NPT is forced off if the
host is using 2-level paging, and add a comment explicitly stating why
NPT requires a 64-bit kernel or a kernel with PAE enabled.

Opportunistically switch the param to octal permissions.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

Comments

Vitaly Kuznetsov March 5, 2021, 9:54 a.m. UTC | #1
Sean Christopherson <seanjc@google.com> writes:

> Directly connect the 'npt' param to the 'npt_enabled' variable so that
> runtime adjustments to npt_enabled are reflected in sysfs.  Move the
> !PAE restriction to a runtime check to ensure NPT is forced off if the
> host is using 2-level paging, and add a comment explicitly stating why
> NPT requires a 64-bit kernel or a kernel with PAE enabled.
>
> Opportunistically switch the param to octal permissions.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/svm/svm.c | 27 ++++++++++++++-------------
>  1 file changed, 14 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 54610270f66a..0ee74321461e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs {
>  	{ .index = MSR_INVALID,				.always = false },
>  };
>  
> -/* enable NPT for AMD64 and X86 with PAE */
> -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
> -bool npt_enabled = true;
> -#else
> -bool npt_enabled;
> -#endif
> -
>  /*
>   * These 2 parameters are used to config the controls for Pause-Loop Exiting:
>   * pause_filter_count: On processors that support Pause filtering(indicated
> @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444);
>  static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
>  module_param(pause_filter_count_max, ushort, 0444);
>  
> -/* allow nested paging (virtualized MMU) for all guests */
> -static int npt = true;
> -module_param(npt, int, S_IRUGO);
> +/*
> + * Use nested page tables by default.  Note, NPT may get forced off by
> + * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
> + */
> +bool npt_enabled = true;
> +module_param_named(npt, npt_enabled, bool, 0444);
>  
>  /* allow nested virtualization in KVM/SVM */
>  static int nested = true;
> @@ -988,12 +984,17 @@ static __init int svm_hardware_setup(void)
>  			goto err;
>  	}
>  
> +	/*
> +	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
> +	 * NPT isn't supported if the host is using 2-level paging since host
> +	 * CR4 is unchanged on VMRUN.
> +	 */
> +	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
> +		npt_enabled = false;
> +
>  	if (!boot_cpu_has(X86_FEATURE_NPT))
>  		npt_enabled = false;
>  
> -	if (npt_enabled && !npt)
> -		npt_enabled = false;
> -

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>

>  	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
>  	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");

(unrelated to your patch but) I'd suggest we demote this pr_info() to
pr_debug() or keep it but print the message only when NPT is disabled as
in the overwhelming majority of cases it is enabled. Also, we don't seem
to print EPT status when kvm-intel is loaded.
Paolo Bonzini March 5, 2021, 1:33 p.m. UTC | #2
On 05/03/21 03:16, Sean Christopherson wrote:
> Directly connect the 'npt' param to the 'npt_enabled' variable so that
> runtime adjustments to npt_enabled are reflected in sysfs.  Move the
> !PAE restriction to a runtime check to ensure NPT is forced off if the
> host is using 2-level paging, and add a comment explicitly stating why
> NPT requires a 64-bit kernel or a kernel with PAE enabled.
> 
> Opportunistically switch the param to octal permissions.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/svm/svm.c | 27 ++++++++++++++-------------
>   1 file changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 54610270f66a..0ee74321461e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs {
>   	{ .index = MSR_INVALID,				.always = false },
>   };
>   
> -/* enable NPT for AMD64 and X86 with PAE */
> -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
> -bool npt_enabled = true;
> -#else
> -bool npt_enabled;
> -#endif
> -
>   /*
>    * These 2 parameters are used to config the controls for Pause-Loop Exiting:
>    * pause_filter_count: On processors that support Pause filtering(indicated
> @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444);
>   static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
>   module_param(pause_filter_count_max, ushort, 0444);
>   
> -/* allow nested paging (virtualized MMU) for all guests */
> -static int npt = true;
> -module_param(npt, int, S_IRUGO);
> +/*
> + * Use nested page tables by default.  Note, NPT may get forced off by
> + * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
> + */
> +bool npt_enabled = true;
> +module_param_named(npt, npt_enabled, bool, 0444);
>   
>   /* allow nested virtualization in KVM/SVM */
>   static int nested = true;
> @@ -988,12 +984,17 @@ static __init int svm_hardware_setup(void)
>   			goto err;
>   	}
>   
> +	/*
> +	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
> +	 * NPT isn't supported if the host is using 2-level paging since host
> +	 * CR4 is unchanged on VMRUN.
> +	 */
> +	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
> +		npt_enabled = false;
> +
>   	if (!boot_cpu_has(X86_FEATURE_NPT))
>   		npt_enabled = false;
>   
> -	if (npt_enabled && !npt)
> -		npt_enabled = false;
> -
>   	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
>   	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
>   
> 

Queued, thanks.

Paolo
Maxim Levitsky March 8, 2021, 11:18 a.m. UTC | #3
On Thu, 2021-03-04 at 18:16 -0800, Sean Christopherson wrote:
> Directly connect the 'npt' param to the 'npt_enabled' variable so that
> runtime adjustments to npt_enabled are reflected in sysfs.  Move the
> !PAE restriction to a runtime check to ensure NPT is forced off if the
> host is using 2-level paging, and add a comment explicitly stating why
> NPT requires a 64-bit kernel or a kernel with PAE enabled.

Let me ask a small question for a personal itch.

Do you think it is feasable to allow the user to enable npt/ept per guest?
(the default should still of course come from npt module parameter)

This weekend I checked it a bit and I think that it shouldn't be hard
to do.

There are some old and broken OSes which can't work with npt=1
https://blog.stuffedcow.net/2015/08/win9x-tlb-invalidation-bug/
https://blog.stuffedcow.net/2015/08/pagewalk-coherence/

I won't be surprised if some other old OSes
are affected by this as well knowing from the above 
that on Intel the MMU speculates less and doesn't
break their assumptions up to today.
(This is tested to be true on my Kabylake laptop)

In addition to that, on semi-unrelated note,
our shadowing MMU also shows up the exact same issue since it
also caches translations in form of unsync MMU pages.

But I can (and did disable) this using a hack (see below)
and this finally made my win98 "hobby" guest actually work fine 
on AMD for me.

I am also thinking to make this "sync" mmu mode to be 
another module param (this can also be useful for debug,
see below)
What do you think?

On yet another semi-unrelated note,
A "sync" mmu mode affects another bug I am tracking,
but I don't yet understand why:

I found out that while windows 10 doesn't boot at all with 
disabled tdp on the host (npt/ept - I tested both) 
 the "sync" mmu mode does make it work.

I was also able to reproduce a crash on Linux 
(but only with nested migration loop)
Without "sync" mmu mode and without npt on the host.
With "sync" mmu mode it passed an overnight test of more 
that 1000 iterations.

For reference this is my "sync" mmu hack:

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index febe71935bb5a..1046d8c97702d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2608,7 +2608,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        }
 
        set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
-                               speculative, true, host_writable);
+                               speculative, false, host_writable);
        if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
                if (write_fault)
                        ret = RET_PF_EMULATE;


It is a hack since it only happens to work because we eventually
unprotect the guest mmu pages when we detect write flooding to them.
Still performance wise, my win98 guest works very well with this
(with npt=0 on host)

Best regards,
	Maxim Levitsky


> 
> Opportunistically switch the param to octal permissions.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/svm/svm.c | 27 ++++++++++++++-------------
>  1 file changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 54610270f66a..0ee74321461e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs {
>  	{ .index = MSR_INVALID,				.always = false },
>  };
>  
> -/* enable NPT for AMD64 and X86 with PAE */
> -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
> -bool npt_enabled = true;
> -#else
> -bool npt_enabled;
> -#endif
> -
>  /*
>   * These 2 parameters are used to config the controls for Pause-Loop Exiting:
>   * pause_filter_count: On processors that support Pause filtering(indicated
> @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444);
>  static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
>  module_param(pause_filter_count_max, ushort, 0444);
>  
> -/* allow nested paging (virtualized MMU) for all guests */
> -static int npt = true;
> -module_param(npt, int, S_IRUGO);
> +/*
> + * Use nested page tables by default.  Note, NPT may get forced off by
> + * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
> + */
> +bool npt_enabled = true;
> +module_param_named(npt, npt_enabled, bool, 0444);
>  
>  /* allow nested virtualization in KVM/SVM */
>  static int nested = true;
> @@ -988,12 +984,17 @@ static __init int svm_hardware_setup(void)
>  			goto err;
>  	}
>  
> +	/*
> +	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
> +	 * NPT isn't supported if the host is using 2-level paging since host
> +	 * CR4 is unchanged on VMRUN.
> +	 */
> +	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
> +		npt_enabled = false;
> +
>  	if (!boot_cpu_has(X86_FEATURE_NPT))
>  		npt_enabled = false;
>  
> -	if (npt_enabled && !npt)
> -		npt_enabled = false;
> -
>  	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
>  	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
>
Sean Christopherson March 8, 2021, 5:18 p.m. UTC | #4
On Mon, Mar 08, 2021, Maxim Levitsky wrote:
> On Thu, 2021-03-04 at 18:16 -0800, Sean Christopherson wrote:
> > Directly connect the 'npt' param to the 'npt_enabled' variable so that
> > runtime adjustments to npt_enabled are reflected in sysfs.  Move the
> > !PAE restriction to a runtime check to ensure NPT is forced off if the
> > host is using 2-level paging, and add a comment explicitly stating why
> > NPT requires a 64-bit kernel or a kernel with PAE enabled.
> 
> Let me ask a small question for a personal itch.
> 
> Do you think it is feasable to allow the user to enable npt/ept per guest?
> (the default should still of course come from npt module parameter)

Feasible, yes.  Worth the extra maintenance, probably not.  It's a niche use
case, and only viable if you have a priori knowledge of the guest being run.
I doubt there are more than a few people in the world that meet those criteria,
and want to run multiple VMs, and also care deeply about the performance
degregation of the other VMs.

> This weekend I checked it a bit and I think that it shouldn't be hard
> to do.
> 
> There are some old and broken OSes which can't work with npt=1
> https://blog.stuffedcow.net/2015/08/win9x-tlb-invalidation-bug/
> https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
> 
> I won't be surprised if some other old OSes
> are affected by this as well knowing from the above 
> that on Intel the MMU speculates less and doesn't
> break their assumptions up to today.
> (This is tested to be true on my Kabylake laptop)

Heh, I would be quite surprised if Intel CPUs speculate less.  I wouldn't be
surprised if the old Windows behavior got grandfathered into Intel CPUs because
the buggy behavior worked on old CPUs and so must continue to work on new CPUs.

> In addition to that, on semi-unrelated note,
> our shadowing MMU also shows up the exact same issue since it
> also caches translations in form of unsync MMU pages.
> 
> But I can (and did disable) this using a hack (see below)
> and this finally made my win98 "hobby" guest actually work fine 
> on AMD for me.
> 
> I am also thinking to make this "sync" mmu mode to be 
> another module param (this can also be useful for debug,
> see below)
> What do you think?
> 
> On yet another semi-unrelated note,
> A "sync" mmu mode affects another bug I am tracking,
> but I don't yet understand why:
> 
> I found out that while windows 10 doesn't boot at all with 
> disabled tdp on the host (npt/ept - I tested both) 
>  the "sync" mmu mode does make it work.

Intel and AMD?  Or just AMD?  If both architectures fail, this definitely needs
to be debugged and fixed.  Given the lack of bug reports, most KVM users
obviously don't care about TDP=0, but any bug in the unsync code likely affects
nested TDP as well, which far more people do care about.

> I was also able to reproduce a crash on Linux 
> (but only with nested migration loop)

With or without TDP enabled?
Maxim Levitsky March 8, 2021, 9:44 p.m. UTC | #5
On Mon, 2021-03-08 at 09:18 -0800, Sean Christopherson wrote:
> On Mon, Mar 08, 2021, Maxim Levitsky wrote:
> > On Thu, 2021-03-04 at 18:16 -0800, Sean Christopherson wrote:
> > > Directly connect the 'npt' param to the 'npt_enabled' variable so that
> > > runtime adjustments to npt_enabled are reflected in sysfs.  Move the
> > > !PAE restriction to a runtime check to ensure NPT is forced off if the
> > > host is using 2-level paging, and add a comment explicitly stating why
> > > NPT requires a 64-bit kernel or a kernel with PAE enabled.
> > 
> > Let me ask a small question for a personal itch.
> > 
> > Do you think it is feasable to allow the user to enable npt/ept per guest?
> > (the default should still of course come from npt module parameter)
> 
> Feasible, yes.  Worth the extra maintenance, probably not.  It's a niche use
> case, and only viable if you have a priori knowledge of the guest being run.
> I doubt there are more than a few people in the world that meet those criteria,
> and want to run multiple VMs, and also care deeply about the performance
> degregation of the other VMs.
I understand.
On one of weekends when I am bored I probably implement it anyway,
and post it upstream. I don't count on getting this merged.

It just that I often run VMs which I don't want to stop, and sometimes I want
to boot my retro VM which is finally working but I need for that
to reload KVM and disable NPT.

> 
> > This weekend I checked it a bit and I think that it shouldn't be hard
> > to do.
> > 
> > There are some old and broken OSes which can't work with npt=1
> > https://blog.stuffedcow.net/2015/08/win9x-tlb-invalidation-bug/
> > https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
> > 
> > I won't be surprised if some other old OSes
> > are affected by this as well knowing from the above 
> > that on Intel the MMU speculates less and doesn't
> > break their assumptions up to today.
> > (This is tested to be true on my Kabylake laptop)
> 
> Heh, I would be quite surprised if Intel CPUs speculate less.  I wouldn't be
> surprised if the old Windows behavior got grandfathered into Intel CPUs because
> the buggy behavior worked on old CPUs and so must continue to work on new CPUs.

Yes, this sounds exactly what did happen. Besides we might not care but other
hypervisors are often sold as a means to run very old software, and that includes
very old operation systems. 
So Intel might have kept this working for that reason as well, 
while AMD didn't have time to care for an obvious OS bug which is 
even given as an example of what not to do in the manual.

> 
> > In addition to that, on semi-unrelated note,
> > our shadowing MMU also shows up the exact same issue since it
> > also caches translations in form of unsync MMU pages.
> > 
> > But I can (and did disable) this using a hack (see below)
> > and this finally made my win98 "hobby" guest actually work fine 
> > on AMD for me.
> > 
> > I am also thinking to make this "sync" mmu mode to be 
> > another module param (this can also be useful for debug,
> > see below)
> > What do you think?
> > 
> > On yet another semi-unrelated note,
> > A "sync" mmu mode affects another bug I am tracking,
> > but I don't yet understand why:
> > 
> > I found out that while windows 10 doesn't boot at all with 
> > disabled tdp on the host (npt/ept - I tested both) 
> >  the "sync" mmu mode does make it work.
> 
> Intel and AMD?  Or just AMD?  If both architectures fail, this definitely needs
> to be debugged and fixed.  Given the lack of bug reports, most KVM users
> obviously don't care about TDP=0, but any bug in the unsync code likely affects
> nested TDP as well, which far more people do care about.

Both Intel and AMD in exactly the same way. 
Win10 fails to boot always, with various blue screens or just
hangs. With 'sync' mmu hack it slow but boots always.

It even boots nested (very slow) with TDP disabled on host.
(with my fix for booting nested guests on AMD with TDP disabled on the host)

Note that otherwise this isn't related to nesting, I just boot a regular win10 guest.
I also see this happen with several different win10 VMs.


> 
> > I was also able to reproduce a crash on Linux 
> > (but only with nested migration loop)
> 
> With or without TDP enabled?
Without TDP enabled. I also was able to reproduce this on both Intel and AMD.

For this case as Linux does seem to boot, I did run my nested migration test,
and its is the nested guest that crashes, but also most likely not related to
nesting as no TDP was enabled on the host (L0)

With sync mmu I wasn't able to make anything crash (I think though that I 
tested this case only on AMD so far)

I also did *lot* of various hacks to mmu code.
(like to avoid any prefetching, sync everything on every cr0/cr3/cr4 write,
flush the real TLB on each guest entry, and stuff like that, and nothing seemes to help).


Best regards,
	Maxim Levitsky

>
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 54610270f66a..0ee74321461e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -115,13 +115,6 @@  static const struct svm_direct_access_msrs {
 	{ .index = MSR_INVALID,				.always = false },
 };
 
-/* enable NPT for AMD64 and X86 with PAE */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-bool npt_enabled = true;
-#else
-bool npt_enabled;
-#endif
-
 /*
  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  * pause_filter_count: On processors that support Pause filtering(indicated
@@ -170,9 +163,12 @@  module_param(pause_filter_count_shrink, ushort, 0444);
 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 module_param(pause_filter_count_max, ushort, 0444);
 
-/* allow nested paging (virtualized MMU) for all guests */
-static int npt = true;
-module_param(npt, int, S_IRUGO);
+/*
+ * Use nested page tables by default.  Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
 
 /* allow nested virtualization in KVM/SVM */
 static int nested = true;
@@ -988,12 +984,17 @@  static __init int svm_hardware_setup(void)
 			goto err;
 	}
 
+	/*
+	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
+	 * NPT isn't supported if the host is using 2-level paging since host
+	 * CR4 is unchanged on VMRUN.
+	 */
+	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+		npt_enabled = false;
+
 	if (!boot_cpu_has(X86_FEATURE_NPT))
 		npt_enabled = false;
 
-	if (npt_enabled && !npt)
-		npt_enabled = false;
-
 	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
 	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");