diff mbox series

KVM: x86: SVM: fix nested PAUSE filtering

Message ID 20220518072709.730031-1-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: SVM: fix nested PAUSE filtering | expand

Commit Message

Maxim Levitsky May 18, 2022, 7:27 a.m. UTC
Commit 74fd41ed16fd
("KVM: x86: nSVM: support PAUSE filtering when L0 doesn't intercept PAUSE")

introduced passthrough support for nested pause filtering,
(when the host doesn't intercept PAUSE)
(either disabled with kvm module param, or disabled with
'-overcommit cpu-pm=on')

However the feature was exposed as supported by KVM cpuid unconditionally,
thus if the guest is launched with -cpu host, it could use it even when
the KVM can't really support it.

While this use case should be avoided by the management software
(e.g libvirt), a failback was made for this case to intercept
each PAUSE instruction.

Turns out that in some cases, such intercept can slow down the
nested guest so much that it can fail to boot.

Also it turns out that when the pause filtering is not supported,
the KVM doesn't intercept PAUSE at all, so before this patch,
this slowdown didn't exist.

To fix this, change the fallback strategy - ignore the guest threshold
values, but use/update the host threshold values, instead of using zeros.

Also fix a minor bug: on nested VM exit, when PAUSE filter counter
were copied back to vmcb01, a dirty bit was not set.

Finally a note on why it 'worked' before the problematic commit in
regard to nesting:

KVM was setting both thresholds to 0 in vmcb02, but very soon afterwards,
(after a first userspace VM exit), the shrink_ple_window was called
which would reset the pause_filter_count to the default value.

Thanks a lot to Suravee Suthikulpanit for debugging this!

Fixes: 74fd41ed16fd ("KVM: x86: nSVM: support PAUSE filtering when L0 doesn't intercept PAUSE")

Reported-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Tested-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 16 ++++++----------
 arch/x86/kvm/svm/svm.c    |  4 ++--
 2 files changed, 8 insertions(+), 12 deletions(-)

Comments

Paolo Bonzini May 20, 2022, 2:05 p.m. UTC | #1
On 5/18/22 09:27, Maxim Levitsky wrote:
> To fix this, change the fallback strategy - ignore the guest threshold
> values, but use/update the host threshold values, instead of using zeros.

Hmm, now I remember why it was using the guest values.  It's because, if
the L1 hypervisor specifies COUNT=0 or does not have filtering enabled,
we need to obey and inject a vmexit on every PAUSE.  So something like this:

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index f209c1ca540c..e6153fd3ae47 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -616,6 +616,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
  	struct kvm_vcpu *vcpu = &svm->vcpu;
  	struct vmcb *vmcb01 = svm->vmcb01.ptr;
  	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+	u32 pause_count12;
+	u32 pause_thresh12;
  
  	/*
  	 * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
@@ -671,20 +673,25 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
  	if (!nested_vmcb_needs_vls_intercept(svm))
  		vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
  
+	pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
+	pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
  	if (kvm_pause_in_guest(svm->vcpu.kvm)) {
-		/* use guest values since host doesn't use them */
-		vmcb02->control.pause_filter_count =
-				svm->pause_filter_enabled ?
-				svm->nested.ctl.pause_filter_count : 0;
-
-		vmcb02->control.pause_filter_thresh =
-				svm->pause_threshold_enabled ?
-				svm->nested.ctl.pause_filter_thresh : 0;
+		/* use guest values since host doesn't intercept PAUSE */
+		vmcb02->control.pause_filter_count = pause_count12;
+		vmcb02->control.pause_filter_thresh = pause_thresh12;
  
  	} else {
-		/* use host values otherwise */
+		/* start from host values otherwise */
  		vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
  		vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+
+		/* ... but ensure filtering is disabled if so requested.  */
+		if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+			if (!pause_count12)
+				vmcb02->control.pause_filter_count = 0;
+			if (!pause_thresh12)
+				vmcb02->control.pause_filter_thresh = 0;
+		}
  	}
  
  	nested_svm_transition_tlb_flush(vcpu);


What do you think?
Maxim Levitsky May 22, 2022, 8:24 a.m. UTC | #2
On Fri, 2022-05-20 at 16:05 +0200, Paolo Bonzini wrote:
> On 5/18/22 09:27, Maxim Levitsky wrote:
> > To fix this, change the fallback strategy - ignore the guest threshold
> > values, but use/update the host threshold values, instead of using zeros.
> 
> Hmm, now I remember why it was using the guest values.  It's because, if
> the L1 hypervisor specifies COUNT=0 or does not have filtering enabled,
> we need to obey and inject a vmexit on every PAUSE.  So something like this:
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index f209c1ca540c..e6153fd3ae47 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -616,6 +616,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
>   	struct kvm_vcpu *vcpu = &svm->vcpu;
>   	struct vmcb *vmcb01 = svm->vmcb01.ptr;
>   	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
> +	u32 pause_count12;
> +	u32 pause_thresh12;
>   
>   	/*
>   	 * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
> @@ -671,20 +673,25 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
>   	if (!nested_vmcb_needs_vls_intercept(svm))
>   		vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
>   
> +	pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
> +	pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
>   	if (kvm_pause_in_guest(svm->vcpu.kvm)) {
> -		/* use guest values since host doesn't use them */
> -		vmcb02->control.pause_filter_count =
> -				svm->pause_filter_enabled ?
> -				svm->nested.ctl.pause_filter_count : 0;
> -
> -		vmcb02->control.pause_filter_thresh =
> -				svm->pause_threshold_enabled ?
> -				svm->nested.ctl.pause_filter_thresh : 0;
> +		/* use guest values since host doesn't intercept PAUSE */
> +		vmcb02->control.pause_filter_count = pause_count12;
> +		vmcb02->control.pause_filter_thresh = pause_thresh12;
>   
>   	} else {
> -		/* use host values otherwise */
> +		/* start from host values otherwise */
>   		vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
>   		vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
> +
> +		/* ... but ensure filtering is disabled if so requested.  */
> +		if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
> +			if (!pause_count12)
> +				vmcb02->control.pause_filter_count = 0;
> +			if (!pause_thresh12)
> +				vmcb02->control.pause_filter_thresh = 0;
> +		}

Makes sense!

I also need to remember to return the '!old' check to the shrink_ple_window,
otherwise it will once again convert 0 to the minimum value.

I'll send a patch soon with this.

Thanks!

>   	}
>   
>   	nested_svm_transition_tlb_flush(vcpu);
> 
> 
> What do you think?
> 


Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bed5e1692cef0..f209c1ca540c9 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -681,17 +681,10 @@  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 				svm->pause_threshold_enabled ?
 				svm->nested.ctl.pause_filter_thresh : 0;
 
-	} else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
-		/* use host values when guest doesn't use them */
+	} else {
+		/* use host values otherwise */
 		vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
 		vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
-	} else {
-		/*
-		 * Intercept every PAUSE otherwise and
-		 * ignore both host and guest values
-		 */
-		vmcb02->control.pause_filter_count = 0;
-		vmcb02->control.pause_filter_thresh = 0;
 	}
 
 	nested_svm_transition_tlb_flush(vcpu);
@@ -951,8 +944,11 @@  int nested_svm_vmexit(struct vcpu_svm *svm)
 	vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
 	vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
 
-	if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+	if (!kvm_pause_in_guest(vcpu->kvm)) {
 		vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+		vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+	}
 
 	nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3b49337998ec9..aa7b387e0b7c4 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -909,7 +909,7 @@  static void grow_ple_window(struct kvm_vcpu *vcpu)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	int old = control->pause_filter_count;
 
-	if (kvm_pause_in_guest(vcpu->kvm) || !old)
+	if (kvm_pause_in_guest(vcpu->kvm))
 		return;
 
 	control->pause_filter_count = __grow_ple_window(old,
@@ -930,7 +930,7 @@  static void shrink_ple_window(struct kvm_vcpu *vcpu)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	int old = control->pause_filter_count;
 
-	if (kvm_pause_in_guest(vcpu->kvm) || !old)
+	if (kvm_pause_in_guest(vcpu->kvm))
 		return;
 
 	control->pause_filter_count =