diff mbox series

kvm: x86: move srcu lock out of kvm_vcpu_check_block

Message ID 20210428173820.13051-1-jon@nutanix.com (mailing list archive)
State New, archived
Headers show
Series kvm: x86: move srcu lock out of kvm_vcpu_check_block | expand

Commit Message

Jon Kohler April 28, 2021, 5:38 p.m. UTC
When using halt polling on x86, srcu_read_lock + unlock overhead [1] is
high in a bursty workload, showing as ~8% of samples in a 60-sec flame
graph.

kvm_vcpu_block calls kvm_vcpu_check_block for both halt polling and
normal blocking. kvm_vcpu_check_block takes srcu_read on kvm->srcu.
This was added in 50c28f21d0 to support fast CR3 and was questioned [2]
at the time but made it in such that we take the lock even for
non-nested. This only appears to be valid for nested situations, where
we will eventually call kvm_vcpu_running and vmx_check_nested_events.
This check is hidden behind is_guest_mode() and therefore does not
seem to apply to non-nested workloads.

To improve performance, this moves kvm->srcu lock logic from
kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around
check_events. Also adds a hint for callers to tell
kvm_vcpu_running whether or not to acquire srcu, which is useful in
situations where the lock may already be held. With this in place, we
see roughly 5% improvement in an internal benchmark [3] and no more
impact from this lock on non-nested workloads.

[1] perf top output in heavy workload
Overhead  Shared Object  Symbol
   9.24%  [kernel]       [k] __srcu_read_lock
   7.48%  [kernel]       [k] __srcu_read_unlock

[2] Locking originally discussed here
https://patchwork.kernel.org/project/kvm/patch/20180612225244.71856-9-junaids@google.com/

[3] Internal benchmark details
Fixed-rate 100 GBytes/second 1MB random read IO ran against the
internal in-memory read cache of Nutanix AOS, 16 threads on a 22
vCPU CentOS 7.9 VM. Before: ~120us avg latency, After: ~113us.

Fixes: 50c28f21d0 ("kvm: x86: Use fast CR3 switch for nested VMX")
Signed-off-by: Jon Kohler <jon@nutanix.com>
Reviewed-by: Bijan Mottahedeh <bijan.mottahedeh@nutanix.com>
Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Cc: Junaid Shahid <junaids@google.com>
---
 arch/x86/kvm/x86.c  | 24 +++++++++++++++++++-----
 virt/kvm/kvm_main.c | 21 +++++++--------------
 2 files changed, 26 insertions(+), 19 deletions(-)

--
2.24.3 (Apple Git-128)

Comments

Sean Christopherson April 30, 2021, 8:45 p.m. UTC | #1
On Wed, Apr 28, 2021, Jon Kohler wrote:
> To improve performance, this moves kvm->srcu lock logic from
> kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around
> check_events. Also adds a hint for callers to tell
> kvm_vcpu_running whether or not to acquire srcu, which is useful in
> situations where the lock may already be held. With this in place, we
> see roughly 5% improvement in an internal benchmark [3] and no more
> impact from this lock on non-nested workloads.

...

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index efc7a82ab140..354f690cc982 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
>  	return 1;
>  }
> 
> -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
> +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu)
>  {
> -	if (is_guest_mode(vcpu))
> -		kvm_x86_ops.nested_ops->check_events(vcpu);
> +	if (is_guest_mode(vcpu)) {
> +		if (acquire_srcu) {
> +			/*
> +			 * We need to lock because check_events could call
> +			 * nested_vmx_vmexit() which might need to resolve a
> +			 * valid memslot. We will have this lock only when
> +			 * called from vcpu_run but not when called from
> +			 * kvm_vcpu_check_block > kvm_arch_vcpu_runnable.
> +			 */
> +			int idx = srcu_read_lock(&vcpu->kvm->srcu);
> +			kvm_x86_ops.nested_ops->check_events(vcpu);
> +			srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +		} else {
> +			kvm_x86_ops.nested_ops->check_events(vcpu);
> +		}
> +	}

Obviously not your fault, but I absolutely detest calling check_events() from
kvm_vcpu_running.  I would much prefer to make baby steps toward cleaning up the
existing mess instead of piling more weirdness on top.

Ideally, APICv support would be fixed to not require a deep probe into nested
events just to see if a vCPU can run.  But, that's probably more than we want to
bite off at this time.

What if we add another nested_ops API to check if the vCPU has an event, but not
actually process the event?  I think that would allow eliminating the SRCU lock,
and would get rid of the most egregious behavior of triggering a nested VM-Exit
in a seemingly innocuous helper.

If this works, we could even explore moving the call to nested_ops->has_events()
out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the
side effects in vcpu_block() would get messed up with that change :-/

Incomplete patch...

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 00339d624c92..15f514891326 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3771,15 +3771,17 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
               to_vmx(vcpu)->nested.preemption_timer_expired;
 }

-static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
+static int __vmx_check_nested_events(struct kvm_vcpu *vcpu, bool only_check)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qual;
-       bool block_nested_events =
-           vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
        bool mtf_pending = vmx->nested.mtf_pending;
        struct kvm_lapic *apic = vcpu->arch.apic;

+       bool block_nested_events = only_check ||
+                                  vmx->nested.nested_run_pending ||
+                                  kvm_event_needs_reinjection(vcpu);
+
        /*
         * Clear the MTF state. If a higher priority VM-exit is delivered first,
         * this state is discarded.
@@ -3837,7 +3839,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
        }

        if (vcpu->arch.exception.pending) {
-               if (vmx->nested.nested_run_pending)
+               if (vmx->nested.nested_run_pending || only_check)
                        return -EBUSY;
                if (!nested_vmx_check_exception(vcpu, &exit_qual))
                        goto no_vmexit;
@@ -3886,10 +3888,23 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
        }

 no_vmexit:
-       vmx_complete_nested_posted_interrupt(vcpu);
+       if (!check_only)
+               vmx_complete_nested_posted_interrupt(vcpu);
+       else if (vmx->nested.pi_desc && vmx->nested.pi_pending)
+               return -EBUSY;
        return 0;
 }

+static bool vmx_has_nested_event(struct kvm_vcpu *vcpu)
+{
+       return !!__vmx_check_nested_events(vcpu, true);
+}
+
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
+{
+       return __vmx_check_nested_events(vcpu, false);
+}
+
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 {
        ktime_t remaining =
@@ -6627,6 +6642,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 }

 struct kvm_x86_nested_ops vmx_nested_ops = {
+       .has_event = vmx_has_nested_event,
        .check_events = vmx_check_nested_events,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
        .triple_fault = nested_vmx_triple_fault,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a829f1ab60c3..5df01012cb1f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9310,6 +9310,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        update_cr8_intercept(vcpu);
                        kvm_lapic_sync_to_vapic(vcpu);
                }
+       } else if (is_guest_mode(vcpu)) {
+               r = kvm_check_nested_events(vcpu);
+               if (r < 0)
+                       req_immediate_exit = true;
        }

        r = kvm_mmu_reload(vcpu);
@@ -9516,8 +9520,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)

 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu))
-               kvm_check_nested_events(vcpu);
+       if (is_guest_mode(vcpu) &&
+           (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu) ||
+            kvm_x86_ops.nested_ops->has_event(vcpu)))
+               return true;

        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
Paolo Bonzini May 1, 2021, 1:05 p.m. UTC | #2
On 30/04/21 22:45, Sean Christopherson wrote:
> On Wed, Apr 28, 2021, Jon Kohler wrote:
>> To improve performance, this moves kvm->srcu lock logic from
>> kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around
>> check_events. Also adds a hint for callers to tell
>> kvm_vcpu_running whether or not to acquire srcu, which is useful in
>> situations where the lock may already be held. With this in place, we
>> see roughly 5% improvement in an internal benchmark [3] and no more
>> impact from this lock on non-nested workloads.
> 
> ...
> 
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index efc7a82ab140..354f690cc982 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
>>   	return 1;
>>   }
>>
>> -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
>> +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu)
>>   {
>> -	if (is_guest_mode(vcpu))
>> -		kvm_x86_ops.nested_ops->check_events(vcpu);
>> +	if (is_guest_mode(vcpu)) {
>> +		if (acquire_srcu) {
>> +			/*
>> +			 * We need to lock because check_events could call
>> +			 * nested_vmx_vmexit() which might need to resolve a
>> +			 * valid memslot. We will have this lock only when
>> +			 * called from vcpu_run but not when called from
>> +			 * kvm_vcpu_check_block > kvm_arch_vcpu_runnable.
>> +			 */
>> +			int idx = srcu_read_lock(&vcpu->kvm->srcu);
>> +			kvm_x86_ops.nested_ops->check_events(vcpu);
>> +			srcu_read_unlock(&vcpu->kvm->srcu, idx);
>> +		} else {
>> +			kvm_x86_ops.nested_ops->check_events(vcpu);
>> +		}
>> +	}
> 
> Obviously not your fault, but I absolutely detest calling check_events() from
> kvm_vcpu_running.  I would much prefer to make baby steps toward cleaning up the
> existing mess instead of piling more weirdness on top.
> 
> Ideally, APICv support would be fixed to not require a deep probe into nested
> events just to see if a vCPU can run.  But, that's probably more than we want to
> bite off at this time.
> 
> What if we add another nested_ops API to check if the vCPU has an event, but not
> actually process the event?  I think that would allow eliminating the SRCU lock,
> and would get rid of the most egregious behavior of triggering a nested VM-Exit
> in a seemingly innocuous helper.
> 
> If this works, we could even explore moving the call to nested_ops->has_events()
> out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the
> side effects in vcpu_block() would get messed up with that change :-/
> 
> Incomplete patch...

I think it doesn't even have to be *nested* events.  Most events are the 
same inside or outside guest mode, as they already special case guest 
mode inside the kvm_x86_ops callbacks (e.g. kvm_arch_interrupt_allowed 
is already called by kvm_vcpu_has_events).

I think we only need to extend kvm_x86_ops.nested_ops->hv_timer_pending 
to cover MTF, plus double check that INIT and SIPI are handled 
correctly, and then the call to check_nested_events can go away.

Paolo

> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 00339d624c92..15f514891326 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -3771,15 +3771,17 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
>                 to_vmx(vcpu)->nested.preemption_timer_expired;
>   }
> 
> -static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
> +static int __vmx_check_nested_events(struct kvm_vcpu *vcpu, bool only_check)
>   {
>          struct vcpu_vmx *vmx = to_vmx(vcpu);
>          unsigned long exit_qual;
> -       bool block_nested_events =
> -           vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
>          bool mtf_pending = vmx->nested.mtf_pending;
>          struct kvm_lapic *apic = vcpu->arch.apic;
> 
> +       bool block_nested_events = only_check ||
> +                                  vmx->nested.nested_run_pending ||
> +                                  kvm_event_needs_reinjection(vcpu);
> +
>          /*
>           * Clear the MTF state. If a higher priority VM-exit is delivered first,
>           * this state is discarded.
> @@ -3837,7 +3839,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
>          }
> 
>          if (vcpu->arch.exception.pending) {
> -               if (vmx->nested.nested_run_pending)
> +               if (vmx->nested.nested_run_pending || only_check)
>                          return -EBUSY;
>                  if (!nested_vmx_check_exception(vcpu, &exit_qual))
>                          goto no_vmexit;
> @@ -3886,10 +3888,23 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
>          }
> 
>   no_vmexit:
> -       vmx_complete_nested_posted_interrupt(vcpu);
> +       if (!check_only)
> +               vmx_complete_nested_posted_interrupt(vcpu);
> +       else if (vmx->nested.pi_desc && vmx->nested.pi_pending)
> +               return -EBUSY;
>          return 0;
>   }
> 
> +static bool vmx_has_nested_event(struct kvm_vcpu *vcpu)
> +{
> +       return !!__vmx_check_nested_events(vcpu, true);
> +}
> +
> +static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
> +{
> +       return __vmx_check_nested_events(vcpu, false);
> +}
> +
>   static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
>   {
>          ktime_t remaining =
> @@ -6627,6 +6642,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
>   }
> 
>   struct kvm_x86_nested_ops vmx_nested_ops = {
> +       .has_event = vmx_has_nested_event,
>          .check_events = vmx_check_nested_events,
>          .hv_timer_pending = nested_vmx_preemption_timer_pending,
>          .triple_fault = nested_vmx_triple_fault,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a829f1ab60c3..5df01012cb1f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9310,6 +9310,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>                          update_cr8_intercept(vcpu);
>                          kvm_lapic_sync_to_vapic(vcpu);
>                  }
> +       } else if (is_guest_mode(vcpu)) {
> +               r = kvm_check_nested_events(vcpu);
> +               if (r < 0)
> +                       req_immediate_exit = true;
>          }
> 
>          r = kvm_mmu_reload(vcpu);
> @@ -9516,8 +9520,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
> 
>   static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
>   {
> -       if (is_guest_mode(vcpu))
> -               kvm_check_nested_events(vcpu);
> +       if (is_guest_mode(vcpu) &&
> +           (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu) ||
> +            kvm_x86_ops.nested_ops->has_event(vcpu)))
> +               return true;
> 
>          return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
>                  !vcpu->arch.apf.halted);
>
Jon Kohler May 5, 2021, 3:46 p.m. UTC | #3
> On May 1, 2021, at 9:05 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 30/04/21 22:45, Sean Christopherson wrote:
>> On Wed, Apr 28, 2021, Jon Kohler wrote:
>>> To improve performance, this moves kvm->srcu lock logic from
>>> kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around
>>> check_events. Also adds a hint for callers to tell
>>> kvm_vcpu_running whether or not to acquire srcu, which is useful in
>>> situations where the lock may already be held. With this in place, we
>>> see roughly 5% improvement in an internal benchmark [3] and no more
>>> impact from this lock on non-nested workloads.
>> ...
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index efc7a82ab140..354f690cc982 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
>>>  	return 1;
>>>  }
>>> 
>>> -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
>>> +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu)
>>>  {
>>> -	if (is_guest_mode(vcpu))
>>> -		kvm_x86_ops.nested_ops->check_events(vcpu);
>>> +	if (is_guest_mode(vcpu)) {
>>> +		if (acquire_srcu) {
>>> +			/*
>>> +			 * We need to lock because check_events could call
>>> +			 * nested_vmx_vmexit() which might need to resolve a
>>> +			 * valid memslot. We will have this lock only when
>>> +			 * called from vcpu_run but not when called from
>>> +			 * kvm_vcpu_check_block > kvm_arch_vcpu_runnable.
>>> +			 */
>>> +			int idx = srcu_read_lock(&vcpu->kvm->srcu);
>>> +			kvm_x86_ops.nested_ops->check_events(vcpu);
>>> +			srcu_read_unlock(&vcpu->kvm->srcu, idx);
>>> +		} else {
>>> +			kvm_x86_ops.nested_ops->check_events(vcpu);
>>> +		}
>>> +	}
>> Obviously not your fault, but I absolutely detest calling check_events() from
>> kvm_vcpu_running.  I would much prefer to make baby steps toward cleaning up the
>> existing mess instead of piling more weirdness on top.
>> Ideally, APICv support would be fixed to not require a deep probe into nested
>> events just to see if a vCPU can run.  But, that's probably more than we want to
>> bite off at this time.
>> What if we add another nested_ops API to check if the vCPU has an event, but not
>> actually process the event?  I think that would allow eliminating the SRCU lock,
>> and would get rid of the most egregious behavior of triggering a nested VM-Exit
>> in a seemingly innocuous helper.
>> If this works, we could even explore moving the call to nested_ops->has_events()
>> out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the
>> side effects in vcpu_block() would get messed up with that change :-/
>> Incomplete patch...
> 
> I think it doesn't even have to be *nested* events.  Most events are the same inside or outside guest mode, as they already special case guest mode inside the kvm_x86_ops callbacks (e.g. kvm_arch_interrupt_allowed is already called by kvm_vcpu_has_events).
> 
> I think we only need to extend kvm_x86_ops.nested_ops->hv_timer_pending to cover MTF, plus double check that INIT and SIPI are handled correctly, and then the call to check_nested_events can go away.
> 
> Paolo

[ resending as my editor switched to html :( ]

Thanks, Paolo, Sean. I appreciate the prompt response, Sorry for the slow reply, I was out with a hand injury for a few days and am caught up now. 

Just to confirm - In the spirit of baby steps as Sean mentioned, I’m happy to take up the idea that Sean mentioned, that makes sense to me. Perhaps we can do that small patch and leave a TODO do a tuneup for hv_timer_pending and the other double checks Paolo mentioned.

Or would you rather skip that approach and go right to making check_nested_events go-away first? Guessing that might be a larger effort with more nuances though?

Happy to help, thanks again,
Jon

> 
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index 00339d624c92..15f514891326 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -3771,15 +3771,17 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
>>                to_vmx(vcpu)->nested.preemption_timer_expired;
>>  }
>> -static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
>> +static int __vmx_check_nested_events(struct kvm_vcpu *vcpu, bool only_check)
>>  {
>>         struct vcpu_vmx *vmx = to_vmx(vcpu);
>>         unsigned long exit_qual;
>> -       bool block_nested_events =
>> -           vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
>>         bool mtf_pending = vmx->nested.mtf_pending;
>>         struct kvm_lapic *apic = vcpu->arch.apic;
>> +       bool block_nested_events = only_check ||
>> +                                  vmx->nested.nested_run_pending ||
>> +                                  kvm_event_needs_reinjection(vcpu);
>> +
>>         /*
>>          * Clear the MTF state. If a higher priority VM-exit is delivered first,
>>          * this state is discarded.
>> @@ -3837,7 +3839,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
>>         }
>>         if (vcpu->arch.exception.pending) {
>> -               if (vmx->nested.nested_run_pending)
>> +               if (vmx->nested.nested_run_pending || only_check)
>>                         return -EBUSY;
>>                 if (!nested_vmx_check_exception(vcpu, &exit_qual))
>>                         goto no_vmexit;
>> @@ -3886,10 +3888,23 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
>>         }
>>  no_vmexit:
>> -       vmx_complete_nested_posted_interrupt(vcpu);
>> +       if (!check_only)
>> +               vmx_complete_nested_posted_interrupt(vcpu);
>> +       else if (vmx->nested.pi_desc && vmx->nested.pi_pending)
>> +               return -EBUSY;
>>         return 0;
>>  }
>> +static bool vmx_has_nested_event(struct kvm_vcpu *vcpu)
>> +{
>> +       return !!__vmx_check_nested_events(vcpu, true);
>> +}
>> +
>> +static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
>> +{
>> +       return __vmx_check_nested_events(vcpu, false);
>> +}
>> +
>>  static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
>>  {
>>         ktime_t remaining =
>> @@ -6627,6 +6642,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
>>  }
>>  struct kvm_x86_nested_ops vmx_nested_ops = {
>> +       .has_event = vmx_has_nested_event,
>>         .check_events = vmx_check_nested_events,
>>         .hv_timer_pending = nested_vmx_preemption_timer_pending,
>>         .triple_fault = nested_vmx_triple_fault,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index a829f1ab60c3..5df01012cb1f 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -9310,6 +9310,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>                         update_cr8_intercept(vcpu);
>>                         kvm_lapic_sync_to_vapic(vcpu);
>>                 }
>> +       } else if (is_guest_mode(vcpu)) {
>> +               r = kvm_check_nested_events(vcpu);
>> +               if (r < 0)
>> +                       req_immediate_exit = true;
>>         }
>>         r = kvm_mmu_reload(vcpu);
>> @@ -9516,8 +9520,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
>>  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
>>  {
>> -       if (is_guest_mode(vcpu))
>> -               kvm_check_nested_events(vcpu);
>> +       if (is_guest_mode(vcpu) &&
>> +           (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu) ||
>> +            kvm_x86_ops.nested_ops->has_event(vcpu)))
>> +               return true;
>>         return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
>>                 !vcpu->arch.apf.halted);
>
Sean Christopherson May 19, 2021, 9:53 p.m. UTC | #4
On Wed, May 05, 2021, Jon Kohler wrote:
> 
> > On May 1, 2021, at 9:05 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> > 
> > On 30/04/21 22:45, Sean Christopherson wrote:
> >> On Wed, Apr 28, 2021, Jon Kohler wrote:
> >>> To improve performance, this moves kvm->srcu lock logic from
> >>> kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around
> >>> check_events. Also adds a hint for callers to tell
> >>> kvm_vcpu_running whether or not to acquire srcu, which is useful in
> >>> situations where the lock may already be held. With this in place, we
> >>> see roughly 5% improvement in an internal benchmark [3] and no more
> >>> impact from this lock on non-nested workloads.
> >> ...
> >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>> index efc7a82ab140..354f690cc982 100644
> >>> --- a/arch/x86/kvm/x86.c
> >>> +++ b/arch/x86/kvm/x86.c
> >>> @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
> >>>  	return 1;
> >>>  }
> >>> 
> >>> -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
> >>> +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu)
> >>>  {
> >>> -	if (is_guest_mode(vcpu))
> >>> -		kvm_x86_ops.nested_ops->check_events(vcpu);
> >>> +	if (is_guest_mode(vcpu)) {
> >>> +		if (acquire_srcu) {
> >>> +			/*
> >>> +			 * We need to lock because check_events could call
> >>> +			 * nested_vmx_vmexit() which might need to resolve a
> >>> +			 * valid memslot. We will have this lock only when
> >>> +			 * called from vcpu_run but not when called from
> >>> +			 * kvm_vcpu_check_block > kvm_arch_vcpu_runnable.
> >>> +			 */
> >>> +			int idx = srcu_read_lock(&vcpu->kvm->srcu);
> >>> +			kvm_x86_ops.nested_ops->check_events(vcpu);
> >>> +			srcu_read_unlock(&vcpu->kvm->srcu, idx);
> >>> +		} else {
> >>> +			kvm_x86_ops.nested_ops->check_events(vcpu);
> >>> +		}
> >>> +	}
> >> Obviously not your fault, but I absolutely detest calling check_events() from
> >> kvm_vcpu_running.  I would much prefer to make baby steps toward cleaning up the
> >> existing mess instead of piling more weirdness on top.
> >>
> >> Ideally, APICv support would be fixed to not require a deep probe into nested
> >> events just to see if a vCPU can run.  But, that's probably more than we want to
> >> bite off at this time.
> >>
> >> What if we add another nested_ops API to check if the vCPU has an event, but not
> >> actually process the event?  I think that would allow eliminating the SRCU lock,
> >> and would get rid of the most egregious behavior of triggering a nested VM-Exit
> >> in a seemingly innocuous helper.
> >>
> >> If this works, we could even explore moving the call to nested_ops->has_events()
> >> out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the
> >> side effects in vcpu_block() would get messed up with that change :-/
> >> Incomplete patch...
> > 
> > I think it doesn't even have to be *nested* events.  Most events are the
> > same inside or outside guest mode, as they already special case guest mode
> > inside the kvm_x86_ops callbacks (e.g. kvm_arch_interrupt_allowed is
> > already called by kvm_vcpu_has_events).
> > 
> > I think we only need to extend kvm_x86_ops.nested_ops->hv_timer_pending to
> > cover MTF, plus double check that INIT and SIPI are handled correctly, and
> > then the call to check_nested_events can go away.
> 
> Thanks, Paolo, Sean. I appreciate the prompt response, Sorry for the slow
> reply, I was out with a hand injury for a few days and am caught up now. 
> 
> Just to confirm - In the spirit of baby steps as Sean mentioned, I’m happy to
> take up the idea that Sean mentioned, that makes sense to me. Perhaps we can
> do that small patch and leave a TODO do a tuneup for hv_timer_pending and the
> other double checks Paolo mentioned.

Paolo was pointing out that kvm_vcpu_has_events() already checks hv_timer_pending,
and that we could add the few missing nested event cases to kvm_vcpu_has_events()
instead of wholesale checking everything that's in check_nested_events().

I believe that would work, as I suspect the underlying bug that was alluded to
by commit 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running")
has since been fixed.  But, I'm not sure it makes much difference in practice
since we'll likely end up with nested_ops->has_events() either way.

Staring a bit more, I'm pretty sure hv_timer_pending() can be made obsolete and
dropped.  Unless Paolo objects, I still like my original proposal.

I think the safest approach from a bisection standpoint would be to do this in
3-4 stages:

  1. Refactor check_nested_events() to split out a has_events() helper.
  2. Move the has_events() call from kvm_vcpu_running() into kvm_vcpu_has_events()
  3. Drop the explicit hv_timer_pending() in inject_pending_event().  It should
     be dead code since it's just a pointer to nested_vmx_preemption_timer_pending(),
     which is handled by vmx_check_nested_events() and called earlier.
  4. Drop the explicit hv_timer_pending() in kvm_vcpu_has_events() for the same
     reasons as (3).  This can also drop hv_timer_pending() entirely.

> Or would you rather skip that approach and go right to making
> check_nested_events go-away first? Guessing that might be a larger effort
> with more nuances though?
Paolo Bonzini May 20, 2021, 12:31 p.m. UTC | #5
On 19/05/21 23:53, Sean Christopherson wrote:
>    1. Refactor check_nested_events() to split out a has_events() helper.
>    2. Move the has_events() call from kvm_vcpu_running() into kvm_vcpu_has_events()
>    3. Drop the explicit hv_timer_pending() in inject_pending_event().  It should
>       be dead code since it's just a pointer to nested_vmx_preemption_timer_pending(),
>       which is handled by vmx_check_nested_events() and called earlier.
>    4. Drop the explicit hv_timer_pending() in kvm_vcpu_has_events() for the same
>       reasons as (3).  This can also drop hv_timer_pending() entirely.

Sounds good except that I would do (3) first, since if I understand 
correctly it's a valid cleanup in the current code as well, and do (2) 
and (4) at the same time since you're basically enlarging the scope of 
the existing hv_timer_pending call to include all nested events.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index efc7a82ab140..354f690cc982 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9273,10 +9273,24 @@  static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 	return 1;
 }

-static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu)
 {
-	if (is_guest_mode(vcpu))
-		kvm_x86_ops.nested_ops->check_events(vcpu);
+	if (is_guest_mode(vcpu)) {
+		if (acquire_srcu) {
+			/*
+			 * We need to lock because check_events could call
+			 * nested_vmx_vmexit() which might need to resolve a
+			 * valid memslot. We will have this lock only when
+			 * called from vcpu_run but not when called from
+			 * kvm_vcpu_check_block > kvm_arch_vcpu_runnable.
+			 */
+			int idx = srcu_read_lock(&vcpu->kvm->srcu);
+			kvm_x86_ops.nested_ops->check_events(vcpu);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		} else {
+			kvm_x86_ops.nested_ops->check_events(vcpu);
+		}
+	}

 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		!vcpu->arch.apf.halted);
@@ -9291,7 +9305,7 @@  static int vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.l1tf_flush_l1d = true;

 	for (;;) {
-		if (kvm_vcpu_running(vcpu)) {
+		if (kvm_vcpu_running(vcpu, false)) {
 			r = vcpu_enter_guest(vcpu);
 		} else {
 			r = vcpu_block(kvm, vcpu);
@@ -10999,7 +11013,7 @@  static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)

 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+	return kvm_vcpu_running(vcpu, true) || kvm_vcpu_has_events(vcpu);
 }

 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 383df23514b9..05e29aed35b5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2783,22 +2783,15 @@  static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)

 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
-	int ret = -EINTR;
-	int idx = srcu_read_lock(&vcpu->kvm->srcu);
-
 	if (kvm_arch_vcpu_runnable(vcpu)) {
 		kvm_make_request(KVM_REQ_UNHALT, vcpu);
-		goto out;
+		return -EINTR;
 	}
-	if (kvm_cpu_has_pending_timer(vcpu))
-		goto out;
-	if (signal_pending(current))
-		goto out;

-	ret = 0;
-out:
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-	return ret;
+	if (kvm_cpu_has_pending_timer(vcpu) || signal_pending(current))
+		return -EINTR;
+
+	return 0;
 }

 static inline void