kvm: Fix perf timer mode IP reporting

Message ID	20170726002032.18233-1-andi@firstfloor.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Andi Kleen <andi@firstfloor.org> To: pbonzini@redhat.com Cc: kvm@vger.kernel.org, peterz@infradead.org, Andi Kleen <ak@linux.intel.com> Subject: [PATCH] kvm: Fix perf timer mode IP reporting Date: Tue, 25 Jul 2017 17:20:32 -0700 Message-Id: <20170726002032.18233-1-andi@firstfloor.org> Sender: kvm-owner@vger.kernel.org Precedence: bulk

Message ID

20170726002032.18233-1-andi@firstfloor.org (mailing list archive)

State

New, archived

Headers

From: Andi Kleen <andi@firstfloor.org>
To: pbonzini@redhat.com
Cc: kvm@vger.kernel.org, peterz@infradead.org,
	Andi Kleen <ak@linux.intel.com>
Subject: [PATCH] kvm: Fix perf timer mode IP reporting
Date: Tue, 25 Jul 2017 17:20:32 -0700
Message-Id: <20170726002032.18233-1-andi@firstfloor.org>
Sender: kvm-owner@vger.kernel.org
Precedence: bulk

Commit Message

Andi Kleen July 26, 2017, 12:20 a.m. UTC

From: Andi Kleen <ak@linux.intel.com>

KVM and perf have a special backdoor mechanism to report the IP for interrupts
re-executed after vm exit. This works for the NMIs that perf normally uses.

However when perf is in timer mode it doesn't work because the timer interrupt
doesn't get this special treatment. This is common when KVM is running
nested in another hypervisor which may not implement the PMU, so only
timer mode is available.

Call the functions to set up the backdoor IP also for non NMI interrupts.

I renamed the functions to set up the backdoor IP reporting to be more
appropiate for their new use.  The SVM change is only compile tested.

v2: Moved the functions inline.
For the normal interrupt case the before/after functions are now
called from x86.c, not arch specific code.
For the NMI case we still need to call it in the architecture
specific code, because it's already needed in the low level *_run
functions.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kvm/svm.c |  6 ++++--
 arch/x86/kvm/vmx.c |  6 ++++--
 arch/x86/kvm/x86.c | 17 ++++-------------
 arch/x86/kvm/x86.h | 14 ++++++++++++--
 4 files changed, 24 insertions(+), 19 deletions(-)

Comments

Paolo Bonzini July 26, 2017, 9:26 a.m. UTC | #1

On 26/07/2017 02:20, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> KVM and perf have a special backdoor mechanism to report the IP for interrupts
> re-executed after vm exit. This works for the NMIs that perf normally uses.
> 
> However when perf is in timer mode it doesn't work because the timer interrupt
> doesn't get this special treatment. This is common when KVM is running
> nested in another hypervisor which may not implement the PMU, so only
> timer mode is available.
> 
> Call the functions to set up the backdoor IP also for non NMI interrupts.
> 
> I renamed the functions to set up the backdoor IP reporting to be more
> appropiate for their new use.  The SVM change is only compile tested.
> 
> v2: Moved the functions inline.
> For the normal interrupt case the before/after functions are now
> called from x86.c, not arch specific code.

You haven't removed the code from vmx_handle_external_intr and 
svm_handle_external_intr.

> For the NMI case we still need to call it in the architecture
> specific code, because it's already needed in the low level *_run
> functions.

I must have been unclear; what I was asking is, can the calls cover
a much wider range of vcpu_enter_guest?

Like this:

        /*
         * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
         * IPI are then delayed after guest entry, which ensures that they
         * result in virtual interrupt delivery.
         */
        local_irq_disable();
	__this_cpu_write(current_vcpu, vcpu);
        vcpu->mode = IN_GUEST_MODE;
	...
        kvm_x86_ops->handle_external_intr(vcpu);
	__this_cpu_write(current_vcpu, NULL);

This would be much cleaner and not require exporting the functions at
all to the kvm_amd and kvm_intel modules.

Thanks,

Paolo

> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/kvm/svm.c |  6 ++++--
>  arch/x86/kvm/vmx.c |  6 ++++--
>  arch/x86/kvm/x86.c | 17 ++++-------------
>  arch/x86/kvm/x86.h | 14 ++++++++++++--
>  4 files changed, 24 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index ba9891ac5c56..41bf6a9de853 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -4872,14 +4872,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
>  	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
>  
>  	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
> -		kvm_before_handle_nmi(&svm->vcpu);
> +		kvm_before_interrupt(&svm->vcpu);
>  
>  	stgi();
>  
>  	/* Any pending NMI will happen here */
>  
>  	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
> -		kvm_after_handle_nmi(&svm->vcpu);
> +		kvm_after_interrupt(&svm->vcpu);
>  
>  	sync_cr8_to_lapic(vcpu);
>  
> @@ -5234,6 +5234,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
>  
>  static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
>  {
> +	kvm_before_interrupt(vcpu);
>  	local_irq_enable();
>  	/*
>  	 * We must have an instruction with interrupts enabled, so
> @@ -5241,6 +5242,7 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
>  	 */
>  	asm("nop");
>  	local_irq_disable();
> +	kvm_after_interrupt(vcpu);
>  }
>  
>  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index ca5d2b93385c..a02178914f6c 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -8606,9 +8606,9 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
>  
>  	/* We need to handle NMIs before interrupts are enabled */
>  	if (is_nmi(exit_intr_info)) {
> -		kvm_before_handle_nmi(&vmx->vcpu);
> +		kvm_before_interrupt(&vmx->vcpu);
>  		asm("int $2");
> -		kvm_after_handle_nmi(&vmx->vcpu);
> +		kvm_after_interrupt(&vmx->vcpu);
>  	}
>  }
>  
> @@ -8627,6 +8627,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>  		unsigned long tmp;
>  #endif
>  
> +		kvm_before_interrupt(vcpu);
>  		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
>  		desc = (gate_desc *)vmx->host_idt_base + vector;
>  		entry = gate_offset(*desc);
> @@ -8650,6 +8651,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>  			[ss]"i"(__KERNEL_DS),
>  			[cs]"i"(__KERNEL_CS)
>  			);
> +		kvm_after_interrupt(vcpu);
>  	}
>  }
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0e846f0cb83b..23ed7315f294 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5942,7 +5942,8 @@ static void kvm_timer_init(void)
>  			  kvmclock_cpu_online, kvmclock_cpu_down_prep);
>  }
>  
> -static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
> +DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
> +EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
>  
>  int kvm_is_in_guest(void)
>  {
> @@ -5975,18 +5976,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
>  	.get_guest_ip		= kvm_get_guest_ip,
>  };
>  
> -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
> -{
> -	__this_cpu_write(current_vcpu, vcpu);
> -}
> -EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
> -
> -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
> -{
> -	__this_cpu_write(current_vcpu, NULL);
> -}
> -EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
> -
>  static void kvm_set_mmio_spte_mask(void)
>  {
>  	u64 mask;
> @@ -6963,7 +6952,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	kvm_put_guest_xcr0(vcpu);
>  
> +	kvm_before_interrupt(vcpu);
>  	kvm_x86_ops->handle_external_intr(vcpu);
> +	kvm_after_interrupt(vcpu);
>  
>  	++vcpu->stat.exits;
>  
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 612067074905..19e1b332ced8 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -155,8 +155,6 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
>  	return !(kvm->arch.disabled_quirks & quirk);
>  }
>  
> -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
> -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
>  void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
>  int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
>  
> @@ -248,4 +246,16 @@ static inline bool kvm_mwait_in_guest(void)
>  	return true;
>  }
>  
> +DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
> +
> +static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
> +{
> +	__this_cpu_write(current_vcpu, vcpu);
> +}
> +
> +static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
> +{
> +	__this_cpu_write(current_vcpu, NULL);
> +}
> +
>  #endif
>

Andi Kleen July 26, 2017, 2:07 p.m. UTC | #2

On Wed, Jul 26, 2017 at 11:26:07AM +0200, Paolo Bonzini wrote:
> On 26/07/2017 02:20, Andi Kleen wrote:
> > From: Andi Kleen <ak@linux.intel.com>
> > 
> > KVM and perf have a special backdoor mechanism to report the IP for interrupts
> > re-executed after vm exit. This works for the NMIs that perf normally uses.
> > 
> > However when perf is in timer mode it doesn't work because the timer interrupt
> > doesn't get this special treatment. This is common when KVM is running
> > nested in another hypervisor which may not implement the PMU, so only
> > timer mode is available.
> > 
> > Call the functions to set up the backdoor IP also for non NMI interrupts.
> > 
> > I renamed the functions to set up the backdoor IP reporting to be more
> > appropiate for their new use.  The SVM change is only compile tested.
> > 
> > v2: Moved the functions inline.
> > For the normal interrupt case the before/after functions are now
> > called from x86.c, not arch specific code.
> 
> You haven't removed the code from vmx_handle_external_intr and 
> svm_handle_external_intr.

Ok.

> 
> > For the NMI case we still need to call it in the architecture
> > specific code, because it's already needed in the low level *_run
> > functions.
> 
> I must have been unclear; what I was asking is, can the calls cover
> a much wider range of vcpu_enter_guest?

Handling the external interrupt case from x86 without exporting
is no problem (like my patch did)

But to handle the NMI case it needs to be exported because
the NMI case is directly handled in the lowlevel vcpu_runs*.

And I'm not sure it's safe to set over the actual low level
guest runs. e.g. the AMD code already seems to change interrupts
there.

-andi

Paolo Bonzini July 26, 2017, 2:25 p.m. UTC | #3

On 26/07/2017 16:07, Andi Kleen wrote:
> On Wed, Jul 26, 2017 at 11:26:07AM +0200, Paolo Bonzini wrote:
>> On 26/07/2017 02:20, Andi Kleen wrote:
>>> From: Andi Kleen <ak@linux.intel.com>
>>>
>>> KVM and perf have a special backdoor mechanism to report the IP for interrupts
>>> re-executed after vm exit. This works for the NMIs that perf normally uses.
>>>
>>> However when perf is in timer mode it doesn't work because the timer interrupt
>>> doesn't get this special treatment. This is common when KVM is running
>>> nested in another hypervisor which may not implement the PMU, so only
>>> timer mode is available.
>>>
>>> Call the functions to set up the backdoor IP also for non NMI interrupts.
>>>
>>> I renamed the functions to set up the backdoor IP reporting to be more
>>> appropiate for their new use.  The SVM change is only compile tested.
>>>
>>> v2: Moved the functions inline.
>>> For the normal interrupt case the before/after functions are now
>>> called from x86.c, not arch specific code.
>>
>> You haven't removed the code from vmx_handle_external_intr and 
>> svm_handle_external_intr.
> 
> Ok.
> 
>>
>>> For the NMI case we still need to call it in the architecture
>>> specific code, because it's already needed in the low level *_run
>>> functions.
>>
>> I must have been unclear; what I was asking is, can the calls cover
>> a much wider range of vcpu_enter_guest?
> 
> Handling the external interrupt case from x86 without exporting
> is no problem (like my patch did)
> 
> But to handle the NMI case it needs to be exported because
> the NMI case is directly handled in the lowlevel vcpu_runs*.
> 
> And I'm not sure it's safe to set over the actual low level
> guest runs. e.g. the AMD code already seems to change interrupts
> there.

AMD only enables interrupts while GIF is 0, so it would be safe.  (The
processor switches GIF atomically to 1 with VMRUN and to 0 with #VMEXIT,
so "clgi;sti" means "enable interrupts while the guest runs" and is
paired with "cli;stgi" after VMRUN returns).

The main difference would be for an NMI happening before the
VMRUN/VMRESUME instruction; it would be accounted to the guest rather
than the host.

I guess it would be better done as a separate change anyway.  I'll
commit your v2 with fixed vmx.c and svm.c.

Paolo

Radim Krčmář March 28, 2018, 8:41 p.m. UTC | #4

2017-07-26 16:25+0200, Paolo Bonzini:
> On 26/07/2017 16:07, Andi Kleen wrote:
> > On Wed, Jul 26, 2017 at 11:26:07AM +0200, Paolo Bonzini wrote:
> >> On 26/07/2017 02:20, Andi Kleen wrote:
> >>> From: Andi Kleen <ak@linux.intel.com>
> >>>
> >>> KVM and perf have a special backdoor mechanism to report the IP for interrupts
> >>> re-executed after vm exit. This works for the NMIs that perf normally uses.
> >>>
> >>> However when perf is in timer mode it doesn't work because the timer interrupt
> >>> doesn't get this special treatment. This is common when KVM is running
> >>> nested in another hypervisor which may not implement the PMU, so only
> >>> timer mode is available.
> >>>
> >>> Call the functions to set up the backdoor IP also for non NMI interrupts.
> >>>
> >>> I renamed the functions to set up the backdoor IP reporting to be more
> >>> appropiate for their new use.  The SVM change is only compile tested.
> >>>
> >>> v2: Moved the functions inline.
> >>> For the normal interrupt case the before/after functions are now
> >>> called from x86.c, not arch specific code.
> >>
> >> You haven't removed the code from vmx_handle_external_intr and 
> >> svm_handle_external_intr.
> > 
> > Ok.
> > 
> >>
> >>> For the NMI case we still need to call it in the architecture
> >>> specific code, because it's already needed in the low level *_run
> >>> functions.
> >>
> >> I must have been unclear; what I was asking is, can the calls cover
> >> a much wider range of vcpu_enter_guest?
> > 
> > Handling the external interrupt case from x86 without exporting
> > is no problem (like my patch did)
> > 
> > But to handle the NMI case it needs to be exported because
> > the NMI case is directly handled in the lowlevel vcpu_runs*.
> > 
> > And I'm not sure it's safe to set over the actual low level
> > guest runs. e.g. the AMD code already seems to change interrupts
> > there.
> 
> AMD only enables interrupts while GIF is 0, so it would be safe.  (The
> processor switches GIF atomically to 1 with VMRUN and to 0 with #VMEXIT,
> so "clgi;sti" means "enable interrupts while the guest runs" and is
> paired with "cli;stgi" after VMRUN returns).
> 
> The main difference would be for an NMI happening before the
> VMRUN/VMRESUME instruction; it would be accounted to the guest rather
> than the host.
> 
> I guess it would be better done as a separate change anyway.  I'll
> commit your v2 with fixed vmx.c and svm.c.

I just removed the extra code from arch handle_external_intr and queued,
thanks.

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ba9891ac5c56..41bf6a9de853 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4872,14 +4872,14 @@  static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-		kvm_before_handle_nmi(&svm->vcpu);
+		kvm_before_interrupt(&svm->vcpu);
 
 	stgi();
 
 	/* Any pending NMI will happen here */
 
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-		kvm_after_handle_nmi(&svm->vcpu);
+		kvm_after_interrupt(&svm->vcpu);
 
 	sync_cr8_to_lapic(vcpu);
 
@@ -5234,6 +5234,7 @@  static int svm_check_intercept(struct kvm_vcpu *vcpu,
 
 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
 {
+	kvm_before_interrupt(vcpu);
 	local_irq_enable();
 	/*
 	 * We must have an instruction with interrupts enabled, so
@@ -5241,6 +5242,7 @@  static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
 	 */
 	asm("nop");
 	local_irq_disable();
+	kvm_after_interrupt(vcpu);
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca5d2b93385c..a02178914f6c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8606,9 +8606,9 @@  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 
 	/* We need to handle NMIs before interrupts are enabled */
 	if (is_nmi(exit_intr_info)) {
-		kvm_before_handle_nmi(&vmx->vcpu);
+		kvm_before_interrupt(&vmx->vcpu);
 		asm("int $2");
-		kvm_after_handle_nmi(&vmx->vcpu);
+		kvm_after_interrupt(&vmx->vcpu);
 	}
 }
 
@@ -8627,6 +8627,7 @@  static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 		unsigned long tmp;
 #endif
 
+		kvm_before_interrupt(vcpu);
 		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
 		desc = (gate_desc *)vmx->host_idt_base + vector;
 		entry = gate_offset(*desc);
@@ -8650,6 +8651,7 @@  static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
+		kvm_after_interrupt(vcpu);
 	}
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e846f0cb83b..23ed7315f294 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5942,7 +5942,8 @@  static void kvm_timer_init(void)
 			  kvmclock_cpu_online, kvmclock_cpu_down_prep);
 }
 
-static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
 
 int kvm_is_in_guest(void)
 {
@@ -5975,18 +5976,6 @@  static struct perf_guest_info_callbacks kvm_guest_cbs = {
 	.get_guest_ip		= kvm_get_guest_ip,
 };
 
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
-{
-	__this_cpu_write(current_vcpu, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
-
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
-{
-	__this_cpu_write(current_vcpu, NULL);
-}
-EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
-
 static void kvm_set_mmio_spte_mask(void)
 {
 	u64 mask;
@@ -6963,7 +6952,9 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	kvm_put_guest_xcr0(vcpu);
 
+	kvm_before_interrupt(vcpu);
 	kvm_x86_ops->handle_external_intr(vcpu);
+	kvm_after_interrupt(vcpu);
 
 	++vcpu->stat.exits;
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 612067074905..19e1b332ced8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -155,8 +155,6 @@  static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
 	return !(kvm->arch.disabled_quirks & quirk);
 }
 
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
@@ -248,4 +246,16 @@  static inline bool kvm_mwait_in_guest(void)
 	return true;
 }
 
+DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+{
+	__this_cpu_write(current_vcpu, vcpu);
+}
+
+static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+{
+	__this_cpu_write(current_vcpu, NULL);
+}
+
 #endif

kvm: Fix perf timer mode IP reporting

Commit Message

Comments

Patch