diff mbox series

[1/2] KVM: x86: Synthesize at most one PMI per VM-exit

Message ID 20230901185646.2823254-1-jmattson@google.com (mailing list archive)
State New, archived
Headers show
Series [1/2] KVM: x86: Synthesize at most one PMI per VM-exit | expand

Commit Message

Jim Mattson Sept. 1, 2023, 6:56 p.m. UTC
When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
VM-exit that also invokes __kvm_perf_overflow() as a result of
instruction emulation, kvm_pmu_deliver_pmi() will be called twice
before the next VM-entry.

That shouldn't be a problem. The local APIC is supposed to
automatically set the mask flag in LVTPC when it handles a PMI, so the
second PMI should be inhibited. However, KVM's local APIC emulation
fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
are delivered via the local APIC. In the common case, where LVTPC is
configured to deliver an NMI, the first NMI is vectored through the
guest IDT, and the second one is held pending. When the NMI handler
returns, the second NMI is vectored through the IDT. For Linux guests,
this results in the "dazed and confused" spurious NMI message.

Though the obvious fix is to set the mask flag in LVTPC when handling
a PMI, KVM's logic around synthesizing a PMI is unnecessarily
convoluted.

Remove the irq_work callback for synthesizing a PMI, and all of the
logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().

Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions")
Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/pmu.c              | 27 +--------------------------
 arch/x86/kvm/x86.c              |  3 +++
 3 files changed, 4 insertions(+), 27 deletions(-)

Comments

Mingwei Zhang Sept. 2, 2023, 7:05 p.m. UTC | #1
On Fri, Sep 01, 2023, Jim Mattson wrote:
> When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> VM-exit that also invokes __kvm_perf_overflow() as a result of
> instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> before the next VM-entry.
> 
> That shouldn't be a problem. The local APIC is supposed to
> automatically set the mask flag in LVTPC when it handles a PMI, so the
> second PMI should be inhibited. However, KVM's local APIC emulation
> fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> are delivered via the local APIC. In the common case, where LVTPC is
> configured to deliver an NMI, the first NMI is vectored through the
> guest IDT, and the second one is held pending. When the NMI handler
> returns, the second NMI is vectored through the IDT. For Linux guests,
> this results in the "dazed and confused" spurious NMI message.
> 
> Though the obvious fix is to set the mask flag in LVTPC when handling
> a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> convoluted.
> 
> Remove the irq_work callback for synthesizing a PMI, and all of the
> logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
> a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().
> 
> Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions")
> Signed-off-by: Jim Mattson <jmattson@google.com>
Tested-by: Mingwei Zhang <mizhang@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 -
>  arch/x86/kvm/pmu.c              | 27 +--------------------------
>  arch/x86/kvm/x86.c              |  3 +++
>  3 files changed, 4 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3bc146dfd38d..f6b9e3ae08bf 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -528,7 +528,6 @@ struct kvm_pmu {
>  	u64 raw_event_mask;
>  	struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
>  	struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
> -	struct irq_work irq_work;
>  
>  	/*
>  	 * Overlay the bitmap with a 64-bit atomic so that all bits can be
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index bf653df86112..0c117cd24077 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
>  #undef __KVM_X86_PMU_OP
>  }
>  
> -static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
> -{
> -	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
> -	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
> -
> -	kvm_pmu_deliver_pmi(vcpu);
> -}
> -
>  static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>  {
>  	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>  		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
>  	}
>  
> -	if (!pmc->intr || skip_pmi)
> -		return;
> -
> -	/*
> -	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
> -	 * can be ejected on a guest mode re-entry. Otherwise we can't
> -	 * be sure that vcpu wasn't executing hlt instruction at the
> -	 * time of vmexit and is not going to re-enter guest mode until
> -	 * woken up. So we should wake it, but this is impossible from
> -	 * NMI context. Do it from irq work instead.
> -	 */
> -	if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
> -		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> -	else
> +	if (pmc->intr && !skip_pmi)
>  		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
>  }
>  
> @@ -677,9 +656,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
>  
>  void kvm_pmu_reset(struct kvm_vcpu *vcpu)
>  {
> -	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> -
> -	irq_work_sync(&pmu->irq_work);
>  	static_call(kvm_x86_pmu_reset)(vcpu);
>  }
>  
> @@ -689,7 +665,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
>  
>  	memset(pmu, 0, sizeof(*pmu));
>  	static_call(kvm_x86_pmu_init)(vcpu);
> -	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
>  	pmu->event_count = 0;
>  	pmu->need_cleanup = false;
>  	kvm_pmu_refresh(vcpu);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c381770bcbf1..0732c09fbd2d 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12841,6 +12841,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
>  		return true;
>  #endif
>  
> +	if (kvm_test_request(KVM_REQ_PMI, vcpu))
> +		return true;
> +
>  	if (kvm_arch_interrupt_allowed(vcpu) &&
>  	    (kvm_cpu_has_interrupt(vcpu) ||
>  	    kvm_guest_apic_has_interrupt(vcpu)))
> -- 
> 2.42.0.283.g2d96d420d3-goog
>
Mi, Dapeng Sept. 6, 2023, 9:17 a.m. UTC | #2
On 9/2/2023 2:56 AM, Jim Mattson wrote:
> When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> VM-exit that also invokes __kvm_perf_overflow() as a result of
> instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> before the next VM-entry.


Do we have a way to reproduce this spurious NMI error constantly?


>
> That shouldn't be a problem. The local APIC is supposed to
> automatically set the mask flag in LVTPC when it handles a PMI, so the
> second PMI should be inhibited. However, KVM's local APIC emulation
> fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> are delivered via the local APIC. In the common case, where LVTPC is
> configured to deliver an NMI, the first NMI is vectored through the
> guest IDT, and the second one is held pending. When the NMI handler
> returns, the second NMI is vectored through the IDT. For Linux guests,
> this results in the "dazed and confused" spurious NMI message.
>
> Though the obvious fix is to set the mask flag in LVTPC when handling
> a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> convoluted.
>
> Remove the irq_work callback for synthesizing a PMI, and all of the
> logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
> a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().
>
> Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions")
> Signed-off-by: Jim Mattson <jmattson@google.com>
> ---
>   arch/x86/include/asm/kvm_host.h |  1 -
>   arch/x86/kvm/pmu.c              | 27 +--------------------------
>   arch/x86/kvm/x86.c              |  3 +++
>   3 files changed, 4 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3bc146dfd38d..f6b9e3ae08bf 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -528,7 +528,6 @@ struct kvm_pmu {
>   	u64 raw_event_mask;
>   	struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
>   	struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
> -	struct irq_work irq_work;
>   
>   	/*
>   	 * Overlay the bitmap with a 64-bit atomic so that all bits can be
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index bf653df86112..0c117cd24077 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
>   #undef __KVM_X86_PMU_OP
>   }
>   
> -static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
> -{
> -	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
> -	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
> -
> -	kvm_pmu_deliver_pmi(vcpu);
> -}
> -
>   static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>   {
>   	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>   		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
>   	}
>   
> -	if (!pmc->intr || skip_pmi)
> -		return;
> -
> -	/*
> -	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
> -	 * can be ejected on a guest mode re-entry. Otherwise we can't
> -	 * be sure that vcpu wasn't executing hlt instruction at the
> -	 * time of vmexit and is not going to re-enter guest mode until
> -	 * woken up. So we should wake it, but this is impossible from
> -	 * NMI context. Do it from irq work instead.
> -	 */
> -	if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
> -		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> -	else
> +	if (pmc->intr && !skip_pmi)
>   		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
>   }
>   
> @@ -677,9 +656,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
>   
>   void kvm_pmu_reset(struct kvm_vcpu *vcpu)
>   {
> -	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> -
> -	irq_work_sync(&pmu->irq_work);
>   	static_call(kvm_x86_pmu_reset)(vcpu);
>   }
>   
> @@ -689,7 +665,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
>   
>   	memset(pmu, 0, sizeof(*pmu));
>   	static_call(kvm_x86_pmu_init)(vcpu);
> -	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
>   	pmu->event_count = 0;
>   	pmu->need_cleanup = false;
>   	kvm_pmu_refresh(vcpu);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c381770bcbf1..0732c09fbd2d 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12841,6 +12841,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
>   		return true;
>   #endif
>   
> +	if (kvm_test_request(KVM_REQ_PMI, vcpu))
> +		return true;
> +
>   	if (kvm_arch_interrupt_allowed(vcpu) &&
>   	    (kvm_cpu_has_interrupt(vcpu) ||
>   	    kvm_guest_apic_has_interrupt(vcpu)))
Mingwei Zhang Sept. 6, 2023, 8:54 p.m. UTC | #3
On Wed, Sep 06, 2023, Mi, Dapeng wrote:
> On 9/2/2023 2:56 AM, Jim Mattson wrote:
> > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > before the next VM-entry.
> 
> 
> Do we have a way to reproduce this spurious NMI error constantly?
> 
I think I shared with you in another thread. I also shared the event
list and command here:

https://lore.kernel.org/all/ZKCD30QrE5g9XGIh@google.com/

To observe the spurious NMIs, you can just continously look at the
NMIs/PMIs in /proc/interrupts and see if you have huge number within 2
minutes when running the above command in a 8-vcpu QEMU VM on Intel SPR
machine. Huge here means more than 10K.

In addition, you may observe the following warning from kernel dmesg:

[3939579.462832] Uhhuh. NMI received for unknown reason 30 on CPU 43.
[3939579.462836] Dazed and confused, but trying to continue

> 
> > 
> > That shouldn't be a problem. The local APIC is supposed to
> > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > second PMI should be inhibited. However, KVM's local APIC emulation
> > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > are delivered via the local APIC. In the common case, where LVTPC is
> > configured to deliver an NMI, the first NMI is vectored through the
> > guest IDT, and the second one is held pending. When the NMI handler
> > returns, the second NMI is vectored through the IDT. For Linux guests,
> > this results in the "dazed and confused" spurious NMI message.
> > 
> > Though the obvious fix is to set the mask flag in LVTPC when handling
> > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > convoluted.
> > 
> > Remove the irq_work callback for synthesizing a PMI, and all of the
> > logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
> > a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().
> > 
> > Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions")
> > Signed-off-by: Jim Mattson <jmattson@google.com>
> > ---
> >   arch/x86/include/asm/kvm_host.h |  1 -
> >   arch/x86/kvm/pmu.c              | 27 +--------------------------
> >   arch/x86/kvm/x86.c              |  3 +++
> >   3 files changed, 4 insertions(+), 27 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 3bc146dfd38d..f6b9e3ae08bf 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -528,7 +528,6 @@ struct kvm_pmu {
> >   	u64 raw_event_mask;
> >   	struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
> >   	struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
> > -	struct irq_work irq_work;
> >   	/*
> >   	 * Overlay the bitmap with a 64-bit atomic so that all bits can be
> > diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> > index bf653df86112..0c117cd24077 100644
> > --- a/arch/x86/kvm/pmu.c
> > +++ b/arch/x86/kvm/pmu.c
> > @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
> >   #undef __KVM_X86_PMU_OP
> >   }
> > -static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
> > -{
> > -	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
> > -	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
> > -
> > -	kvm_pmu_deliver_pmi(vcpu);
> > -}
> > -
> >   static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
> >   {
> >   	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> > @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
> >   		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> >   	}
> > -	if (!pmc->intr || skip_pmi)
> > -		return;
> > -
> > -	/*
> > -	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
> > -	 * can be ejected on a guest mode re-entry. Otherwise we can't
> > -	 * be sure that vcpu wasn't executing hlt instruction at the
> > -	 * time of vmexit and is not going to re-enter guest mode until
> > -	 * woken up. So we should wake it, but this is impossible from
> > -	 * NMI context. Do it from irq work instead.
> > -	 */
> > -	if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
> > -		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> > -	else
> > +	if (pmc->intr && !skip_pmi)
> >   		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
> >   }
> > @@ -677,9 +656,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
> >   void kvm_pmu_reset(struct kvm_vcpu *vcpu)
> >   {
> > -	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> > -
> > -	irq_work_sync(&pmu->irq_work);
> >   	static_call(kvm_x86_pmu_reset)(vcpu);
> >   }
> > @@ -689,7 +665,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
> >   	memset(pmu, 0, sizeof(*pmu));
> >   	static_call(kvm_x86_pmu_init)(vcpu);
> > -	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
> >   	pmu->event_count = 0;
> >   	pmu->need_cleanup = false;
> >   	kvm_pmu_refresh(vcpu);
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index c381770bcbf1..0732c09fbd2d 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -12841,6 +12841,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
> >   		return true;
> >   #endif
> > +	if (kvm_test_request(KVM_REQ_PMI, vcpu))
> > +		return true;
> > +
> >   	if (kvm_arch_interrupt_allowed(vcpu) &&
> >   	    (kvm_cpu_has_interrupt(vcpu) ||
> >   	    kvm_guest_apic_has_interrupt(vcpu)))
Mi, Dapeng Sept. 7, 2023, 6:29 a.m. UTC | #4
On 9/7/2023 4:54 AM, Mingwei Zhang wrote:
> On Wed, Sep 06, 2023, Mi, Dapeng wrote:
>> On 9/2/2023 2:56 AM, Jim Mattson wrote:
>>> When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
>>> VM-exit that also invokes __kvm_perf_overflow() as a result of
>>> instruction emulation, kvm_pmu_deliver_pmi() will be called twice
>>> before the next VM-entry.
>>
>> Do we have a way to reproduce this spurious NMI error constantly?
>>
> I think I shared with you in another thread. I also shared the event
> list and command here:
>
> https://lore.kernel.org/all/ZKCD30QrE5g9XGIh@google.com/
>
> To observe the spurious NMIs, you can just continously look at the
> NMIs/PMIs in /proc/interrupts and see if you have huge number within 2
> minutes when running the above command in a 8-vcpu QEMU VM on Intel SPR
> machine. Huge here means more than 10K.
>
> In addition, you may observe the following warning from kernel dmesg:
>
> [3939579.462832] Uhhuh. NMI received for unknown reason 30 on CPU 43.
> [3939579.462836] Dazed and confused, but trying to continue

Thanks. I run the perf command which you shared in a VM for 10 minutes 
on SPR, I indeed see the unknown NMI warning messages, but I didn't see 
the huge number NMI interrupt burst, instead the NMI number increased 
averagely and there is no a burst increase.

After applying this patchset, the unknown NMI warning is indeed gone.

Tested-by: Dapeng Mi <dapeng1.mi@linux.intel.com>

>
>>> That shouldn't be a problem. The local APIC is supposed to
>>> automatically set the mask flag in LVTPC when it handles a PMI, so the
>>> second PMI should be inhibited. However, KVM's local APIC emulation
>>> fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
>>> are delivered via the local APIC. In the common case, where LVTPC is
>>> configured to deliver an NMI, the first NMI is vectored through the
>>> guest IDT, and the second one is held pending. When the NMI handler
>>> returns, the second NMI is vectored through the IDT. For Linux guests,
>>> this results in the "dazed and confused" spurious NMI message.
>>>
>>> Though the obvious fix is to set the mask flag in LVTPC when handling
>>> a PMI, KVM's logic around synthesizing a PMI is unnecessarily
>>> convoluted.
>>>
>>> Remove the irq_work callback for synthesizing a PMI, and all of the
>>> logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
>>> a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().
>>>
>>> Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions")
>>> Signed-off-by: Jim Mattson <jmattson@google.com>
>>> ---
>>>    arch/x86/include/asm/kvm_host.h |  1 -
>>>    arch/x86/kvm/pmu.c              | 27 +--------------------------
>>>    arch/x86/kvm/x86.c              |  3 +++
>>>    3 files changed, 4 insertions(+), 27 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>> index 3bc146dfd38d..f6b9e3ae08bf 100644
>>> --- a/arch/x86/include/asm/kvm_host.h
>>> +++ b/arch/x86/include/asm/kvm_host.h
>>> @@ -528,7 +528,6 @@ struct kvm_pmu {
>>>    	u64 raw_event_mask;
>>>    	struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
>>>    	struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
>>> -	struct irq_work irq_work;
>>>    	/*
>>>    	 * Overlay the bitmap with a 64-bit atomic so that all bits can be
>>> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
>>> index bf653df86112..0c117cd24077 100644
>>> --- a/arch/x86/kvm/pmu.c
>>> +++ b/arch/x86/kvm/pmu.c
>>> @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
>>>    #undef __KVM_X86_PMU_OP
>>>    }
>>> -static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
>>> -{
>>> -	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
>>> -	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
>>> -
>>> -	kvm_pmu_deliver_pmi(vcpu);
>>> -}
>>> -
>>>    static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>>>    {
>>>    	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
>>> @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>>>    		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
>>>    	}
>>> -	if (!pmc->intr || skip_pmi)
>>> -		return;
>>> -
>>> -	/*
>>> -	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
>>> -	 * can be ejected on a guest mode re-entry. Otherwise we can't
>>> -	 * be sure that vcpu wasn't executing hlt instruction at the
>>> -	 * time of vmexit and is not going to re-enter guest mode until
>>> -	 * woken up. So we should wake it, but this is impossible from
>>> -	 * NMI context. Do it from irq work instead.
>>> -	 */
>>> -	if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
>>> -		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
>>> -	else
>>> +	if (pmc->intr && !skip_pmi)
>>>    		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
>>>    }
>>> @@ -677,9 +656,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
>>>    void kvm_pmu_reset(struct kvm_vcpu *vcpu)
>>>    {
>>> -	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>>> -
>>> -	irq_work_sync(&pmu->irq_work);
>>>    	static_call(kvm_x86_pmu_reset)(vcpu);
>>>    }
>>> @@ -689,7 +665,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
>>>    	memset(pmu, 0, sizeof(*pmu));
>>>    	static_call(kvm_x86_pmu_init)(vcpu);
>>> -	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
>>>    	pmu->event_count = 0;
>>>    	pmu->need_cleanup = false;
>>>    	kvm_pmu_refresh(vcpu);
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index c381770bcbf1..0732c09fbd2d 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -12841,6 +12841,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
>>>    		return true;
>>>    #endif
>>> +	if (kvm_test_request(KVM_REQ_PMI, vcpu))
>>> +		return true;
>>> +
>>>    	if (kvm_arch_interrupt_allowed(vcpu) &&
>>>    	    (kvm_cpu_has_interrupt(vcpu) ||
>>>    	    kvm_guest_apic_has_interrupt(vcpu)))
Like Xu Sept. 14, 2023, 11:57 a.m. UTC | #5
On 2/9/2023 2:56 am, Jim Mattson wrote:
> When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> VM-exit that also invokes __kvm_perf_overflow() as a result of
> instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> before the next VM-entry.
> 
> That shouldn't be a problem. The local APIC is supposed to

As you said, that shouldn't be a problem.

> automatically set the mask flag in LVTPC when it handles a PMI, so the
> second PMI should be inhibited. However, KVM's local APIC emulation
> fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> are delivered via the local APIC. In the common case, where LVTPC is
> configured to deliver an NMI, the first NMI is vectored through the
> guest IDT, and the second one is held pending. When the NMI handler
> returns, the second NMI is vectored through the IDT. For Linux guests,
> this results in the "dazed and confused" spurious NMI message.
> 
> Though the obvious fix is to set the mask flag in LVTPC when handling
> a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> convoluted.

Any obstruction issues on fixing in this direction ?

> 
> Remove the irq_work callback for synthesizing a PMI, and all of the
> logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
> a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().
> 
> Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions")
> Signed-off-by: Jim Mattson <jmattson@google.com>
> ---
>   arch/x86/include/asm/kvm_host.h |  1 -
>   arch/x86/kvm/pmu.c              | 27 +--------------------------
>   arch/x86/kvm/x86.c              |  3 +++
>   3 files changed, 4 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3bc146dfd38d..f6b9e3ae08bf 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -528,7 +528,6 @@ struct kvm_pmu {
>   	u64 raw_event_mask;
>   	struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
>   	struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
> -	struct irq_work irq_work;
>   
>   	/*
>   	 * Overlay the bitmap with a 64-bit atomic so that all bits can be
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index bf653df86112..0c117cd24077 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
>   #undef __KVM_X86_PMU_OP
>   }
>   
> -static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
> -{
> -	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
> -	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
> -
> -	kvm_pmu_deliver_pmi(vcpu);
> -}
> -
>   static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>   {
>   	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>   		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
>   	}
>   
> -	if (!pmc->intr || skip_pmi)
> -		return;
> -
> -	/*
> -	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
> -	 * can be ejected on a guest mode re-entry. Otherwise we can't
> -	 * be sure that vcpu wasn't executing hlt instruction at the
> -	 * time of vmexit and is not going to re-enter guest mode until
> -	 * woken up. So we should wake it, but this is impossible from
> -	 * NMI context. Do it from irq work instead.
> -	 */
> -	if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
> -		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> -	else
> +	if (pmc->intr && !skip_pmi)
>   		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
>   }
>   
> @@ -677,9 +656,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
>   
>   void kvm_pmu_reset(struct kvm_vcpu *vcpu)
>   {
> -	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> -
> -	irq_work_sync(&pmu->irq_work);
>   	static_call(kvm_x86_pmu_reset)(vcpu);
>   }
>   
> @@ -689,7 +665,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
>   
>   	memset(pmu, 0, sizeof(*pmu));
>   	static_call(kvm_x86_pmu_init)(vcpu);
> -	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
>   	pmu->event_count = 0;
>   	pmu->need_cleanup = false;
>   	kvm_pmu_refresh(vcpu);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c381770bcbf1..0732c09fbd2d 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12841,6 +12841,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
>   		return true;
>   #endif
>   
> +	if (kvm_test_request(KVM_REQ_PMI, vcpu))
> +		return true;
> +
>   	if (kvm_arch_interrupt_allowed(vcpu) &&
>   	    (kvm_cpu_has_interrupt(vcpu) ||
>   	    kvm_guest_apic_has_interrupt(vcpu)))
Sean Christopherson Sept. 14, 2023, 2:27 p.m. UTC | #6
On Thu, Sep 14, 2023, Like Xu wrote:
> On 2/9/2023 2:56 am, Jim Mattson wrote:
> > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > before the next VM-entry.
> > 
> > That shouldn't be a problem. The local APIC is supposed to
> 
> As you said, that shouldn't be a problem.

It's still a bug though, overflow should only happen once.

> > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > second PMI should be inhibited. However, KVM's local APIC emulation
> > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > are delivered via the local APIC. In the common case, where LVTPC is
> > configured to deliver an NMI, the first NMI is vectored through the
> > guest IDT, and the second one is held pending. When the NMI handler
> > returns, the second NMI is vectored through the IDT. For Linux guests,
> > this results in the "dazed and confused" spurious NMI message.
> > 
> > Though the obvious fix is to set the mask flag in LVTPC when handling
> > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > convoluted.
> 
> Any obstruction issues on fixing in this direction ?

No, patch 2/2 in this series fixes LVTPC masking bug.  I haven't dug through all
of this yet, but my gut reaction is that I'm very strongly in favor of not using
irq_work just to ensure KVM kicks a vCPU out of HLT.  That's just ridiculous.
Sean Christopherson Sept. 22, 2023, 6:46 p.m. UTC | #7
On Fri, Sep 01, 2023, Jim Mattson wrote:
> When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> VM-exit that also invokes __kvm_perf_overflow() as a result of
> instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> before the next VM-entry.
> 
> That shouldn't be a problem. The local APIC is supposed to
> automatically set the mask flag in LVTPC when it handles a PMI, so the
> second PMI should be inhibited. However, KVM's local APIC emulation
> fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> are delivered via the local APIC. In the common case, where LVTPC is
> configured to deliver an NMI, the first NMI is vectored through the
> guest IDT, and the second one is held pending. When the NMI handler
> returns, the second NMI is vectored through the IDT. For Linux guests,
> this results in the "dazed and confused" spurious NMI message.
> 
> Though the obvious fix is to set the mask flag in LVTPC when handling
> a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> convoluted.

To address Like's question about whether not this is necessary, I think we should
rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
masking thing.

And I think it makes sense to swap the order of the two patches.  The LVTPC masking
fix is a clearcut architectural violation.  This is a bit more of a grey area,
though still blatantly buggy.

So, put this patch second, and replace the above paragraphs with something like?

  Calling kvm_pmu_deliver_pmi() twice is unlikely to be problematic now that
  KVM sets the LVTPC mask bit when delivering a PMI.  But using IRQ work to
  trigger the PMI is still broken, albeit very theoretically.

  E.g. if the self-IPI to trigger IRQ work is be delayed long enough for the
  vCPU to be migrated to a different pCPU, then it's possible for
  kvm_pmi_trigger_fn() to race with the kvm_pmu_deliver_pmi() from
  KVM_REQ_PMI and still generate two PMIs.

  KVM could set the mask bit using an atomic operation, but that'd just be
  piling on unnecessary code to workaround what is effectively a hack.  The
  *only* reason KVM uses IRQ work is to ensure the PMI is treated as a wake
  event, e.g. if the vCPU just executed HLT.

> Remove the irq_work callback for synthesizing a PMI, and all of the
> logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
> a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().
Jim Mattson Sept. 22, 2023, 7:04 p.m. UTC | #8
On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Sep 01, 2023, Jim Mattson wrote:
> > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > before the next VM-entry.
> >
> > That shouldn't be a problem. The local APIC is supposed to
> > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > second PMI should be inhibited. However, KVM's local APIC emulation
> > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > are delivered via the local APIC. In the common case, where LVTPC is
> > configured to deliver an NMI, the first NMI is vectored through the
> > guest IDT, and the second one is held pending. When the NMI handler
> > returns, the second NMI is vectored through the IDT. For Linux guests,
> > this results in the "dazed and confused" spurious NMI message.
> >
> > Though the obvious fix is to set the mask flag in LVTPC when handling
> > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > convoluted.
>
> To address Like's question about whether not this is necessary, I think we should
> rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
> masking thing.
>
> And I think it makes sense to swap the order of the two patches.  The LVTPC masking
> fix is a clearcut architectural violation.  This is a bit more of a grey area,
> though still blatantly buggy.

The reason I ordered the patches as I did is that when this patch
comes first, it actually fixes the problem that was introduced in
commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
instructions"). If this patch comes second, it's less clear that it
fixes a bug, since the other patch renders this one essentially moot.

> So, put this patch second, and replace the above paragraphs with something like?
>
>   Calling kvm_pmu_deliver_pmi() twice is unlikely to be problematic now that
>   KVM sets the LVTPC mask bit when delivering a PMI.  But using IRQ work to
>   trigger the PMI is still broken, albeit very theoretically.
>
>   E.g. if the self-IPI to trigger IRQ work is be delayed long enough for the
>   vCPU to be migrated to a different pCPU, then it's possible for
>   kvm_pmi_trigger_fn() to race with the kvm_pmu_deliver_pmi() from
>   KVM_REQ_PMI and still generate two PMIs.
>
>   KVM could set the mask bit using an atomic operation, but that'd just be
>   piling on unnecessary code to workaround what is effectively a hack.  The
>   *only* reason KVM uses IRQ work is to ensure the PMI is treated as a wake
>   event, e.g. if the vCPU just executed HLT.
>
> > Remove the irq_work callback for synthesizing a PMI, and all of the
> > logic for invoking it. Instead, to prevent a vcpu from leaving C0 with
> > a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events().
Sean Christopherson Sept. 22, 2023, 7:21 p.m. UTC | #9
On Fri, Sep 22, 2023, Jim Mattson wrote:
> On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Fri, Sep 01, 2023, Jim Mattson wrote:
> > > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > > before the next VM-entry.
> > >
> > > That shouldn't be a problem. The local APIC is supposed to
> > > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > > second PMI should be inhibited. However, KVM's local APIC emulation
> > > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > > are delivered via the local APIC. In the common case, where LVTPC is
> > > configured to deliver an NMI, the first NMI is vectored through the
> > > guest IDT, and the second one is held pending. When the NMI handler
> > > returns, the second NMI is vectored through the IDT. For Linux guests,
> > > this results in the "dazed and confused" spurious NMI message.
> > >
> > > Though the obvious fix is to set the mask flag in LVTPC when handling
> > > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > > convoluted.
> >
> > To address Like's question about whether not this is necessary, I think we should
> > rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
> > masking thing.
> >
> > And I think it makes sense to swap the order of the two patches.  The LVTPC masking
> > fix is a clearcut architectural violation.  This is a bit more of a grey area,
> > though still blatantly buggy.
> 
> The reason I ordered the patches as I did is that when this patch
> comes first, it actually fixes the problem that was introduced in
> commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
> instructions"). If this patch comes second, it's less clear that it
> fixes a bug, since the other patch renders this one essentially moot.

Yeah, but as Like pointed out, the way the changelog is worded just raises the
question of why this change is necessary.

I think we should tag them both for stable.  They're both bug fixes, regardless
of the ordering.
Mingwei Zhang Sept. 22, 2023, 8:25 p.m. UTC | #10
On Fri, Sep 22, 2023 at 12:21 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Sep 22, 2023, Jim Mattson wrote:
> > On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Fri, Sep 01, 2023, Jim Mattson wrote:
> > > > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > > > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > > > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > > > before the next VM-entry.
> > > >
> > > > That shouldn't be a problem. The local APIC is supposed to
> > > > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > > > second PMI should be inhibited. However, KVM's local APIC emulation
> > > > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > > > are delivered via the local APIC. In the common case, where LVTPC is
> > > > configured to deliver an NMI, the first NMI is vectored through the
> > > > guest IDT, and the second one is held pending. When the NMI handler
> > > > returns, the second NMI is vectored through the IDT. For Linux guests,
> > > > this results in the "dazed and confused" spurious NMI message.
> > > >
> > > > Though the obvious fix is to set the mask flag in LVTPC when handling
> > > > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > > > convoluted.
> > >
> > > To address Like's question about whether not this is necessary, I think we should
> > > rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
> > > masking thing.
> > >
> > > And I think it makes sense to swap the order of the two patches.  The LVTPC masking
> > > fix is a clearcut architectural violation.  This is a bit more of a grey area,
> > > though still blatantly buggy.
> >
> > The reason I ordered the patches as I did is that when this patch
> > comes first, it actually fixes the problem that was introduced in
> > commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
> > instructions"). If this patch comes second, it's less clear that it
> > fixes a bug, since the other patch renders this one essentially moot.
>
> Yeah, but as Like pointed out, the way the changelog is worded just raises the
> question of why this change is necessary.
>
> I think we should tag them both for stable.  They're both bug fixes, regardless
> of the ordering.

Agree. Both patches are fixing the general potential buggy situation
of multiple PMI injection on one vm entry: one software level defense
(forcing the usage of KVM_REQ_PMI) and one hardware level defense
(preventing PMI injection using mask).

Although neither patch in this series is fixing the root cause of this
specific double PMI injection bug, I don't see a reason why we cannot
add a "fixes" tag to them, since we may fix it and create it again.

I am currently working on it and testing my patch. Please give me some
time, I think I could try sending out one version today. Once that is
done, I will combine mine with the existing patch and send it out as a
series.

Thanks.
-Mingwei
Sean Christopherson Sept. 22, 2023, 8:34 p.m. UTC | #11
On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> On Fri, Sep 22, 2023 at 12:21 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Fri, Sep 22, 2023, Jim Mattson wrote:
> > > On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
> > > >
> > > > On Fri, Sep 01, 2023, Jim Mattson wrote:
> > > > > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > > > > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > > > > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > > > > before the next VM-entry.
> > > > >
> > > > > That shouldn't be a problem. The local APIC is supposed to
> > > > > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > > > > second PMI should be inhibited. However, KVM's local APIC emulation
> > > > > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > > > > are delivered via the local APIC. In the common case, where LVTPC is
> > > > > configured to deliver an NMI, the first NMI is vectored through the
> > > > > guest IDT, and the second one is held pending. When the NMI handler
> > > > > returns, the second NMI is vectored through the IDT. For Linux guests,
> > > > > this results in the "dazed and confused" spurious NMI message.
> > > > >
> > > > > Though the obvious fix is to set the mask flag in LVTPC when handling
> > > > > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > > > > convoluted.
> > > >
> > > > To address Like's question about whether not this is necessary, I think we should
> > > > rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
> > > > masking thing.
> > > >
> > > > And I think it makes sense to swap the order of the two patches.  The LVTPC masking
> > > > fix is a clearcut architectural violation.  This is a bit more of a grey area,
> > > > though still blatantly buggy.
> > >
> > > The reason I ordered the patches as I did is that when this patch
> > > comes first, it actually fixes the problem that was introduced in
> > > commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
> > > instructions"). If this patch comes second, it's less clear that it
> > > fixes a bug, since the other patch renders this one essentially moot.
> >
> > Yeah, but as Like pointed out, the way the changelog is worded just raises the
> > question of why this change is necessary.
> >
> > I think we should tag them both for stable.  They're both bug fixes, regardless
> > of the ordering.
> 
> Agree. Both patches are fixing the general potential buggy situation
> of multiple PMI injection on one vm entry: one software level defense
> (forcing the usage of KVM_REQ_PMI) and one hardware level defense
> (preventing PMI injection using mask).
> 
> Although neither patch in this series is fixing the root cause of this
> specific double PMI injection bug, I don't see a reason why we cannot
> add a "fixes" tag to them, since we may fix it and create it again.
> 
> I am currently working on it and testing my patch. Please give me some
> time, I think I could try sending out one version today. Once that is
> done, I will combine mine with the existing patch and send it out as a
> series.

Me confused, what patch?  And what does this patch have to do with Jim's series?
Unless I've missed something, Jim's patches are good to go with my nits addressed.
Mingwei Zhang Sept. 22, 2023, 8:49 p.m. UTC | #12
On Fri, Sep 22, 2023 at 1:34 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > On Fri, Sep 22, 2023 at 12:21 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Fri, Sep 22, 2023, Jim Mattson wrote:
> > > > On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > >
> > > > > On Fri, Sep 01, 2023, Jim Mattson wrote:
> > > > > > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > > > > > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > > > > > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > > > > > before the next VM-entry.
> > > > > >
> > > > > > That shouldn't be a problem. The local APIC is supposed to
> > > > > > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > > > > > second PMI should be inhibited. However, KVM's local APIC emulation
> > > > > > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > > > > > are delivered via the local APIC. In the common case, where LVTPC is
> > > > > > configured to deliver an NMI, the first NMI is vectored through the
> > > > > > guest IDT, and the second one is held pending. When the NMI handler
> > > > > > returns, the second NMI is vectored through the IDT. For Linux guests,
> > > > > > this results in the "dazed and confused" spurious NMI message.
> > > > > >
> > > > > > Though the obvious fix is to set the mask flag in LVTPC when handling
> > > > > > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > > > > > convoluted.
> > > > >
> > > > > To address Like's question about whether not this is necessary, I think we should
> > > > > rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
> > > > > masking thing.
> > > > >
> > > > > And I think it makes sense to swap the order of the two patches.  The LVTPC masking
> > > > > fix is a clearcut architectural violation.  This is a bit more of a grey area,
> > > > > though still blatantly buggy.
> > > >
> > > > The reason I ordered the patches as I did is that when this patch
> > > > comes first, it actually fixes the problem that was introduced in
> > > > commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
> > > > instructions"). If this patch comes second, it's less clear that it
> > > > fixes a bug, since the other patch renders this one essentially moot.
> > >
> > > Yeah, but as Like pointed out, the way the changelog is worded just raises the
> > > question of why this change is necessary.
> > >
> > > I think we should tag them both for stable.  They're both bug fixes, regardless
> > > of the ordering.
> >
> > Agree. Both patches are fixing the general potential buggy situation
> > of multiple PMI injection on one vm entry: one software level defense
> > (forcing the usage of KVM_REQ_PMI) and one hardware level defense
> > (preventing PMI injection using mask).
> >
> > Although neither patch in this series is fixing the root cause of this
> > specific double PMI injection bug, I don't see a reason why we cannot
> > add a "fixes" tag to them, since we may fix it and create it again.
> >
> > I am currently working on it and testing my patch. Please give me some
> > time, I think I could try sending out one version today. Once that is
> > done, I will combine mine with the existing patch and send it out as a
> > series.
>
> Me confused, what patch?  And what does this patch have to do with Jim's series?
> Unless I've missed something, Jim's patches are good to go with my nits addressed.

Let me step back.

We have the following problem when we run perf inside guest:

[ 1437.487320] Uhhuh. NMI received for unknown reason 20 on CPU 3.
[ 1437.487330] Dazed and confused, but trying to continue

This means there are more NMIs that guest PMI could understand. So
there are potentially two approaches to solve the problem: 1) fix the
PMI injection issue: only one can be injected; 2) fix the code that
causes the (incorrect) multiple PMI injection.

I am working on the 2nd one. So, the property of the 2nd one is:
without patches in 1) (Jim's patches), we could still avoid the above
warning messages.

Thanks.
-Mingwei
Mingwei Zhang Sept. 22, 2023, 9:02 p.m. UTC | #13
On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> On Fri, Sep 22, 2023 at 1:34 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > > On Fri, Sep 22, 2023 at 12:21 PM Sean Christopherson <seanjc@google.com> wrote:
> > > >
> > > > On Fri, Sep 22, 2023, Jim Mattson wrote:
> > > > > On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > >
> > > > > > On Fri, Sep 01, 2023, Jim Mattson wrote:
> > > > > > > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > > > > > > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > > > > > > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > > > > > > before the next VM-entry.
> > > > > > >
> > > > > > > That shouldn't be a problem. The local APIC is supposed to
> > > > > > > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > > > > > > second PMI should be inhibited. However, KVM's local APIC emulation
> > > > > > > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > > > > > > are delivered via the local APIC. In the common case, where LVTPC is
> > > > > > > configured to deliver an NMI, the first NMI is vectored through the
> > > > > > > guest IDT, and the second one is held pending. When the NMI handler
> > > > > > > returns, the second NMI is vectored through the IDT. For Linux guests,
> > > > > > > this results in the "dazed and confused" spurious NMI message.
> > > > > > >
> > > > > > > Though the obvious fix is to set the mask flag in LVTPC when handling
> > > > > > > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > > > > > > convoluted.
> > > > > >
> > > > > > To address Like's question about whether not this is necessary, I think we should
> > > > > > rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
> > > > > > masking thing.
> > > > > >
> > > > > > And I think it makes sense to swap the order of the two patches.  The LVTPC masking
> > > > > > fix is a clearcut architectural violation.  This is a bit more of a grey area,
> > > > > > though still blatantly buggy.
> > > > >
> > > > > The reason I ordered the patches as I did is that when this patch
> > > > > comes first, it actually fixes the problem that was introduced in
> > > > > commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
> > > > > instructions"). If this patch comes second, it's less clear that it
> > > > > fixes a bug, since the other patch renders this one essentially moot.
> > > >
> > > > Yeah, but as Like pointed out, the way the changelog is worded just raises the
> > > > question of why this change is necessary.
> > > >
> > > > I think we should tag them both for stable.  They're both bug fixes, regardless
> > > > of the ordering.
> > >
> > > Agree. Both patches are fixing the general potential buggy situation
> > > of multiple PMI injection on one vm entry: one software level defense
> > > (forcing the usage of KVM_REQ_PMI) and one hardware level defense
> > > (preventing PMI injection using mask).
> > >
> > > Although neither patch in this series is fixing the root cause of this
> > > specific double PMI injection bug, I don't see a reason why we cannot
> > > add a "fixes" tag to them, since we may fix it and create it again.
> > >
> > > I am currently working on it and testing my patch. Please give me some
> > > time, I think I could try sending out one version today. Once that is
> > > done, I will combine mine with the existing patch and send it out as a
> > > series.
> >
> > Me confused, what patch?  And what does this patch have to do with Jim's series?
> > Unless I've missed something, Jim's patches are good to go with my nits addressed.
> 
> Let me step back.
> 
> We have the following problem when we run perf inside guest:
> 
> [ 1437.487320] Uhhuh. NMI received for unknown reason 20 on CPU 3.
> [ 1437.487330] Dazed and confused, but trying to continue
> 
> This means there are more NMIs that guest PMI could understand. So
> there are potentially two approaches to solve the problem: 1) fix the
> PMI injection issue: only one can be injected; 2) fix the code that
> causes the (incorrect) multiple PMI injection.
> 
> I am working on the 2nd one. So, the property of the 2nd one is:
> without patches in 1) (Jim's patches), we could still avoid the above
> warning messages.
> 
> Thanks.
> -Mingwei

This is my draft version. If you don't have full-width counter support, this
patch needs be placed on top of this one:
https://lore.kernel.org/all/20230504120042.785651-1-rkagan@amazon.de/

My initial testing on both QEMU and our GCP testing environment shows no
"Uhhuh..." dmesg in guest.

Please take a look...

From 47e629269d8b0ff65c242334f068300216cb7f91 Mon Sep 17 00:00:00 2001
From: Mingwei Zhang <mizhang@google.com>
Date: Fri, 22 Sep 2023 17:13:55 +0000
Subject: [PATCH] KVM: x86/pmu: Fix emulated counter increment due to
 instruction emulation

Fix KVM emulated counter increment due to instruction emulation. KVM
pmc->counter is always a snapshot value when counter is running. Therefore,
the value does not represent the actual value of counter. Thus it is
inappropriate to compare it with other counter values. In existing code
KVM directly compares pmc->prev_counter and pmc->counter directly. However,
pmc->prev_counter is a snaphot value assigned from pmc->counter when
counter may still be running.  So this comparison logic in
reprogram_counter() will generate incorrect invocations to
__kvm_perf_overflow(in_pmi=false) and generate duplicated PMI injection
requests.

Fix this issue by adding emulated_counter field and only the do the counter
calculation after we pause

Change-Id: I2d59e68557fd35f7bbcfe09ea42ad81bd36776b7
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/pmu.c              | 15 ++++++++-------
 arch/x86/kvm/svm/pmu.c          |  1 +
 arch/x86/kvm/vmx/pmu_intel.c    |  2 ++
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1a4def36d5bb..47bbfbc0aa35 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -494,6 +494,7 @@ struct kvm_pmc {
 	bool intr;
 	u64 counter;
 	u64 prev_counter;
+	u64 emulated_counter;
 	u64 eventsel;
 	struct perf_event *perf_event;
 	struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index edb89b51b383..47acf3a2b077 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -240,12 +240,13 @@ static void pmc_pause_counter(struct kvm_pmc *pmc)
 {
 	u64 counter = pmc->counter;

-	if (!pmc->perf_event || pmc->is_paused)
-		return;
-
 	/* update counter, reset event value to avoid redundant accumulation */
-	counter += perf_event_pause(pmc->perf_event, true);
-	pmc->counter = counter & pmc_bitmask(pmc);
+	if (pmc->perf_event && !pmc->is_paused)
+		counter += perf_event_pause(pmc->perf_event, true);
+
+	pmc->prev_counter = counter & pmc_bitmask(pmc);
+	pmc->counter = (counter + pmc->emulated_counter) & pmc_bitmask(pmc);
+	pmc->emulated_counter = 0;
 	pmc->is_paused = true;
 }

@@ -452,6 +453,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
 reprogram_complete:
 	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
 	pmc->prev_counter = 0;
+	pmc->emulated_counter = 0;
 }

 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
@@ -725,8 +727,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)

 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
 {
-	pmc->prev_counter = pmc->counter;
-	pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
+	pmc->emulated_counter += 1;
 	kvm_pmu_request_counter_reprogram(pmc);
 }

diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index a25b91ff9aea..b88fab4ae1d7 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -243,6 +243,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)

 		pmc_stop_counter(pmc);
 		pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
+		pmc->emulated_counter = 0;
 	}

 	pmu->global_ctrl = pmu->global_status = 0;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 626df5fdf542..d03c4ec7273d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -641,6 +641,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)

 		pmc_stop_counter(pmc);
 		pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
+		pmc->emulated_counter = 0;
 	}

 	for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
@@ -648,6 +649,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)

 		pmc_stop_counter(pmc);
 		pmc->counter = pmc->prev_counter = 0;
+		pmc->emulated_counter = 0;
 	}

 	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
--
2.42.0.515.g380fc7ccd1-goog
Sean Christopherson Sept. 22, 2023, 9:06 p.m. UTC | #14
On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> On Fri, Sep 22, 2023 at 1:34 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > > Agree. Both patches are fixing the general potential buggy situation
> > > of multiple PMI injection on one vm entry: one software level defense
> > > (forcing the usage of KVM_REQ_PMI) and one hardware level defense
> > > (preventing PMI injection using mask).
> > >
> > > Although neither patch in this series is fixing the root cause of this
> > > specific double PMI injection bug, I don't see a reason why we cannot
> > > add a "fixes" tag to them, since we may fix it and create it again.
> > >
> > > I am currently working on it and testing my patch. Please give me some
> > > time, I think I could try sending out one version today. Once that is
> > > done, I will combine mine with the existing patch and send it out as a
> > > series.
> >
> > Me confused, what patch?  And what does this patch have to do with Jim's series?
> > Unless I've missed something, Jim's patches are good to go with my nits addressed.
> 
> Let me step back.
> 
> We have the following problem when we run perf inside guest:
> 
> [ 1437.487320] Uhhuh. NMI received for unknown reason 20 on CPU 3.
> [ 1437.487330] Dazed and confused, but trying to continue
> 
> This means there are more NMIs that guest PMI could understand. So
> there are potentially two approaches to solve the problem: 1) fix the
> PMI injection issue: only one can be injected; 2) fix the code that
> causes the (incorrect) multiple PMI injection.

No, because the LVTPC masking fix isn't optional, the current KVM behavior is a
clear violation of the SDM.  And I'm struggling to think of a sane way to fix the
IRQ work bug, e.g. KVM would have to busy on the work finishing before resuming
the guest, which is rather crazy.

I'm not saying there isn't more work to be done, nor am I saying that we shouldn't
further harden KVM against double-injection.  I'm just truly confused as to what
that has to do with Jim's fixes.

> I am working on the 2nd one. So, the property of the 2nd one is:
> without patches in 1) (Jim's patches), we could still avoid the above
> warning messages.
Mingwei Zhang Sept. 22, 2023, 10:42 p.m. UTC | #15
On Fri, Sep 22, 2023 at 2:06 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > On Fri, Sep 22, 2023 at 1:34 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > > > Agree. Both patches are fixing the general potential buggy situation
> > > > of multiple PMI injection on one vm entry: one software level defense
> > > > (forcing the usage of KVM_REQ_PMI) and one hardware level defense
> > > > (preventing PMI injection using mask).
> > > >
> > > > Although neither patch in this series is fixing the root cause of this
> > > > specific double PMI injection bug, I don't see a reason why we cannot
> > > > add a "fixes" tag to them, since we may fix it and create it again.
> > > >
> > > > I am currently working on it and testing my patch. Please give me some
> > > > time, I think I could try sending out one version today. Once that is
> > > > done, I will combine mine with the existing patch and send it out as a
> > > > series.
> > >
> > > Me confused, what patch?  And what does this patch have to do with Jim's series?
> > > Unless I've missed something, Jim's patches are good to go with my nits addressed.
> >
> > Let me step back.
> >
> > We have the following problem when we run perf inside guest:
> >
> > [ 1437.487320] Uhhuh. NMI received for unknown reason 20 on CPU 3.
> > [ 1437.487330] Dazed and confused, but trying to continue
> >
> > This means there are more NMIs that guest PMI could understand. So
> > there are potentially two approaches to solve the problem: 1) fix the
> > PMI injection issue: only one can be injected; 2) fix the code that
> > causes the (incorrect) multiple PMI injection.
>
> No, because the LVTPC masking fix isn't optional, the current KVM behavior is a
> clear violation of the SDM.  And I'm struggling to think of a sane way to fix the
> IRQ work bug, e.g. KVM would have to busy on the work finishing before resuming
> the guest, which is rather crazy.
>
> I'm not saying there isn't more work to be done, nor am I saying that we shouldn't
> further harden KVM against double-injection.  I'm just truly confused as to what
> that has to do with Jim's fixes.
>
hmm, I will take the "two approaches" back. You are right on that.
"two directions" is what I mean.

Oh, I think I did not elaborate the full context to you maybe. That
might cause confusion and sorry about that.

The context of Jim's patches is to fix the multiple PMI injections
when using perf, starting from
https://lore.kernel.org/all/ZJ7y9DuedQyBb9eU@u40bc5e070a0153.ant.amazon.com/

So, regarding the fix, there are multiple layers and they may or may
not be logically connected closely, but we are solving the same
problem. In fact, I was asking Jim to help me with this specific issue
:)

So yes, they could be put together and they could be put separately.
But I don't see why they _cannot_ be together or cause confusion. So,
I would like to put them together in the same context with a cover
letter fully describing the details.

FYI for reviewers: to reproduce the spurious PMI issue in the guest
VM, you need to let KVM emulate some instructions during the runtime,
so the function kvm_pmu_incr_counter() will be triggered more. One
option is to add a kernel cmdline like "idle=nomwait" to your guest
kernel. Regarding the workload in guest vm, please run the perf
command specified in
https://lore.kernel.org/all/ZKCD30QrE5g9XGIh@google.com/

Thanks.
-Mingwei



-Mingwei
Sean Christopherson Sept. 22, 2023, 10:44 p.m. UTC | #16
On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> My initial testing on both QEMU and our GCP testing environment shows no
> "Uhhuh..." dmesg in guest.
> 
> Please take a look...

And now I'm extra confused, I thought the plan was for me to post a proper series
for the emulated_counter idea[*], get the code base healthy/functional, and then
build on top, e.g. improve performance and whatnot.

The below is just a stripped down version of that, and doesn't look quite right.
Specifically, pmc_write_counter() needs to purge emulated_count (my pseudo-patch
subtly handled that by pausing the counter).

I totally realize I'm moving sloooow, but I pinky swear I'm getting there.  My
compile-tested-only branch can be found at

  https://github.com/sean-jc/linux x86/pmu_refactor

There's a lot more in there, e.g. it has fixes from Roman and Jim, along with
some other found-by-inspection cleanups.

I dropped the "pause on WRMSR" proposal.  I still don't love the offset approach,
but I agree that pausing and reprogramming counters on writes could introduce an
entirely new set of problems.

I'm logging off for the weekend, but I'll pick this back up next (it's at the
top of my priority list, assuming guest_memfd doesn't somehow catch fire.

[*] https://lore.kernel.org/all/ZJ9IaskpbIK9q4rt@google.com

> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index edb89b51b383..47acf3a2b077 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -240,12 +240,13 @@ static void pmc_pause_counter(struct kvm_pmc *pmc)
>  {
>  	u64 counter = pmc->counter;
> 
> -	if (!pmc->perf_event || pmc->is_paused)
> -		return;
> -
>  	/* update counter, reset event value to avoid redundant accumulation */
> -	counter += perf_event_pause(pmc->perf_event, true);
> -	pmc->counter = counter & pmc_bitmask(pmc);
> +	if (pmc->perf_event && !pmc->is_paused)
> +		counter += perf_event_pause(pmc->perf_event, true);
> +
> +	pmc->prev_counter = counter & pmc_bitmask(pmc);

Honest question, is it actually correct/necessary to mask the counter at the
intermediate steps?  Conceptually, the hardware count and the emulated count are
the same thing, just tracked separately.

> +	pmc->counter = (counter + pmc->emulated_counter) & pmc_bitmask(pmc);
> +	pmc->emulated_counter = 0;
>  	pmc->is_paused = true;
>  }
> 
> @@ -452,6 +453,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
>  reprogram_complete:
>  	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
>  	pmc->prev_counter = 0;

I don't see any reason to keep kvm_pmc.prev_counter.  reprogram_counter() is the
only caller of pmc_pause_counter(), and so is effectively the only writer and the
only reader.  I.e. prev_counter can just be a local variable in reprogram_counter(),
no?
Sean Christopherson Sept. 22, 2023, 11 p.m. UTC | #17
On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> So yes, they could be put together and they could be put separately.
> But I don't see why they _cannot_ be together or cause confusion.

Because they don't need to be put together.  Roman's patch kinda sorta overlaps
with the prev_counter mess, but Jim's fixes are entirely orthogonal.

If one person initially posted such a series with everything together I probably
wouldn't care *too* much, but combining patches and/or series that aren't tightly
coupled or dependent in some way usually does more harm than good.  E.g. if a
maintainer has complaints against only one or two patches in series of unrelated
patches, then grabbing the "good" patches is unnecessarily difficult.  It's not
truly hard on the maintainer's end, but little bits of avoidable friction in the
process adds up across hundreds and thousands of patches.

FWIW, my plan is to apply Roman's patch pretty much as-is, grab v2 from Jim, and
post my cleanups as a separate series on top (maybe two series, really haven't
thought about it yet).  The only reason I have them all in a single branch is
because there are code conflicts and I know I will apply the patches from Roman
and Jim first, i.e. I didn't want to develop on a base that I knew would become
stale.

> So, I would like to put them together in the same context with a cover letter
> fully describing the details.

I certainly won't object to a thorough bug report/analysis, but I'd prefer that
Jim's series be posted separately (though I don't care if it's you or Jim that
posts it).
Mingwei Zhang Sept. 25, 2023, 6 a.m. UTC | #18
Hi Sean,

On Fri, Sep 22, 2023 at 3:44 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > My initial testing on both QEMU and our GCP testing environment shows no
> > "Uhhuh..." dmesg in guest.
> >
> > Please take a look...
>
> And now I'm extra confused, I thought the plan was for me to post a proper series
> for the emulated_counter idea[*], get the code base healthy/functional, and then
> build on top, e.g. improve performance and whatnot.
>
> The below is just a stripped down version of that, and doesn't look quite right.
> Specifically, pmc_write_counter() needs to purge emulated_count (my pseudo-patch
> subtly handled that by pausing the counter).
>
> I totally realize I'm moving sloooow, but I pinky swear I'm getting there.  My
> compile-tested-only branch can be found at
>
>   https://github.com/sean-jc/linux x86/pmu_refactor
>
> There's a lot more in there, e.g. it has fixes from Roman and Jim, along with
> some other found-by-inspection cleanups.

Sean, thanks for the work you have done on the thread:
https://lore.kernel.org/all/ZJ9IaskpbIK9q4rt@google.com

I think the diff you posted helped quite a lot. In fact, Jim also
asked me to try using emulated_counter and I thought that just fixed
the issue. I tried my own version as well as yours. However, neither
could fix this problem at that time. So, Jim took a further look on
the lower level and I was stuck on the performance analysis until
recently I came back and discovered the real fix for this.

Your diff (or I should say your patch) covers lots of things including
the adding of emulated_counter, some refactoring code on pmu reset and
pause-on-wrmsr. In comparison, my code just focuses on the bug fixing
for the duplicate PMI, since that's what I care for production.
Although the code looks somewhat similar, the thought process and
intention are quite different.

Sorry to confuse you. To resolve this issue, I am wondering if adding
you into "Co-developed-by:" would be a valid choice? Or the other way
around like adding me as a co-developer into one patch of your series.

In addition, I have no interest in further refactoring the existing
vPMU code. So for that I will definitely step aside.

Update: it looks like both my patch and Jim's patches (applied
separately) break the kvm-unit-test/pmu with the following error on
SPR:

FAIL: Intel: emulated instruction: instruction counter overflow
FAIL: Intel: full-width writes: emulated instruction: instruction
counter overflow

Not sure whether it is a test error or not.

>
> I dropped the "pause on WRMSR" proposal.  I still don't love the offset approach,
> but I agree that pausing and reprogramming counters on writes could introduce an
> entirely new set of problems.
>
> I'm logging off for the weekend, but I'll pick this back up next (it's at the
> top of my priority list, assuming guest_memfd doesn't somehow catch fire.
>
> [*] https://lore.kernel.org/all/ZJ9IaskpbIK9q4rt@google.com
>
> > diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> > index edb89b51b383..47acf3a2b077 100644
> > --- a/arch/x86/kvm/pmu.c
> > +++ b/arch/x86/kvm/pmu.c
> > @@ -240,12 +240,13 @@ static void pmc_pause_counter(struct kvm_pmc *pmc)
> >  {
> >       u64 counter = pmc->counter;
> >
> > -     if (!pmc->perf_event || pmc->is_paused)
> > -             return;
> > -
> >       /* update counter, reset event value to avoid redundant accumulation */
> > -     counter += perf_event_pause(pmc->perf_event, true);
> > -     pmc->counter = counter & pmc_bitmask(pmc);
> > +     if (pmc->perf_event && !pmc->is_paused)
> > +             counter += perf_event_pause(pmc->perf_event, true);
> > +
> > +     pmc->prev_counter = counter & pmc_bitmask(pmc);
>
> Honest question, is it actually correct/necessary to mask the counter at the
> intermediate steps?  Conceptually, the hardware count and the emulated count are
> the same thing, just tracked separately.

It is valid because the counter has stopped, and now pmc->counter
contains the valid value. So, it can move into different variables and
do the comparison. Regarding the intermediate steps, I don't think
this is visible to the guest, no? Otherwise, we may have to use tmp
variables and then make the assignment atomic, although I doubt if
that is needed.

>
> > +     pmc->counter = (counter + pmc->emulated_counter) & pmc_bitmask(pmc);
> > +     pmc->emulated_counter = 0;
> >       pmc->is_paused = true;
> >  }
> >
> > @@ -452,6 +453,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
> >  reprogram_complete:
> >       clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
> >       pmc->prev_counter = 0;
>
> I don't see any reason to keep kvm_pmc.prev_counter.  reprogram_counter() is the
> only caller of pmc_pause_counter(), and so is effectively the only writer and the
> only reader.  I.e. prev_counter can just be a local variable in reprogram_counter(),
> no?

You are right, I will update in the next version.
Mingwei Zhang Sept. 25, 2023, 6:09 a.m. UTC | #19
Hi Sean,

On Fri, Sep 22, 2023 at 4:00 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > So yes, they could be put together and they could be put separately.
> > But I don't see why they _cannot_ be together or cause confusion.
>
> Because they don't need to be put together.  Roman's patch kinda sorta overlaps
> with the prev_counter mess, but Jim's fixes are entirely orthogonal.
>
> If one person initially posted such a series with everything together I probably
> wouldn't care *too* much, but combining patches and/or series that aren't tightly
> coupled or dependent in some way usually does more harm than good.  E.g. if a
> maintainer has complaints against only one or two patches in series of unrelated
> patches, then grabbing the "good" patches is unnecessarily difficult.  It's not
> truly hard on the maintainer's end, but little bits of avoidable friction in the
> process adds up across hundreds and thousands of patches.
>
> FWIW, my plan is to apply Roman's patch pretty much as-is, grab v2 from Jim, and
> post my cleanups as a separate series on top (maybe two series, really haven't
> thought about it yet).  The only reason I have them all in a single branch is
> because there are code conflicts and I know I will apply the patches from Roman
> and Jim first, i.e. I didn't want to develop on a base that I knew would become
> stale.
>
> > So, I would like to put them together in the same context with a cover letter
> > fully describing the details.
>
> I certainly won't object to a thorough bug report/analysis, but I'd prefer that
> Jim's series be posted separately (though I don't care if it's you or Jim that
> posts it).

Thanks for agreeing to put things together. In fact, everything
together means all relevant fix patches for the same bug need to be
together. But I will put my patch explicitly as _optional_ mentioned
in the cover letter.

If the series causes inconvenience, please accept my apology. For the
sense of responsibility, I think I could just use this opportunity to
send my updated version with your comment fixed. I will also use this
chance to update your fix to Jim's patches.

One last thing, breaking the kvm-unit-test/pmu still surprises me.
Please test it again when you have a chance. Maybe adding more fixes
on top. With the series sent, I will hand it over to you.

Thanks.
-Mingwei

-Mingwei
Like Xu Sept. 25, 2023, 7:06 a.m. UTC | #20
On 23/9/2023 6:42 am, Mingwei Zhang wrote:
> On Fri, Sep 22, 2023 at 2:06 PM Sean Christopherson <seanjc@google.com> wrote:
>>
>> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
>>> On Fri, Sep 22, 2023 at 1:34 PM Sean Christopherson <seanjc@google.com> wrote:
>>>>
>>>> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
>>>>> Agree. Both patches are fixing the general potential buggy situation
>>>>> of multiple PMI injection on one vm entry: one software level defense
>>>>> (forcing the usage of KVM_REQ_PMI) and one hardware level defense
>>>>> (preventing PMI injection using mask).
>>>>>
>>>>> Although neither patch in this series is fixing the root cause of this
>>>>> specific double PMI injection bug, I don't see a reason why we cannot
>>>>> add a "fixes" tag to them, since we may fix it and create it again.
>>>>>
>>>>> I am currently working on it and testing my patch. Please give me some
>>>>> time, I think I could try sending out one version today. Once that is
>>>>> done, I will combine mine with the existing patch and send it out as a
>>>>> series.
>>>>
>>>> Me confused, what patch?  And what does this patch have to do with Jim's series?
>>>> Unless I've missed something, Jim's patches are good to go with my nits addressed.
>>>
>>> Let me step back.
>>>
>>> We have the following problem when we run perf inside guest:
>>>
>>> [ 1437.487320] Uhhuh. NMI received for unknown reason 20 on CPU 3.
>>> [ 1437.487330] Dazed and confused, but trying to continue
>>>
>>> This means there are more NMIs that guest PMI could understand. So
>>> there are potentially two approaches to solve the problem: 1) fix the
>>> PMI injection issue: only one can be injected; 2) fix the code that
>>> causes the (incorrect) multiple PMI injection.
>>
>> No, because the LVTPC masking fix isn't optional, the current KVM behavior is a
>> clear violation of the SDM.  And I'm struggling to think of a sane way to fix the
>> IRQ work bug, e.g. KVM would have to busy on the work finishing before resuming
>> the guest, which is rather crazy.
>>
>> I'm not saying there isn't more work to be done, nor am I saying that we shouldn't
>> further harden KVM against double-injection.  I'm just truly confused as to what
>> that has to do with Jim's fixes.
>>
> hmm, I will take the "two approaches" back. You are right on that.
> "two directions" is what I mean.
> 
> Oh, I think I did not elaborate the full context to you maybe. That
> might cause confusion and sorry about that.
> 
> The context of Jim's patches is to fix the multiple PMI injections
> when using perf, starting from
> https://lore.kernel.org/all/ZJ7y9DuedQyBb9eU@u40bc5e070a0153.ant.amazon.com/
> 
> So, regarding the fix, there are multiple layers and they may or may
> not be logically connected closely, but we are solving the same
> problem. In fact, I was asking Jim to help me with this specific issue
> :)
> 
> So yes, they could be put together and they could be put separately.
> But I don't see why they _cannot_ be together or cause confusion. So,
> I would like to put them together in the same context with a cover
> letter fully describing the details.
> 
> FYI for reviewers: to reproduce the spurious PMI issue in the guest
> VM, you need to let KVM emulate some instructions during the runtime,
> so the function kvm_pmu_incr_counter() will be triggered more. One
> option is to add a kernel cmdline like "idle=nomwait" to your guest
> kernel. Regarding the workload in guest vm, please run the perf
> command specified in
> https://lore.kernel.org/all/ZKCD30QrE5g9XGIh@google.com/

Hi Mingwei,

I would encourage you to convert this perf use case into a sequence of MSRs
accesses so that it's easier to understand where the emulation fails on KVM.

> 
> Thanks.
> -Mingwei
> 
> 
> 
> -Mingwei
>
Like Xu Sept. 25, 2023, 7:33 a.m. UTC | #21
On 23/9/2023 3:21 am, Sean Christopherson wrote:
> On Fri, Sep 22, 2023, Jim Mattson wrote:
>> On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
>>>
>>> On Fri, Sep 01, 2023, Jim Mattson wrote:
>>>> When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
>>>> VM-exit that also invokes __kvm_perf_overflow() as a result of
>>>> instruction emulation, kvm_pmu_deliver_pmi() will be called twice
>>>> before the next VM-entry.
>>>>
>>>> That shouldn't be a problem. The local APIC is supposed to
>>>> automatically set the mask flag in LVTPC when it handles a PMI, so the
>>>> second PMI should be inhibited. However, KVM's local APIC emulation
>>>> fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
>>>> are delivered via the local APIC. In the common case, where LVTPC is
>>>> configured to deliver an NMI, the first NMI is vectored through the
>>>> guest IDT, and the second one is held pending. When the NMI handler
>>>> returns, the second NMI is vectored through the IDT. For Linux guests,
>>>> this results in the "dazed and confused" spurious NMI message.
>>>>
>>>> Though the obvious fix is to set the mask flag in LVTPC when handling
>>>> a PMI, KVM's logic around synthesizing a PMI is unnecessarily
>>>> convoluted.
>>>
>>> To address Like's question about whether not this is necessary, I think we should
>>> rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
>>> masking thing.
>>>
>>> And I think it makes sense to swap the order of the two patches.  The LVTPC masking
>>> fix is a clearcut architectural violation.  This is a bit more of a grey area,
>>> though still blatantly buggy.
>>
>> The reason I ordered the patches as I did is that when this patch
>> comes first, it actually fixes the problem that was introduced in
>> commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
>> instructions"). If this patch comes second, it's less clear that it
>> fixes a bug, since the other patch renders this one essentially moot.
> 
> Yeah, but as Like pointed out, the way the changelog is worded just raises the
> question of why this change is necessary.
> 
> I think we should tag them both for stable.  They're both bug fixes, regardless
> of the ordering.
> 

In the semantics of "at most one PMI per VM exit", what if the PMI-caused
vm-exit itself triggers a guest counter overflow and triggers vPMI (for
example, at this time the L1 guest is counting the number of vm-exit events
from the L2 guest), will the latter interrupt be swallowed by L0 KVM ? What
is the correct expectation ? It may be different on Intel and AMD.
Mingwei Zhang Sept. 25, 2023, 4:22 p.m. UTC | #22
On Sun, Sep 24, 2023 at 11:09 PM Mingwei Zhang <mizhang@google.com> wrote:
>
> Hi Sean,
>
> On Fri, Sep 22, 2023 at 4:00 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > > So yes, they could be put together and they could be put separately.
> > > But I don't see why they _cannot_ be together or cause confusion.
> >
> > Because they don't need to be put together.  Roman's patch kinda sorta overlaps
> > with the prev_counter mess, but Jim's fixes are entirely orthogonal.
> >
> > If one person initially posted such a series with everything together I probably
> > wouldn't care *too* much, but combining patches and/or series that aren't tightly
> > coupled or dependent in some way usually does more harm than good.  E.g. if a
> > maintainer has complaints against only one or two patches in series of unrelated
> > patches, then grabbing the "good" patches is unnecessarily difficult.  It's not
> > truly hard on the maintainer's end, but little bits of avoidable friction in the
> > process adds up across hundreds and thousands of patches.
> >
> > FWIW, my plan is to apply Roman's patch pretty much as-is, grab v2 from Jim, and
> > post my cleanups as a separate series on top (maybe two series, really haven't
> > thought about it yet).  The only reason I have them all in a single branch is
> > because there are code conflicts and I know I will apply the patches from Roman
> > and Jim first, i.e. I didn't want to develop on a base that I knew would become
> > stale.
> >
> > > So, I would like to put them together in the same context with a cover letter
> > > fully describing the details.
> >
> > I certainly won't object to a thorough bug report/analysis, but I'd prefer that
> > Jim's series be posted separately (though I don't care if it's you or Jim that
> > posts it).
>
> Thanks for agreeing to put things together. In fact, everything
> together means all relevant fix patches for the same bug need to be
> together. But I will put my patch explicitly as _optional_ mentioned
> in the cover letter.
>
> If the series causes inconvenience, please accept my apology. For the
> sense of responsibility, I think I could just use this opportunity to
> send my updated version with your comment fixed. I will also use this
> chance to update your fix to Jim's patches.
>
> One last thing, breaking the kvm-unit-test/pmu still surprises me.
> Please test it again when you have a chance. Maybe adding more fixes
> on top. With the series sent, I will hand it over to you.
>

Never, this is a test failure that we already solved internally.
Applying the following fix to kvm-unit-tests/pmu remove the failures:

diff --git a/x86/pmu.c b/x86/pmu.c
index 0def2869..667e6233 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -68,6 +68,7 @@ volatile uint64_t irq_received;
 static void cnt_overflow(isr_regs_t *regs)
 {
        irq_received++;
+       apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
        apic_write(APIC_EOI, 0);
 }

Since KVM vPMU adds a mask when injecting the PMI, it is the
responsibility of the guest PMI handler to remove the mask and allow
subsequent PMIs delivered.

We should upstream the above fix some time.

Thanks.
-Mingwei
Sean Christopherson Sept. 25, 2023, 5:06 p.m. UTC | #23
On Mon, Sep 25, 2023, Mingwei Zhang wrote:
> On Sun, Sep 24, 2023 at 11:09 PM Mingwei Zhang <mizhang@google.com> wrote:
> >
> > Hi Sean,
> >
> > On Fri, Sep 22, 2023 at 4:00 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > > > So yes, they could be put together and they could be put separately.
> > > > But I don't see why they _cannot_ be together or cause confusion.
> > >
> > > Because they don't need to be put together.  Roman's patch kinda sorta overlaps
> > > with the prev_counter mess, but Jim's fixes are entirely orthogonal.
> > >
> > > If one person initially posted such a series with everything together I probably
> > > wouldn't care *too* much, but combining patches and/or series that aren't tightly
> > > coupled or dependent in some way usually does more harm than good.  E.g. if a
> > > maintainer has complaints against only one or two patches in series of unrelated
> > > patches, then grabbing the "good" patches is unnecessarily difficult.  It's not
> > > truly hard on the maintainer's end, but little bits of avoidable friction in the
> > > process adds up across hundreds and thousands of patches.
> > >
> > > FWIW, my plan is to apply Roman's patch pretty much as-is, grab v2 from Jim, and
> > > post my cleanups as a separate series on top (maybe two series, really haven't
> > > thought about it yet).  The only reason I have them all in a single branch is
> > > because there are code conflicts and I know I will apply the patches from Roman
> > > and Jim first, i.e. I didn't want to develop on a base that I knew would become
> > > stale.
> > >
> > > > So, I would like to put them together in the same context with a cover letter
> > > > fully describing the details.
> > >
> > > I certainly won't object to a thorough bug report/analysis, but I'd prefer that
> > > Jim's series be posted separately (though I don't care if it's you or Jim that
> > > posts it).
> >
> > Thanks for agreeing to put things together. In fact, everything
> > together means all relevant fix patches for the same bug need to be
> > together. But I will put my patch explicitly as _optional_ mentioned
> > in the cover letter.

No, please do not post your version of the emulated_counter patch.  I am more
than happy to give you primary author credit (though I need your SoB), all I care
about is minimizing the amount of effort and overhead on my end.  At this point,
posting your version at this time will only generate more noise and make my job
harder.  To tie everything together in the cover letter, just include lore links
to the relevant pseudo-patches.

Assuming you are taking over Jim's series, please post v2 asap.  I want to get
the critical fixes applied sooner than later.

> > If the series causes inconvenience, please accept my apology. For the
> > sense of responsibility, I think I could just use this opportunity to
> > send my updated version with your comment fixed. I will also use this
> > chance to update your fix to Jim's patches.
> >
> > One last thing, breaking the kvm-unit-test/pmu still surprises me.
> > Please test it again when you have a chance. Maybe adding more fixes
> > on top. With the series sent, I will hand it over to you.
> >
> 
> Never, this is a test failure that we already solved internally.
> Applying the following fix to kvm-unit-tests/pmu remove the failures:
> 
> diff --git a/x86/pmu.c b/x86/pmu.c
> index 0def2869..667e6233 100644
> --- a/x86/pmu.c
> +++ b/x86/pmu.c
> @@ -68,6 +68,7 @@ volatile uint64_t irq_received;
>  static void cnt_overflow(isr_regs_t *regs)
>  {
>         irq_received++;
> +       apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
>         apic_write(APIC_EOI, 0);
>  }
> 
> Since KVM vPMU adds a mask when injecting the PMI, it is the
> responsibility of the guest PMI handler to remove the mask and allow
> subsequent PMIs delivered.
> 
> We should upstream the above fix some time.

Please post the above asap.  And give pmu_pebs.c's cnt_overflow() the same
treatment when you do.  Or just give my your SoB and I'll write the changelog.
Mingwei Zhang Sept. 25, 2023, 7:54 p.m. UTC | #24
On Fri, Sep 22, 2023 at 2:02 PM Mingwei Zhang <mizhang@google.com> wrote:
>
> On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > On Fri, Sep 22, 2023 at 1:34 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Fri, Sep 22, 2023, Mingwei Zhang wrote:
> > > > On Fri, Sep 22, 2023 at 12:21 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > >
> > > > > On Fri, Sep 22, 2023, Jim Mattson wrote:
> > > > > > On Fri, Sep 22, 2023 at 11:46 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > > >
> > > > > > > On Fri, Sep 01, 2023, Jim Mattson wrote:
> > > > > > > > When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a
> > > > > > > > VM-exit that also invokes __kvm_perf_overflow() as a result of
> > > > > > > > instruction emulation, kvm_pmu_deliver_pmi() will be called twice
> > > > > > > > before the next VM-entry.
> > > > > > > >
> > > > > > > > That shouldn't be a problem. The local APIC is supposed to
> > > > > > > > automatically set the mask flag in LVTPC when it handles a PMI, so the
> > > > > > > > second PMI should be inhibited. However, KVM's local APIC emulation
> > > > > > > > fails to set the mask flag in LVTPC when it handles a PMI, so two PMIs
> > > > > > > > are delivered via the local APIC. In the common case, where LVTPC is
> > > > > > > > configured to deliver an NMI, the first NMI is vectored through the
> > > > > > > > guest IDT, and the second one is held pending. When the NMI handler
> > > > > > > > returns, the second NMI is vectored through the IDT. For Linux guests,
> > > > > > > > this results in the "dazed and confused" spurious NMI message.
> > > > > > > >
> > > > > > > > Though the obvious fix is to set the mask flag in LVTPC when handling
> > > > > > > > a PMI, KVM's logic around synthesizing a PMI is unnecessarily
> > > > > > > > convoluted.
> > > > > > >
> > > > > > > To address Like's question about whether not this is necessary, I think we should
> > > > > > > rephrase this to explicitly state this is a bug irrespective of the whole LVTPC
> > > > > > > masking thing.
> > > > > > >
> > > > > > > And I think it makes sense to swap the order of the two patches.  The LVTPC masking
> > > > > > > fix is a clearcut architectural violation.  This is a bit more of a grey area,
> > > > > > > though still blatantly buggy.
> > > > > >
> > > > > > The reason I ordered the patches as I did is that when this patch
> > > > > > comes first, it actually fixes the problem that was introduced in
> > > > > > commit 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring
> > > > > > instructions"). If this patch comes second, it's less clear that it
> > > > > > fixes a bug, since the other patch renders this one essentially moot.
> > > > >
> > > > > Yeah, but as Like pointed out, the way the changelog is worded just raises the
> > > > > question of why this change is necessary.
> > > > >
> > > > > I think we should tag them both for stable.  They're both bug fixes, regardless
> > > > > of the ordering.
> > > >
> > > > Agree. Both patches are fixing the general potential buggy situation
> > > > of multiple PMI injection on one vm entry: one software level defense
> > > > (forcing the usage of KVM_REQ_PMI) and one hardware level defense
> > > > (preventing PMI injection using mask).
> > > >
> > > > Although neither patch in this series is fixing the root cause of this
> > > > specific double PMI injection bug, I don't see a reason why we cannot
> > > > add a "fixes" tag to them, since we may fix it and create it again.
> > > >
> > > > I am currently working on it and testing my patch. Please give me some
> > > > time, I think I could try sending out one version today. Once that is
> > > > done, I will combine mine with the existing patch and send it out as a
> > > > series.
> > >
> > > Me confused, what patch?  And what does this patch have to do with Jim's series?
> > > Unless I've missed something, Jim's patches are good to go with my nits addressed.
> >
> > Let me step back.
> >
> > We have the following problem when we run perf inside guest:
> >
> > [ 1437.487320] Uhhuh. NMI received for unknown reason 20 on CPU 3.
> > [ 1437.487330] Dazed and confused, but trying to continue
> >
> > This means there are more NMIs that guest PMI could understand. So
> > there are potentially two approaches to solve the problem: 1) fix the
> > PMI injection issue: only one can be injected; 2) fix the code that
> > causes the (incorrect) multiple PMI injection.
> >
> > I am working on the 2nd one. So, the property of the 2nd one is:
> > without patches in 1) (Jim's patches), we could still avoid the above
> > warning messages.
> >
> > Thanks.
> > -Mingwei
>
> This is my draft version. If you don't have full-width counter support, this
> patch needs be placed on top of this one:
> https://lore.kernel.org/all/20230504120042.785651-1-rkagan@amazon.de/
>
> My initial testing on both QEMU and our GCP testing environment shows no
> "Uhhuh..." dmesg in guest.
>
> Please take a look...
>
> From 47e629269d8b0ff65c242334f068300216cb7f91 Mon Sep 17 00:00:00 2001
> From: Mingwei Zhang <mizhang@google.com>
> Date: Fri, 22 Sep 2023 17:13:55 +0000
> Subject: [PATCH] KVM: x86/pmu: Fix emulated counter increment due to
>  instruction emulation
>
> Fix KVM emulated counter increment due to instruction emulation. KVM
> pmc->counter is always a snapshot value when counter is running. Therefore,
> the value does not represent the actual value of counter. Thus it is
> inappropriate to compare it with other counter values. In existing code
> KVM directly compares pmc->prev_counter and pmc->counter directly. However,
> pmc->prev_counter is a snaphot value assigned from pmc->counter when
> counter may still be running.  So this comparison logic in
> reprogram_counter() will generate incorrect invocations to
> __kvm_perf_overflow(in_pmi=false) and generate duplicated PMI injection
> requests.
>
> Fix this issue by adding emulated_counter field and only the do the counter
> calculation after we pause
>
> Change-Id: I2d59e68557fd35f7bbcfe09ea42ad81bd36776b7
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/pmu.c              | 15 ++++++++-------
>  arch/x86/kvm/svm/pmu.c          |  1 +
>  arch/x86/kvm/vmx/pmu_intel.c    |  2 ++
>  4 files changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 1a4def36d5bb..47bbfbc0aa35 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -494,6 +494,7 @@ struct kvm_pmc {
>         bool intr;
>         u64 counter;
>         u64 prev_counter;
> +       u64 emulated_counter;
>         u64 eventsel;
>         struct perf_event *perf_event;
>         struct kvm_vcpu *vcpu;
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index edb89b51b383..47acf3a2b077 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -240,12 +240,13 @@ static void pmc_pause_counter(struct kvm_pmc *pmc)
>  {
>         u64 counter = pmc->counter;
>
> -       if (!pmc->perf_event || pmc->is_paused)
> -               return;
> -
>         /* update counter, reset event value to avoid redundant accumulation */
> -       counter += perf_event_pause(pmc->perf_event, true);
> -       pmc->counter = counter & pmc_bitmask(pmc);
> +       if (pmc->perf_event && !pmc->is_paused)
> +               counter += perf_event_pause(pmc->perf_event, true);
> +
> +       pmc->prev_counter = counter & pmc_bitmask(pmc);
> +       pmc->counter = (counter + pmc->emulated_counter) & pmc_bitmask(pmc);
> +       pmc->emulated_counter = 0;
>         pmc->is_paused = true;
>  }
>
> @@ -452,6 +453,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
>  reprogram_complete:
>         clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
>         pmc->prev_counter = 0;
> +       pmc->emulated_counter = 0;
>  }
>
>  void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
> @@ -725,8 +727,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
>
>  static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
>  {
> -       pmc->prev_counter = pmc->counter;
> -       pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
> +       pmc->emulated_counter += 1;
>         kvm_pmu_request_counter_reprogram(pmc);
>  }
>
> diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
> index a25b91ff9aea..b88fab4ae1d7 100644
> --- a/arch/x86/kvm/svm/pmu.c
> +++ b/arch/x86/kvm/svm/pmu.c
> @@ -243,6 +243,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
>
>                 pmc_stop_counter(pmc);
>                 pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
> +               pmc->emulated_counter = 0;
>         }
>
>         pmu->global_ctrl = pmu->global_status = 0;
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index 626df5fdf542..d03c4ec7273d 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -641,6 +641,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
>
>                 pmc_stop_counter(pmc);
>                 pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
> +               pmc->emulated_counter = 0;
>         }
>
>         for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
> @@ -648,6 +649,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
>
>                 pmc_stop_counter(pmc);
>                 pmc->counter = pmc->prev_counter = 0;
> +               pmc->emulated_counter = 0;
>         }
>
>         pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
> --
> 2.42.0.515.g380fc7ccd1-goog

Signed-off-by: Mingwei Zhang <mizhang@google.com>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3bc146dfd38d..f6b9e3ae08bf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -528,7 +528,6 @@  struct kvm_pmu {
 	u64 raw_event_mask;
 	struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
 	struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
-	struct irq_work irq_work;
 
 	/*
 	 * Overlay the bitmap with a 64-bit atomic so that all bits can be
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index bf653df86112..0c117cd24077 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -93,14 +93,6 @@  void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
 #undef __KVM_X86_PMU_OP
 }
 
-static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
-{
-	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
-	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
-
-	kvm_pmu_deliver_pmi(vcpu);
-}
-
 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 {
 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -124,20 +116,7 @@  static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
 	}
 
-	if (!pmc->intr || skip_pmi)
-		return;
-
-	/*
-	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
-	 * can be ejected on a guest mode re-entry. Otherwise we can't
-	 * be sure that vcpu wasn't executing hlt instruction at the
-	 * time of vmexit and is not going to re-enter guest mode until
-	 * woken up. So we should wake it, but this is impossible from
-	 * NMI context. Do it from irq work instead.
-	 */
-	if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
-		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
-	else
+	if (pmc->intr && !skip_pmi)
 		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
 }
 
@@ -677,9 +656,6 @@  void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
 
 void kvm_pmu_reset(struct kvm_vcpu *vcpu)
 {
-	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
-
-	irq_work_sync(&pmu->irq_work);
 	static_call(kvm_x86_pmu_reset)(vcpu);
 }
 
@@ -689,7 +665,6 @@  void kvm_pmu_init(struct kvm_vcpu *vcpu)
 
 	memset(pmu, 0, sizeof(*pmu));
 	static_call(kvm_x86_pmu_init)(vcpu);
-	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
 	pmu->event_count = 0;
 	pmu->need_cleanup = false;
 	kvm_pmu_refresh(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c381770bcbf1..0732c09fbd2d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12841,6 +12841,9 @@  static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 		return true;
 #endif
 
+	if (kvm_test_request(KVM_REQ_PMI, vcpu))
+		return true;
+
 	if (kvm_arch_interrupt_allowed(vcpu) &&
 	    (kvm_cpu_has_interrupt(vcpu) ||
 	    kvm_guest_apic_has_interrupt(vcpu)))