diff mbox series

[v2,4/6] KVM: x86/pmu: Add pmc->intr to refactor kvm_perf_overflow{_intr}()

Message ID 20211130074221.93635-5-likexu@tencent.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/pmu: Count two basic events for emulated instructions | expand

Commit Message

Like Xu Nov. 30, 2021, 7:42 a.m. UTC
From: Like Xu <likexu@tencent.com>

Depending on whether intr should be triggered or not, KVM registers
two different event overflow callbacks in the perf_event context.

The code skeleton of these two functions is very similar, so
the pmc->intr can be stored into pmc from pmc_reprogram_counter()
which provides smaller instructions footprint against the
u-architecture branch predictor.

The __kvm_perf_overflow() can be called in non-nmi contexts
and a flag is needed to distinguish the caller context and thus
avoid a check on kvm_is_in_guest(), otherwise we might get
warnings from suspicious RCU or check_preemption_disabled().

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Like Xu <likexu@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/pmu.c              | 58 ++++++++++++++++-----------------
 2 files changed, 29 insertions(+), 30 deletions(-)

Comments

Jim Mattson Dec. 9, 2021, 4:25 a.m. UTC | #1
On Mon, Nov 29, 2021 at 11:42 PM Like Xu <like.xu.linux@gmail.com> wrote:
>
> From: Like Xu <likexu@tencent.com>
>
> Depending on whether intr should be triggered or not, KVM registers
> two different event overflow callbacks in the perf_event context.
>
> The code skeleton of these two functions is very similar, so
> the pmc->intr can be stored into pmc from pmc_reprogram_counter()
> which provides smaller instructions footprint against the
> u-architecture branch predictor.
>
> The __kvm_perf_overflow() can be called in non-nmi contexts
> and a flag is needed to distinguish the caller context and thus
> avoid a check on kvm_is_in_guest(), otherwise we might get
> warnings from suspicious RCU or check_preemption_disabled().
>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Like Xu <likexu@tencent.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/pmu.c              | 58 ++++++++++++++++-----------------
>  2 files changed, 29 insertions(+), 30 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e41ad1ead721..6c2b2331ffeb 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -495,6 +495,7 @@ struct kvm_pmc {
>          */
>         u64 current_config;
>         bool is_paused;
> +       bool intr;
>  };
>
>  struct kvm_pmu {
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index b7a1ae28ab87..a20207ee4014 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -55,43 +55,41 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
>         kvm_pmu_deliver_pmi(vcpu);
>  }
>
> -static void kvm_perf_overflow(struct perf_event *perf_event,
> -                             struct perf_sample_data *data,
> -                             struct pt_regs *regs)
> +static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>  {
> -       struct kvm_pmc *pmc = perf_event->overflow_handler_context;
>         struct kvm_pmu *pmu = pmc_to_pmu(pmc);
>
> -       if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
> -               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> -               kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
> -       }
> +       /* Ignore counters that have been reprogrammed already. */
> +       if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
> +               return;
> +
> +       __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> +       kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
> +
> +       if (!pmc->intr)
> +               return;
> +
> +       /*
> +        * Inject PMI. If vcpu was in a guest mode during NMI PMI
> +        * can be ejected on a guest mode re-entry. Otherwise we can't
> +        * be sure that vcpu wasn't executing hlt instruction at the
> +        * time of vmexit and is not going to re-enter guest mode until
> +        * woken up. So we should wake it, but this is impossible from
> +        * NMI context. Do it from irq work instead.
> +        */
> +       if (in_pmi && !kvm_is_in_guest())
> +               irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> +       else
> +               kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
>  }
>
> -static void kvm_perf_overflow_intr(struct perf_event *perf_event,
> -                                  struct perf_sample_data *data,
> -                                  struct pt_regs *regs)
> +static void kvm_perf_overflow(struct perf_event *perf_event,
> +                             struct perf_sample_data *data,
> +                             struct pt_regs *regs)
>  {
>         struct kvm_pmc *pmc = perf_event->overflow_handler_context;
> -       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> -
> -       if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
> -               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> -               kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
>
> -               /*
> -                * Inject PMI. If vcpu was in a guest mode during NMI PMI
> -                * can be ejected on a guest mode re-entry. Otherwise we can't
> -                * be sure that vcpu wasn't executing hlt instruction at the
> -                * time of vmexit and is not going to re-enter guest mode until
> -                * woken up. So we should wake it, but this is impossible from
> -                * NMI context. Do it from irq work instead.
> -                */
> -               if (!kvm_is_in_guest())
> -                       irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> -               else
> -                       kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
> -       }
> +       __kvm_perf_overflow(pmc, true);
>  }
>
>  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> @@ -126,7 +124,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>         }
>
>         event = perf_event_create_kernel_counter(&attr, -1, current,
> -                                                intr ? kvm_perf_overflow_intr :
>                                                  kvm_perf_overflow, pmc);

Not your change, but if the event is counting anything based on
cycles, and the guest TSC is scaled to run at a different rate from
the host TSC, doesn't the initial value of the underlying hardware
counter have to be adjusted as well, so that the interrupt arrives
when the guest's counter overflows rather than when the host's counter
overflows?

Reviewed-by: Jim Mattson <jmattson@google.com>
Like Xu Dec. 9, 2021, 8:28 a.m. UTC | #2
On 9/12/2021 12:25 pm, Jim Mattson wrote:
> 
> Not your change, but if the event is counting anything based on
> cycles, and the guest TSC is scaled to run at a different rate from
> the host TSC, doesn't the initial value of the underlying hardware
> counter have to be adjusted as well, so that the interrupt arrives
> when the guest's counter overflows rather than when the host's counter
> overflows?

I've thought about this issue too and at least the Intel Specification
did not let me down on this detail:

	"The counter changes in the VMX non-root mode will follow
	VMM's use of the TSC offset or TSC scaling VMX controls"

Not knowing if AMD or the real world hardware
will live up to this expectation and I'm pessimistic.

cc Andi and Kim.
Jim Mattson Dec. 10, 2021, 12:54 a.m. UTC | #3
On Thu, Dec 9, 2021 at 12:28 AM Like Xu <like.xu.linux@gmail.com> wrote:
>
> On 9/12/2021 12:25 pm, Jim Mattson wrote:
> >
> > Not your change, but if the event is counting anything based on
> > cycles, and the guest TSC is scaled to run at a different rate from
> > the host TSC, doesn't the initial value of the underlying hardware
> > counter have to be adjusted as well, so that the interrupt arrives
> > when the guest's counter overflows rather than when the host's counter
> > overflows?
>
> I've thought about this issue too and at least the Intel Specification
> did not let me down on this detail:
>
>         "The counter changes in the VMX non-root mode will follow
>         VMM's use of the TSC offset or TSC scaling VMX controls"

Where do you see this? I see similar text regarding TSC packets in the
section on Intel Processor Trace, but nothing about PMU counters
advancing at a scaled TSC frequency.

> Not knowing if AMD or the real world hardware
> will live up to this expectation and I'm pessimistic.
>
> cc Andi and Kim.
>
Paolo Bonzini Dec. 10, 2021, 9:35 a.m. UTC | #4
On 12/10/21 01:54, Jim Mattson wrote:
> On Thu, Dec 9, 2021 at 12:28 AM Like Xu <like.xu.linux@gmail.com> wrote:
>>
>> On 9/12/2021 12:25 pm, Jim Mattson wrote:
>>>
>>> Not your change, but if the event is counting anything based on
>>> cycles, and the guest TSC is scaled to run at a different rate from
>>> the host TSC, doesn't the initial value of the underlying hardware
>>> counter have to be adjusted as well, so that the interrupt arrives
>>> when the guest's counter overflows rather than when the host's counter
>>> overflows?
>>
>> I've thought about this issue too and at least the Intel Specification
>> did not let me down on this detail:
>>
>>          "The counter changes in the VMX non-root mode will follow
>>          VMM's use of the TSC offset or TSC scaling VMX controls"
> 
> Where do you see this? I see similar text regarding TSC packets in the
> section on Intel Processor Trace, but nothing about PMU counters
> advancing at a scaled TSC frequency.

Indeed it seems quite unlikely that PMU counters can count fractionally.

Even for tracing the SDM says "Like the value returned by RDTSC, TSC 
packets will include these adjustments, but other timing packets (such 
as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone 
TSC packets are typically generated only when generation of other timing 
packets (MTCs and CYCs) has ceased for a period of time", I'm not even 
sure it's a good thing that the values in TSC packets are scaled and offset.

Back to the PMU, for non-architectural counters it's not really possible 
to know if they count in cycles or not.  So it may not be a good idea to 
special case the architectural counters.

Paolo
Like Xu Dec. 10, 2021, 10:11 a.m. UTC | #5
On 10/12/2021 5:35 pm, Paolo Bonzini wrote:
> On 12/10/21 01:54, Jim Mattson wrote:
>> On Thu, Dec 9, 2021 at 12:28 AM Like Xu <like.xu.linux@gmail.com> wrote:
>>>
>>> On 9/12/2021 12:25 pm, Jim Mattson wrote:
>>>>
>>>> Not your change, but if the event is counting anything based on
>>>> cycles, and the guest TSC is scaled to run at a different rate from
>>>> the host TSC, doesn't the initial value of the underlying hardware
>>>> counter have to be adjusted as well, so that the interrupt arrives
>>>> when the guest's counter overflows rather than when the host's counter
>>>> overflows?
>>>
>>> I've thought about this issue too and at least the Intel Specification
>>> did not let me down on this detail:
>>>
>>>          "The counter changes in the VMX non-root mode will follow
>>>          VMM's use of the TSC offset or TSC scaling VMX controls"
>>

Emm, before I left Intel to play AMD, my hardware architect gave
me a verbal yes about any reported TSC values for vmx non-root mode
(including the Timed LBR or PEBS records or PT packages) as long as
we enable the relevant VM execution control bits.

Not sure if it's true for legacy platforms.

>> Where do you see this? I see similar text regarding TSC packets in the
>> section on Intel Processor Trace, but nothing about PMU counters
>> advancing at a scaled TSC frequency.
> 
> Indeed it seems quite unlikely that PMU counters can count fractionally.
> 
> Even for tracing the SDM says "Like the value returned by RDTSC, TSC packets 
> will include these adjustments, but other timing packets (such as MTC, CYC, and 
> CBR) are not impacted".  Considering that "stand-alone TSC packets are typically 
> generated only when generation of other timing packets (MTCs and CYCs) has 
> ceased for a period of time", I'm not even sure it's a good thing that the 
> values in TSC packets are scaled and offset.

There are some discussion that cannot be made public.

We recommend (as software developers) that any PMU enabled guest
should keep the host/guest TSC as a joint progression for
performance tuning since guest doesn't have AMPERF capability.

> 
> Back to the PMU, for non-architectural counters it's not really possible to know 
> if they count in cycles or not.  So it may not be a good idea to special case 
> the architectural counters.

Yes captain.
Let's see if we have real world challenges or bugs to explore this detail further.

> 
> Paolo
>
Jim Mattson Dec. 10, 2021, 10:55 p.m. UTC | #6
On Fri, Dec 10, 2021 at 1:35 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 12/10/21 01:54, Jim Mattson wrote:
> > On Thu, Dec 9, 2021 at 12:28 AM Like Xu <like.xu.linux@gmail.com> wrote:
> >>
> >> On 9/12/2021 12:25 pm, Jim Mattson wrote:
> >>>
> >>> Not your change, but if the event is counting anything based on
> >>> cycles, and the guest TSC is scaled to run at a different rate from
> >>> the host TSC, doesn't the initial value of the underlying hardware
> >>> counter have to be adjusted as well, so that the interrupt arrives
> >>> when the guest's counter overflows rather than when the host's counter
> >>> overflows?
> >>
> >> I've thought about this issue too and at least the Intel Specification
> >> did not let me down on this detail:
> >>
> >>          "The counter changes in the VMX non-root mode will follow
> >>          VMM's use of the TSC offset or TSC scaling VMX controls"
> >
> > Where do you see this? I see similar text regarding TSC packets in the
> > section on Intel Processor Trace, but nothing about PMU counters
> > advancing at a scaled TSC frequency.
>
> Indeed it seems quite unlikely that PMU counters can count fractionally.
>
> Even for tracing the SDM says "Like the value returned by RDTSC, TSC
> packets will include these adjustments, but other timing packets (such
> as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone
> TSC packets are typically generated only when generation of other timing
> packets (MTCs and CYCs) has ceased for a period of time", I'm not even
> sure it's a good thing that the values in TSC packets are scaled and offset.
>
> Back to the PMU, for non-architectural counters it's not really possible
> to know if they count in cycles or not.  So it may not be a good idea to
> special case the architectural counters.

In that case, what we're doing with the guest PMU is not
virtualization. I don't know what it is, but it's not virtualization.

Exposing non-architectural events is questionable with live migration,
and TSC scaling is unnecessary without live migration. I suppose you
could have a migration pool with different SKUs of the same generation
with 'seemingly compatible' PMU events but different TSC frequencies,
in which case it might be reasonable to expose non-architectural
events, but I would argue that any of those 'seemingly compatible'
events are actually not compatible if they count in cycles.

Unless, of course, Like is right, and the PMU counters do count fractionally.
Paolo Bonzini Dec. 10, 2021, 10:59 p.m. UTC | #7
On 12/10/21 23:55, Jim Mattson wrote:
>>
>> Even for tracing the SDM says "Like the value returned by RDTSC, TSC
>> packets will include these adjustments, but other timing packets (such
>> as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone
>> TSC packets are typically generated only when generation of other timing
>> packets (MTCs and CYCs) has ceased for a period of time", I'm not even
>> sure it's a good thing that the values in TSC packets are scaled and offset.
>>
>> Back to the PMU, for non-architectural counters it's not really possible
>> to know if they count in cycles or not.  So it may not be a good idea to
>> special case the architectural counters.
>
> In that case, what we're doing with the guest PMU is not
> virtualization. I don't know what it is, but it's not virtualization.

It is virtualization even if it is incompatible with live migration to a 
different SKU (where, as you point out below, multiple TSC frequencies 
might also count as multiple SKUs).  But yeah, it's virtualization with 
more caveats than usual.

> Exposing non-architectural events is questionable with live migration,
> and TSC scaling is unnecessary without live migration. I suppose you
> could have a migration pool with different SKUs of the same generation
> with 'seemingly compatible' PMU events but different TSC frequencies,
> in which case it might be reasonable to expose non-architectural
> events, but I would argue that any of those 'seemingly compatible'
> events are actually not compatible if they count in cycles.

I agree.  Support for marshaling/unmarshaling PMU state exists but it's 
more useful for intra-host updates than for actual live migration, since 
these days most live migration will use TSC scaling on the destination.

Paolo

> 
> Unless, of course, Like is right, and the PMU counters do count fractionally.
>
Jim Mattson Dec. 10, 2021, 11:31 p.m. UTC | #8
On Fri, Dec 10, 2021 at 2:59 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 12/10/21 23:55, Jim Mattson wrote:
> >>
> >> Even for tracing the SDM says "Like the value returned by RDTSC, TSC
> >> packets will include these adjustments, but other timing packets (such
> >> as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone
> >> TSC packets are typically generated only when generation of other timing
> >> packets (MTCs and CYCs) has ceased for a period of time", I'm not even
> >> sure it's a good thing that the values in TSC packets are scaled and offset.
> >>
> >> Back to the PMU, for non-architectural counters it's not really possible
> >> to know if they count in cycles or not.  So it may not be a good idea to
> >> special case the architectural counters.
> >
> > In that case, what we're doing with the guest PMU is not
> > virtualization. I don't know what it is, but it's not virtualization.
>
> It is virtualization even if it is incompatible with live migration to a
> different SKU (where, as you point out below, multiple TSC frequencies
> might also count as multiple SKUs).  But yeah, it's virtualization with
> more caveats than usual.

It's not virtualization if the counters don't count at the rate the
guest expects them to count.

> > Exposing non-architectural events is questionable with live migration,
> > and TSC scaling is unnecessary without live migration. I suppose you
> > could have a migration pool with different SKUs of the same generation
> > with 'seemingly compatible' PMU events but different TSC frequencies,
> > in which case it might be reasonable to expose non-architectural
> > events, but I would argue that any of those 'seemingly compatible'
> > events are actually not compatible if they count in cycles.
> I agree.  Support for marshaling/unmarshaling PMU state exists but it's
> more useful for intra-host updates than for actual live migration, since
> these days most live migration will use TSC scaling on the destination.
>
> Paolo
>
> >
> > Unless, of course, Like is right, and the PMU counters do count fractionally.
> >
>
Jim Mattson Dec. 12, 2021, 4:56 a.m. UTC | #9
On Fri, Dec 10, 2021 at 3:31 PM Jim Mattson <jmattson@google.com> wrote:
>
> On Fri, Dec 10, 2021 at 2:59 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> > On 12/10/21 23:55, Jim Mattson wrote:
> > >>
> > >> Even for tracing the SDM says "Like the value returned by RDTSC, TSC
> > >> packets will include these adjustments, but other timing packets (such
> > >> as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone
> > >> TSC packets are typically generated only when generation of other timing
> > >> packets (MTCs and CYCs) has ceased for a period of time", I'm not even
> > >> sure it's a good thing that the values in TSC packets are scaled and offset.
> > >>
> > >> Back to the PMU, for non-architectural counters it's not really possible
> > >> to know if they count in cycles or not.  So it may not be a good idea to
> > >> special case the architectural counters.
> > >
> > > In that case, what we're doing with the guest PMU is not
> > > virtualization. I don't know what it is, but it's not virtualization.
> >
> > It is virtualization even if it is incompatible with live migration to a
> > different SKU (where, as you point out below, multiple TSC frequencies
> > might also count as multiple SKUs).  But yeah, it's virtualization with
> > more caveats than usual.
>
> It's not virtualization if the counters don't count at the rate the
> guest expects them to count.

Per the SDM, unhalted reference cycles count at "a fixed frequency."
If the frequency changes on migration, then the value of this event is
questionable at best. For unhalted core cycles, on the other hand, the
SDM says, "The performance counter for this event counts across
performance state transitions using different core clock frequencies."
That does seem to permit frequency changes on migration, but I suspect
that software expects the event to count at a fixed frequency if
INVARIANT_TSC is set.

I'm not sure that I buy your argument regarding consistency. In
general, I would expect the hypervisor to exclude non-architected
events from the allow-list for any VM instances running in a
heterogeneous migration pool. Certainly, those events could be allowed
in a heterogeneous migration pool consisting of multiple SKUs of the
same microarchitecture running at different clock frequencies, but
that seems like a niche case.


> > > Exposing non-architectural events is questionable with live migration,
> > > and TSC scaling is unnecessary without live migration. I suppose you
> > > could have a migration pool with different SKUs of the same generation
> > > with 'seemingly compatible' PMU events but different TSC frequencies,
> > > in which case it might be reasonable to expose non-architectural
> > > events, but I would argue that any of those 'seemingly compatible'
> > > events are actually not compatible if they count in cycles.
> > I agree.  Support for marshaling/unmarshaling PMU state exists but it's
> > more useful for intra-host updates than for actual live migration, since
> > these days most live migration will use TSC scaling on the destination.
> >
> > Paolo
> >
> > >
> > > Unless, of course, Like is right, and the PMU counters do count fractionally.
> > >
> >
Jim Mattson Dec. 13, 2021, 6:37 a.m. UTC | #10
On Sat, Dec 11, 2021 at 8:56 PM Jim Mattson <jmattson@google.com> wrote:
>
> On Fri, Dec 10, 2021 at 3:31 PM Jim Mattson <jmattson@google.com> wrote:
> >
> > On Fri, Dec 10, 2021 at 2:59 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > >
> > > On 12/10/21 23:55, Jim Mattson wrote:
> > > >>
> > > >> Even for tracing the SDM says "Like the value returned by RDTSC, TSC
> > > >> packets will include these adjustments, but other timing packets (such
> > > >> as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone
> > > >> TSC packets are typically generated only when generation of other timing
> > > >> packets (MTCs and CYCs) has ceased for a period of time", I'm not even
> > > >> sure it's a good thing that the values in TSC packets are scaled and offset.
> > > >>
> > > >> Back to the PMU, for non-architectural counters it's not really possible
> > > >> to know if they count in cycles or not.  So it may not be a good idea to
> > > >> special case the architectural counters.
> > > >
> > > > In that case, what we're doing with the guest PMU is not
> > > > virtualization. I don't know what it is, but it's not virtualization.
> > >
> > > It is virtualization even if it is incompatible with live migration to a
> > > different SKU (where, as you point out below, multiple TSC frequencies
> > > might also count as multiple SKUs).  But yeah, it's virtualization with
> > > more caveats than usual.
> >
> > It's not virtualization if the counters don't count at the rate the
> > guest expects them to count.
>
> Per the SDM, unhalted reference cycles count at "a fixed frequency."
> If the frequency changes on migration, then the value of this event is
> questionable at best. For unhalted core cycles, on the other hand, the
> SDM says, "The performance counter for this event counts across
> performance state transitions using different core clock frequencies."
> That does seem to permit frequency changes on migration, but I suspect
> that software expects the event to count at a fixed frequency if
> INVARIANT_TSC is set.

Actually, I now realize that unhalted reference cycles is independent
of the host or guest TSC, so it is not affected by TSC scaling.
However, we still have to decide on a specific fixed frequency to
virtualize so that the frequency doesn't change on migration. As a
practical matter, it may be the case that the reference cycles
frequency is the same on all processors in a migration pool, and we
don't have to do anything.


> I'm not sure that I buy your argument regarding consistency. In
> general, I would expect the hypervisor to exclude non-architected
> events from the allow-list for any VM instances running in a
> heterogeneous migration pool. Certainly, those events could be allowed
> in a heterogeneous migration pool consisting of multiple SKUs of the
> same microarchitecture running at different clock frequencies, but
> that seems like a niche case.
>
>
> > > > Exposing non-architectural events is questionable with live migration,
> > > > and TSC scaling is unnecessary without live migration. I suppose you
> > > > could have a migration pool with different SKUs of the same generation
> > > > with 'seemingly compatible' PMU events but different TSC frequencies,
> > > > in which case it might be reasonable to expose non-architectural
> > > > events, but I would argue that any of those 'seemingly compatible'
> > > > events are actually not compatible if they count in cycles.
> > > I agree.  Support for marshaling/unmarshaling PMU state exists but it's
> > > more useful for intra-host updates than for actual live migration, since
> > > these days most live migration will use TSC scaling on the destination.
> > >
> > > Paolo
> > >
> > > >
> > > > Unless, of course, Like is right, and the PMU counters do count fractionally.
> > > >
> > >
Like Xu Dec. 16, 2021, 9:57 a.m. UTC | #11
On 13/12/2021 2:37 pm, Jim Mattson wrote:
> On Sat, Dec 11, 2021 at 8:56 PM Jim Mattson <jmattson@google.com> wrote:
>>
>> On Fri, Dec 10, 2021 at 3:31 PM Jim Mattson <jmattson@google.com> wrote:
>>>
>>> On Fri, Dec 10, 2021 at 2:59 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>>
>>>> On 12/10/21 23:55, Jim Mattson wrote:
>>>>>>
>>>>>> Even for tracing the SDM says "Like the value returned by RDTSC, TSC
>>>>>> packets will include these adjustments, but other timing packets (such
>>>>>> as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone
>>>>>> TSC packets are typically generated only when generation of other timing
>>>>>> packets (MTCs and CYCs) has ceased for a period of time", I'm not even
>>>>>> sure it's a good thing that the values in TSC packets are scaled and offset.
>>>>>>
>>>>>> Back to the PMU, for non-architectural counters it's not really possible
>>>>>> to know if they count in cycles or not.  So it may not be a good idea to
>>>>>> special case the architectural counters.
>>>>>
>>>>> In that case, what we're doing with the guest PMU is not
>>>>> virtualization. I don't know what it is, but it's not virtualization.

It's a use of profiling guest on the host side, like "perf kvm" and in that case,
we need to convert the guest's TSC values with the host view, taking into
account the guest TSC scaling.

>>>>
>>>> It is virtualization even if it is incompatible with live migration to a
>>>> different SKU (where, as you point out below, multiple TSC frequencies
>>>> might also count as multiple SKUs).  But yeah, it's virtualization with
>>>> more caveats than usual.
>>>
>>> It's not virtualization if the counters don't count at the rate the
>>> guest expects them to count.

We do have "Use TSC scaling" bit in the "Secondary Processor-Based VM-Execution 
Controls".

>>
>> Per the SDM, unhalted reference cycles count at "a fixed frequency."
>> If the frequency changes on migration, then the value of this event is
>> questionable at best. For unhalted core cycles, on the other hand, the
>> SDM says, "The performance counter for this event counts across
>> performance state transitions using different core clock frequencies."
>> That does seem to permit frequency changes on migration, but I suspect
>> that software expects the event to count at a fixed frequency if
>> INVARIANT_TSC is set.

Yes, I may propose that pmu be used in conjunction with INVARIANT_TSC.

> 
> Actually, I now realize that unhalted reference cycles is independent
> of the host or guest TSC, so it is not affected by TSC scaling.

I doubt it.

> However, we still have to decide on a specific fixed frequency to
> virtualize so that the frequency doesn't change on migration. As a
> practical matter, it may be the case that the reference cycles
> frequency is the same on all processors in a migration pool, and we
> don't have to do anything.

Yes, someone is already doing this in a production environment.

> 
> 
>> I'm not sure that I buy your argument regarding consistency. In
>> general, I would expect the hypervisor to exclude non-architected
>> events from the allow-list for any VM instances running in a
>> heterogeneous migration pool. Certainly, those events could be allowed
>> in a heterogeneous migration pool consisting of multiple SKUs of the
>> same microarchitecture running at different clock frequencies, but
>> that seems like a niche case.

IMO, if there are users who want to use the guest PMU, they definitely
want non-architectural events, even without live migration support.

Another input is that we actually have no problem reporting erratic
performance data during live migration transactions or host power
transactions, and there are situations where users want to know
that these kind of things are happening underwater.

The software performance tuners would not trust the perf data from
a single trial, relying more on statistical conclusions.

>>
>>
>>>>> Exposing non-architectural events is questionable with live migration,
>>>>> and TSC scaling is unnecessary without live migration. I suppose you
>>>>> could have a migration pool with different SKUs of the same generation
>>>>> with 'seemingly compatible' PMU events but different TSC frequencies,
>>>>> in which case it might be reasonable to expose non-architectural
>>>>> events, but I would argue that any of those 'seemingly compatible'
>>>>> events are actually not compatible if they count in cycles.
>>>> I agree.  Support for marshaling/unmarshaling PMU state exists but it's
>>>> more useful for intra-host updates than for actual live migration, since
>>>> these days most live migration will use TSC scaling on the destination.
>>>>
>>>> Paolo
>>>>
>>>>>
>>>>> Unless, of course, Like is right, and the PMU counters do count fractionally.
>>>>>
>>>>
>
Jim Mattson Dec. 16, 2021, 5:52 p.m. UTC | #12
On Thu, Dec 16, 2021 at 1:57 AM Like Xu <like.xu.linux@gmail.com> wrote:
>
> On 13/12/2021 2:37 pm, Jim Mattson wrote:
> > On Sat, Dec 11, 2021 at 8:56 PM Jim Mattson <jmattson@google.com> wrote:
> >>
> >> On Fri, Dec 10, 2021 at 3:31 PM Jim Mattson <jmattson@google.com> wrote:
> >>>
> >>> On Fri, Dec 10, 2021 at 2:59 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
> >>>>
> >>>> On 12/10/21 23:55, Jim Mattson wrote:
> >>>>>>
> >>>>>> Even for tracing the SDM says "Like the value returned by RDTSC, TSC
> >>>>>> packets will include these adjustments, but other timing packets (such
> >>>>>> as MTC, CYC, and CBR) are not impacted".  Considering that "stand-alone
> >>>>>> TSC packets are typically generated only when generation of other timing
> >>>>>> packets (MTCs and CYCs) has ceased for a period of time", I'm not even
> >>>>>> sure it's a good thing that the values in TSC packets are scaled and offset.
> >>>>>>
> >>>>>> Back to the PMU, for non-architectural counters it's not really possible
> >>>>>> to know if they count in cycles or not.  So it may not be a good idea to
> >>>>>> special case the architectural counters.
> >>>>>
> >>>>> In that case, what we're doing with the guest PMU is not
> >>>>> virtualization. I don't know what it is, but it's not virtualization.
>
> It's a use of profiling guest on the host side, like "perf kvm" and in that case,
> we need to convert the guest's TSC values with the host view, taking into
> account the guest TSC scaling.

I'm not sure if you are agreeing with me or disagreeing. Basically, my
argument is that the guest should observe a PMU counter programmed
with the "unhalted core cycles" event to be in sync with the guest's
time stamp counter. (If FREEZE_WHILE_SMM or Freeze_PerfMon_On_PMI is
set, the PMU counter may lag behind the time stamp counter, but it
should never get ahead of it.)

> >>>>
> >>>> It is virtualization even if it is incompatible with live migration to a
> >>>> different SKU (where, as you point out below, multiple TSC frequencies
> >>>> might also count as multiple SKUs).  But yeah, it's virtualization with
> >>>> more caveats than usual.
> >>>
> >>> It's not virtualization if the counters don't count at the rate the
> >>> guest expects them to count.
>
> We do have "Use TSC scaling" bit in the "Secondary Processor-Based VM-Execution
> Controls".

Yes, we do. That's what this discussion has been about. That
VM-execution control is documented as follows:

This control determines whether executions of RDTSC, executions of
RDTSCP, and executions of RDMSR that read from the
IA32_TIME_STAMP_COUNTER MSR return a value modified by the TSC
multiplier field (see Section 23.6.5 and Section 24.3).

The SDM is quite specific about what this VM-execution control bit
does, and it makes no mention of PMU events.

> >>
> >> Per the SDM, unhalted reference cycles count at "a fixed frequency."
> >> If the frequency changes on migration, then the value of this event is
> >> questionable at best. For unhalted core cycles, on the other hand, the
> >> SDM says, "The performance counter for this event counts across
> >> performance state transitions using different core clock frequencies."
> >> That does seem to permit frequency changes on migration, but I suspect
> >> that software expects the event to count at a fixed frequency if
> >> INVARIANT_TSC is set.
>
> Yes, I may propose that pmu be used in conjunction with INVARIANT_TSC.
>
> >
> > Actually, I now realize that unhalted reference cycles is independent
> > of the host or guest TSC, so it is not affected by TSC scaling.
>
> I doubt it.

Well, it should be easy to prove, one way or the other. :-)

> > However, we still have to decide on a specific fixed frequency to
> > virtualize so that the frequency doesn't change on migration. As a
> > practical matter, it may be the case that the reference cycles
> > frequency is the same on all processors in a migration pool, and we
> > don't have to do anything.
>
> Yes, someone is already doing this in a production environment.

I'm sure they are. That doesn't mean PMU virtualization is bug-free.

> >
> >
> >> I'm not sure that I buy your argument regarding consistency. In
> >> general, I would expect the hypervisor to exclude non-architected
> >> events from the allow-list for any VM instances running in a
> >> heterogeneous migration pool. Certainly, those events could be allowed
> >> in a heterogeneous migration pool consisting of multiple SKUs of the
> >> same microarchitecture running at different clock frequencies, but
> >> that seems like a niche case.
>
> IMO, if there are users who want to use the guest PMU, they definitely
> want non-architectural events, even without live migration support.
>
There are two scenarios to support: (1) VMs that run on the same
microarchitecture as reported in the guest CPUID. (2) VMs that don't.

Paolo has argued against scaling the architected "unhalted core
cycles" event, because it is infeasible for KVM to recognize and scale
non-architected events that are also TSC based, and the inconsistency
is ugly.
However, in case (2), it is infeasible for KVM to offer any
non-architected events.

To clarify my earlier position, I am arguing that in case (1), TSC
scaling is not likely to be in use, so consistency is not an issue. In
case (2), I don't want to see the inconsistency that would arise every
time the TSC scaling fgactor changes.

I believe that KVM should be made capable of correctly virtualizing
the "unhalted core cycles" event in the presence of TSC scaling. I'm
happy to put this under a KVM_CAP if there are those who would prefer
that it not.

> Another input is that we actually have no problem reporting erratic
> performance data during live migration transactions or host power
> transactions, and there are situations where users want to know
> that these kind of things are happening underwater.

I have no idea what you are saying.

> The software performance tuners would not trust the perf data from
> a single trial, relying more on statistical conclusions.

Software performance tuning is not the only use of the PMU.

> >>
> >>
> >>>>> Exposing non-architectural events is questionable with live migration,
> >>>>> and TSC scaling is unnecessary without live migration. I suppose you
> >>>>> could have a migration pool with different SKUs of the same generation
> >>>>> with 'seemingly compatible' PMU events but different TSC frequencies,
> >>>>> in which case it might be reasonable to expose non-architectural
> >>>>> events, but I would argue that any of those 'seemingly compatible'
> >>>>> events are actually not compatible if they count in cycles.
> >>>> I agree.  Support for marshaling/unmarshaling PMU state exists but it's
> >>>> more useful for intra-host updates than for actual live migration, since
> >>>> these days most live migration will use TSC scaling on the destination.
> >>>>
> >>>> Paolo
> >>>>
> >>>>>
> >>>>> Unless, of course, Like is right, and the PMU counters do count fractionally.
> >>>>>
> >>>>
> >
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e41ad1ead721..6c2b2331ffeb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -495,6 +495,7 @@  struct kvm_pmc {
 	 */
 	u64 current_config;
 	bool is_paused;
+	bool intr;
 };
 
 struct kvm_pmu {
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index b7a1ae28ab87..a20207ee4014 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -55,43 +55,41 @@  static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
 	kvm_pmu_deliver_pmi(vcpu);
 }
 
-static void kvm_perf_overflow(struct perf_event *perf_event,
-			      struct perf_sample_data *data,
-			      struct pt_regs *regs)
+static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 {
-	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
 
-	if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
-		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
-		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
-	}
+	/* Ignore counters that have been reprogrammed already. */
+	if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
+		return;
+
+	__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+	kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
+	if (!pmc->intr)
+		return;
+
+	/*
+	 * Inject PMI. If vcpu was in a guest mode during NMI PMI
+	 * can be ejected on a guest mode re-entry. Otherwise we can't
+	 * be sure that vcpu wasn't executing hlt instruction at the
+	 * time of vmexit and is not going to re-enter guest mode until
+	 * woken up. So we should wake it, but this is impossible from
+	 * NMI context. Do it from irq work instead.
+	 */
+	if (in_pmi && !kvm_is_in_guest())
+		irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
+	else
+		kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
 }
 
-static void kvm_perf_overflow_intr(struct perf_event *perf_event,
-				   struct perf_sample_data *data,
-				   struct pt_regs *regs)
+static void kvm_perf_overflow(struct perf_event *perf_event,
+			      struct perf_sample_data *data,
+			      struct pt_regs *regs)
 {
 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
-
-	if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
-		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
-		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 
-		/*
-		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
-		 * can be ejected on a guest mode re-entry. Otherwise we can't
-		 * be sure that vcpu wasn't executing hlt instruction at the
-		 * time of vmexit and is not going to re-enter guest mode until
-		 * woken up. So we should wake it, but this is impossible from
-		 * NMI context. Do it from irq work instead.
-		 */
-		if (!kvm_is_in_guest())
-			irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
-		else
-			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
-	}
+	__kvm_perf_overflow(pmc, true);
 }
 
 static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
@@ -126,7 +124,6 @@  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 	}
 
 	event = perf_event_create_kernel_counter(&attr, -1, current,
-						 intr ? kvm_perf_overflow_intr :
 						 kvm_perf_overflow, pmc);
 	if (IS_ERR(event)) {
 		pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
@@ -138,6 +135,7 @@  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 	pmc_to_pmu(pmc)->event_count++;
 	clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
 	pmc->is_paused = false;
+	pmc->intr = intr;
 }
 
 static void pmc_pause_counter(struct kvm_pmc *pmc)