diff mbox series

KVM: x86/pmu: omit "impossible" Intel counter MSRs from MSR list

Message ID 20220906081604.24035-1-likexu@tencent.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/pmu: omit "impossible" Intel counter MSRs from MSR list | expand

Commit Message

Like Xu Sept. 6, 2022, 8:16 a.m. UTC
From: Like Xu <likexu@tencent.com>

According to Intel April 2022 SDM - Table 2-2. IA-32 Architectural MSRs,
combined with the address reservation ranges of PERFCTRx, EVENTSELy, and
MSR_IA32_PMCz, the theoretical effective maximum value of the Intel GP
counters is 14, instead of 18:

  14 = 0xE = min (
    0xE = IA32_CORE_CAPABILITIES (0xCF) - IA32_PMC0 (0xC1),
    0xF = IA32_OVERCLOCKING_STATUS (0x195) - IA32_PERFEVTSEL0 (0x186),
    0xF = IA32_MCG_EXT_CTL (0x4D0) - IA32_A_PMC0 (0x4C1)
  )

the source of the incorrect number may be:
  18 = 0x12 = IA32_PERF_STATUS (0x198) - IA32_PERFEVTSEL0 (0x186)
but the range covers IA32_OVERCLOCKING_STATUS, which is also architectural.

Cut the list to 14 entries to avoid false positives.

Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Jim Mattson <jamttson@google.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list")
Signed-off-by: Like Xu <likexu@tencent.com>
---
 arch/x86/kvm/x86.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

Comments

Jim Mattson Sept. 7, 2022, 12:37 a.m. UTC | #1
On Tue, Sep 6, 2022 at 1:16 AM Like Xu <like.xu.linux@gmail.com> wrote:
>
> From: Like Xu <likexu@tencent.com>
>
> According to Intel April 2022 SDM - Table 2-2. IA-32 Architectural MSRs,
> combined with the address reservation ranges of PERFCTRx, EVENTSELy, and
> MSR_IA32_PMCz, the theoretical effective maximum value of the Intel GP
> counters is 14, instead of 18:
>
>   14 = 0xE = min (
>     0xE = IA32_CORE_CAPABILITIES (0xCF) - IA32_PMC0 (0xC1),
>     0xF = IA32_OVERCLOCKING_STATUS (0x195) - IA32_PERFEVTSEL0 (0x186),
>     0xF = IA32_MCG_EXT_CTL (0x4D0) - IA32_A_PMC0 (0x4C1)
>   )
>
> the source of the incorrect number may be:
>   18 = 0x12 = IA32_PERF_STATUS (0x198) - IA32_PERFEVTSEL0 (0x186)
> but the range covers IA32_OVERCLOCKING_STATUS, which is also architectural.
> Cut the list to 14 entries to avoid false positives.
>
> Cc: Kan Liang <kan.liang@linux.intel.com>
> Cc: Jim Mattson <jamttson@google.com>

That should be 'jmattson.'

> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
> Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list")

I'm not sure I completely agree with the "Fixes," since
IA32_OVERCLOCKING_STATUS didn't exist back then. However, Paolo did
make the incorrect assumption that Intel wouldn't cut the range even
further with the introduction of new MSRs.

To that point, aren't you setting yourself up for a future "Fixes"
referencing this change?

We should probably stop at the maximum number of GP PMCs supported
today (8, I think).

If Intel doubles the number of PMCs to remain competitive with AMD,
they'll probably put PMCs 8-15 in a completely different range of MSR
indices.

> Signed-off-by: Like Xu <likexu@tencent.com>
> ---
>  arch/x86/kvm/x86.c | 8 ++------
>  1 file changed, 2 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 43a6a7efc6ec..98cdd4221447 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1431,8 +1431,6 @@ static const u32 msrs_to_save_all[] = {
>         MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
>         MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
>         MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
> -       MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
> -       MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
>         MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
>         MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
>         MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
> @@ -1440,8 +1438,6 @@ static const u32 msrs_to_save_all[] = {
>         MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
>         MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
>         MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
> -       MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
> -       MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
>         MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
>
>         MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
> @@ -6943,12 +6939,12 @@ static void kvm_init_msr_list(void)
>                                 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
>                                 continue;
>                         break;
> -               case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
> +               case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 13:
>                         if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
>                             min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
>                                 continue;
>                         break;
> -               case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
> +               case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 13:
>                         if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
>                             min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
>                                 continue;
> --
> 2.37.3
>
Like Xu Sept. 7, 2022, 3:25 a.m. UTC | #2
On 7/9/2022 8:37 am, Jim Mattson wrote:
> On Tue, Sep 6, 2022 at 1:16 AM Like Xu <like.xu.linux@gmail.com> wrote:
>>
>> From: Like Xu <likexu@tencent.com>
>>
>> According to Intel April 2022 SDM - Table 2-2. IA-32 Architectural MSRs,
>> combined with the address reservation ranges of PERFCTRx, EVENTSELy, and
>> MSR_IA32_PMCz, the theoretical effective maximum value of the Intel GP
>> counters is 14, instead of 18:
>>
>>    14 = 0xE = min (
>>      0xE = IA32_CORE_CAPABILITIES (0xCF) - IA32_PMC0 (0xC1),
>>      0xF = IA32_OVERCLOCKING_STATUS (0x195) - IA32_PERFEVTSEL0 (0x186),
>>      0xF = IA32_MCG_EXT_CTL (0x4D0) - IA32_A_PMC0 (0x4C1)
>>    )
>>
>> the source of the incorrect number may be:
>>    18 = 0x12 = IA32_PERF_STATUS (0x198) - IA32_PERFEVTSEL0 (0x186)
>> but the range covers IA32_OVERCLOCKING_STATUS, which is also architectural.
>> Cut the list to 14 entries to avoid false positives.
>>
>> Cc: Kan Liang <kan.liang@linux.intel.com>
>> Cc: Jim Mattson <jamttson@google.com>
> 
> That should be 'jmattson.'

Oops, my fault.

> 
>> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
>> Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list")
> 
> I'm not sure I completely agree with the "Fixes," since
> IA32_OVERCLOCKING_STATUS didn't exist back then. However, Paolo did
> make the incorrect assumption that Intel wouldn't cut the range even
> further with the introduction of new MSRs.

This new msr is added in April 2022.

Driver-like software had to keep up with real hardware changes and
speculatively with potential predictable hardware changes until failure.

> 
> To that point, aren't you setting yourself up for a future "Fixes"
> referencing this change?

(1) We have precedents like be4f3b3f8227;
(2) Fixes tags is introduced to help stable trees' maintainers (and their robot 
selectors)
absorb suitable patches like this one. We can expect similar issues with stable 
trees running
on new hardware without this fix.
(3) Fixing the tags does not feather the developer's nest, on the contrary the 
upstream code
itself as a vehicle for our group knowledge, is reinforced.

> 
> We should probably stop at the maximum number of GP PMCs supported
> today (8, I think).

I actually thought that at first, until I saw the speculative offset +17 :D.

> 
> If Intel doubles the number of PMCs to remain competitive with AMD,
> they'll probably put PMCs 8-15 in a completely different range of MSR
> indices.

I'll do a little cleanup work as the next version, stopping the number at 8.

> 
>> Signed-off-by: Like Xu <likexu@tencent.com>
>> ---
>>   arch/x86/kvm/x86.c | 8 ++------
>>   1 file changed, 2 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 43a6a7efc6ec..98cdd4221447 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1431,8 +1431,6 @@ static const u32 msrs_to_save_all[] = {
>>          MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
>>          MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
>>          MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
>> -       MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
>> -       MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
>>          MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
>>          MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
>>          MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
>> @@ -1440,8 +1438,6 @@ static const u32 msrs_to_save_all[] = {
>>          MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
>>          MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
>>          MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
>> -       MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
>> -       MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
>>          MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
>>
>>          MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
>> @@ -6943,12 +6939,12 @@ static void kvm_init_msr_list(void)
>>                                  intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
>>                                  continue;
>>                          break;
>> -               case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
>> +               case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 13:
>>                          if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
>>                              min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
>>                                  continue;
>>                          break;
>> -               case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
>> +               case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 13:
>>                          if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
>>                              min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
>>                                  continue;
>> --
>> 2.37.3
>>
Jim Mattson Sept. 7, 2022, 5:06 a.m. UTC | #3
On Tue, Sep 6, 2022 at 8:25 PM Like Xu <like.xu.linux@gmail.com> wrote:
>
> On 7/9/2022 8:37 am, Jim Mattson wrote:
> > On Tue, Sep 6, 2022 at 1:16 AM Like Xu <like.xu.linux@gmail.com> wrote:
> >>
> >> From: Like Xu <likexu@tencent.com>
> >>
> >> According to Intel April 2022 SDM - Table 2-2. IA-32 Architectural MSRs,
> >> combined with the address reservation ranges of PERFCTRx, EVENTSELy, and
> >> MSR_IA32_PMCz, the theoretical effective maximum value of the Intel GP
> >> counters is 14, instead of 18:
> >>
> >>    14 = 0xE = min (
> >>      0xE = IA32_CORE_CAPABILITIES (0xCF) - IA32_PMC0 (0xC1),
> >>      0xF = IA32_OVERCLOCKING_STATUS (0x195) - IA32_PERFEVTSEL0 (0x186),
> >>      0xF = IA32_MCG_EXT_CTL (0x4D0) - IA32_A_PMC0 (0x4C1)
> >>    )
> >>
> >> the source of the incorrect number may be:
> >>    18 = 0x12 = IA32_PERF_STATUS (0x198) - IA32_PERFEVTSEL0 (0x186)
> >> but the range covers IA32_OVERCLOCKING_STATUS, which is also architectural.
> >> Cut the list to 14 entries to avoid false positives.
> >>
> >> Cc: Kan Liang <kan.liang@linux.intel.com>
> >> Cc: Jim Mattson <jamttson@google.com>
> >
> > That should be 'jmattson.'
>
> Oops, my fault.
>
> >
> >> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
> >> Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list")
> >
> > I'm not sure I completely agree with the "Fixes," since
> > IA32_OVERCLOCKING_STATUS didn't exist back then. However, Paolo did
> > make the incorrect assumption that Intel wouldn't cut the range even
> > further with the introduction of new MSRs.
>
> This new msr is added in April 2022.
>
> Driver-like software had to keep up with real hardware changes and
> speculatively with potential predictable hardware changes until failure.
>
> >
> > To that point, aren't you setting yourself up for a future "Fixes"
> > referencing this change?
>
> (1) We have precedents like be4f3b3f8227;
> (2) Fixes tags is introduced to help stable trees' maintainers (and their robot
> selectors)
> absorb suitable patches like this one. We can expect similar issues with stable
> trees running
> on new hardware without this fix.
> (3) Fixing the tags does not feather the developer's nest, on the contrary the
> upstream code
> itself as a vehicle for our group knowledge, is reinforced.
> >
> > We should probably stop at the maximum number of GP PMCs supported
> > today (8, I think).
>
> I actually thought that at first, until I saw the speculative offset +17 :D.

The root cause of all of this pain is commit a072738e04f0 ("perf, x86:
Implement initial P4 PMU driver"). It bumped X86_PMC_MAX_GENERIC from
8 to 32. That eventually mutated into INTEL_PMC_MAX_GENERIC, which is
what I consulted when I originally added the Intel PMU MSRs to
msrs_to_save[] in
commit e2ada66ec418 ("kvm: x86: Add Intel PMU MSRs to
msrs_to_save[]"). My bad for just assuming that I knew what
INTEL_PMC_MAX_GENERIC meant, based solely on its name!

Paolo fixed my commit by reducing the list to 18 PMCs, because of the
known conflict at the time. (Note that the SDM says that there are
actually only 18 PMCs on the P4, but I don't think Paolo factored this
into his change.)

This is all the more reason *not* to put a static list of PMU MSRs
into msrs_to_save[], but to dynamically add the PMU MSRs supported on
the host. If you're on a P4, there will be 18 of them, but they range
from 0x300 to 0x311.
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43a6a7efc6ec..98cdd4221447 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1431,8 +1431,6 @@  static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
 	MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
 	MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
-	MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
-	MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
 	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
@@ -1440,8 +1438,6 @@  static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
-	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
-	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
 	MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
 
 	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
@@ -6943,12 +6939,12 @@  static void kvm_init_msr_list(void)
 				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
 				continue;
 			break;
-		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 13:
 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
 			    min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
 				continue;
 			break;
-		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 13:
 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
 			    min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
 				continue;