Message ID | 20190821182004.102768-1-jmattson@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | kvm: x86: Add Intel PMU MSRs to msrs_to_save[] | expand |
On Wed, Aug 21, 2019 at 11:20 AM Jim Mattson <jmattson@google.com> wrote: > > These MSRs should be enumerated by KVM_GET_MSR_INDEX_LIST, so that > userspace knows that these MSRs may be part of the vCPU state. > > Signed-off-by: Jim Mattson <jmattson@google.com> > Reviewed-by: Eric Hankland <ehankland@google.com> > Reviewed-by: Peter Shier <pshier@google.com> > > --- > arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 41 insertions(+) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 93b0bd45ac73..ecaaa411538f 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1140,6 +1140,42 @@ static u32 msrs_to_save[] = { > MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, > MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, > MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, > + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, > + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, > + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, > + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, > + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, > + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, > + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, > + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, > + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, > + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, > + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, > + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, > + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, > + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, > + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, > + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, > + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, > + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, > + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, > + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, > + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, > + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, > + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, > + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, > + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, > + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, > + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, > + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, > + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, > + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, > + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, > + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, > + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, > + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, > + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, > + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, > }; > > static unsigned num_msrs_to_save; > @@ -4989,6 +5025,11 @@ static void kvm_init_msr_list(void) > u32 dummy[2]; > unsigned i, j; > > + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, > + "Please update the fixed PMCs in msrs_to_save[]"); > + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, > + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); > + > for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { > if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) > continue; > -- > 2.23.0.187.g17f5b7556c-goog Ping.
On Fri, Sep 6, 2019 at 12:59 PM Krish Sadhukhan <krish.sadhukhan@oracle.com> wrote: > > > On 9/6/19 9:48 AM, Jim Mattson wrote: > > On Wed, Aug 21, 2019 at 11:20 AM Jim Mattson <jmattson@google.com> wrote: > > These MSRs should be enumerated by KVM_GET_MSR_INDEX_LIST, so that > userspace knows that these MSRs may be part of the vCPU state. > > Signed-off-by: Jim Mattson <jmattson@google.com> > Reviewed-by: Eric Hankland <ehankland@google.com> > Reviewed-by: Peter Shier <pshier@google.com> > > --- > arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 41 insertions(+) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 93b0bd45ac73..ecaaa411538f 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1140,6 +1140,42 @@ static u32 msrs_to_save[] = { > MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, > MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, > MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, > + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, > + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, > + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, > + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, > + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, > + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, > + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, > + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, > + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, > + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, > + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, > + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, > + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, > + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, > + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, > + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, > + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, > + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, > + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, > + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, > + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, > + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, > + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, > + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, > + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, > + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, > + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, > + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, > + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, > + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, > + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, > + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, > + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, > + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, > + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, > + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, > }; > > > Should we have separate #defines for the MSRs that are at offset from the base MSR? How about macros that take an offset argument, rather than a whole slew of new macros? > > static unsigned num_msrs_to_save; > @@ -4989,6 +5025,11 @@ static void kvm_init_msr_list(void) > u32 dummy[2]; > unsigned i, j; > > + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, > + "Please update the fixed PMCs in msrs_to_save[]"); > + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, > + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); > > > Just curious how the condition can ever become false because we are comparing two static numbers here. Someone just has to change the macros. In fact, I originally developed this change on a version of the kernel where INTEL_PMC_MAX_FIXED was 3, and so I had: > + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 3, > + "Please update the fixed PMCs in msrs_to_save[]") When I cherry-picked the change to Linux tip, the BUILD_BUG_ON fired, and I updated the fixed PMCs in msrs_to_save[]. > > + > for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { > if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) > continue; > -- > 2.23.0.187.g17f5b7556c-goog > > Ping. > > > Also, since these MSRs are Intel-specific, should these be enumerated via 'intel_pmu_ops' ? msrs_to_save[] is filtered to remove MSRs that aren't supported on the host. Or are you asking something else?
On 9/6/19 1:30 PM, Jim Mattson wrote: > On Fri, Sep 6, 2019 at 12:59 PM Krish Sadhukhan > <krish.sadhukhan@oracle.com> wrote: >> >> On 9/6/19 9:48 AM, Jim Mattson wrote: >> >> On Wed, Aug 21, 2019 at 11:20 AM Jim Mattson <jmattson@google.com> wrote: >> >> These MSRs should be enumerated by KVM_GET_MSR_INDEX_LIST, so that >> userspace knows that these MSRs may be part of the vCPU state. >> >> Signed-off-by: Jim Mattson <jmattson@google.com> >> Reviewed-by: Eric Hankland <ehankland@google.com> >> Reviewed-by: Peter Shier <pshier@google.com> >> >> --- >> arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 41 insertions(+) >> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 93b0bd45ac73..ecaaa411538f 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -1140,6 +1140,42 @@ static u32 msrs_to_save[] = { >> MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, >> MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, >> MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, >> + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, >> + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, >> + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, >> + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, >> + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, >> + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, >> + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, >> + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, >> + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, >> + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, >> + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, >> + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, >> + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, >> + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, >> + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, >> + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, >> + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, >> + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, >> + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, >> + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, >> + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, >> + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, >> }; >> >> >> Should we have separate #defines for the MSRs that are at offset from the base MSR? > How about macros that take an offset argument, rather than a whole > slew of new macros? Yes, that works too. > >> static unsigned num_msrs_to_save; >> @@ -4989,6 +5025,11 @@ static void kvm_init_msr_list(void) >> u32 dummy[2]; >> unsigned i, j; >> >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, >> + "Please update the fixed PMCs in msrs_to_save[]"); >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, >> + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); >> >> >> Just curious how the condition can ever become false because we are comparing two static numbers here. > Someone just has to change the macros. In fact, I originally developed > this change on a version of the kernel where INTEL_PMC_MAX_FIXED was > 3, and so I had: > >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 3, >> + "Please update the fixed PMCs in msrs_to_save[]") > When I cherry-picked the change to Linux tip, the BUILD_BUG_ON fired, > and I updated the fixed PMCs in msrs_to_save[]. > >> + >> for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { >> if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) >> continue; >> -- >> 2.23.0.187.g17f5b7556c-goog >> >> Ping. >> >> >> Also, since these MSRs are Intel-specific, should these be enumerated via 'intel_pmu_ops' ? > msrs_to_save[] is filtered to remove MSRs that aren't supported on the > host. Or are you asking something else? I am referring to the fact that we are enumerating Intel-specific MSRs in the generic KVM code. Should there be some sort of #define guard to not compile the code on AMD ?
On Fri, Sep 6, 2019 at 1:43 PM Krish Sadhukhan <krish.sadhukhan@oracle.com> wrote: > > > On 9/6/19 1:30 PM, Jim Mattson wrote: > > On Fri, Sep 6, 2019 at 12:59 PM Krish Sadhukhan > > <krish.sadhukhan@oracle.com> wrote: > >> > >> On 9/6/19 9:48 AM, Jim Mattson wrote: > >> > >> On Wed, Aug 21, 2019 at 11:20 AM Jim Mattson <jmattson@google.com> wrote: > >> > >> These MSRs should be enumerated by KVM_GET_MSR_INDEX_LIST, so that > >> userspace knows that these MSRs may be part of the vCPU state. > >> > >> Signed-off-by: Jim Mattson <jmattson@google.com> > >> Reviewed-by: Eric Hankland <ehankland@google.com> > >> Reviewed-by: Peter Shier <pshier@google.com> > >> > >> --- > >> arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ > >> 1 file changed, 41 insertions(+) > >> > >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >> index 93b0bd45ac73..ecaaa411538f 100644 > >> --- a/arch/x86/kvm/x86.c > >> +++ b/arch/x86/kvm/x86.c > >> @@ -1140,6 +1140,42 @@ static u32 msrs_to_save[] = { > >> MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, > >> MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, > >> MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, > >> + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, > >> + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, > >> + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, > >> + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, > >> + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, > >> + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, > >> + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, > >> }; > >> > >> > >> Should we have separate #defines for the MSRs that are at offset from the base MSR? > > How about macros that take an offset argument, rather than a whole > > slew of new macros? > > > Yes, that works too. > > > > > >> static unsigned num_msrs_to_save; > >> @@ -4989,6 +5025,11 @@ static void kvm_init_msr_list(void) > >> u32 dummy[2]; > >> unsigned i, j; > >> > >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, > >> + "Please update the fixed PMCs in msrs_to_save[]"); > >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, > >> + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); > >> > >> > >> Just curious how the condition can ever become false because we are comparing two static numbers here. > > Someone just has to change the macros. In fact, I originally developed > > this change on a version of the kernel where INTEL_PMC_MAX_FIXED was > > 3, and so I had: > > > >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 3, > >> + "Please update the fixed PMCs in msrs_to_save[]") > > When I cherry-picked the change to Linux tip, the BUILD_BUG_ON fired, > > and I updated the fixed PMCs in msrs_to_save[]. > > > >> + > >> for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { > >> if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) > >> continue; > >> -- > >> 2.23.0.187.g17f5b7556c-goog > >> > >> Ping. > >> > >> > >> Also, since these MSRs are Intel-specific, should these be enumerated via 'intel_pmu_ops' ? > > msrs_to_save[] is filtered to remove MSRs that aren't supported on the > > host. Or are you asking something else? > > > I am referring to the fact that we are enumerating Intel-specific MSRs > in the generic KVM code. Should there be some sort of #define guard to > not compile the code on AMD ? No. msrs_to_save[] contains *all* MSRs that may be relevant on some platform. Notice that it already includes AMD-only MSRs (e.g. MSR_VM_HSAVE_PA) and Intel-only MSRs (e.g. MSR_IA32_BNDCFGS). The MSRs that are not relevant are filtered out in kvm_init_msr_list(). This list probably should include the AMD equivalents as well, but I haven't looked into the AMD vPMU yet.
On Fri, Sep 6, 2019 at 2:08 PM Jim Mattson <jmattson@google.com> wrote: > > On Fri, Sep 6, 2019 at 1:43 PM Krish Sadhukhan > <krish.sadhukhan@oracle.com> wrote: > > > > > > On 9/6/19 1:30 PM, Jim Mattson wrote: > > > On Fri, Sep 6, 2019 at 12:59 PM Krish Sadhukhan > > > <krish.sadhukhan@oracle.com> wrote: > > >> > > >> On 9/6/19 9:48 AM, Jim Mattson wrote: > > >> > > >> On Wed, Aug 21, 2019 at 11:20 AM Jim Mattson <jmattson@google.com> wrote: > > >> > > >> These MSRs should be enumerated by KVM_GET_MSR_INDEX_LIST, so that > > >> userspace knows that these MSRs may be part of the vCPU state. > > >> > > >> Signed-off-by: Jim Mattson <jmattson@google.com> > > >> Reviewed-by: Eric Hankland <ehankland@google.com> > > >> Reviewed-by: Peter Shier <pshier@google.com> > > >> > > >> --- > > >> arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ > > >> 1 file changed, 41 insertions(+) > > >> > > >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > >> index 93b0bd45ac73..ecaaa411538f 100644 > > >> --- a/arch/x86/kvm/x86.c > > >> +++ b/arch/x86/kvm/x86.c > > >> @@ -1140,6 +1140,42 @@ static u32 msrs_to_save[] = { > > >> MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, > > >> MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, > > >> MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, > > >> + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, > > >> + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, > > >> + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, > > >> + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, > > >> + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, > > >> + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, > > >> + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, > > >> + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, > > >> }; > > >> > > >> > > >> Should we have separate #defines for the MSRs that are at offset from the base MSR? > > > How about macros that take an offset argument, rather than a whole > > > slew of new macros? > > > > > > Yes, that works too. > > > > > > > > > >> static unsigned num_msrs_to_save; > > >> @@ -4989,6 +5025,11 @@ static void kvm_init_msr_list(void) > > >> u32 dummy[2]; > > >> unsigned i, j; > > >> > > >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, > > >> + "Please update the fixed PMCs in msrs_to_save[]"); > > >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, > > >> + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); > > >> > > >> > > >> Just curious how the condition can ever become false because we are comparing two static numbers here. > > > Someone just has to change the macros. In fact, I originally developed > > > this change on a version of the kernel where INTEL_PMC_MAX_FIXED was > > > 3, and so I had: > > > > > >> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 3, > > >> + "Please update the fixed PMCs in msrs_to_save[]") > > > When I cherry-picked the change to Linux tip, the BUILD_BUG_ON fired, > > > and I updated the fixed PMCs in msrs_to_save[]. > > > > > >> + > > >> for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { > > >> if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) > > >> continue; > > >> -- > > >> 2.23.0.187.g17f5b7556c-goog > > >> > > >> Ping. > > >> > > >> > > >> Also, since these MSRs are Intel-specific, should these be enumerated via 'intel_pmu_ops' ? > > > msrs_to_save[] is filtered to remove MSRs that aren't supported on the > > > host. Or are you asking something else? > > > > > > I am referring to the fact that we are enumerating Intel-specific MSRs > > in the generic KVM code. Should there be some sort of #define guard to > > not compile the code on AMD ? > > No. msrs_to_save[] contains *all* MSRs that may be relevant on some > platform. Notice that it already includes AMD-only MSRs (e.g. > MSR_VM_HSAVE_PA) and Intel-only MSRs (e.g. MSR_IA32_BNDCFGS). The MSRs > that are not relevant are filtered out in kvm_init_msr_list(). > > This list probably should include the AMD equivalents as well, but I > haven't looked into the AMD vPMU yet. Ping.
On 16/09/19 22:43, Jim Mattson wrote: > On Fri, Sep 6, 2019 at 2:08 PM Jim Mattson <jmattson@google.com> wrote: >> >> On Fri, Sep 6, 2019 at 1:43 PM Krish Sadhukhan >> <krish.sadhukhan@oracle.com> wrote: >>> >>> >>> On 9/6/19 1:30 PM, Jim Mattson wrote: >>>> On Fri, Sep 6, 2019 at 12:59 PM Krish Sadhukhan >>>> <krish.sadhukhan@oracle.com> wrote: >>>>> >>>>> On 9/6/19 9:48 AM, Jim Mattson wrote: >>>>> >>>>> On Wed, Aug 21, 2019 at 11:20 AM Jim Mattson <jmattson@google.com> wrote: >>>>> >>>>> These MSRs should be enumerated by KVM_GET_MSR_INDEX_LIST, so that >>>>> userspace knows that these MSRs may be part of the vCPU state. >>>>> >>>>> Signed-off-by: Jim Mattson <jmattson@google.com> >>>>> Reviewed-by: Eric Hankland <ehankland@google.com> >>>>> Reviewed-by: Peter Shier <pshier@google.com> >>>>> >>>>> --- >>>>> arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ >>>>> 1 file changed, 41 insertions(+) >>>>> >>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>>>> index 93b0bd45ac73..ecaaa411538f 100644 >>>>> --- a/arch/x86/kvm/x86.c >>>>> +++ b/arch/x86/kvm/x86.c >>>>> @@ -1140,6 +1140,42 @@ static u32 msrs_to_save[] = { >>>>> MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, >>>>> MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, >>>>> MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, >>>>> + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, >>>>> + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, >>>>> + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, >>>>> + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, >>>>> + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, >>>>> + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, >>>>> + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, >>>>> }; >>>>> >>>>> >>>>> Should we have separate #defines for the MSRs that are at offset from the base MSR? >>>> How about macros that take an offset argument, rather than a whole >>>> slew of new macros? >>> >>> >>> Yes, that works too. >>> >>> >>>> >>>>> static unsigned num_msrs_to_save; >>>>> @@ -4989,6 +5025,11 @@ static void kvm_init_msr_list(void) >>>>> u32 dummy[2]; >>>>> unsigned i, j; >>>>> >>>>> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, >>>>> + "Please update the fixed PMCs in msrs_to_save[]"); >>>>> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, >>>>> + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); >>>>> >>>>> >>>>> Just curious how the condition can ever become false because we are comparing two static numbers here. >>>> Someone just has to change the macros. In fact, I originally developed >>>> this change on a version of the kernel where INTEL_PMC_MAX_FIXED was >>>> 3, and so I had: >>>> >>>>> + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 3, >>>>> + "Please update the fixed PMCs in msrs_to_save[]") >>>> When I cherry-picked the change to Linux tip, the BUILD_BUG_ON fired, >>>> and I updated the fixed PMCs in msrs_to_save[]. >>>> >>>>> + >>>>> for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { >>>>> if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) >>>>> continue; >>>>> -- >>>>> 2.23.0.187.g17f5b7556c-goog >>>>> >>>>> Ping. >>>>> >>>>> >>>>> Also, since these MSRs are Intel-specific, should these be enumerated via 'intel_pmu_ops' ? >>>> msrs_to_save[] is filtered to remove MSRs that aren't supported on the >>>> host. Or are you asking something else? >>> >>> >>> I am referring to the fact that we are enumerating Intel-specific MSRs >>> in the generic KVM code. Should there be some sort of #define guard to >>> not compile the code on AMD ? >> >> No. msrs_to_save[] contains *all* MSRs that may be relevant on some >> platform. Notice that it already includes AMD-only MSRs (e.g. >> MSR_VM_HSAVE_PA) and Intel-only MSRs (e.g. MSR_IA32_BNDCFGS). The MSRs >> that are not relevant are filtered out in kvm_init_msr_list(). >> >> This list probably should include the AMD equivalents as well, but I >> haven't looked into the AMD vPMU yet. > > Ping. > Queued, thanks. Paolo
Paolo Bonzini <pbonzini@redhat.com> writes:
> Queued, thanks.
I'm sorry for late feedback but this commit seems to be causing
selftests failures for me, e.g.:
# ./x86_64/state_test
Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages
Guest physical address width detected: 46
==== Test Assertion Failure ====
lib/x86_64/processor.c:1089: r == nmsrs
pid=14431 tid=14431 - Argument list too long
1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3)
2 0x00000000004010e3: main at state_test.c:171 (discriminator 4)
3 0x00007f881eb453d4: ?? ??:0
4 0x0000000000401287: _start at ??:?
Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194)
Is this something known already or should I investigate?
On 27/09/19 15:53, Vitaly Kuznetsov wrote: > Paolo Bonzini <pbonzini@redhat.com> writes: > >> Queued, thanks. > > I'm sorry for late feedback but this commit seems to be causing > selftests failures for me, e.g.: > > # ./x86_64/state_test > Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages > Guest physical address width detected: 46 > ==== Test Assertion Failure ==== > lib/x86_64/processor.c:1089: r == nmsrs > pid=14431 tid=14431 - Argument list too long > 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) > 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) > 3 0x00007f881eb453d4: ?? ??:0 > 4 0x0000000000401287: _start at ??:? > Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) > > Is this something known already or should I investigate? No, I didn't know about it, it works here. Paolo
Paolo Bonzini <pbonzini@redhat.com> writes: > On 27/09/19 15:53, Vitaly Kuznetsov wrote: >> Paolo Bonzini <pbonzini@redhat.com> writes: >> >>> Queued, thanks. >> >> I'm sorry for late feedback but this commit seems to be causing >> selftests failures for me, e.g.: >> >> # ./x86_64/state_test >> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages >> Guest physical address width detected: 46 >> ==== Test Assertion Failure ==== >> lib/x86_64/processor.c:1089: r == nmsrs >> pid=14431 tid=14431 - Argument list too long >> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) >> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) >> 3 0x00007f881eb453d4: ?? ??:0 >> 4 0x0000000000401287: _start at ??:? >> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) >> >> Is this something known already or should I investigate? > > No, I didn't know about it, it works here. > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks it against nr_arch_gp_counters and returns a failure. Oh, btw, this is Intel(R) Xeon(R) CPU E5-2603 v3 So apparently rdmsr_safe() check in kvm_init_msr_list() is not enough, for PMU MSRs we need to do extra.
On 27/09/19 16:40, Vitaly Kuznetsov wrote: > Paolo Bonzini <pbonzini@redhat.com> writes: > >> On 27/09/19 15:53, Vitaly Kuznetsov wrote: >>> Paolo Bonzini <pbonzini@redhat.com> writes: >>> >>>> Queued, thanks. >>> >>> I'm sorry for late feedback but this commit seems to be causing >>> selftests failures for me, e.g.: >>> >>> # ./x86_64/state_test >>> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages >>> Guest physical address width detected: 46 >>> ==== Test Assertion Failure ==== >>> lib/x86_64/processor.c:1089: r == nmsrs >>> pid=14431 tid=14431 - Argument list too long >>> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) >>> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) >>> 3 0x00007f881eb453d4: ?? ??:0 >>> 4 0x0000000000401287: _start at ??:? >>> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) >>> >>> Is this something known already or should I investigate? >> >> No, I didn't know about it, it works here. >> > > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > it against nr_arch_gp_counters and returns a failure. Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need to be checked against CPUID before allowing them. Paolo > Oh, btw, this is Intel(R) Xeon(R) CPU E5-2603 v3 > > So apparently rdmsr_safe() check in kvm_init_msr_list() is not enough, > for PMU MSRs we need to do extra.
On Fri, Sep 27, 2019 at 05:19:25PM +0200, Paolo Bonzini wrote: > On 27/09/19 16:40, Vitaly Kuznetsov wrote: > > Paolo Bonzini <pbonzini@redhat.com> writes: > > > >> On 27/09/19 15:53, Vitaly Kuznetsov wrote: > >>> Paolo Bonzini <pbonzini@redhat.com> writes: > >>> > >>>> Queued, thanks. > >>> > >>> I'm sorry for late feedback but this commit seems to be causing > >>> selftests failures for me, e.g.: > >>> > >>> # ./x86_64/state_test > >>> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages > >>> Guest physical address width detected: 46 > >>> ==== Test Assertion Failure ==== > >>> lib/x86_64/processor.c:1089: r == nmsrs > >>> pid=14431 tid=14431 - Argument list too long > >>> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) > >>> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) > >>> 3 0x00007f881eb453d4: ?? ??:0 > >>> 4 0x0000000000401287: _start at ??:? > >>> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) That "failed at %x" print line should really be updated to make it clear that it's printing hex... > >>> Is this something known already or should I investigate? > >> > >> No, I didn't know about it, it works here. > >> > > > > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > > it against nr_arch_gp_counters and returns a failure. > > Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need > to be checked against CPUID before allowing them. My vote would be to programmatically generate the MSRs using CPUID and the base MSR, as opposed to dumping them into the list and cross-referencing them against CPUID. E.g. there should also be some form of check that the architectural PMUs are even supported.
Sean Christopherson <sean.j.christopherson@intel.com> writes: > On Fri, Sep 27, 2019 at 05:19:25PM +0200, Paolo Bonzini wrote: >> On 27/09/19 16:40, Vitaly Kuznetsov wrote: >> > Paolo Bonzini <pbonzini@redhat.com> writes: >> > >> >> On 27/09/19 15:53, Vitaly Kuznetsov wrote: >> >>> Paolo Bonzini <pbonzini@redhat.com> writes: >> >>> >> >>>> Queued, thanks. >> >>> >> >>> I'm sorry for late feedback but this commit seems to be causing >> >>> selftests failures for me, e.g.: >> >>> >> >>> # ./x86_64/state_test >> >>> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages >> >>> Guest physical address width detected: 46 >> >>> ==== Test Assertion Failure ==== >> >>> lib/x86_64/processor.c:1089: r == nmsrs >> >>> pid=14431 tid=14431 - Argument list too long >> >>> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) >> >>> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) >> >>> 3 0x00007f881eb453d4: ?? ??:0 >> >>> 4 0x0000000000401287: _start at ??:? >> >>> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) > > That "failed at %x" print line should really be updated to make it clear > that it's printing hex... > Yea, I also wasn't sure and had to look in the code. Will send a patch if no one beats me to it. >> >>> Is this something known already or should I investigate? >> >> >> >> No, I didn't know about it, it works here. >> >> >> > >> > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + >> > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, >> > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails >> > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks >> > it against nr_arch_gp_counters and returns a failure. >> >> Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need >> to be checked against CPUID before allowing them. > > My vote would be to programmatically generate the MSRs using CPUID and the > base MSR, as opposed to dumping them into the list and cross-referencing > them against CPUID. E.g. there should also be some form of check that the > architectural PMUs are even supported. Yes. The problem appears to be that msrs_to_save[] and emulated_msrs[] are global and for the MSRs in question we check kvm_find_cpuid_entry(vcpu, 0xa, ) to find out how many of them are available so this can be different for different VMs (and even vCPUs :-) However, "KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list varies by kvm version and host processor, but does not change otherwise." So it seems that PMU MSRs just can't be there. Revert?
On Fri, Sep 27, 2019 at 8:47 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > > Sean Christopherson <sean.j.christopherson@intel.com> writes: > > > On Fri, Sep 27, 2019 at 05:19:25PM +0200, Paolo Bonzini wrote: > >> On 27/09/19 16:40, Vitaly Kuznetsov wrote: > >> > Paolo Bonzini <pbonzini@redhat.com> writes: > >> > > >> >> On 27/09/19 15:53, Vitaly Kuznetsov wrote: > >> >>> Paolo Bonzini <pbonzini@redhat.com> writes: > >> >>> > >> >>>> Queued, thanks. > >> >>> > >> >>> I'm sorry for late feedback but this commit seems to be causing > >> >>> selftests failures for me, e.g.: > >> >>> > >> >>> # ./x86_64/state_test > >> >>> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages > >> >>> Guest physical address width detected: 46 > >> >>> ==== Test Assertion Failure ==== > >> >>> lib/x86_64/processor.c:1089: r == nmsrs > >> >>> pid=14431 tid=14431 - Argument list too long > >> >>> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) > >> >>> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) > >> >>> 3 0x00007f881eb453d4: ?? ??:0 > >> >>> 4 0x0000000000401287: _start at ??:? > >> >>> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) > > > > That "failed at %x" print line should really be updated to make it clear > > that it's printing hex... > > > > Yea, I also wasn't sure and had to look in the code. Will send a patch > if no one beats me to it. > > >> >>> Is this something known already or should I investigate? > >> >> > >> >> No, I didn't know about it, it works here. > >> >> > >> > > >> > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > >> > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > >> > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > >> > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > >> > it against nr_arch_gp_counters and returns a failure. > >> > >> Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need > >> to be checked against CPUID before allowing them. > > > > My vote would be to programmatically generate the MSRs using CPUID and the > > base MSR, as opposed to dumping them into the list and cross-referencing > > them against CPUID. E.g. there should also be some form of check that the > > architectural PMUs are even supported. > > Yes. The problem appears to be that msrs_to_save[] and emulated_msrs[] > are global and for the MSRs in question we check > kvm_find_cpuid_entry(vcpu, 0xa, ) to find out how many of them are > available so this can be different for different VMs (and even vCPUs :-) > However, > > "KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list > varies by kvm version and host processor, but does not change otherwise." > > So it seems that PMU MSRs just can't be there. Revert? The API design is unfortunate, but I would argue that any MSR that a guest *might* support has to be in this list for live migration to work with the vPMU enabled. I don't know about qemu, but Google's userspace will only save/restore MSRs that are in this list
On Fri, 2019-09-27 at 17:46 +0200, Vitaly Kuznetsov wrote: > > > > > > Is this something known already or should I investigate? > > > > > > > > > > No, I didn't know about it, it works here. > > > > > > > > > > > > > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > > > > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > > > > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > > > > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > > > > it against nr_arch_gp_counters and returns a failure. > > > > > > Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need > > > to be checked against CPUID before allowing them. > > > > My vote would be to programmatically generate the MSRs using CPUID and the > > base MSR, as opposed to dumping them into the list and cross-referencing > > them against CPUID. E.g. there should also be some form of check that the > > architectural PMUs are even supported. > > Yes. The problem appears to be that msrs_to_save[] and emulated_msrs[] > are global and for the MSRs in question we check > kvm_find_cpuid_entry(vcpu, 0xa, ) to find out how many of them are > available so this can be different for different VMs (and even vCPUs :-) > However, > > "KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list > varies by kvm version and host processor, but does not change otherwise." > Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and they are free from different guest configuration since they're initialized when kvm module is loaded. Even though some MSRs are not exposed to guest by clear their related cpuid bits, they are still saved/restored by QEMU in the same fashion. I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > So it seems that PMU MSRs just can't be there. Revert? >
On Fri, Sep 27, 2019 at 8:55 AM Jim Mattson <jmattson@google.com> wrote: > > On Fri, Sep 27, 2019 at 8:47 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > > > > Sean Christopherson <sean.j.christopherson@intel.com> writes: > > > > > On Fri, Sep 27, 2019 at 05:19:25PM +0200, Paolo Bonzini wrote: > > >> On 27/09/19 16:40, Vitaly Kuznetsov wrote: > > >> > Paolo Bonzini <pbonzini@redhat.com> writes: > > >> > > > >> >> On 27/09/19 15:53, Vitaly Kuznetsov wrote: > > >> >>> Paolo Bonzini <pbonzini@redhat.com> writes: > > >> >>> > > >> >>>> Queued, thanks. > > >> >>> > > >> >>> I'm sorry for late feedback but this commit seems to be causing > > >> >>> selftests failures for me, e.g.: > > >> >>> > > >> >>> # ./x86_64/state_test > > >> >>> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages > > >> >>> Guest physical address width detected: 46 > > >> >>> ==== Test Assertion Failure ==== > > >> >>> lib/x86_64/processor.c:1089: r == nmsrs > > >> >>> pid=14431 tid=14431 - Argument list too long > > >> >>> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) > > >> >>> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) > > >> >>> 3 0x00007f881eb453d4: ?? ??:0 > > >> >>> 4 0x0000000000401287: _start at ??:? > > >> >>> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) > > > > > > That "failed at %x" print line should really be updated to make it clear > > > that it's printing hex... > > > > > > > Yea, I also wasn't sure and had to look in the code. Will send a patch > > if no one beats me to it. > > > > >> >>> Is this something known already or should I investigate? > > >> >> > > >> >> No, I didn't know about it, it works here. > > >> >> > > >> > > > >> > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > > >> > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > > >> > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > > >> > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > > >> > it against nr_arch_gp_counters and returns a failure. > > >> > > >> Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need > > >> to be checked against CPUID before allowing them. > > > > > > My vote would be to programmatically generate the MSRs using CPUID and the > > > base MSR, as opposed to dumping them into the list and cross-referencing > > > them against CPUID. E.g. there should also be some form of check that the > > > architectural PMUs are even supported. > > > > Yes. The problem appears to be that msrs_to_save[] and emulated_msrs[] > > are global and for the MSRs in question we check > > kvm_find_cpuid_entry(vcpu, 0xa, ) to find out how many of them are > > available so this can be different for different VMs (and even vCPUs :-) > > However, > > > > "KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list > > varies by kvm version and host processor, but does not change otherwise." > > > > So it seems that PMU MSRs just can't be there. Revert? > > The API design is unfortunate, but I would argue that any MSR that a > guest *might* support has to be in this list for live migration to > work with the vPMU enabled. I don't know about qemu, but Google's > userspace will only save/restore MSRs that are in this list Having said that, please revert the offending commit, and I'll take another shot at it (unless someone else wants to do it).
On 27/09/19 17:55, Jim Mattson wrote: >> "KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list >> varies by kvm version and host processor, but does not change otherwise." >> >> So it seems that PMU MSRs just can't be there. Revert? > > The API design is unfortunate, but I would argue that any MSR that a > guest *might* support has to be in this list for live migration to > work with the vPMU enabled. In theory yes, in practice this breaks any userspace that (such as state_test) blindly takes the list and passes it to KVM_GET_MSR. > I don't know about qemu, but Google's > userspace will only save/restore MSRs that are in this list Almost, there are a few MSRs that it saves/restores always (TSC, kvmclock, MTRR, PAT, sysenter CS/ESP/EIP, and on 64-bit machines CSTAR/KERNELGSBASE/FMASK/LSTAR) and some where it compiles the list fom KVM_GET_SUPPORTED_CPUID information (the PMU and processor tracing). Of these, PMU and processor tracing seem to be the only one that vary per VM. So yeah, it needs to be reverted I suppose. Paolo
On 27/09/19 17:58, Xiaoyao Li wrote: > Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > they are free from different guest configuration since they're initialized when > kvm module is loaded. > > Even though some MSRs are not exposed to guest by clear their related cpuid > bits, they are still saved/restored by QEMU in the same fashion. > > I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? We can add a per-VM version too, yes. Paolo
On Fri, Sep 27, 2019 at 9:05 AM Xiaoyao Li <xiaoyao.li@intel.com> wrote: > > On Fri, 2019-09-27 at 17:46 +0200, Vitaly Kuznetsov wrote: > > > > > > > Is this something known already or should I investigate? > > > > > > > > > > > > No, I didn't know about it, it works here. > > > > > > > > > > > > > > > > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > > > > > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > > > > > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > > > > > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > > > > > it against nr_arch_gp_counters and returns a failure. > > > > > > > > Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need > > > > to be checked against CPUID before allowing them. > > > > > > My vote would be to programmatically generate the MSRs using CPUID and the > > > base MSR, as opposed to dumping them into the list and cross-referencing > > > them against CPUID. E.g. there should also be some form of check that the > > > architectural PMUs are even supported. > > > > Yes. The problem appears to be that msrs_to_save[] and emulated_msrs[] > > are global and for the MSRs in question we check > > kvm_find_cpuid_entry(vcpu, 0xa, ) to find out how many of them are > > available so this can be different for different VMs (and even vCPUs :-) > > However, > > > > "KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list > > varies by kvm version and host processor, but does not change otherwise." > > > > Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > they are free from different guest configuration since they're initialized when > kvm module is loaded. > > Even though some MSRs are not exposed to guest by clear their related cpuid > bits, they are still saved/restored by QEMU in the same fashion. > > I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? Yes! Quoting Paolo from a few days ago, "If there's a complex or really weird behavior that userspace would most definitely get wrong, we should design the API to simplify its job." > > So it seems that PMU MSRs just can't be there. Revert? > > >
On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 27/09/19 17:58, Xiaoyao Li wrote: > > Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > > they are free from different guest configuration since they're initialized when > > kvm module is loaded. > > > > Even though some MSRs are not exposed to guest by clear their related cpuid > > bits, they are still saved/restored by QEMU in the same fashion. > > > > I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > > We can add a per-VM version too, yes. Should the system-wide version continue to list *some* supported MSRs and *some* unsupported MSRs, with no rhyme or reason? Or should we codify what that list contains?
On 27/09/19 18:10, Jim Mattson wrote: > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: >> >> On 27/09/19 17:58, Xiaoyao Li wrote: >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and >>> they are free from different guest configuration since they're initialized when >>> kvm module is loaded. >>> >>> Even though some MSRs are not exposed to guest by clear their related cpuid >>> bits, they are still saved/restored by QEMU in the same fashion. >>> >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? >> >> We can add a per-VM version too, yes. There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM ioctl. > Should the system-wide version continue to list *some* supported MSRs > and *some* unsupported MSRs, with no rhyme or reason? Or should we > codify what that list contains? The optimal thing would be for it to list only MSRs that are unconditionally supported by all VMs and are part of the runtime state. MSRs that are not part of the runtime state, such as the VMX capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. Unfortunately, that commit was done because userspace (QEMU) has a genuine need to detect whether KVM is new enough to support the IA32_VMX_VMFUNC MSR. Perhaps we can make all MSRs supported unconditionally if host_initiated. For unsupported performance counters it's easy to make them return 0, and allow setting them to 0, if host_initiated (BTW, how did you pick 32? is there any risk of conflicts with other MSRs?). I'm not sure of the best set of values to allow for VMX caps, especially with the default0/default1 stuff going on for execution controls. But perhaps that would be the simplest thing to do. One possibility would be to make a KVM_GET_MSR_INDEX_LIST variant that is a system ioctl and takes a CPUID vector. I'm worried that it would be tedious to get right and hardish to keep correct---so I'm not sure it's a good idea. Paolo
On Fri, Sep 27, 2019 at 06:32:27PM +0200, Paolo Bonzini wrote: > On 27/09/19 18:10, Jim Mattson wrote: > > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > >> > >> On 27/09/19 17:58, Xiaoyao Li wrote: > >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > >>> they are free from different guest configuration since they're initialized when > >>> kvm module is loaded. > >>> > >>> Even though some MSRs are not exposed to guest by clear their related cpuid > >>> bits, they are still saved/restored by QEMU in the same fashion. > >>> > >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > >> > >> We can add a per-VM version too, yes. > > There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM > ioctl. > > > Should the system-wide version continue to list *some* supported MSRs > > and *some* unsupported MSRs, with no rhyme or reason? Or should we > > codify what that list contains? > > The optimal thing would be for it to list only MSRs that are > unconditionally supported by all VMs and are part of the runtime state. > MSRs that are not part of the runtime state, such as the VMX > capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. > > This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX > MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. > Unfortunately, that commit was done because userspace (QEMU) has a > genuine need to detect whether KVM is new enough to support the > IA32_VMX_VMFUNC MSR. > > Perhaps we can make all MSRs supported unconditionally if > host_initiated. For unsupported performance counters it's easy to make > them return 0, and allow setting them to 0, if host_initiated I don't think we need to go that far. Allowing any ol' MSR access seems like it would cause more problems than it would solve, e.g. userspace could completely botch something and never know. For the perf MSRs, could we enumerate all arch perf MSRs that are supported by hardware? That would also be the list of MSRs that host_initiated MSR accesses can touch regardless of guest support. Something like: case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0+INTEL_PMC_MAX_GENERIC: case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0+INTEL_PMC_MAX_GENERIC: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); else if (msr <= num_hw_counters) break; return 1; > (BTW, how did you pick 32? is there any risk of conflicts with other MSRs?). Presumably because INTEL_PMC_MAX_GENERIC is 32. > I'm not sure of the best set of values to allow for VMX caps, especially > with the default0/default1 stuff going on for execution controls. But > perhaps that would be the simplest thing to do. > > One possibility would be to make a KVM_GET_MSR_INDEX_LIST variant that > is a system ioctl and takes a CPUID vector. I'm worried that it would > be tedious to get right and hardish to keep correct---so I'm not sure > it's a good idea. > > Paolo
On 27/09/19 19:14, Sean Christopherson wrote: >> >> Perhaps we can make all MSRs supported unconditionally if >> host_initiated. For unsupported performance counters it's easy to make >> them return 0, and allow setting them to 0, if host_initiated > I don't think we need to go that far. Allowing any ol' MSR access seems > like it would cause more problems than it would solve, e.g. userspace > could completely botch something and never know. Well, I didn't mean really _all_ MSRs, only those returned by KVM_GET_MSR_INDEX_LIST. > For the perf MSRs, could we enumerate all arch perf MSRs that are supported > by hardware? That would also be the list of MSRs that host_initiated MSR > accesses can touch regardless of guest support. Yes, that is easy indeed. Any ideas about VMX? Paolo
On Fri, Sep 27, 2019 at 9:32 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 27/09/19 18:10, Jim Mattson wrote: > > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > >> > >> On 27/09/19 17:58, Xiaoyao Li wrote: > >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > >>> they are free from different guest configuration since they're initialized when > >>> kvm module is loaded. > >>> > >>> Even though some MSRs are not exposed to guest by clear their related cpuid > >>> bits, they are still saved/restored by QEMU in the same fashion. > >>> > >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > >> > >> We can add a per-VM version too, yes. > > There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM > ioctl. > > > Should the system-wide version continue to list *some* supported MSRs > > and *some* unsupported MSRs, with no rhyme or reason? Or should we > > codify what that list contains? > > The optimal thing would be for it to list only MSRs that are > unconditionally supported by all VMs and are part of the runtime state. > MSRs that are not part of the runtime state, such as the VMX > capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. > > This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX > MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. > Unfortunately, that commit was done because userspace (QEMU) has a > genuine need to detect whether KVM is new enough to support the > IA32_VMX_VMFUNC MSR. > > Perhaps we can make all MSRs supported unconditionally if > host_initiated. For unsupported performance counters it's easy to make > them return 0, and allow setting them to 0, if host_initiated (BTW, how > did you pick 32? is there any risk of conflicts with other MSRs?). 32 comes from INTEL_PMC_MAX_GENERIC. There are definitely conflicts. (Sorry; this should have occurred to me earlier.) 32 event selectors would occupy indices [0x186, 0x1a6). But on the architectural MSR list, only indices up through 0x197 are "reserved" (presumably for future event selectors). 32 GP counters would occupy indices [0xc1, 0xe1). But on the architectural MSR list, only indices up through 0xc8 are defined for GP counters. None are marked "reserved" for future expansion, but none in the range (0xc8, 0xe1) are defined either. Perhaps INTEL_MAX_PMC_GENERIC should be reduced to 18. If we removed event selectors and counters above 18, would my original approach work? > I'm not sure of the best set of values to allow for VMX caps, especially > with the default0/default1 stuff going on for execution controls. But > perhaps that would be the simplest thing to do. > > One possibility would be to make a KVM_GET_MSR_INDEX_LIST variant that > is a system ioctl and takes a CPUID vector. I'm worried that it would > be tedious to get right and hardish to keep correct---so I'm not sure > it's a good idea. > > Paolo
On Fri, Sep 27, 2019 at 07:19:14PM +0200, Paolo Bonzini wrote: > On 27/09/19 19:14, Sean Christopherson wrote: > >> > >> Perhaps we can make all MSRs supported unconditionally if > >> host_initiated. For unsupported performance counters it's easy to make > >> them return 0, and allow setting them to 0, if host_initiated > > I don't think we need to go that far. Allowing any ol' MSR access seems > > like it would cause more problems than it would solve, e.g. userspace > > could completely botch something and never know. > > Well, I didn't mean really _all_ MSRs, only those returned by > KVM_GET_MSR_INDEX_LIST. Ah, that makes way more sense :-) > > For the perf MSRs, could we enumerate all arch perf MSRs that are supported > > by hardware? That would also be the list of MSRs that host_initiated MSR > > accesses can touch regardless of guest support. > > Yes, that is easy indeed. Any ideas about VMX? Can you elaborate on the problem with VMX MSRs?
On Fri, Sep 27, 2019 at 9:32 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 27/09/19 18:10, Jim Mattson wrote: > > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > >> > >> On 27/09/19 17:58, Xiaoyao Li wrote: > >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > >>> they are free from different guest configuration since they're initialized when > >>> kvm module is loaded. > >>> > >>> Even though some MSRs are not exposed to guest by clear their related cpuid > >>> bits, they are still saved/restored by QEMU in the same fashion. > >>> > >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > >> > >> We can add a per-VM version too, yes. > > There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM > ioctl. > > > Should the system-wide version continue to list *some* supported MSRs > > and *some* unsupported MSRs, with no rhyme or reason? Or should we > > codify what that list contains? > > The optimal thing would be for it to list only MSRs that are > unconditionally supported by all VMs and are part of the runtime state. > MSRs that are not part of the runtime state, such as the VMX > capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. > > This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX > MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. > Unfortunately, that commit was done because userspace (QEMU) has a > genuine need to detect whether KVM is new enough to support the > IA32_VMX_VMFUNC MSR. > > Perhaps we can make all MSRs supported unconditionally if > host_initiated. For unsupported performance counters it's easy to make > them return 0, and allow setting them to 0, if host_initiated (BTW, how > did you pick 32? is there any risk of conflicts with other MSRs?). > > I'm not sure of the best set of values to allow for VMX caps, especially > with the default0/default1 stuff going on for execution controls. But > perhaps that would be the simplest thing to do. > > One possibility would be to make a KVM_GET_MSR_INDEX_LIST variant that > is a system ioctl and takes a CPUID vector. I'm worried that it would > be tedious to get right and hardish to keep correct---so I'm not sure > it's a good idea. Even worse, CPUID alone isn't sufficient. For example, according to the SDM, "The IA32_VMX_VMFUNC MSR exists only on processors that support the 1-setting of the 'activate secondary controls' VM-execution control (only if bit 63 of the IA32_VMX_PROCBASED_CTLS MSR is 1) and the 1-setting of the 'enable VM functions' secondary processor-based VM-execution control (only if bit 45 of the IA32_VMX_PROCBASED_CTLS2 MSR is 1)."
On Fri, Sep 27, 2019 at 10:14 AM Sean Christopherson <sean.j.christopherson@intel.com> wrote: > > On Fri, Sep 27, 2019 at 06:32:27PM +0200, Paolo Bonzini wrote: > > On 27/09/19 18:10, Jim Mattson wrote: > > > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > >> > > >> On 27/09/19 17:58, Xiaoyao Li wrote: > > >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > > >>> they are free from different guest configuration since they're initialized when > > >>> kvm module is loaded. > > >>> > > >>> Even though some MSRs are not exposed to guest by clear their related cpuid > > >>> bits, they are still saved/restored by QEMU in the same fashion. > > >>> > > >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > > >> > > >> We can add a per-VM version too, yes. > > > > There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM > > ioctl. > > > > > Should the system-wide version continue to list *some* supported MSRs > > > and *some* unsupported MSRs, with no rhyme or reason? Or should we > > > codify what that list contains? > > > > The optimal thing would be for it to list only MSRs that are > > unconditionally supported by all VMs and are part of the runtime state. > > MSRs that are not part of the runtime state, such as the VMX > > capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. > > > > This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX > > MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. > > Unfortunately, that commit was done because userspace (QEMU) has a > > genuine need to detect whether KVM is new enough to support the > > IA32_VMX_VMFUNC MSR. > > > > Perhaps we can make all MSRs supported unconditionally if > > host_initiated. For unsupported performance counters it's easy to make > > them return 0, and allow setting them to 0, if host_initiated > > I don't think we need to go that far. Allowing any ol' MSR access seems > like it would cause more problems than it would solve, e.g. userspace > could completely botch something and never know. > > For the perf MSRs, could we enumerate all arch perf MSRs that are supported > by hardware? That would also be the list of MSRs that host_initiated MSR > accesses can touch regardless of guest support. > > Something like: > > case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0+INTEL_PMC_MAX_GENERIC: > case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0+INTEL_PMC_MAX_GENERIC: > if (kvm_pmu_is_valid_msr(vcpu, msr)) > return kvm_pmu_set_msr(vcpu, msr_info); > else if (msr <= num_hw_counters) > break; > return 1; That doesn't quite work, since you need a vcpu, and KVM_GET_MSR_INDEX_LIST is a system-wide ioctl, not a VCPU ioctl. > > (BTW, how did you pick 32? is there any risk of conflicts with other MSRs?). > > Presumably because INTEL_PMC_MAX_GENERIC is 32. > > > I'm not sure of the best set of values to allow for VMX caps, especially > > with the default0/default1 stuff going on for execution controls. But > > perhaps that would be the simplest thing to do. > > > > One possibility would be to make a KVM_GET_MSR_INDEX_LIST variant that > > is a system ioctl and takes a CPUID vector. I'm worried that it would > > be tedious to get right and hardish to keep correct---so I'm not sure > > it's a good idea. > > > > Paolo
On Fri, Sep 27, 2019 at 10:22:51AM -0700, Jim Mattson wrote: > On Fri, Sep 27, 2019 at 9:32 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > > > On 27/09/19 18:10, Jim Mattson wrote: > > > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > >> > > >> On 27/09/19 17:58, Xiaoyao Li wrote: > > >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > > >>> they are free from different guest configuration since they're initialized when > > >>> kvm module is loaded. > > >>> > > >>> Even though some MSRs are not exposed to guest by clear their related cpuid > > >>> bits, they are still saved/restored by QEMU in the same fashion. > > >>> > > >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > > >> > > >> We can add a per-VM version too, yes. > > > > There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM > > ioctl. > > > > > Should the system-wide version continue to list *some* supported MSRs > > > and *some* unsupported MSRs, with no rhyme or reason? Or should we > > > codify what that list contains? > > > > The optimal thing would be for it to list only MSRs that are > > unconditionally supported by all VMs and are part of the runtime state. > > MSRs that are not part of the runtime state, such as the VMX > > capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. > > > > This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX > > MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. > > Unfortunately, that commit was done because userspace (QEMU) has a > > genuine need to detect whether KVM is new enough to support the > > IA32_VMX_VMFUNC MSR. > > > > Perhaps we can make all MSRs supported unconditionally if > > host_initiated. For unsupported performance counters it's easy to make > > them return 0, and allow setting them to 0, if host_initiated (BTW, how > > did you pick 32? is there any risk of conflicts with other MSRs?). > > 32 comes from INTEL_PMC_MAX_GENERIC. There are definitely conflicts. > (Sorry; this should have occurred to me earlier.) 32 event selectors > would occupy indices [0x186, 0x1a6). But on the architectural MSR > list, only indices up through 0x197 are "reserved" (presumably for > future event selectors). 32 GP counters would occupy indices [0xc1, > 0xe1). But on the architectural MSR list, only indices up through 0xc8 > are defined for GP counters. None are marked "reserved" for future > expansion, but none in the range (0xc8, 0xe1) are defined either. > > Perhaps INTEL_MAX_PMC_GENERIC should be reduced to 18. If we removed > event selectors and counters above 18, would my original approach > work? Heh, VMX is technically available on P4 processors, which don't support the architectural PMU. Generating the list based on hardware CPUID seems both safer and easier.
On Fri, Sep 27, 2019 at 10:30:38AM -0700, Jim Mattson wrote: > On Fri, Sep 27, 2019 at 10:14 AM Sean Christopherson > <sean.j.christopherson@intel.com> wrote: > > > > On Fri, Sep 27, 2019 at 06:32:27PM +0200, Paolo Bonzini wrote: > > > On 27/09/19 18:10, Jim Mattson wrote: > > > > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > > >> > > > >> On 27/09/19 17:58, Xiaoyao Li wrote: > > > >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > > > >>> they are free from different guest configuration since they're initialized when > > > >>> kvm module is loaded. > > > >>> > > > >>> Even though some MSRs are not exposed to guest by clear their related cpuid > > > >>> bits, they are still saved/restored by QEMU in the same fashion. > > > >>> > > > >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > > > >> > > > >> We can add a per-VM version too, yes. > > > > > > There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM > > > ioctl. > > > > > > > Should the system-wide version continue to list *some* supported MSRs > > > > and *some* unsupported MSRs, with no rhyme or reason? Or should we > > > > codify what that list contains? > > > > > > The optimal thing would be for it to list only MSRs that are > > > unconditionally supported by all VMs and are part of the runtime state. > > > MSRs that are not part of the runtime state, such as the VMX > > > capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. > > > > > > This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX > > > MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. > > > Unfortunately, that commit was done because userspace (QEMU) has a > > > genuine need to detect whether KVM is new enough to support the > > > IA32_VMX_VMFUNC MSR. > > > > > > Perhaps we can make all MSRs supported unconditionally if > > > host_initiated. For unsupported performance counters it's easy to make > > > them return 0, and allow setting them to 0, if host_initiated > > > > I don't think we need to go that far. Allowing any ol' MSR access seems > > like it would cause more problems than it would solve, e.g. userspace > > could completely botch something and never know. > > > > For the perf MSRs, could we enumerate all arch perf MSRs that are supported > > by hardware? That would also be the list of MSRs that host_initiated MSR > > accesses can touch regardless of guest support. > > > > Something like: > > > > case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0+INTEL_PMC_MAX_GENERIC: > > case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0+INTEL_PMC_MAX_GENERIC: > > if (kvm_pmu_is_valid_msr(vcpu, msr)) > > return kvm_pmu_set_msr(vcpu, msr_info); > > else if (msr <= num_hw_counters) > > break; > > return 1; > > That doesn't quite work, since you need a vcpu, and > KVM_GET_MSR_INDEX_LIST is a system-wide ioctl, not a VCPU ioctl. That'd be for the {kvm,vmx}_set_msr() flow. The KVM_GET_MSR_INDEX_LIST flow would report all MSRs from 0..num_hw_counters, where num_hw_counters is pulled from CPUID.
On Fri, Sep 27, 2019 at 8:19 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 27/09/19 16:40, Vitaly Kuznetsov wrote: > > Paolo Bonzini <pbonzini@redhat.com> writes: > > > >> On 27/09/19 15:53, Vitaly Kuznetsov wrote: > >>> Paolo Bonzini <pbonzini@redhat.com> writes: > >>> > >>>> Queued, thanks. > >>> > >>> I'm sorry for late feedback but this commit seems to be causing > >>> selftests failures for me, e.g.: > >>> > >>> # ./x86_64/state_test > >>> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages > >>> Guest physical address width detected: 46 > >>> ==== Test Assertion Failure ==== > >>> lib/x86_64/processor.c:1089: r == nmsrs > >>> pid=14431 tid=14431 - Argument list too long > >>> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) > >>> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) > >>> 3 0x00007f881eb453d4: ?? ??:0 > >>> 4 0x0000000000401287: _start at ??:? > >>> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) > >>> > >>> Is this something known already or should I investigate? > >> > >> No, I didn't know about it, it works here. > >> > > > > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > > it against nr_arch_gp_counters and returns a failure. > > Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need > to be checked against CPUID before allowing them. According to the SDM, volume 4, table 2-2, IA-32 Architectural MSRs, 194H is reserved. Sounds like your CPU is mis-architected. :-) > Paolo > > > Oh, btw, this is Intel(R) Xeon(R) CPU E5-2603 v3 > > > > So apparently rdmsr_safe() check in kvm_init_msr_list() is not enough, > > for PMU MSRs we need to do extra. > > >
On Fri, Sep 27, 2019 at 10:36:14AM -0700, Jim Mattson wrote: > On Fri, Sep 27, 2019 at 8:19 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > > > On 27/09/19 16:40, Vitaly Kuznetsov wrote: > > > Paolo Bonzini <pbonzini@redhat.com> writes: > > > > > >> On 27/09/19 15:53, Vitaly Kuznetsov wrote: > > >>> Paolo Bonzini <pbonzini@redhat.com> writes: > > >>> > > >>>> Queued, thanks. > > >>> > > >>> I'm sorry for late feedback but this commit seems to be causing > > >>> selftests failures for me, e.g.: > > >>> > > >>> # ./x86_64/state_test > > >>> Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages > > >>> Guest physical address width detected: 46 > > >>> ==== Test Assertion Failure ==== > > >>> lib/x86_64/processor.c:1089: r == nmsrs > > >>> pid=14431 tid=14431 - Argument list too long > > >>> 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) > > >>> 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) > > >>> 3 0x00007f881eb453d4: ?? ??:0 > > >>> 4 0x0000000000401287: _start at ??:? > > >>> Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) > > >>> > > >>> Is this something known already or should I investigate? > > >> > > >> No, I didn't know about it, it works here. > > >> > > > > > > Ok, this is a bit weird :-) '194' is 'MSR_ARCH_PERFMON_EVENTSEL0 + > > > 14'. In intel_pmu_refresh() nr_arch_gp_counters is set to '8', however, > > > rdmsr_safe() for this MSR passes in kvm_init_msr_list() (but it fails > > > for 0x18e..0x193!) so it stay in the list. get_gp_pmc(), however, checks > > > it against nr_arch_gp_counters and returns a failure. > > > > Huh, 194h apparently is a "FLEX_RATIO" MSR. I agree that PMU MSRs need > > to be checked against CPUID before allowing them. > > According to the SDM, volume 4, table 2-2, IA-32 Architectural MSRs, > 194H is reserved. Sounds like your CPU is mis-architected. :-) It's definitely supposed to be reserved, the footnote in that table even doubles down: Starting with Intel Core Duo processors, MSR addresses 180H-185H, 188H-197H are reserved. But looking at some internal definitions for MSR layouts, 194H is used for IA32_FLEX_RATIO MSR on quite literally every processor that is supposed to reserve 188H-197H.
On Fri, Sep 27, 2019 at 10:35 AM Sean Christopherson <sean.j.christopherson@intel.com> wrote: > > On Fri, Sep 27, 2019 at 10:30:38AM -0700, Jim Mattson wrote: > > On Fri, Sep 27, 2019 at 10:14 AM Sean Christopherson > > <sean.j.christopherson@intel.com> wrote: > > > > > > On Fri, Sep 27, 2019 at 06:32:27PM +0200, Paolo Bonzini wrote: > > > > On 27/09/19 18:10, Jim Mattson wrote: > > > > > On Fri, Sep 27, 2019 at 9:06 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > > > >> > > > > >> On 27/09/19 17:58, Xiaoyao Li wrote: > > > > >>> Indeed, "KVM_GET_MSR_INDEX_LIST" returns the guest msrs that KVM supports and > > > > >>> they are free from different guest configuration since they're initialized when > > > > >>> kvm module is loaded. > > > > >>> > > > > >>> Even though some MSRs are not exposed to guest by clear their related cpuid > > > > >>> bits, they are still saved/restored by QEMU in the same fashion. > > > > >>> > > > > >>> I wonder should we change "KVM_GET_MSR_INDEX_LIST" per VM? > > > > >> > > > > >> We can add a per-VM version too, yes. > > > > > > > > There is one problem with that: KVM_SET_CPUID2 is a vCPU ioctl, not a VM > > > > ioctl. > > > > > > > > > Should the system-wide version continue to list *some* supported MSRs > > > > > and *some* unsupported MSRs, with no rhyme or reason? Or should we > > > > > codify what that list contains? > > > > > > > > The optimal thing would be for it to list only MSRs that are > > > > unconditionally supported by all VMs and are part of the runtime state. > > > > MSRs that are not part of the runtime state, such as the VMX > > > > capabilities, should be returned by KVM_GET_MSR_FEATURE_INDEX_LIST. > > > > > > > > This also means that my own commit 95c5c7c77c06 ("KVM: nVMX: list VMX > > > > MSRs in KVM_GET_MSR_INDEX_LIST", 2019-07-02) was incorrect. > > > > Unfortunately, that commit was done because userspace (QEMU) has a > > > > genuine need to detect whether KVM is new enough to support the > > > > IA32_VMX_VMFUNC MSR. > > > > > > > > Perhaps we can make all MSRs supported unconditionally if > > > > host_initiated. For unsupported performance counters it's easy to make > > > > them return 0, and allow setting them to 0, if host_initiated > > > > > > I don't think we need to go that far. Allowing any ol' MSR access seems > > > like it would cause more problems than it would solve, e.g. userspace > > > could completely botch something and never know. > > > > > > For the perf MSRs, could we enumerate all arch perf MSRs that are supported > > > by hardware? That would also be the list of MSRs that host_initiated MSR > > > accesses can touch regardless of guest support. > > > > > > Something like: > > > > > > case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0+INTEL_PMC_MAX_GENERIC: > > > case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0+INTEL_PMC_MAX_GENERIC: > > > if (kvm_pmu_is_valid_msr(vcpu, msr)) > > > return kvm_pmu_set_msr(vcpu, msr_info); > > > else if (msr <= num_hw_counters) > > > break; > > > return 1; > > > > That doesn't quite work, since you need a vcpu, and > > KVM_GET_MSR_INDEX_LIST is a system-wide ioctl, not a VCPU ioctl. > > That'd be for the {kvm,vmx}_set_msr() flow. The KVM_GET_MSR_INDEX_LIST > flow would report all MSRs from 0..num_hw_counters, where num_hw_counters > is pulled from CPUID. The try-catch methodology used for all of the other msrs_to_save[] *should* have worked here, if (a) the static PMU MSR list stopped at the correct maximum PMC index (which is clearly less than INTEL_PMC_MAX_GENERIC), and (b) Intel had reserved all of the currently unused PMU MSRs so that accesses to them actually raised #GP. Alas, neither is the case. Revert the change. I propose moving these MSRs to the inaptly named emulated_msrs[], reducing the static list to the theoretical maximum of 18, and adding code to vmx_has_emulated_msr to filter based on host CPUID leaf 0AH.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 93b0bd45ac73..ecaaa411538f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1140,6 +1140,42 @@ static u32 msrs_to_save[] = { MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, }; static unsigned num_msrs_to_save; @@ -4989,6 +5025,11 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + "Please update the fixed PMCs in msrs_to_save[]"); + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue;