diff mbox series

[v4,05/10] KVM/x86: expose MSR_IA32_PERF_CAPABILITIES to the guest

Message ID 1545816338-1171-6-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show
Series Guest LBR Enabling | expand

Commit Message

Wang, Wei W Dec. 26, 2018, 9:25 a.m. UTC
Bits [0, 5] of MSR_IA32_PERF_CAPABILITIES tell about the format of
the addresses stored in the LBR stack. Expose those bits to the guest
when the guest lbr feature is enabled.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/perf_event.h | 2 ++
 arch/x86/kvm/cpuid.c              | 2 +-
 arch/x86/kvm/vmx.c                | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

Comments

Jim Mattson Jan. 2, 2019, 11:40 p.m. UTC | #1
On Wed, Dec 26, 2018 at 2:01 AM Wei Wang <wei.w.wang@intel.com> wrote:
>
> Bits [0, 5] of MSR_IA32_PERF_CAPABILITIES tell about the format of
> the addresses stored in the LBR stack. Expose those bits to the guest
> when the guest lbr feature is enabled.
>
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/include/asm/perf_event.h | 2 ++
>  arch/x86/kvm/cpuid.c              | 2 +-
>  arch/x86/kvm/vmx.c                | 9 +++++++++
>  3 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> index 2f82795..eee09b7 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -87,6 +87,8 @@
>  #define ARCH_PERFMON_BRANCH_MISSES_RETIRED             6
>  #define ARCH_PERFMON_EVENTS_COUNT                      7
>
> +#define X86_PERF_CAP_MASK_LBR_FMT                      0x3f
> +
>  /*
>   * Intel "Architectural Performance Monitoring" CPUID
>   * detection/enumeration details:
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 7bcfa61..3b8a57b 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -365,7 +365,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>                 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
>                 0 /* DS-CPL, VMX, SMX, EST */ |
>                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
> -               F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
> +               F(FMA) | F(CX16) | 0 /* xTPR Update*/ | F(PDCM) |
>                 F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
>                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
>                 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 8d5d984..ee02967 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4161,6 +4161,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                         return 1;
>                 msr_info->data = vcpu->arch.ia32_xss;
>                 break;
> +       case MSR_IA32_PERF_CAPABILITIES:
> +               if (!boot_cpu_has(X86_FEATURE_PDCM))
> +                       return 1;
> +               msr_info->data = native_read_msr(MSR_IA32_PERF_CAPABILITIES);

Since this isn't guarded by vcpu->kvm->arch.lbr_in_guest, it breaks
backwards compatibility, doesn't it?

> +               if (vcpu->kvm->arch.lbr_in_guest)
> +                       msr_info->data &= X86_PERF_CAP_MASK_LBR_FMT;
> +               break;
>         case MSR_TSC_AUX:
>                 if (!msr_info->host_initiated &&
>                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> @@ -4343,6 +4350,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                 else
>                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
>                 break;
> +       case MSR_IA32_PERF_CAPABILITIES:
> +               return 1; /* RO MSR */
>         case MSR_TSC_AUX:
>                 if (!msr_info->host_initiated &&
>                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> --
> 2.7.4
>
Wang, Wei W Jan. 3, 2019, 8 a.m. UTC | #2
On 01/03/2019 07:40 AM, Jim Mattson wrote:
> On Wed, Dec 26, 2018 at 2:01 AM Wei Wang <wei.w.wang@intel.com> wrote:
>> Bits [0, 5] of MSR_IA32_PERF_CAPABILITIES tell about the format of
>> the addresses stored in the LBR stack. Expose those bits to the guest
>> when the guest lbr feature is enabled.
>>
>> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Andi Kleen <ak@linux.intel.com>
>> ---
>>   arch/x86/include/asm/perf_event.h | 2 ++
>>   arch/x86/kvm/cpuid.c              | 2 +-
>>   arch/x86/kvm/vmx.c                | 9 +++++++++
>>   3 files changed, 12 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
>> index 2f82795..eee09b7 100644
>> --- a/arch/x86/include/asm/perf_event.h
>> +++ b/arch/x86/include/asm/perf_event.h
>> @@ -87,6 +87,8 @@
>>   #define ARCH_PERFMON_BRANCH_MISSES_RETIRED             6
>>   #define ARCH_PERFMON_EVENTS_COUNT                      7
>>
>> +#define X86_PERF_CAP_MASK_LBR_FMT                      0x3f
>> +
>>   /*
>>    * Intel "Architectural Performance Monitoring" CPUID
>>    * detection/enumeration details:
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index 7bcfa61..3b8a57b 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -365,7 +365,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>>                  F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
>>                  0 /* DS-CPL, VMX, SMX, EST */ |
>>                  0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
>> -               F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
>> +               F(FMA) | F(CX16) | 0 /* xTPR Update*/ | F(PDCM) |
>>                  F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
>>                  F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
>>                  0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 8d5d984..ee02967 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -4161,6 +4161,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>                          return 1;
>>                  msr_info->data = vcpu->arch.ia32_xss;
>>                  break;
>> +       case MSR_IA32_PERF_CAPABILITIES:
>> +               if (!boot_cpu_has(X86_FEATURE_PDCM))
>> +                       return 1;
>> +               msr_info->data = native_read_msr(MSR_IA32_PERF_CAPABILITIES);
> Since this isn't guarded by vcpu->kvm->arch.lbr_in_guest, it breaks
> backwards compatibility, doesn't it?

Right, thanks. Probably better to change it to below:

msr_info->data = 0;
data = native_read_msr(MSR_IA32_PERF_CAPABILITIES);
if (vcpu->kvm->arch.lbr_in_guest)
     msr_info->data |= (data & X86_PERF_CAP_MASK_LBR_FMT);

Best,
Wei
Jim Mattson Jan. 3, 2019, 3:25 p.m. UTC | #3
On Wed, Jan 2, 2019 at 11:55 PM Wei Wang <wei.w.wang@intel.com> wrote:

> Right, thanks. Probably better to change it to below:
>
> msr_info->data = 0;
> data = native_read_msr(MSR_IA32_PERF_CAPABILITIES);
> if (vcpu->kvm->arch.lbr_in_guest)
>      msr_info->data |= (data & X86_PERF_CAP_MASK_LBR_FMT);
>

This still breaks backwards compatibility. Returning 0 and raising #GP
are not the same.
Wang, Wei W Jan. 7, 2019, 9:15 a.m. UTC | #4
On 01/03/2019 11:25 PM, Jim Mattson wrote:
> On Wed, Jan 2, 2019 at 11:55 PM Wei Wang <wei.w.wang@intel.com> wrote:
>
>> Right, thanks. Probably better to change it to below:
>>
>> msr_info->data = 0;
>> data = native_read_msr(MSR_IA32_PERF_CAPABILITIES);
>> if (vcpu->kvm->arch.lbr_in_guest)
>>       msr_info->data |= (data & X86_PERF_CAP_MASK_LBR_FMT);
>>
> This still breaks backwards compatibility. Returning 0 and raising #GP
> are not the same.

I'm not sure about raising GP# in this case.

This PERF_CAP msr contains more things than the lbr format.
For example, a guest with lbr=false option could read it to get PEBS_FMT,
which is PERF_CAP[11:8]. We should offer those bits in this case.

When lbr=false, the lbr feature is not usable by the guest,
so I think whatever value (0 or other value) of the LBR_FMT bits that
we give to the guest might not be important.

Best,
Wei
Jim Mattson Jan. 7, 2019, 6:05 p.m. UTC | #5
On Mon, Jan 7, 2019 at 1:09 AM Wei Wang <wei.w.wang@intel.com> wrote:
>
> On 01/03/2019 11:25 PM, Jim Mattson wrote:
> > On Wed, Jan 2, 2019 at 11:55 PM Wei Wang <wei.w.wang@intel.com> wrote:
> >
> >> Right, thanks. Probably better to change it to below:
> >>
> >> msr_info->data = 0;
> >> data = native_read_msr(MSR_IA32_PERF_CAPABILITIES);
> >> if (vcpu->kvm->arch.lbr_in_guest)
> >>       msr_info->data |= (data & X86_PERF_CAP_MASK_LBR_FMT);
> >>
> > This still breaks backwards compatibility. Returning 0 and raising #GP
> > are not the same.
>
> I'm not sure about raising GP# in this case.
>
> This PERF_CAP msr contains more things than the lbr format.
> For example, a guest with lbr=false option could read it to get PEBS_FMT,
> which is PERF_CAP[11:8]. We should offer those bits in this case.
>
> When lbr=false, the lbr feature is not usable by the guest,
> so I think whatever value (0 or other value) of the LBR_FMT bits that
> we give to the guest might not be important.

The issue is compatibility. Prior to your change, reading this MSR
from a VM would raise #GP. After your change, it won't. That means
that if you have a VM migrating between hosts with kernel versions
before and after this change, the results will be inconsistent. In the
forward migration path, RDMSR will first raise #GP, and later will
return 0. In the backward migration path, RDMSR will first return 0,
and later it will raise #GP.

To avoid these inconsistencies, the new MSR behavior should be
explicitly enabled by an opt-in ioctl from userspace (not necessarily
the same ioctl that enables LBR).
Andi Kleen Jan. 7, 2019, 6:20 p.m. UTC | #6
> The issue is compatibility. Prior to your change, reading this MSR
> from a VM would raise #GP. After your change, it won't. That means
> that if you have a VM migrating between hosts with kernel versions
> before and after this change, the results will be inconsistent. In the

No it will not be. All Linux kernel uses of this MSR are guarded
by a CPUID check.

-Andi
Jim Mattson Jan. 7, 2019, 6:48 p.m. UTC | #7
On Mon, Jan 7, 2019 at 10:20 AM Andi Kleen <ak@linux.intel.com> wrote:
>
> > The issue is compatibility. Prior to your change, reading this MSR
> > from a VM would raise #GP. After your change, it won't. That means
> > that if you have a VM migrating between hosts with kernel versions
> > before and after this change, the results will be inconsistent. In the
>
> No it will not be. All Linux kernel uses of this MSR are guarded
> by a CPUID check.

Linux usage is irrelevant to the architected behavior of the virtual
CPU. According to volume 4 of the SDM, this MSR is only supported when
CPUID.01H:ECX.PDCM [bit 15] is set. Therefore, kvm should raise #GP
whenever a guest tries to read this MSR and the guest's
CPUID.01H:ECX.PDCM [bit 15] is clear.
Andi Kleen Jan. 7, 2019, 8:14 p.m. UTC | #8
On Mon, Jan 07, 2019 at 10:48:38AM -0800, Jim Mattson wrote:
> On Mon, Jan 7, 2019 at 10:20 AM Andi Kleen <ak@linux.intel.com> wrote:
> >
> > > The issue is compatibility. Prior to your change, reading this MSR
> > > from a VM would raise #GP. After your change, it won't. That means
> > > that if you have a VM migrating between hosts with kernel versions
> > > before and after this change, the results will be inconsistent. In the
> >
> > No it will not be. All Linux kernel uses of this MSR are guarded
> > by a CPUID check.
> 
> Linux usage is irrelevant to the architected behavior of the virtual
> CPU. According to volume 4 of the SDM, this MSR is only supported when
> CPUID.01H:ECX.PDCM [bit 15] is set. Therefore, kvm should raise #GP
> whenever a guest tries to read this MSR and the guest's
> CPUID.01H:ECX.PDCM [bit 15] is clear.

That's not general practice in KVM. There are lots of MSRs
that don't fault even if their CPUID doesn't support them.

It's also not generally true for instructions, where it is even
impossible.

You could argue it should be done for MSRs, but that would
be a much larger patch for lots of MSRs. It seems pointless
to single out this particular one.

In practice I doubt it will make any difference for
real software either way.

-Andi
Jim Mattson Jan. 7, 2019, 9 p.m. UTC | #9
On Mon, Jan 7, 2019 at 12:14 PM Andi Kleen <ak@linux.intel.com> wrote:
>
> On Mon, Jan 07, 2019 at 10:48:38AM -0800, Jim Mattson wrote:
> > On Mon, Jan 7, 2019 at 10:20 AM Andi Kleen <ak@linux.intel.com> wrote:
> > >
> > > > The issue is compatibility. Prior to your change, reading this MSR
> > > > from a VM would raise #GP. After your change, it won't. That means
> > > > that if you have a VM migrating between hosts with kernel versions
> > > > before and after this change, the results will be inconsistent. In the
> > >
> > > No it will not be. All Linux kernel uses of this MSR are guarded
> > > by a CPUID check.
> >
> > Linux usage is irrelevant to the architected behavior of the virtual
> > CPU. According to volume 4 of the SDM, this MSR is only supported when
> > CPUID.01H:ECX.PDCM [bit 15] is set. Therefore, kvm should raise #GP
> > whenever a guest tries to read this MSR and the guest's
> > CPUID.01H:ECX.PDCM [bit 15] is clear.
>
> That's not general practice in KVM. There are lots of MSRs
> that don't fault even if their CPUID doesn't support them.

KVM doesn't really have a "general practice" in this regard. True,
there are lots of MSRs that don't fault even if unsupported. But there
are quite a few that do fault if unsupported. On the Intel side,
MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_BNDCFGS, and
MSR_TSC_AUX all check for the corresponding capabilities in guest
CPUID. In the "common" code, MSR_IA32_TSC_ADJUST,
MSR_AMD64_OSVW_ID_LENGTH, and MSR_AMD64_OSVW_STATUS all check for the
corresponding capabilities in guest CPUID.

> It's also not generally true for instructions, where it is even
> impossible.

Yes, sometimes it is impossible to #UD for instructions that are
supported on the host but not in the guest. However, sometimes it is
possible, and kvm sometimes does enforce this. Some examples that come
to mind are RDTSCP and INVPCID. In fact, kvm goes so far as to require
that the guest CPUID must enumerate PCID support in order to enumerate
INVPCID support, and INVPCID will raise #UD unless both capabilities
are enumerated!

> You could argue it should be done for MSRs, but that would
> be a much larger patch for lots of MSRs. It seems pointless
> to single out this particular one.

Past sloppiness isn't really a compelling argument for continued
sloppiness going forward. And there are existing MSRs that do the
appropriate checks, so I'm not singling this one out.

> In practice I doubt it will make any difference for
> real software either way.

Perhaps, but when the correct behavior is so easy to implement, why
not just do it right in the first place?
Wang, Wei W Jan. 8, 2019, 7:53 a.m. UTC | #10
On 01/08/2019 02:48 AM, Jim Mattson wrote:
> On Mon, Jan 7, 2019 at 10:20 AM Andi Kleen <ak@linux.intel.com> wrote:
>>> The issue is compatibility. Prior to your change, reading this MSR
>>> from a VM would raise #GP. After your change, it won't. That means
>>> that if you have a VM migrating between hosts with kernel versions
>>> before and after this change, the results will be inconsistent. In the
>> No it will not be. All Linux kernel uses of this MSR are guarded
>> by a CPUID check.
> Linux usage is irrelevant to the architected behavior of the virtual
> CPU. According to volume 4 of the SDM, this MSR is only supported when
> CPUID.01H:ECX.PDCM [bit 15] is set. Therefore, kvm should raise #GP
> whenever a guest tries to read this MSR and the guest's
> CPUID.01H:ECX.PDCM [bit 15] is clear.
>

Probably one more check would be better:

if (!boot_cpu_has(X86_FEATURE_PDCM) ||
     !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
     return 1;

(host isn't expected to read this MSR when PDCM is not supported
by the guest, so don't have "!msr_info->host_initiate" added to above)

Best,
Wei
Jim Mattson Jan. 8, 2019, 5:19 p.m. UTC | #11
On Mon, Jan 7, 2019 at 11:48 PM Wei Wang <wei.w.wang@intel.com> wrote:
>
> On 01/08/2019 02:48 AM, Jim Mattson wrote:
> > On Mon, Jan 7, 2019 at 10:20 AM Andi Kleen <ak@linux.intel.com> wrote:
> >>> The issue is compatibility. Prior to your change, reading this MSR
> >>> from a VM would raise #GP. After your change, it won't. That means
> >>> that if you have a VM migrating between hosts with kernel versions
> >>> before and after this change, the results will be inconsistent. In the
> >> No it will not be. All Linux kernel uses of this MSR are guarded
> >> by a CPUID check.
> > Linux usage is irrelevant to the architected behavior of the virtual
> > CPU. According to volume 4 of the SDM, this MSR is only supported when
> > CPUID.01H:ECX.PDCM [bit 15] is set. Therefore, kvm should raise #GP
> > whenever a guest tries to read this MSR and the guest's
> > CPUID.01H:ECX.PDCM [bit 15] is clear.
> >
>
> Probably one more check would be better:
>
> if (!boot_cpu_has(X86_FEATURE_PDCM) ||
>      !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
>      return 1;

Thanks for that change!

> (host isn't expected to read this MSR when PDCM is not supported
> by the guest, so don't have "!msr_info->host_initiate" added to above)

In keeping with commit 44883f01fe6ae ("KVM: x86: ensure all MSRs can
always be KVM_GET/SET_MSR'd"), I think the host_initiated check should
be added.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 2f82795..eee09b7 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -87,6 +87,8 @@ 
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED		6
 #define ARCH_PERFMON_EVENTS_COUNT			7
 
+#define X86_PERF_CAP_MASK_LBR_FMT			0x3f
+
 /*
  * Intel "Architectural Performance Monitoring" CPUID
  * detection/enumeration details:
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7bcfa61..3b8a57b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -365,7 +365,7 @@  static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		F(FMA) | F(CX16) | 0 /* xTPR Update*/ | F(PDCM) |
 		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8d5d984..ee02967 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4161,6 +4161,13 @@  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		msr_info->data = vcpu->arch.ia32_xss;
 		break;
+	case MSR_IA32_PERF_CAPABILITIES:
+		if (!boot_cpu_has(X86_FEATURE_PDCM))
+			return 1;
+		msr_info->data = native_read_msr(MSR_IA32_PERF_CAPABILITIES);
+		if (vcpu->kvm->arch.lbr_in_guest)
+			msr_info->data &= X86_PERF_CAP_MASK_LBR_FMT;
+		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
@@ -4343,6 +4350,8 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
 		break;
+	case MSR_IA32_PERF_CAPABILITIES:
+		return 1; /* RO MSR */
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))