diff mbox series

KVM: x86: Provide a capability to disable APERF/MPERF read intercepts

Message ID 20250225004708.1001320-1-jmattson@google.com (mailing list archive)
State New
Headers show
Series KVM: x86: Provide a capability to disable APERF/MPERF read intercepts | expand

Commit Message

Jim Mattson Feb. 25, 2025, 12:45 a.m. UTC
Allow a guest to read the physical IA32_APERF and IA32_MPERF MSRs
without interception.

The IA32_APERF and IA32_MPERF MSRs are not virtualized. Writes are not
handled at all. The MSR values are not zeroed on vCPU creation, saved
on suspend, or restored on resume. No accommodation is made for
processor migration or for sharing a logical processor with other
tasks. No adjustments are made for non-unit TSC multipliers. The MSRs
do not account for time the same way as the comparable PMU events,
whether the PMU is virtualized by the traditional emulation method or
the new mediated pass-through approach.

Nonetheless, in a properly constrained environment, this capability
can be combined with a guest CPUID table that advertises support for
CPUID.6:ECX.APERFMPERF[bit 0] to induce a Linux guest to report the
effective physical CPU frequency in /proc/cpuinfo. Moreover, there is
no performance cost for this capability.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 Documentation/virt/kvm/api.rst  | 1 +
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/svm/svm.c          | 7 +++++++
 arch/x86/kvm/svm/svm.h          | 2 +-
 arch/x86/kvm/vmx/vmx.c          | 4 ++++
 arch/x86/kvm/x86.c              | 7 +++++--
 arch/x86/kvm/x86.h              | 5 +++++
 include/uapi/linux/kvm.h        | 1 +
 tools/include/uapi/linux/kvm.h  | 4 +++-
 9 files changed, 28 insertions(+), 4 deletions(-)

Comments

Jim Mattson March 13, 2025, 8:40 p.m. UTC | #1
On Mon, Feb 24, 2025 at 4:47 PM Jim Mattson <jmattson@google.com> wrote:
>
> Allow a guest to read the physical IA32_APERF and IA32_MPERF MSRs
> without interception.
>
> The IA32_APERF and IA32_MPERF MSRs are not virtualized. Writes are not
> handled at all. The MSR values are not zeroed on vCPU creation, saved
> on suspend, or restored on resume. No accommodation is made for
> processor migration or for sharing a logical processor with other
> tasks. No adjustments are made for non-unit TSC multipliers. The MSRs
> do not account for time the same way as the comparable PMU events,
> whether the PMU is virtualized by the traditional emulation method or
> the new mediated pass-through approach.
>
> Nonetheless, in a properly constrained environment, this capability
> can be combined with a guest CPUID table that advertises support for
> CPUID.6:ECX.APERFMPERF[bit 0] to induce a Linux guest to report the
> effective physical CPU frequency in /proc/cpuinfo. Moreover, there is
> no performance cost for this capability.
>
> Signed-off-by: Jim Mattson <jmattson@google.com>
> ---
>  Documentation/virt/kvm/api.rst  | 1 +
>  arch/x86/include/asm/kvm_host.h | 1 +
>  arch/x86/kvm/svm/svm.c          | 7 +++++++
>  arch/x86/kvm/svm/svm.h          | 2 +-
>  arch/x86/kvm/vmx/vmx.c          | 4 ++++
>  arch/x86/kvm/x86.c              | 7 +++++--
>  arch/x86/kvm/x86.h              | 5 +++++
>  include/uapi/linux/kvm.h        | 1 +
>  tools/include/uapi/linux/kvm.h  | 4 +++-
>  9 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 2b52eb77e29c..6431cd33f06a 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -7684,6 +7684,7 @@ Valid bits in args[0] are::
>    #define KVM_X86_DISABLE_EXITS_HLT              (1 << 1)
>    #define KVM_X86_DISABLE_EXITS_PAUSE            (1 << 2)
>    #define KVM_X86_DISABLE_EXITS_CSTATE           (1 << 3)
> +  #define KVM_X86_DISABLE_EXITS_APERFMPERF       (1 << 4)
>
>  Enabling this capability on a VM provides userspace with a way to no
>  longer intercept some instructions for improved latency in some
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 0b7af5902ff7..53de91fccc20 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1380,6 +1380,7 @@ struct kvm_arch {
>         bool hlt_in_guest;
>         bool pause_in_guest;
>         bool cstate_in_guest;
> +       bool aperfmperf_in_guest;
>
>         unsigned long irq_sources_bitmap;
>         s64 kvmclock_offset;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index a713c803a3a3..5ebcbff341bc 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -111,6 +111,8 @@ static const struct svm_direct_access_msrs {
>         { .index = MSR_IA32_CR_PAT,                     .always = false },
>         { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
>         { .index = MSR_TSC_AUX,                         .always = false },
> +       { .index = MSR_IA32_APERF,                      .always = false },
> +       { .index = MSR_IA32_MPERF,                      .always = false },
>         { .index = X2APIC_MSR(APIC_ID),                 .always = false },
>         { .index = X2APIC_MSR(APIC_LVR),                .always = false },
>         { .index = X2APIC_MSR(APIC_TASKPRI),            .always = false },
> @@ -1359,6 +1361,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
>         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
>                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
>
> +       if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
> +               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_APERF, 1, 0);
> +               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_MPERF, 1, 0);
> +       }
> +
>         if (kvm_vcpu_apicv_active(vcpu))
>                 avic_init_vmcb(svm, vmcb);
>
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 9d7cdb8fbf87..3ee2b7e07395 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -44,7 +44,7 @@ static inline struct page *__sme_pa_to_page(unsigned long pa)
>  #define        IOPM_SIZE PAGE_SIZE * 3
>  #define        MSRPM_SIZE PAGE_SIZE * 2
>
> -#define MAX_DIRECT_ACCESS_MSRS 48
> +#define MAX_DIRECT_ACCESS_MSRS 50
>  #define MSRPM_OFFSETS  32
>  extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
>  extern bool npt_enabled;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 6c56d5235f0f..88a555328932 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7597,6 +7597,10 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
>                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
>                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
>         }
> +       if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
> +               vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
> +               vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
> +       }
>
>         vmx->loaded_vmcs = &vmx->vmcs01;
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 02159c967d29..98f3df24ac9a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4533,7 +4533,7 @@ static inline bool kvm_can_mwait_in_guest(void)
>
>  static u64 kvm_get_allowed_disable_exits(void)
>  {
> -       u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
> +       u64 r = KVM_X86_DISABLE_EXITS_PAUSE | KVM_X86_DISABLE_EXITS_APERFMPERF;
>
>         if (!mitigate_smt_rsb) {
>                 r |= KVM_X86_DISABLE_EXITS_HLT |
> @@ -6543,7 +6543,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>
>                 if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
>                     cpu_smt_possible() &&
> -                   (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
> +                   (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
> +                                     KVM_X86_DISABLE_EXITS_APERFMPERF)))
>                         pr_warn_once(SMT_RSB_MSG);
>
>                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
> @@ -6554,6 +6555,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>                         kvm->arch.hlt_in_guest = true;
>                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
>                         kvm->arch.cstate_in_guest = true;
> +               if (cap->args[0] & KVM_X86_DISABLE_EXITS_APERFMPERF)
> +                       kvm->arch.aperfmperf_in_guest = true;
>                 r = 0;
>  disable_exits_unlock:
>                 mutex_unlock(&kvm->lock);
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 91e50a513100..0c3ac99454e5 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -488,6 +488,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
>         return kvm->arch.cstate_in_guest;
>  }
>
> +static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
> +{
> +       return kvm->arch.aperfmperf_in_guest;
> +}
> +
>  static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
>  {
>         return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 45e6d8fca9b9..b4a4eb52f6df 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -617,6 +617,7 @@ struct kvm_ioeventfd {
>  #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>  #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
>  #define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
> +#define KVM_X86_DISABLE_EXITS_APERFMPERF     (1 << 4)
>
>  /* for KVM_ENABLE_CAP */
>  struct kvm_enable_cap {
> diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
> index 502ea63b5d2e..9b60f0509cdc 100644
> --- a/tools/include/uapi/linux/kvm.h
> +++ b/tools/include/uapi/linux/kvm.h
> @@ -617,10 +617,12 @@ struct kvm_ioeventfd {
>  #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
>  #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
>  #define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
> +#define KVM_X86_DISABLE_EXITS_APERFMPERF     (1 << 4)
>  #define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
>                                                KVM_X86_DISABLE_EXITS_HLT | \
>                                                KVM_X86_DISABLE_EXITS_PAUSE | \
> -                                              KVM_X86_DISABLE_EXITS_CSTATE)
> +                                             KVM_X86_DISABLE_EXITS_CSTATE | \
> +                                             KVM_X86_DISABLE_EXITS_APERFMPERF)
>
>  /* for KVM_ENABLE_CAP */
>  struct kvm_enable_cap {
> --
> 2.48.1.658.g4767266eb4-goog
>

Ping.

Any thoughts?
Sean Christopherson March 14, 2025, 1:59 p.m. UTC | #2
On Thu, Mar 13, 2025, Jim Mattson wrote:
> On Mon, Feb 24, 2025 at 4:47 PM Jim Mattson <jmattson@google.com> wrote:
> >
> > Allow a guest to read the physical IA32_APERF and IA32_MPERF MSRs
> > without interception.
> >
> > The IA32_APERF and IA32_MPERF MSRs are not virtualized. Writes are not
> > handled at all. The MSR values are not zeroed on vCPU creation, saved
> > on suspend, or restored on resume. No accommodation is made for
> > processor migration or for sharing a logical processor with other
> > tasks. No adjustments are made for non-unit TSC multipliers. The MSRs
> > do not account for time the same way as the comparable PMU events,
> > whether the PMU is virtualized by the traditional emulation method or
> > the new mediated pass-through approach.
> >
> > Nonetheless, in a properly constrained environment, this capability
> > can be combined with a guest CPUID table that advertises support for
> > CPUID.6:ECX.APERFMPERF[bit 0] to induce a Linux guest to report the
> > effective physical CPU frequency in /proc/cpuinfo. Moreover, there is
> > no performance cost for this capability.
> >
> > Signed-off-by: Jim Mattson <jmattson@google.com>
> > ---

...

> Any thoughts?

It's absolutely absurd, but I like it.  I would much rather provide functionality
that is flawed in obvious ways, as opposed to functionality that is flawed in
subtle and hard-to-grok ways.  Especially when the former is orders of magnitude
less complex.

I have no objections, so long as we add very explicit disclaimers in the docs.

FWIW, the only reason my response was delayed is because I was trying to figure
out if there's a clean way to avoid adding a large number of a capabilities for
things like this.  E.g. if we can add generic uAPI to let userspace disable MSR
interception.  But AFAICT, there aren't very many MSRs where it would be sane to
let the guest read unadultered MSRs, so it's probably not worth the complexity.
Paolo Bonzini March 14, 2025, 3:07 p.m. UTC | #3
On 3/14/25 14:59, Sean Christopherson wrote:
> On Thu, Mar 13, 2025, Jim Mattson wrote:
>> On Mon, Feb 24, 2025 at 4:47 PM Jim Mattson <jmattson@google.com> wrote:
>>>
>>> Allow a guest to read the physical IA32_APERF and IA32_MPERF MSRs
>>> without interception.
>>>
>>> The IA32_APERF and IA32_MPERF MSRs are not virtualized. Writes are not
>>> handled at all. The MSR values are not zeroed on vCPU creation, saved
>>> on suspend, or restored on resume. No accommodation is made for
>>> processor migration or for sharing a logical processor with other
>>> tasks. No adjustments are made for non-unit TSC multipliers. The MSRs
>>> do not account for time the same way as the comparable PMU events,
>>> whether the PMU is virtualized by the traditional emulation method or
>>> the new mediated pass-through approach.
>>>
>>> Nonetheless, in a properly constrained environment, this capability
>>> can be combined with a guest CPUID table that advertises support for
>>> CPUID.6:ECX.APERFMPERF[bit 0] to induce a Linux guest to report the
>>> effective physical CPU frequency in /proc/cpuinfo. Moreover, there is
>>> no performance cost for this capability.
>>>
>>> Signed-off-by: Jim Mattson <jmattson@google.com>
>>> ---
> 
> ...
> 
>> Any thoughts?
> 
> It's absolutely absurd, but I like it.  I would much rather provide functionality
> that is flawed in obvious ways, as opposed to functionality that is flawed in
> subtle and hard-to-grok ways.  Especially when the former is orders of magnitude
> less complex.
> 
> I have no objections, so long as we add very explicit disclaimers in the docs.
> 
> FWIW, the only reason my response was delayed is because I was trying to figure
> out if there's a clean way to avoid adding a large number of a capabilities for
> things like this.

True but it's not even a capability, it's just a new bit in the existing 
KVM_CAP_X86_DISABLE_EXITS.

Just one question:

> -       u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
> +       u64 r = KVM_X86_DISABLE_EXITS_PAUSE | KVM_X86_DISABLE_EXITS_APERFMPERF;

Should it be conditional on the host having the APERFMPERF feature 
itself?  As is the patch _does_ do something sensible, i.e. #GP, but 
this puts the burden on userspace of checking the host CPUID and 
figuring out whether it makes sense to expose the feature to the guest. 
It would be simpler for userspace to be able to say "if the bit is there 
then enable it and make it visible through CPUID".

Paolo
Jim Mattson March 14, 2025, 3:33 p.m. UTC | #4
On Fri, Mar 14, 2025 at 8:07 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 3/14/25 14:59, Sean Christopherson wrote:
> > On Thu, Mar 13, 2025, Jim Mattson wrote:
> >> On Mon, Feb 24, 2025 at 4:47 PM Jim Mattson <jmattson@google.com> wrote:
> >>>
> >>> Allow a guest to read the physical IA32_APERF and IA32_MPERF MSRs
> >>> without interception.
> >>>
> >>> The IA32_APERF and IA32_MPERF MSRs are not virtualized. Writes are not
> >>> handled at all. The MSR values are not zeroed on vCPU creation, saved
> >>> on suspend, or restored on resume. No accommodation is made for
> >>> processor migration or for sharing a logical processor with other
> >>> tasks. No adjustments are made for non-unit TSC multipliers. The MSRs
> >>> do not account for time the same way as the comparable PMU events,
> >>> whether the PMU is virtualized by the traditional emulation method or
> >>> the new mediated pass-through approach.
> >>>
> >>> Nonetheless, in a properly constrained environment, this capability
> >>> can be combined with a guest CPUID table that advertises support for
> >>> CPUID.6:ECX.APERFMPERF[bit 0] to induce a Linux guest to report the
> >>> effective physical CPU frequency in /proc/cpuinfo. Moreover, there is
> >>> no performance cost for this capability.
> >>>
> >>> Signed-off-by: Jim Mattson <jmattson@google.com>
> >>> ---
> >
> > ...
> >
> >> Any thoughts?
> >
> > It's absolutely absurd, but I like it.  I would much rather provide functionality
> > that is flawed in obvious ways, as opposed to functionality that is flawed in
> > subtle and hard-to-grok ways.  Especially when the former is orders of magnitude
> > less complex.
> >
> > I have no objections, so long as we add very explicit disclaimers in the docs.
> >
> > FWIW, the only reason my response was delayed is because I was trying to figure
> > out if there's a clean way to avoid adding a large number of a capabilities for
> > things like this.
>
> True but it's not even a capability, it's just a new bit in the existing
> KVM_CAP_X86_DISABLE_EXITS.
>
> Just one question:
>
> > -       u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
> > +       u64 r = KVM_X86_DISABLE_EXITS_PAUSE | KVM_X86_DISABLE_EXITS_APERFMPERF;
>
> Should it be conditional on the host having the APERFMPERF feature
> itself?  As is the patch _does_ do something sensible, i.e. #GP, but
> this puts the burden on userspace of checking the host CPUID and
> figuring out whether it makes sense to expose the feature to the guest.
> It would be simpler for userspace to be able to say "if the bit is there
> then enable it and make it visible through CPUID".

Good point. I'll take care of that in v2.

I feel like I am abandoning my principles with this patch, but as long
as you and Sean are on-board, I will do what needs to be done.
Paolo Bonzini March 14, 2025, 3:34 p.m. UTC | #5
On 3/14/25 16:33, Jim Mattson wrote:
>> It would be simpler for userspace to be able to say "if the bit is there
>> then enable it and make it visible through CPUID".
> Good point. I'll take care of that in v2.
> 
> I feel like I am abandoning my principles with this patch, but as long
> as you and Sean are on-board, I will do what needs to be done.

True, but there's a time for that as well.  As long as it's not enabled 
by default, stuff like this (or even quirks that _are_ enabled by 
default) has its place.

Paolo
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 2b52eb77e29c..6431cd33f06a 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7684,6 +7684,7 @@  Valid bits in args[0] are::
   #define KVM_X86_DISABLE_EXITS_HLT              (1 << 1)
   #define KVM_X86_DISABLE_EXITS_PAUSE            (1 << 2)
   #define KVM_X86_DISABLE_EXITS_CSTATE           (1 << 3)
+  #define KVM_X86_DISABLE_EXITS_APERFMPERF       (1 << 4)
 
 Enabling this capability on a VM provides userspace with a way to no
 longer intercept some instructions for improved latency in some
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0b7af5902ff7..53de91fccc20 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1380,6 +1380,7 @@  struct kvm_arch {
 	bool hlt_in_guest;
 	bool pause_in_guest;
 	bool cstate_in_guest;
+	bool aperfmperf_in_guest;
 
 	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a713c803a3a3..5ebcbff341bc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -111,6 +111,8 @@  static const struct svm_direct_access_msrs {
 	{ .index = MSR_IA32_CR_PAT,			.always = false },
 	{ .index = MSR_AMD64_SEV_ES_GHCB,		.always = true  },
 	{ .index = MSR_TSC_AUX,				.always = false },
+	{ .index = MSR_IA32_APERF,			.always = false },
+	{ .index = MSR_IA32_MPERF,			.always = false },
 	{ .index = X2APIC_MSR(APIC_ID),			.always = false },
 	{ .index = X2APIC_MSR(APIC_LVR),		.always = false },
 	{ .index = X2APIC_MSR(APIC_TASKPRI),		.always = false },
@@ -1359,6 +1361,11 @@  static void init_vmcb(struct kvm_vcpu *vcpu)
 	if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
 
+	if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_APERF, 1, 0);
+		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_MPERF, 1, 0);
+	}
+
 	if (kvm_vcpu_apicv_active(vcpu))
 		avic_init_vmcb(svm, vmcb);
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9d7cdb8fbf87..3ee2b7e07395 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -44,7 +44,7 @@  static inline struct page *__sme_pa_to_page(unsigned long pa)
 #define	IOPM_SIZE PAGE_SIZE * 3
 #define	MSRPM_SIZE PAGE_SIZE * 2
 
-#define MAX_DIRECT_ACCESS_MSRS	48
+#define MAX_DIRECT_ACCESS_MSRS	50
 #define MSRPM_OFFSETS	32
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6c56d5235f0f..88a555328932 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7597,6 +7597,10 @@  int vmx_vcpu_create(struct kvm_vcpu *vcpu)
 		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
 		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
 	}
+	if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+	}
 
 	vmx->loaded_vmcs = &vmx->vmcs01;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02159c967d29..98f3df24ac9a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4533,7 +4533,7 @@  static inline bool kvm_can_mwait_in_guest(void)
 
 static u64 kvm_get_allowed_disable_exits(void)
 {
-	u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+	u64 r = KVM_X86_DISABLE_EXITS_PAUSE | KVM_X86_DISABLE_EXITS_APERFMPERF;
 
 	if (!mitigate_smt_rsb) {
 		r |= KVM_X86_DISABLE_EXITS_HLT |
@@ -6543,7 +6543,8 @@  int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 
 		if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
 		    cpu_smt_possible() &&
-		    (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+		    (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+				      KVM_X86_DISABLE_EXITS_APERFMPERF)))
 			pr_warn_once(SMT_RSB_MSG);
 
 		if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
@@ -6554,6 +6555,8 @@  int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			kvm->arch.hlt_in_guest = true;
 		if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
 			kvm->arch.cstate_in_guest = true;
+		if (cap->args[0] & KVM_X86_DISABLE_EXITS_APERFMPERF)
+			kvm->arch.aperfmperf_in_guest = true;
 		r = 0;
 disable_exits_unlock:
 		mutex_unlock(&kvm->lock);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 91e50a513100..0c3ac99454e5 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -488,6 +488,11 @@  static inline bool kvm_cstate_in_guest(struct kvm *kvm)
 	return kvm->arch.cstate_in_guest;
 }
 
+static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
+{
+	return kvm->arch.aperfmperf_in_guest;
+}
+
 static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
 {
 	return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 45e6d8fca9b9..b4a4eb52f6df 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -617,6 +617,7 @@  struct kvm_ioeventfd {
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
 #define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
+#define KVM_X86_DISABLE_EXITS_APERFMPERF     (1 << 4)
 
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 502ea63b5d2e..9b60f0509cdc 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -617,10 +617,12 @@  struct kvm_ioeventfd {
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
 #define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
+#define KVM_X86_DISABLE_EXITS_APERFMPERF     (1 << 4)
 #define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
                                               KVM_X86_DISABLE_EXITS_HLT | \
                                               KVM_X86_DISABLE_EXITS_PAUSE | \
-                                              KVM_X86_DISABLE_EXITS_CSTATE)
+					      KVM_X86_DISABLE_EXITS_CSTATE | \
+					      KVM_X86_DISABLE_EXITS_APERFMPERF)
 
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {