diff mbox series

[v2,04/23] KVM: x86: Inhibit AVIC SPTEs if any vCPU enables x2APIC

Message ID 20220903002254.2411750-5-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: AVIC and local APIC fixes+cleanups | expand

Commit Message

Sean Christopherson Sept. 3, 2022, 12:22 a.m. UTC
Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
to fix a bug where the APIC access page is visible to vCPUs that have
x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.

On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
enabled even without x2AVIC support, the bug occurs any time AVIC is
enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
that the guest is operating in x2APIC mode.

Opportunistically drop the "can" while updating avic_activate_vmcb()'s
comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
the "Note:" down a line to conform to preferred kernel/KVM multi-line
comment style.

Leave Intel as-is for now to avoid a subtle performance regression, even
though Intel likely suffers from the same bug.  On Intel, in theory the
bug rears its head only when vCPUs share host page tables (extremely
likely) and x2APIC enabling is not consistent within the guest, i.e. if
some vCPUs have x2APIC enabled and other does do not (unlikely to occur
except in certain situations, e.g. bringing up APs).

Fixes: 0e311d33bfbe ("KVM: SVM: Introduce hybrid-AVIC mode")
Cc: stable@vger.kernel.org
Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h | 10 ++++++++++
 arch/x86/kvm/lapic.c            |  4 +++-
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/svm/avic.c         | 15 +++++++-------
 arch/x86/kvm/x86.c              | 35 +++++++++++++++++++++++++++++----
 5 files changed, 53 insertions(+), 13 deletions(-)

Comments

Paolo Bonzini Sept. 5, 2022, 10:02 p.m. UTC | #1
On 9/3/22 02:22, Sean Christopherson wrote:
> Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> to fix a bug where the APIC access page is visible to vCPUs that have
> x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> 
> On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> enabled even without x2AVIC support, the bug occurs any time AVIC is
> enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> that the guest is operating in x2APIC mode.
> 
> Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> the "Note:" down a line to conform to preferred kernel/KVM multi-line
> comment style.
> 
> Leave Intel as-is for now to avoid a subtle performance regression, even
> though Intel likely suffers from the same bug.  On Intel, in theory the
> bug rears its head only when vCPUs share host page tables (extremely
> likely) and x2APIC enabling is not consistent within the guest, i.e. if
> some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> except in certain situations, e.g. bringing up APs).
> 
> Fixes: 0e311d33bfbe ("KVM: SVM: Introduce hybrid-AVIC mode")
> Cc: stable@vger.kernel.org
> Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/include/asm/kvm_host.h | 10 ++++++++++
>   arch/x86/kvm/lapic.c            |  4 +++-
>   arch/x86/kvm/mmu/mmu.c          |  2 +-
>   arch/x86/kvm/svm/avic.c         | 15 +++++++-------
>   arch/x86/kvm/x86.c              | 35 +++++++++++++++++++++++++++++----
>   5 files changed, 53 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2c96c43c313a..1fd1b66ceeb6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1132,6 +1132,15 @@ enum kvm_apicv_inhibit {
>   	 * AVIC is disabled because SEV doesn't support it.
>   	 */
>   	APICV_INHIBIT_REASON_SEV,
> +
> +	/*
> +	 * Due to sharing page tables across vCPUs, the xAPIC memslot must be
> +	 * inhibited if any vCPU has x2APIC enabled.  Note, this is a "partial"
> +	 * inhibit; APICv can still be activated, but KVM mustn't retain/create
> +	 * SPTEs for the APIC access page.  Like the APIC ID and APIC base
> +	 * inhibits, this is sticky for simplicity.
> +	 */
> +	APICV_INHIBIT_REASON_X2APIC,
>   };
>   
>   struct kvm_arch {
> @@ -1903,6 +1912,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
>   gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
>   				struct x86_exception *exception);
>   
> +bool kvm_apicv_memslot_activated(struct kvm *kvm);
>   bool kvm_apicv_activated(struct kvm *kvm);
>   bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
>   void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 38e9b8e5278c..d956cd37908e 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2394,8 +2394,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>   		}
>   	}
>   
> -	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
> +	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) {
>   		kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
> +		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_X2APIC);
> +	}
>   
>   	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
>   		kvm_vcpu_update_apicv(vcpu);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e418ef3ecfcb..cea25552869f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4150,7 +4150,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>   		 * when the AVIC is re-enabled.
>   		 */
>   		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
> -		    !kvm_apicv_activated(vcpu->kvm))
> +		    !kvm_apicv_memslot_activated(vcpu->kvm))
>   			return RET_PF_EMULATE;
>   	}
>   
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index 6a3d225eb02c..19be5f1afaac 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -72,12 +72,12 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
>   
>   	vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
>   
> -	/* Note:
> -	 * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
> -	 * MSR accesses, while interrupt injection to a running vCPU
> -	 * can be achieved using AVIC doorbell. The AVIC hardware still
> -	 * accelerate MMIO accesses, but this does not cause any harm
> -	 * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
> +	/*
> +	 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
> +	 * accesses, while interrupt injection to a running vCPU can be
> +	 * achieved using AVIC doorbell.  KVM disables the APIC access page
> +	 * (prevents mapping it into the guest) if any vCPU has x2APIC enabled,
> +	 * thus enabling AVIC activates only the doorbell mechanism.
>   	 */
>   	if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
>   	    avic_mode == AVIC_MODE_X2) {
> @@ -1014,7 +1014,8 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
>   			  BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
>   			  BIT(APICV_INHIBIT_REASON_SEV)      |
>   			  BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
> -			  BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
> +			  BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) |
> +			  BIT(APICV_INHIBIT_REASON_X2APIC);
>   
>   	return supported & BIT(reason);
>   }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d7374d768296..6ab9088c2531 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9379,15 +9379,29 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
>   	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
>   }
>   
> -bool kvm_apicv_activated(struct kvm *kvm)
> +bool kvm_apicv_memslot_activated(struct kvm *kvm)
>   {
>   	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
>   }
> +
> +static unsigned long kvm_apicv_get_inhibit_reasons(struct kvm *kvm)
> +{
> +	/*
> +	 * x2APIC only needs to "inhibit" the MMIO region, all other aspects of
> +	 * APICv can continue to be utilized.
> +	 */
> +	return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~APICV_INHIBIT_REASON_X2APIC;
> +}
> +
> +bool kvm_apicv_activated(struct kvm *kvm)
> +{
> +	return !kvm_apicv_get_inhibit_reasons(kvm);
> +}
>   EXPORT_SYMBOL_GPL(kvm_apicv_activated);
>   
>   bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
>   {
> -	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
> +	ulong vm_reasons = kvm_apicv_get_inhibit_reasons(vcpu->kvm);
>   	ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
>   
>   	return (vm_reasons | vcpu_reasons) == 0;
> @@ -10122,7 +10136,15 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
>   
>   	set_or_clear_apicv_inhibit(&new, reason, set);
>   
> -	if (!!old != !!new) {
> +	/*
> +	 * If the overall "is APICv activated" status is unchanged, simply add
> +	 * or remove the inihbit from the pile.  x2APIC is an exception, as it
> +	 * is a partial inhibit (only blocks SPTEs for the APIC access page).
> +	 * If x2APIC is the only inhibit in either the old or the new set, then
> +	 * vCPUs need to be kicked to transition between partially-inhibited
> +	 * and fully-inhibited.
> +	 */
> +	if ((!!old != !!new) || old == X2APIC_ENABLE || new == X2APIC_ENABLE) {
>   		/*
>   		 * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
>   		 * false positives in the sanity check WARN in svm_vcpu_run().
> @@ -10137,7 +10159,12 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
>   		 */
>   		kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
>   		kvm->arch.apicv_inhibit_reasons = new;
> -		if (new) {
> +
> +		/*
> +		 * Zap SPTEs for the APIC access page if APICv is newly
> +		 * inhibited (partially or fully).
> +		 */
> +		if (new && !old) {
>   			unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
>   			kvm_zap_gfn_range(kvm, gfn, gfn+1);
>   		}

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Suthikulpanit, Suravee Sept. 13, 2022, 7:52 p.m. UTC | #2
Sean,

This patch inhibits VM running in x2APIC mode on system w/ x2AVIC support.

On 9/2/2022 7:22 PM, Sean Christopherson wrote:
> Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> to fix a bug where the APIC access page is visible to vCPUs that have
> x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> 
> On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> enabled even without x2AVIC support, the bug occurs any time AVIC is
> enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> that the guest is operating in x2APIC mode.
> 
> Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> the "Note:" down a line to conform to preferred kernel/KVM multi-line
> comment style.
> 
> Leave Intel as-is for now to avoid a subtle performance regression, even
> though Intel likely suffers from the same bug.  On Intel, in theory the
> bug rears its head only when vCPUs share host page tables (extremely
> likely) and x2APIC enabling is not consistent within the guest, i.e. if
> some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> except in certain situations, e.g. bringing up APs).
> 
> Fixes: 0e311d33bfbe ("KVM: SVM: Introduce hybrid-AVIC mode")
> Cc: stable@vger.kernel.org
> Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/include/asm/kvm_host.h | 10 ++++++++++
>   arch/x86/kvm/lapic.c            |  4 +++-
>   arch/x86/kvm/mmu/mmu.c          |  2 +-
>   arch/x86/kvm/svm/avic.c         | 15 +++++++-------
>   arch/x86/kvm/x86.c              | 35 +++++++++++++++++++++++++++++----
>   5 files changed, 53 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2c96c43c313a..1fd1b66ceeb6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1132,6 +1132,15 @@ enum kvm_apicv_inhibit {
>   	 * AVIC is disabled because SEV doesn't support it.
>   	 */
>   	APICV_INHIBIT_REASON_SEV,
> +
> +	/*
> +	 * Due to sharing page tables across vCPUs, the xAPIC memslot must be
> +	 * inhibited if any vCPU has x2APIC enabled.  Note, this is a "partial"
> +	 * inhibit; APICv can still be activated, but KVM mustn't retain/create
> +	 * SPTEs for the APIC access page.  Like the APIC ID and APIC base
> +	 * inhibits, this is sticky for simplicity.
> +	 */
> +	APICV_INHIBIT_REASON_X2APIC,

Actually, shouldn't the APICV_INHIBIT_REASON_X2APIC is set only when 
vCPU has x2APIC enabled on the system with _NO x2AVIC support_ ? For 
example, .....

>   };
>   
>   struct kvm_arch {
> @@ -1903,6 +1912,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
>   gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
>   				struct x86_exception *exception);
>   
> +bool kvm_apicv_memslot_activated(struct kvm *kvm);
>   bool kvm_apicv_activated(struct kvm *kvm);
>   bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
>   void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 38e9b8e5278c..d956cd37908e 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2394,8 +2394,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>   		}
>   	}
>   
> -	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
> +	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) {
>   		kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
> +		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_X2APIC);
> +	}

.... Here, since we do not want to inhibit APICV/AVIC on system that can 
support x2AVIC, this should be set in the vendor-specific call-back 
function, where appropriate checks can be made.

>   
>   	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
>   		kvm_vcpu_update_apicv(vcpu);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e418ef3ecfcb..cea25552869f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4150,7 +4150,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>   		 * when the AVIC is re-enabled.
>   		 */
>   		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
> -		    !kvm_apicv_activated(vcpu->kvm))
> +		    !kvm_apicv_memslot_activated(vcpu->kvm))
>   			return RET_PF_EMULATE;
>   	}
>   
> ....
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d7374d768296..6ab9088c2531 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9379,15 +9379,29 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
>   	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
>   }
>   
> -bool kvm_apicv_activated(struct kvm *kvm)
> +bool kvm_apicv_memslot_activated(struct kvm *kvm)
>   {
>   	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
>   }
> +
> +static unsigned long kvm_apicv_get_inhibit_reasons(struct kvm *kvm)
> +{
> +	/*
> +	 * x2APIC only needs to "inhibit" the MMIO region, all other aspects of
> +	 * APICv can continue to be utilized.
> +	 */
> +	return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~APICV_INHIBIT_REASON_X2APIC;
> +}
> +
> +bool kvm_apicv_activated(struct kvm *kvm)
> +{
> +	return !kvm_apicv_get_inhibit_reasons(kvm);
> +}
>   EXPORT_SYMBOL_GPL(kvm_apicv_activated);
>   
>   bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
>   {
> -	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
> +	ulong vm_reasons = kvm_apicv_get_inhibit_reasons(vcpu->kvm);
>   	ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
>   
>   	return (vm_reasons | vcpu_reasons) == 0;
> @@ -10122,7 +10136,15 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
>   
>   	set_or_clear_apicv_inhibit(&new, reason, set);
>   
> -	if (!!old != !!new) {
> +	/*
> +	 * If the overall "is APICv activated" status is unchanged, simply add
> +	 * or remove the inihbit from the pile.  x2APIC is an exception, as it
> +	 * is a partial inhibit (only blocks SPTEs for the APIC access page).
> +	 * If x2APIC is the only inhibit in either the old or the new set, then
> +	 * vCPUs need to be kicked to transition between partially-inhibited
> +	 * and fully-inhibited.
> +	 */
> +	if ((!!old != !!new) || old == X2APIC_ENABLE || new == X2APIC_ENABLE) {

Why are we comparing APICV inhibit reasons (old, new) with X2APIC_ENABLE 
here? Do you mean to compare with APICV_INHIBIT_REASON_X2APIC?

Thanks,
Suravee

>   		/*
>   		 * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
>   		 * false positives in the sanity check WARN in svm_vcpu_run().
> @@ -10137,7 +10159,12 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
>   		 */
>   		kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
>   		kvm->arch.apicv_inhibit_reasons = new;
> -		if (new) {
> +
> +		/*
> +		 * Zap SPTEs for the APIC access page if APICv is newly
> +		 * inhibited (partially or fully).
> +		 */
> +		if (new && !old) {
>   			unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
>   			kvm_zap_gfn_range(kvm, gfn, gfn+1);
>   		}
Sean Christopherson Sept. 14, 2022, 7:39 a.m. UTC | #3
On Tue, Sep 13, 2022, Suthikulpanit, Suravee wrote:
> Sean,
> 
> This patch inhibits VM running in x2APIC mode on system w/ x2AVIC support.

The intent is that it only inhibits the MMIO page.

> On 9/2/2022 7:22 PM, Sean Christopherson wrote:
> > Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> > to fix a bug where the APIC access page is visible to vCPUs that have
> > x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> > 
> > On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> > enabled even without x2AVIC support, the bug occurs any time AVIC is
> > enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> > that the guest is operating in x2APIC mode.
> > 
> > Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> > comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> > the "Note:" down a line to conform to preferred kernel/KVM multi-line
> > comment style.
> > 
> > Leave Intel as-is for now to avoid a subtle performance regression, even
> > though Intel likely suffers from the same bug.  On Intel, in theory the
> > bug rears its head only when vCPUs share host page tables (extremely
> > likely) and x2APIC enabling is not consistent within the guest, i.e. if
> > some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> > except in certain situations, e.g. bringing up APs).
> > 
> > Fixes: 0e311d33bfbe ("KVM: SVM: Introduce hybrid-AVIC mode")
> > Cc: stable@vger.kernel.org
> > Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >   arch/x86/include/asm/kvm_host.h | 10 ++++++++++
> >   arch/x86/kvm/lapic.c            |  4 +++-
> >   arch/x86/kvm/mmu/mmu.c          |  2 +-
> >   arch/x86/kvm/svm/avic.c         | 15 +++++++-------
> >   arch/x86/kvm/x86.c              | 35 +++++++++++++++++++++++++++++----
> >   5 files changed, 53 insertions(+), 13 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 2c96c43c313a..1fd1b66ceeb6 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1132,6 +1132,15 @@ enum kvm_apicv_inhibit {
> >   	 * AVIC is disabled because SEV doesn't support it.
> >   	 */
> >   	APICV_INHIBIT_REASON_SEV,
> > +
> > +	/*
> > +	 * Due to sharing page tables across vCPUs, the xAPIC memslot must be
> > +	 * inhibited if any vCPU has x2APIC enabled.  Note, this is a "partial"
> > +	 * inhibit; APICv can still be activated, but KVM mustn't retain/create
> > +	 * SPTEs for the APIC access page.  Like the APIC ID and APIC base
> > +	 * inhibits, this is sticky for simplicity.
> > +	 */
> > +	APICV_INHIBIT_REASON_X2APIC,
> 
> Actually, shouldn't the APICV_INHIBIT_REASON_X2APIC is set only when vCPU
> has x2APIC enabled on the system with _NO x2AVIC support_ ? For example,
> .....
> 
> >   };
> >   struct kvm_arch {
> > @@ -1903,6 +1912,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
> >   gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
> >   				struct x86_exception *exception);
> > +bool kvm_apicv_memslot_activated(struct kvm *kvm);
> >   bool kvm_apicv_activated(struct kvm *kvm);
> >   bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
> >   void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 38e9b8e5278c..d956cd37908e 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -2394,8 +2394,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> >   		}
> >   	}
> > -	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
> > +	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) {
> >   		kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
> > +		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_X2APIC);
> > +	}
> 
> .... Here, since we do not want to inhibit APICV/AVIC on system that can
> support x2AVIC, this should be set in the vendor-specific call-back
> function, where appropriate checks can be made.

No, again the intent is to inhibit only the MMIO page.  The x2APIC inhibit is
ignored when determining whether or not APICv is inhibited, but is included when
checking if the memslot is inhibited.

bool kvm_apicv_memslot_activated(struct kvm *kvm)
{
	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
}

static unsigned long kvm_apicv_get_inhibit_reasons(struct kvm *kvm)
{
	/*
	 * x2APIC only needs to "inhibit" the MMIO region, all other aspects of
	 * APICv can continue to be utilized.
	 */
	return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~APICV_INHIBIT_REASON_X2APIC;
}

> > @@ -10122,7 +10136,15 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
> >   	set_or_clear_apicv_inhibit(&new, reason, set);
> > -	if (!!old != !!new) {
> > +	/*
> > +	 * If the overall "is APICv activated" status is unchanged, simply add
> > +	 * or remove the inihbit from the pile.  x2APIC is an exception, as it
> > +	 * is a partial inhibit (only blocks SPTEs for the APIC access page).
> > +	 * If x2APIC is the only inhibit in either the old or the new set, then
> > +	 * vCPUs need to be kicked to transition between partially-inhibited
> > +	 * and fully-inhibited.
> > +	 */
> > +	if ((!!old != !!new) || old == X2APIC_ENABLE || new == X2APIC_ENABLE) {
> 
> Why are we comparing APICV inhibit reasons (old, new) with X2APIC_ENABLE
> here? Do you mean to compare with APICV_INHIBIT_REASON_X2APIC?

Yeaaaaah.
Suthikulpanit, Suravee Sept. 14, 2022, 5:41 p.m. UTC | #4
Sean

On 9/14/2022 2:39 AM, Sean Christopherson wrote:
>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>> index 38e9b8e5278c..d956cd37908e 100644
>>> --- a/arch/x86/kvm/lapic.c
>>> +++ b/arch/x86/kvm/lapic.c
>>> @@ -2394,8 +2394,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>>>    		}
>>>    	}
>>> -	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
>>> +	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) {
>>>    		kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
>>> +		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_X2APIC);
>>> +	}
>> .... Here, since we do not want to inhibit APICV/AVIC on system that can
>> support x2AVIC, this should be set in the vendor-specific call-back
>> function, where appropriate checks can be made.
> No, again the intent is to inhibit only the MMIO page.  The x2APIC inhibit is
> ignored when determining whether or not APICv is inhibited, but is included when
> checking if the memslot is inhibited.
> 
> bool kvm_apicv_memslot_activated(struct kvm *kvm)
> {
> 	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
> }
> 
> static unsigned long kvm_apicv_get_inhibit_reasons(struct kvm *kvm)
> {
> 	/*
> 	 * x2APIC only needs to "inhibit" the MMIO region, all other aspects of
> 	 * APICv can continue to be utilized.
> 	 */
> 	return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~APICV_INHIBIT_REASON_X2APIC;

Also, this should be:

return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~(1UL << 
APICV_INHIBIT_REASON_X2APIC);

Suravee
Sean Christopherson Sept. 16, 2022, 5:47 p.m. UTC | #5
On Wed, Sep 14, 2022, Suthikulpanit, Suravee wrote:
> Sean
> 
> On 9/14/2022 2:39 AM, Sean Christopherson wrote:
> > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > index 38e9b8e5278c..d956cd37908e 100644
> > > > --- a/arch/x86/kvm/lapic.c
> > > > +++ b/arch/x86/kvm/lapic.c
> > > > @@ -2394,8 +2394,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> > > >    		}
> > > >    	}
> > > > -	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
> > > > +	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) {
> > > >    		kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
> > > > +		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_X2APIC);
> > > > +	}
> > > .... Here, since we do not want to inhibit APICV/AVIC on system that can
> > > support x2AVIC, this should be set in the vendor-specific call-back
> > > function, where appropriate checks can be made.
> > No, again the intent is to inhibit only the MMIO page.  The x2APIC inhibit is
> > ignored when determining whether or not APICv is inhibited, but is included when
> > checking if the memslot is inhibited.
> > 
> > bool kvm_apicv_memslot_activated(struct kvm *kvm)
> > {
> > 	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
> > }
> > 
> > static unsigned long kvm_apicv_get_inhibit_reasons(struct kvm *kvm)
> > {
> > 	/*
> > 	 * x2APIC only needs to "inhibit" the MMIO region, all other aspects of
> > 	 * APICv can continue to be utilized.
> > 	 */
> > 	return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~APICV_INHIBIT_REASON_X2APIC;
> 
> Also, this should be:
> 
> return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~(1UL <<
> APICV_INHIBIT_REASON_X2APIC);

Ugh, I found and fixed this locally when testing, but lost the change when shuffling
code between systems.

Thanks!
Sean Christopherson Sept. 16, 2022, 7:10 p.m. UTC | #6
On Tue, Sep 13, 2022, Suthikulpanit, Suravee wrote:
> > @@ -10122,7 +10136,15 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
> >   	set_or_clear_apicv_inhibit(&new, reason, set);
> > -	if (!!old != !!new) {
> > +	/*
> > +	 * If the overall "is APICv activated" status is unchanged, simply add
> > +	 * or remove the inihbit from the pile.  x2APIC is an exception, as it
> > +	 * is a partial inhibit (only blocks SPTEs for the APIC access page).
> > +	 * If x2APIC is the only inhibit in either the old or the new set, then
> > +	 * vCPUs need to be kicked to transition between partially-inhibited
> > +	 * and fully-inhibited.
> > +	 */
> > +	if ((!!old != !!new) || old == X2APIC_ENABLE || new == X2APIC_ENABLE) {
> 
> Why are we comparing APICV inhibit reasons (old, new) with X2APIC_ENABLE
> here? Do you mean to compare with APICV_INHIBIT_REASON_X2APIC?

Heh, the truly hilarious part about this is that the code actually works, because
by pure coincidence, X2APIC_ENABLE == BIT(APICV_INHIBIT_REASON_X2APIC).  Obviously
still needs to be changed, just found it amusing.
Maxim Levitsky Sept. 20, 2022, 1:07 p.m. UTC | #7
On Sat, 2022-09-03 at 00:22 +0000, Sean Christopherson wrote:
> Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> to fix a bug where the APIC access page is visible to vCPUs that have
> x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> 
> On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> enabled even without x2AVIC support, the bug occurs any time AVIC is
> enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> that the guest is operating in x2APIC mode.
> 
> Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> the "Note:" down a line to conform to preferred kernel/KVM multi-line
> comment style.
> 
> Leave Intel as-is for now to avoid a subtle performance regression, even
> though Intel likely suffers from the same bug.  On Intel, in theory the
> bug rears its head only when vCPUs share host page tables (extremely
> likely) and x2APIC enabling is not consistent within the guest, i.e. if
> some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> except in certain situations, e.g. bringing up APs).

Are you sure about this?

This is what I am thinking will happen, I might be wrong but I am not sure:

Due to kvm internal memory slot at 0xfee00000 userspace is very unlikely
to put a memslot over this range - let assume that userspace didn't map
RAM over the 0xfee00000 page.

In fact if userspace did try to put the memory slot either first vCPU creation
will fail because this is where we create the private memslot,
or userspace memslot will fail if it was added later.

So assuming no guest RAM in the area, the correct x86 behavier should be:

xapic enabled - 0xfee00xxx acts as APIC
x2apic enabled - 0xfee00xxx is unmapped mmio, thus writes are dropped, reads return 0xFFFFFFFF.

However currently what will happen with APICv/AVIC (regardless of the hybrid mode):

1. we start with xapic, the guest accesses it once, and that populates our SPTE entry
for 0xfee00000, but since xapic is enabled, read/writes to it are redirected to apic backing page
(the spte is ignored)

2. guest changes mode to x2apic (even on all vCPUS).
That doesn't currently change any inhibits or so,
thus I *think* only vmx_set_virtual_apic_mode is called, and it doesn't flush the SPTE
for 0xfee00000, it remains present and writable.

3. guest accesses the 0xfee00xxx, assuming APICv/x2avic, the hardware won't redirect
the access to apic backing page, but will instead just use that SPTE and let the guest
read/write the private page that we mapped in the range, which is wrong.

Am I missing something?

Also I somewhat doen't like the partial inhibit - it is to some extent misleading.
I don't have a very strong option on using the inhibit, but its meaning just feels a bit overloaded.

So why not to do it this way:

1. zap the SPTE always when switching apic mode (e.g move the code in 
__kvm_set_or_clear_apicv_inhibit to a common funtion)

2. make kvm_faultin_pfn check a flag 'any vcpu enabled x2apic' and refuse
to re-install that spte?

Something like that (only compile tested, and likely misses memory barriers):


commit 0d00c3716d5073d58d018fab9e5a261600483e25
Author: Maxim Levitsky <mlevitsk@redhat.com>
Date:   Tue Sep 20 14:39:57 2022 +0300

    KVM: x86: hide APICv/AVIC mmio when guest enables x2apic
    
   
    TBD
    
    Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bd4b8c27129cc9..fe8ba17481fe1d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1356,6 +1356,9 @@ struct kvm_arch {
 	 */
 #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
 	struct kvm_mmu_memory_cache split_desc_cache;
+
+
+	bool apicv_mmio_hidden;
 };
 
 struct kvm_vm_stat {
@@ -1914,6 +1917,8 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 				    enum kvm_apicv_inhibit reason, bool set);
 
+void kvm_update_virtual_apic_mode(struct kvm_vcpu *vcpu);
+
 static inline void kvm_set_apicv_inhibit(struct kvm *kvm,
 					 enum kvm_apicv_inhibit reason)
 {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c448b6e4b3ad33..c18094b95c4ba9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2442,10 +2442,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
 		kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
 
-	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
-		kvm_vcpu_update_apicv(vcpu);
-		static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
-	}
+	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
+		kvm_update_virtual_apic_mode(vcpu);
 
 	apic->base_address = apic->vcpu->arch.apic_base &
 			     MSR_IA32_APICBASE_BASE;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 86fe9a7ced1700..4ef3c87662742d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4153,9 +4153,9 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * MMIO SPTE.  That way the cache doesn't need to be purged
 		 * when the AVIC is re-enabled.
 		 */
-		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
-		    !kvm_apicv_activated(vcpu->kvm))
-			return RET_PF_EMULATE;
+		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+			if (!kvm_apicv_activated(vcpu->kvm) || vcpu->kvm->arch.apicv_mmio_hidden)
+				return RET_PF_EMULATE;
 	}
 
 	async = false;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0bc974cdcced10..fc0c6ec79ad194 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10127,6 +10127,13 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
 
+static void kvm_zap_apicv_memslot(struct kvm *kvm)
+{
+	unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+
+	kvm_zap_gfn_range(kvm, gfn, gfn+1);
+}
+
 void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 				      enum kvm_apicv_inhibit reason, bool set)
 {
@@ -10156,10 +10163,8 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 		 */
 		kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
 		kvm->arch.apicv_inhibit_reasons = new;
-		if (new) {
-			unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
-			kvm_zap_gfn_range(kvm, gfn, gfn+1);
-		}
+		if (new)
+			kvm_zap_apicv_memslot(kvm);
 	} else {
 		kvm->arch.apicv_inhibit_reasons = new;
 	}
@@ -10177,6 +10182,31 @@ void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
 
+void kvm_update_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	if (!enable_apicv)
+		return;
+
+	if (apic_x2apic_mode(vcpu->arch.apic) && !READ_ONCE(kvm->arch.apicv_mmio_hidden)) {
+		/*
+		 * Permanently hide APICv/AVIC private KVM memory slot when one of
+		 * vCPUs enables the X2APIC for the first time.
+		 *
+		 * That ensures that no vCPU could access this memory slot
+		 * in case of hardware supported x2apic acceleration
+		 * (apicv/x2avic), and also that in case of hybrid AVIC mode,
+		 * the AVIC doesn't continue to accelerate this MMIO.
+		 */
+		WRITE_ONCE(kvm->arch.apicv_mmio_hidden, true);
+		kvm_zap_apicv_memslot(kvm);
+	}
+
+	kvm_vcpu_update_apicv(vcpu);
+	static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
+}
+
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	if (!kvm_apic_present(vcpu))



Best regards,
	Maxim Levitsky


> 
> Fixes: 0e311d33bfbe ("KVM: SVM: Introduce hybrid-AVIC mode")
> Cc: stable@vger.kernel.org
> Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h | 10 ++++++++++
>  arch/x86/kvm/lapic.c            |  4 +++-
>  arch/x86/kvm/mmu/mmu.c          |  2 +-
>  arch/x86/kvm/svm/avic.c         | 15 +++++++-------
>  arch/x86/kvm/x86.c              | 35 +++++++++++++++++++++++++++++----
>  5 files changed, 53 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2c96c43c313a..1fd1b66ceeb6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1132,6 +1132,15 @@ enum kvm_apicv_inhibit {
>          * AVIC is disabled because SEV doesn't support it.
>          */
>         APICV_INHIBIT_REASON_SEV,
> +
> +       /*
> +        * Due to sharing page tables across vCPUs, the xAPIC memslot must be
> +        * inhibited if any vCPU has x2APIC enabled.  Note, this is a "partial"
> +        * inhibit; APICv can still be activated, but KVM mustn't retain/create
> +        * SPTEs for the APIC access page.  Like the APIC ID and APIC base
> +        * inhibits, this is sticky for simplicity.
> +        */
> +       APICV_INHIBIT_REASON_X2APIC,
>  };
>  
>  struct kvm_arch {
> @@ -1903,6 +1912,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
>  gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
>                                 struct x86_exception *exception);
>  
> +bool kvm_apicv_memslot_activated(struct kvm *kvm);
>  bool kvm_apicv_activated(struct kvm *kvm);
>  bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
>  void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 38e9b8e5278c..d956cd37908e 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2394,8 +2394,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>                 }
>         }
>  
> -       if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
> +       if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) {
>                 kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
> +               kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_X2APIC);
> +       }
>  
>         if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
>                 kvm_vcpu_update_apicv(vcpu);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e418ef3ecfcb..cea25552869f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4150,7 +4150,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>                  * when the AVIC is re-enabled.
>                  */
>                 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
> -                   !kvm_apicv_activated(vcpu->kvm))
> +                   !kvm_apicv_memslot_activated(vcpu->kvm))
>                         return RET_PF_EMULATE;
>         }
>  
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index 6a3d225eb02c..19be5f1afaac 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -72,12 +72,12 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
>  
>         vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
>  
> -       /* Note:
> -        * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
> -        * MSR accesses, while interrupt injection to a running vCPU
> -        * can be achieved using AVIC doorbell. The AVIC hardware still
> -        * accelerate MMIO accesses, but this does not cause any harm
> -        * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
> +       /*
> +        * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
> +        * accesses, while interrupt injection to a running vCPU can be
> +        * achieved using AVIC doorbell.  KVM disables the APIC access page
> +        * (prevents mapping it into the guest) if any vCPU has x2APIC enabled,
> +        * thus enabling AVIC activates only the doorbell mechanism.
>          */
>         if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
>             avic_mode == AVIC_MODE_X2) {
> @@ -1014,7 +1014,8 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
>                           BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
>                           BIT(APICV_INHIBIT_REASON_SEV)      |
>                           BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
> -                         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
> +                         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) |
> +                         BIT(APICV_INHIBIT_REASON_X2APIC);
>  
>         return supported & BIT(reason);
>  }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d7374d768296..6ab9088c2531 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9379,15 +9379,29 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
>         kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
>  }
>  
> -bool kvm_apicv_activated(struct kvm *kvm)
> +bool kvm_apicv_memslot_activated(struct kvm *kvm)
>  {
>         return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
>  }
> +
> +static unsigned long kvm_apicv_get_inhibit_reasons(struct kvm *kvm)
> +{
> +       /*
> +        * x2APIC only needs to "inhibit" the MMIO region, all other aspects of
> +        * APICv can continue to be utilized.
> +        */
> +       return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~APICV_INHIBIT_REASON_X2APIC;
> +}
> +
> +bool kvm_apicv_activated(struct kvm *kvm)
> +{
> +       return !kvm_apicv_get_inhibit_reasons(kvm);
> +}
>  EXPORT_SYMBOL_GPL(kvm_apicv_activated);
>  
>  bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
>  {
> -       ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
> +       ulong vm_reasons = kvm_apicv_get_inhibit_reasons(vcpu->kvm);
>         ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
>  
>         return (vm_reasons | vcpu_reasons) == 0;
> @@ -10122,7 +10136,15 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
>  
>         set_or_clear_apicv_inhibit(&new, reason, set);
>  
> -       if (!!old != !!new) {
> +       /*
> +        * If the overall "is APICv activated" status is unchanged, simply add
> +        * or remove the inihbit from the pile.  x2APIC is an exception, as it
> +        * is a partial inhibit (only blocks SPTEs for the APIC access page).
> +        * If x2APIC is the only inhibit in either the old or the new set, then
> +        * vCPUs need to be kicked to transition between partially-inhibited
> +        * and fully-inhibited.
> +        */
> +       if ((!!old != !!new) || old == X2APIC_ENABLE || new == X2APIC_ENABLE) {
>                 /*
>                  * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
>                  * false positives in the sanity check WARN in svm_vcpu_run().
> @@ -10137,7 +10159,12 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
>                  */
>                 kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
>                 kvm->arch.apicv_inhibit_reasons = new;
> -               if (new) {
> +
> +               /*
> +                * Zap SPTEs for the APIC access page if APICv is newly
> +                * inhibited (partially or fully).
> +                */
> +               if (new && !old) {
>                         unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
>                         kvm_zap_gfn_range(kvm, gfn, gfn+1);
>                 }
Sean Christopherson Sept. 20, 2022, 3:46 p.m. UTC | #8
On Tue, Sep 20, 2022, Maxim Levitsky wrote:
> On Sat, 2022-09-03 at 00:22 +0000, Sean Christopherson wrote:
> > Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> > to fix a bug where the APIC access page is visible to vCPUs that have
> > x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> > 
> > On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> > enabled even without x2AVIC support, the bug occurs any time AVIC is
> > enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> > that the guest is operating in x2APIC mode.
> > 
> > Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> > comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> > the "Note:" down a line to conform to preferred kernel/KVM multi-line
> > comment style.
> > 
> > Leave Intel as-is for now to avoid a subtle performance regression, even
> > though Intel likely suffers from the same bug.  On Intel, in theory the
> > bug rears its head only when vCPUs share host page tables (extremely
> > likely) and x2APIC enabling is not consistent within the guest, i.e. if
> > some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> > except in certain situations, e.g. bringing up APs).
> 
> Are you sure about this?

Ah, no.  The key on Intel is the separate VMCS control for virtualizing xAPIC
accesses.  As you note below, KVM will provide memory semantics, which is technically
wrong.

> This is what I am thinking will happen, I might be wrong but I am not sure:

...

> 3. guest accesses the 0xfee00xxx, assuming APICv/x2avic, the hardware won't redirect
> the access to apic backing page, but will instead just use that SPTE and let the guest
> read/write the private page that we mapped in the range, which is wrong.
> 
> Am I missing something?

No, I don't believe so.  I'm still hesitant to add the effetive inhibit to Intel,
though that's probably just pure paranoia at this point.  Probably makes sense to
just do it and verify that x2APIC virtualization still works.

> Also I somewhat doen't like the partial inhibit - it is to some extent
> misleading.  I don't have a very strong option on using the inhibit, but its
> meaning just feels a bit overloaded.
> 
> So why not to do it this way:
> 
> 1. zap the SPTE always when switching apic mode (e.g move the code in 
> __kvm_set_or_clear_apicv_inhibit to a common funtion)
> 
> 2. make kvm_faultin_pfn check a flag 'any vcpu enabled x2apic' and refuse
> to re-install that spte?
> 
> Something like that (only compile tested, and likely misses memory barriers):

Actually, since this is "sticky", we can go even further and just delete the
memslot.  Deleting the memslot is slightly complicated by the need to drop SRCU
if kvm_lapic_set_base() enables x2APIC during KVM_RUN, but that's enough enough
to handled by putting the disabling logic in vcpu_run() and using KVM_REQ_UNBLOCK
to ensure the memslot is deleted before the vCPU re-enters the guest.

Testing now...
Sean Christopherson Sept. 20, 2022, 4:50 p.m. UTC | #9
On Tue, Sep 20, 2022, Sean Christopherson wrote:
> On Tue, Sep 20, 2022, Maxim Levitsky wrote:
> > On Sat, 2022-09-03 at 00:22 +0000, Sean Christopherson wrote:
> > > Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> > > to fix a bug where the APIC access page is visible to vCPUs that have
> > > x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> > > 
> > > On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> > > enabled even without x2AVIC support, the bug occurs any time AVIC is
> > > enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> > > that the guest is operating in x2APIC mode.
> > > 
> > > Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> > > comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> > > the "Note:" down a line to conform to preferred kernel/KVM multi-line
> > > comment style.
> > > 
> > > Leave Intel as-is for now to avoid a subtle performance regression, even
> > > though Intel likely suffers from the same bug.  On Intel, in theory the
> > > bug rears its head only when vCPUs share host page tables (extremely
> > > likely) and x2APIC enabling is not consistent within the guest, i.e. if
> > > some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> > > except in certain situations, e.g. bringing up APs).
> > 
> > Are you sure about this?
> 
> Ah, no.  The key on Intel is the separate VMCS control for virtualizing xAPIC
> accesses.  As you note below, KVM will provide memory semantics, which is technically
> wrong.
> 
> > This is what I am thinking will happen, I might be wrong but I am not sure:
> 
> ...
> 
> > 3. guest accesses the 0xfee00xxx, assuming APICv/x2avic, the hardware won't redirect
> > the access to apic backing page, but will instead just use that SPTE and let the guest
> > read/write the private page that we mapped in the range, which is wrong.
> > 
> > Am I missing something?
> 
> No, I don't believe so.  I'm still hesitant to add the effetive inhibit to Intel,
> though that's probably just pure paranoia at this point.  Probably makes sense to
> just do it and verify that x2APIC virtualization still works.

Scratch that, Intel can end up with memory semantics irrespective of x2APIC, e.g.
if APICv is enabled but a vCPU disables its APIC.  We could force a bus error by
inhibiting APICv if any vCPU has its APIC hardware disabled, but IMO that's a bad
tradeoff as there are legitimate reasons to disable the APIC on select vCPUs.

So, I'll omit Intel from the x2APIC pseudo-inhibit, and maybe add a KVM erratum
to document that KVM may provide memory semantics for APIC_BASE when APICv/AVIC
is enabled.
Maxim Levitsky Sept. 23, 2022, 10:18 a.m. UTC | #10
On Tue, 2022-09-20 at 16:50 +0000, Sean Christopherson wrote:
> On Tue, Sep 20, 2022, Sean Christopherson wrote:
> > On Tue, Sep 20, 2022, Maxim Levitsky wrote:
> > > On Sat, 2022-09-03 at 00:22 +0000, Sean Christopherson wrote:
> > > > Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> > > > to fix a bug where the APIC access page is visible to vCPUs that have
> > > > x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> > > > 
> > > > On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> > > > enabled even without x2AVIC support, the bug occurs any time AVIC is
> > > > enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> > > > that the guest is operating in x2APIC mode.
> > > > 
> > > > Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> > > > comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> > > > the "Note:" down a line to conform to preferred kernel/KVM multi-line
> > > > comment style.
> > > > 
> > > > Leave Intel as-is for now to avoid a subtle performance regression, even
> > > > though Intel likely suffers from the same bug.  On Intel, in theory the
> > > > bug rears its head only when vCPUs share host page tables (extremely
> > > > likely) and x2APIC enabling is not consistent within the guest, i.e. if
> > > > some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> > > > except in certain situations, e.g. bringing up APs).
> > > 
> > > Are you sure about this?
> > 
> > Ah, no.  The key on Intel is the separate VMCS control for virtualizing xAPIC
> > accesses.  As you note below, KVM will provide memory semantics, which is technically
> > wrong.
> > 
> > > This is what I am thinking will happen, I might be wrong but I am not sure:
> > 
> > ...
> > 
> > > 3. guest accesses the 0xfee00xxx, assuming APICv/x2avic, the hardware won't redirect
> > > the access to apic backing page, but will instead just use that SPTE and let the guest
> > > read/write the private page that we mapped in the range, which is wrong.
> > > 
> > > Am I missing something?
> > 
> > No, I don't believe so.  I'm still hesitant to add the effetive inhibit to Intel,
> > though that's probably just pure paranoia at this point.  Probably makes sense to
> > just do it and verify that x2APIC virtualization still works.
> 
> Scratch that, Intel can end up with memory semantics irrespective of x2APIC, e.g.
> if APICv is enabled but a vCPU disables its APIC.  We could force a bus error by
> inhibiting APICv if any vCPU has its APIC hardware disabled, but IMO that's a bad
> tradeoff as there are legitimate reasons to disable the APIC on select vCPUs.
> 
> So, I'll omit Intel from the x2APIC pseudo-inhibit, and maybe add a KVM erratum
> to document that KVM may provide memory semantics for APIC_BASE when APICv/AVIC
> is enabled.
> 

I can agree with this - disabled APIC is indeed a valid use case, if a VM doesn't
use all the vCPUs it was assigned, and hotplugs them later.

Thus I can agree to leave it as is for Intel, although if our goal is to be as
close to x86 spec as possible as you said, then it is more correct to do it as I suggested,
because it is still better to be compliant to the x86 spec in one case of two,
that to be compliant in none of the cases.

I am not going to argue about this though.


Best regards,
	Maxim Levitsky
Maxim Levitsky Sept. 23, 2022, 10:19 a.m. UTC | #11
On Tue, 2022-09-20 at 15:46 +0000, Sean Christopherson wrote:
> On Tue, Sep 20, 2022, Maxim Levitsky wrote:
> > On Sat, 2022-09-03 at 00:22 +0000, Sean Christopherson wrote:
> > > Reintroduce APICV_INHIBIT_REASON_X2APIC as a "partial" inhibit for AMD
> > > to fix a bug where the APIC access page is visible to vCPUs that have
> > > x2APIC enabled, i.e. shouldn't be able to "see" the xAPIC MMIO region.
> > > 
> > > On AMD, due to its "hybrid" mode where AVIC is enabled when x2APIC is
> > > enabled even without x2AVIC support, the bug occurs any time AVIC is
> > > enabled as x2APIC is fully emulated by KVM.  I.e. hardware isn't aware
> > > that the guest is operating in x2APIC mode.
> > > 
> > > Opportunistically drop the "can" while updating avic_activate_vmcb()'s
> > > comment, i.e. to state that KVM _does_ support the hybrid mode.  Move
> > > the "Note:" down a line to conform to preferred kernel/KVM multi-line
> > > comment style.
> > > 
> > > Leave Intel as-is for now to avoid a subtle performance regression, even
> > > though Intel likely suffers from the same bug.  On Intel, in theory the
> > > bug rears its head only when vCPUs share host page tables (extremely
> > > likely) and x2APIC enabling is not consistent within the guest, i.e. if
> > > some vCPUs have x2APIC enabled and other does do not (unlikely to occur
> > > except in certain situations, e.g. bringing up APs).
> > 
> > Are you sure about this?
> 
> Ah, no.  The key on Intel is the separate VMCS control for virtualizing xAPIC
> accesses.  As you note below, KVM will provide memory semantics, which is technically
> wrong.
> 
> > This is what I am thinking will happen, I might be wrong but I am not sure:
> 
> ...
> 
> > 3. guest accesses the 0xfee00xxx, assuming APICv/x2avic, the hardware won't redirect
> > the access to apic backing page, but will instead just use that SPTE and let the guest
> > read/write the private page that we mapped in the range, which is wrong.
> > 
> > Am I missing something?
> 
> No, I don't believe so.  I'm still hesitant to add the effetive inhibit to Intel,
> though that's probably just pure paranoia at this point.  Probably makes sense to
> just do it and verify that x2APIC virtualization still works.
> 
> > Also I somewhat doen't like the partial inhibit - it is to some extent
> > misleading.  I don't have a very strong option on using the inhibit, but its
> > meaning just feels a bit overloaded.
> > 
> > So why not to do it this way:
> > 
> > 1. zap the SPTE always when switching apic mode (e.g move the code in 
> > __kvm_set_or_clear_apicv_inhibit to a common funtion)
> > 
> > 2. make kvm_faultin_pfn check a flag 'any vcpu enabled x2apic' and refuse
> > to re-install that spte?
> > 
> > Something like that (only compile tested, and likely misses memory barriers):
> 
> Actually, since this is "sticky", we can go even further and just delete the
> memslot.  Deleting the memslot is slightly complicated by the need to drop SRCU
> if kvm_lapic_set_base() enables x2APIC during KVM_RUN, but that's enough enough
> to handled by putting the disabling logic in vcpu_run() and using KVM_REQ_UNBLOCK
> to ensure the memslot is deleted before the vCPU re-enters the guest.

Yes, that is the elephant in the room - deleting the memslot makes all of the sense,
and I thought about doing it, except that it has a chance of letting 
the genie out of its bottle again - remember that mess we had with the fact that
the memslots are rcu protected?
 
If it works, I 100% support the idea.
 
Also I think you want to remove the KVM_REQ_UNBLOCK, in the other patch series
you just posted?
 
Best regards,
	Maxim Levitsky

> 
> Testing now...
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2c96c43c313a..1fd1b66ceeb6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1132,6 +1132,15 @@  enum kvm_apicv_inhibit {
 	 * AVIC is disabled because SEV doesn't support it.
 	 */
 	APICV_INHIBIT_REASON_SEV,
+
+	/*
+	 * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+	 * inhibited if any vCPU has x2APIC enabled.  Note, this is a "partial"
+	 * inhibit; APICv can still be activated, but KVM mustn't retain/create
+	 * SPTEs for the APIC access page.  Like the APIC ID and APIC base
+	 * inhibits, this is sticky for simplicity.
+	 */
+	APICV_INHIBIT_REASON_X2APIC,
 };
 
 struct kvm_arch {
@@ -1903,6 +1912,7 @@  gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception);
 
+bool kvm_apicv_memslot_activated(struct kvm *kvm);
 bool kvm_apicv_activated(struct kvm *kvm);
 bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 38e9b8e5278c..d956cd37908e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2394,8 +2394,10 @@  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 		}
 	}
 
-	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
+	if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) {
 		kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_X2APIC);
+	}
 
 	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
 		kvm_vcpu_update_apicv(vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e418ef3ecfcb..cea25552869f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4150,7 +4150,7 @@  static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * when the AVIC is re-enabled.
 		 */
 		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
-		    !kvm_apicv_activated(vcpu->kvm))
+		    !kvm_apicv_memslot_activated(vcpu->kvm))
 			return RET_PF_EMULATE;
 	}
 
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 6a3d225eb02c..19be5f1afaac 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -72,12 +72,12 @@  static void avic_activate_vmcb(struct vcpu_svm *svm)
 
 	vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
 
-	/* Note:
-	 * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
-	 * MSR accesses, while interrupt injection to a running vCPU
-	 * can be achieved using AVIC doorbell. The AVIC hardware still
-	 * accelerate MMIO accesses, but this does not cause any harm
-	 * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+	/*
+	 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+	 * accesses, while interrupt injection to a running vCPU can be
+	 * achieved using AVIC doorbell.  KVM disables the APIC access page
+	 * (prevents mapping it into the guest) if any vCPU has x2APIC enabled,
+	 * thus enabling AVIC activates only the doorbell mechanism.
 	 */
 	if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
 	    avic_mode == AVIC_MODE_X2) {
@@ -1014,7 +1014,8 @@  bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
 			  BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
 			  BIT(APICV_INHIBIT_REASON_SEV)      |
 			  BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
-			  BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+			  BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) |
+			  BIT(APICV_INHIBIT_REASON_X2APIC);
 
 	return supported & BIT(reason);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d7374d768296..6ab9088c2531 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9379,15 +9379,29 @@  static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
 	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
 }
 
-bool kvm_apicv_activated(struct kvm *kvm)
+bool kvm_apicv_memslot_activated(struct kvm *kvm)
 {
 	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
 }
+
+static unsigned long kvm_apicv_get_inhibit_reasons(struct kvm *kvm)
+{
+	/*
+	 * x2APIC only needs to "inhibit" the MMIO region, all other aspects of
+	 * APICv can continue to be utilized.
+	 */
+	return READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~APICV_INHIBIT_REASON_X2APIC;
+}
+
+bool kvm_apicv_activated(struct kvm *kvm)
+{
+	return !kvm_apicv_get_inhibit_reasons(kvm);
+}
 EXPORT_SYMBOL_GPL(kvm_apicv_activated);
 
 bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
 {
-	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+	ulong vm_reasons = kvm_apicv_get_inhibit_reasons(vcpu->kvm);
 	ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
 
 	return (vm_reasons | vcpu_reasons) == 0;
@@ -10122,7 +10136,15 @@  void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 
 	set_or_clear_apicv_inhibit(&new, reason, set);
 
-	if (!!old != !!new) {
+	/*
+	 * If the overall "is APICv activated" status is unchanged, simply add
+	 * or remove the inihbit from the pile.  x2APIC is an exception, as it
+	 * is a partial inhibit (only blocks SPTEs for the APIC access page).
+	 * If x2APIC is the only inhibit in either the old or the new set, then
+	 * vCPUs need to be kicked to transition between partially-inhibited
+	 * and fully-inhibited.
+	 */
+	if ((!!old != !!new) || old == X2APIC_ENABLE || new == X2APIC_ENABLE) {
 		/*
 		 * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
 		 * false positives in the sanity check WARN in svm_vcpu_run().
@@ -10137,7 +10159,12 @@  void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 		 */
 		kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
 		kvm->arch.apicv_inhibit_reasons = new;
-		if (new) {
+
+		/*
+		 * Zap SPTEs for the APIC access page if APICv is newly
+		 * inhibited (partially or fully).
+		 */
+		if (new && !old) {
 			unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
 			kvm_zap_gfn_range(kvm, gfn, gfn+1);
 		}