diff mbox series

[v2,03/10] KVM: x86: Re-split x2APIC ICR into ICR+ICR2 for AMD (x2AVIC)

Message ID 20240719235107.3023592-4-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: Fix ICR handling when x2AVIC is active | expand

Commit Message

Sean Christopherson July 19, 2024, 11:51 p.m. UTC
Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's
IPI virtualization support, but only for AMD.  While not stated anywhere
in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs
store the 64-bit ICR as two separate 32-bit values in ICR and ICR2.  When
IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled,
KVM needs to match CPU behavior as some ICR ICR writes will be handled by
the CPU, not by KVM.

Add a kvm_x86_ops knob to control the underlying format used by the CPU to
store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether
or not x2AVIC is enabled.  If KVM is handling all ICR writes, the storage
format for x2APIC mode doesn't matter, and having the behavior follow AMD
versus Intel will provide better test coverage and ease debugging.

Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode")
Cc: stable@vger.kernel.org
Cc: Maxim Levitsky <mlevitsk@redhat.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/lapic.c            | 42 +++++++++++++++++++++++----------
 arch/x86/kvm/svm/svm.c          |  2 ++
 arch/x86/kvm/vmx/main.c         |  2 ++
 4 files changed, 36 insertions(+), 12 deletions(-)

Comments

Sean Christopherson Aug. 22, 2024, 6:44 p.m. UTC | #1
+Tom

Can someone from AMD confirm that this is indeed the behavior, and that for AMD
CPUs, it's the architectural behavior?

A sanity check on the code would be appreciated too, it'd be nice to get this
into v6.12.

Thanks!

On Fri, Jul 19, 2024, Sean Christopherson wrote:
> Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's
> IPI virtualization support, but only for AMD.  While not stated anywhere
> in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs
> store the 64-bit ICR as two separate 32-bit values in ICR and ICR2.  When
> IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled,
> KVM needs to match CPU behavior as some ICR ICR writes will be handled by
> the CPU, not by KVM.
> 
> Add a kvm_x86_ops knob to control the underlying format used by the CPU to
> store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether
> or not x2AVIC is enabled.  If KVM is handling all ICR writes, the storage
> format for x2APIC mode doesn't matter, and having the behavior follow AMD
> versus Intel will provide better test coverage and ease debugging.
> 
> Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode")
> Cc: stable@vger.kernel.org
> Cc: Maxim Levitsky <mlevitsk@redhat.com>
> Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  2 ++
>  arch/x86/kvm/lapic.c            | 42 +++++++++++++++++++++++----------
>  arch/x86/kvm/svm/svm.c          |  2 ++
>  arch/x86/kvm/vmx/main.c         |  2 ++
>  4 files changed, 36 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 950a03e0181e..edc235521434 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1726,6 +1726,8 @@ struct kvm_x86_ops {
>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> +
> +	const bool x2apic_icr_is_split;
>  	const unsigned long required_apicv_inhibits;
>  	bool allow_apicv_in_x2apic_without_x2apic_virtualization;
>  	void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index d14ef485b0bd..cc0a1008fae4 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2473,11 +2473,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
>  	data &= ~APIC_ICR_BUSY;
>  
>  	kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
> -	kvm_lapic_set_reg64(apic, APIC_ICR, data);
> +	if (kvm_x86_ops.x2apic_icr_is_split) {
> +		kvm_lapic_set_reg(apic, APIC_ICR, data);
> +		kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
> +	} else {
> +		kvm_lapic_set_reg64(apic, APIC_ICR, data);
> +	}
>  	trace_kvm_apic_write(APIC_ICR, data);
>  	return 0;
>  }
>  
> +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
> +{
> +	if (kvm_x86_ops.x2apic_icr_is_split)
> +		return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
> +		       (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
> +
> +	return kvm_lapic_get_reg64(apic, APIC_ICR);
> +}
> +
>  /* emulate APIC access in a trap manner */
>  void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
>  {
> @@ -2495,7 +2509,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
>  	 * maybe-unecessary write, and both are in the noise anyways.
>  	 */
>  	if (apic_x2apic_mode(apic) && offset == APIC_ICR)
> -		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)));
> +		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
>  	else
>  		kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
>  }
> @@ -3005,18 +3019,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
>  
>  		/*
>  		 * In x2APIC mode, the LDR is fixed and based on the id.  And
> -		 * ICR is internally a single 64-bit register, but needs to be
> -		 * split to ICR+ICR2 in userspace for backwards compatibility.
> +		 * if the ICR is _not_ split, ICR is internally a single 64-bit
> +		 * register, but needs to be split to ICR+ICR2 in userspace for
> +		 * backwards compatibility.
>  		 */
> -		if (set) {
> +		if (set)
>  			*ldr = kvm_apic_calc_x2apic_ldr(*id);
>  
> -			icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
> -			      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
> -			__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
> -		} else {
> -			icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
> -			__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
> +		if (!kvm_x86_ops.x2apic_icr_is_split) {
> +			if (set) {
> +				icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
> +				      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
> +				__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
> +			} else {
> +				icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
> +				__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
> +			}
>  		}
>  	}
>  
> @@ -3214,7 +3232,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
>  	u32 low;
>  
>  	if (reg == APIC_ICR) {
> -		*data = kvm_lapic_get_reg64(apic, APIC_ICR);
> +		*data = kvm_x2apic_icr_read(apic);
>  		return 0;
>  	}
>  
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index c115d26844f7..04c113386de6 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -5049,6 +5049,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.enable_nmi_window = svm_enable_nmi_window,
>  	.enable_irq_window = svm_enable_irq_window,
>  	.update_cr8_intercept = svm_update_cr8_intercept,
> +
> +	.x2apic_icr_is_split = true,
>  	.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
>  	.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
>  	.apicv_post_state_restore = avic_apicv_post_state_restore,
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 0bf35ebe8a1b..a70699665e11 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -89,6 +89,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>  	.enable_nmi_window = vmx_enable_nmi_window,
>  	.enable_irq_window = vmx_enable_irq_window,
>  	.update_cr8_intercept = vmx_update_cr8_intercept,
> +
> +	.x2apic_icr_is_split = false,
>  	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
>  	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
>  	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
> -- 
> 2.45.2.1089.g2a221341d9-goog
>
Tom Lendacky Aug. 22, 2024, 7:16 p.m. UTC | #2
On 8/22/24 13:44, Sean Christopherson wrote:
> +Tom
> 
> Can someone from AMD confirm that this is indeed the behavior, and that for AMD
> CPUs, it's the architectural behavior?

In section "16.11 Accessing x2APIC Register" of APM Vol 2, there is this
statement:

"For 64-bit x2APIC registers, the high-order bits (bits 63:32) are
mapped to EDX[31:0]"

and in section "16.11.1 x2APIC Register Address Space" of APM Vol 2,
there is this statement:

"The two 32-bit Interrupt Command Registers in APIC mode (MMIO offsets
300h and 310h) are merged into a single 64-bit x2APIC register at MSR
address 830h."

So I believe this isn't necessary. @Suravee, agree?

Are you seeing a bug related to this?

Thanks,
Tom

> 
> A sanity check on the code would be appreciated too, it'd be nice to get this
> into v6.12.
> 
> Thanks!
> 
> On Fri, Jul 19, 2024, Sean Christopherson wrote:
>> Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's
>> IPI virtualization support, but only for AMD.  While not stated anywhere
>> in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs
>> store the 64-bit ICR as two separate 32-bit values in ICR and ICR2.  When
>> IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled,
>> KVM needs to match CPU behavior as some ICR ICR writes will be handled by
>> the CPU, not by KVM.
>>
>> Add a kvm_x86_ops knob to control the underlying format used by the CPU to
>> store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether
>> or not x2AVIC is enabled.  If KVM is handling all ICR writes, the storage
>> format for x2APIC mode doesn't matter, and having the behavior follow AMD
>> versus Intel will provide better test coverage and ease debugging.
>>
>> Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode")
>> Cc: stable@vger.kernel.org
>> Cc: Maxim Levitsky <mlevitsk@redhat.com>
>> Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |  2 ++
>>  arch/x86/kvm/lapic.c            | 42 +++++++++++++++++++++++----------
>>  arch/x86/kvm/svm/svm.c          |  2 ++
>>  arch/x86/kvm/vmx/main.c         |  2 ++
>>  4 files changed, 36 insertions(+), 12 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 950a03e0181e..edc235521434 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1726,6 +1726,8 @@ struct kvm_x86_ops {
>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>> +
>> +	const bool x2apic_icr_is_split;
>>  	const unsigned long required_apicv_inhibits;
>>  	bool allow_apicv_in_x2apic_without_x2apic_virtualization;
>>  	void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index d14ef485b0bd..cc0a1008fae4 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -2473,11 +2473,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
>>  	data &= ~APIC_ICR_BUSY;
>>  
>>  	kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
>> -	kvm_lapic_set_reg64(apic, APIC_ICR, data);
>> +	if (kvm_x86_ops.x2apic_icr_is_split) {
>> +		kvm_lapic_set_reg(apic, APIC_ICR, data);
>> +		kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
>> +	} else {
>> +		kvm_lapic_set_reg64(apic, APIC_ICR, data);
>> +	}
>>  	trace_kvm_apic_write(APIC_ICR, data);
>>  	return 0;
>>  }
>>  
>> +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
>> +{
>> +	if (kvm_x86_ops.x2apic_icr_is_split)
>> +		return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
>> +		       (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
>> +
>> +	return kvm_lapic_get_reg64(apic, APIC_ICR);
>> +}
>> +
>>  /* emulate APIC access in a trap manner */
>>  void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
>>  {
>> @@ -2495,7 +2509,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
>>  	 * maybe-unecessary write, and both are in the noise anyways.
>>  	 */
>>  	if (apic_x2apic_mode(apic) && offset == APIC_ICR)
>> -		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)));
>> +		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
>>  	else
>>  		kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
>>  }
>> @@ -3005,18 +3019,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
>>  
>>  		/*
>>  		 * In x2APIC mode, the LDR is fixed and based on the id.  And
>> -		 * ICR is internally a single 64-bit register, but needs to be
>> -		 * split to ICR+ICR2 in userspace for backwards compatibility.
>> +		 * if the ICR is _not_ split, ICR is internally a single 64-bit
>> +		 * register, but needs to be split to ICR+ICR2 in userspace for
>> +		 * backwards compatibility.
>>  		 */
>> -		if (set) {
>> +		if (set)
>>  			*ldr = kvm_apic_calc_x2apic_ldr(*id);
>>  
>> -			icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
>> -			      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
>> -			__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
>> -		} else {
>> -			icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
>> -			__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
>> +		if (!kvm_x86_ops.x2apic_icr_is_split) {
>> +			if (set) {
>> +				icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
>> +				      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
>> +				__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
>> +			} else {
>> +				icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
>> +				__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
>> +			}
>>  		}
>>  	}
>>  
>> @@ -3214,7 +3232,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
>>  	u32 low;
>>  
>>  	if (reg == APIC_ICR) {
>> -		*data = kvm_lapic_get_reg64(apic, APIC_ICR);
>> +		*data = kvm_x2apic_icr_read(apic);
>>  		return 0;
>>  	}
>>  
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index c115d26844f7..04c113386de6 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -5049,6 +5049,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>>  	.enable_nmi_window = svm_enable_nmi_window,
>>  	.enable_irq_window = svm_enable_irq_window,
>>  	.update_cr8_intercept = svm_update_cr8_intercept,
>> +
>> +	.x2apic_icr_is_split = true,
>>  	.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
>>  	.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
>>  	.apicv_post_state_restore = avic_apicv_post_state_restore,
>> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
>> index 0bf35ebe8a1b..a70699665e11 100644
>> --- a/arch/x86/kvm/vmx/main.c
>> +++ b/arch/x86/kvm/vmx/main.c
>> @@ -89,6 +89,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>>  	.enable_nmi_window = vmx_enable_nmi_window,
>>  	.enable_irq_window = vmx_enable_irq_window,
>>  	.update_cr8_intercept = vmx_update_cr8_intercept,
>> +
>> +	.x2apic_icr_is_split = false,
>>  	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
>>  	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
>>  	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
>> -- 
>> 2.45.2.1089.g2a221341d9-goog
>>
Sean Christopherson Aug. 22, 2024, 7:41 p.m. UTC | #3
On Thu, Aug 22, 2024, Tom Lendacky wrote:
> On 8/22/24 13:44, Sean Christopherson wrote:
> > +Tom
> > 
> > Can someone from AMD confirm that this is indeed the behavior, and that for AMD
> > CPUs, it's the architectural behavior?
> 
> In section "16.11 Accessing x2APIC Register" of APM Vol 2, there is this
> statement:
> 
> "For 64-bit x2APIC registers, the high-order bits (bits 63:32) are
> mapped to EDX[31:0]"
> 
> and in section "16.11.1 x2APIC Register Address Space" of APM Vol 2,
> there is this statement:
> 
> "The two 32-bit Interrupt Command Registers in APIC mode (MMIO offsets
> 300h and 310h) are merged into a single 64-bit x2APIC register at MSR
> address 830h."
> 
> So I believe this isn't necessary. @Suravee, agree?
> 
> Are you seeing a bug related to this?

Yep.  With APICv and x2APIC enabled, Intel CPUs use a single 64-bit value at
offset 300h for the backing storage.  This is what KVM currently implements,
e.g. when pulling state out of the vAPIC page for migration, and when emulating
a RDMSR(ICR).

With AVIC and x2APIC (a.k.a. x2AVIC enabled), Genoa uses the legacy MMIO offsets
for storage, at least AFAICT.  I.e. the single MSR at 830h is split into separate
32-bit values at 300h and 310h on WRMSR, and then reconstituted on RDMSR.

The APM doesn't actually clarify the layout of the backing storage, i.e. doesn't
explicitly say that the full 64-bit value is stored at 300h.  IIRC, Intel's SDM
doesn't say that either, i.e. KVM's behavior is fully based on throwing noodles
and seeing what sticks.

FWIW, AMD's behavior actually makes sense, at it provides a consistent layout
when switching between xAPIC and x2APIC.  Intel's does too, from the perspective
of being able to emit single loads/stores.
Tom Lendacky Aug. 22, 2024, 8:29 p.m. UTC | #4
On 8/22/24 14:41, Sean Christopherson wrote:
> On Thu, Aug 22, 2024, Tom Lendacky wrote:
>> On 8/22/24 13:44, Sean Christopherson wrote:
>>> +Tom
>>>
>>> Can someone from AMD confirm that this is indeed the behavior, and that for AMD
>>> CPUs, it's the architectural behavior?
>>
>> In section "16.11 Accessing x2APIC Register" of APM Vol 2, there is this
>> statement:
>>
>> "For 64-bit x2APIC registers, the high-order bits (bits 63:32) are
>> mapped to EDX[31:0]"
>>
>> and in section "16.11.1 x2APIC Register Address Space" of APM Vol 2,
>> there is this statement:
>>
>> "The two 32-bit Interrupt Command Registers in APIC mode (MMIO offsets
>> 300h and 310h) are merged into a single 64-bit x2APIC register at MSR
>> address 830h."
>>
>> So I believe this isn't necessary. @Suravee, agree?
>>
>> Are you seeing a bug related to this?
> 
> Yep.  With APICv and x2APIC enabled, Intel CPUs use a single 64-bit value at
> offset 300h for the backing storage.  This is what KVM currently implements,
> e.g. when pulling state out of the vAPIC page for migration, and when emulating
> a RDMSR(ICR).
> 
> With AVIC and x2APIC (a.k.a. x2AVIC enabled), Genoa uses the legacy MMIO offsets
> for storage, at least AFAICT.  I.e. the single MSR at 830h is split into separate
> 32-bit values at 300h and 310h on WRMSR, and then reconstituted on RDMSR.
> 
> The APM doesn't actually clarify the layout of the backing storage, i.e. doesn't
> explicitly say that the full 64-bit value is stored at 300h.  IIRC, Intel's SDM

Ah, for x2AVIC, yes, you have to do two writes to the backing page. One
at offset 0x300 and one at offset 0x310 (confirmed with the hardware
team). The order shouldn't matter since the guest vCPU isn't running
when you're writing the values, but you should do the IRC High write
first, followed by the ICR Low.

Thanks,
Tom

> doesn't say that either, i.e. KVM's behavior is fully based on throwing noodles
> and seeing what sticks.
> 
> FWIW, AMD's behavior actually makes sense, at it provides a consistent layout
> when switching between xAPIC and x2APIC.  Intel's does too, from the perspective
> of being able to emit single loads/stores.
Sean Christopherson Aug. 22, 2024, 11:10 p.m. UTC | #5
On Thu, Aug 22, 2024, Tom Lendacky wrote:
> On 8/22/24 14:41, Sean Christopherson wrote:
> > On Thu, Aug 22, 2024, Tom Lendacky wrote:
> >> On 8/22/24 13:44, Sean Christopherson wrote:
> >>> +Tom
> >>>
> >>> Can someone from AMD confirm that this is indeed the behavior, and that for AMD
> >>> CPUs, it's the architectural behavior?
> >>
> >> In section "16.11 Accessing x2APIC Register" of APM Vol 2, there is this
> >> statement:
> >>
> >> "For 64-bit x2APIC registers, the high-order bits (bits 63:32) are
> >> mapped to EDX[31:0]"
> >>
> >> and in section "16.11.1 x2APIC Register Address Space" of APM Vol 2,
> >> there is this statement:
> >>
> >> "The two 32-bit Interrupt Command Registers in APIC mode (MMIO offsets
> >> 300h and 310h) are merged into a single 64-bit x2APIC register at MSR
> >> address 830h."
> >>
> >> So I believe this isn't necessary. @Suravee, agree?
> >>
> >> Are you seeing a bug related to this?
> > 
> > Yep.  With APICv and x2APIC enabled, Intel CPUs use a single 64-bit value at
> > offset 300h for the backing storage.  This is what KVM currently implements,
> > e.g. when pulling state out of the vAPIC page for migration, and when emulating
> > a RDMSR(ICR).
> > 
> > With AVIC and x2APIC (a.k.a. x2AVIC enabled), Genoa uses the legacy MMIO offsets
> > for storage, at least AFAICT.  I.e. the single MSR at 830h is split into separate
> > 32-bit values at 300h and 310h on WRMSR, and then reconstituted on RDMSR.
> > 
> > The APM doesn't actually clarify the layout of the backing storage, i.e. doesn't
> > explicitly say that the full 64-bit value is stored at 300h.  IIRC, Intel's SDM
> 
> Ah, for x2AVIC, yes, you have to do two writes to the backing page. One
> at offset 0x300 and one at offset 0x310 (confirmed with the hardware
> team). The order shouldn't matter since the guest vCPU isn't running
> when you're writing the values, but you should do the IRC High write
> first, followed by the ICR Low.

Thanks Tom!
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 950a03e0181e..edc235521434 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1726,6 +1726,8 @@  struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+
+	const bool x2apic_icr_is_split;
 	const unsigned long required_apicv_inhibits;
 	bool allow_apicv_in_x2apic_without_x2apic_virtualization;
 	void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d14ef485b0bd..cc0a1008fae4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2473,11 +2473,25 @@  int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
 	data &= ~APIC_ICR_BUSY;
 
 	kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
-	kvm_lapic_set_reg64(apic, APIC_ICR, data);
+	if (kvm_x86_ops.x2apic_icr_is_split) {
+		kvm_lapic_set_reg(apic, APIC_ICR, data);
+		kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+	} else {
+		kvm_lapic_set_reg64(apic, APIC_ICR, data);
+	}
 	trace_kvm_apic_write(APIC_ICR, data);
 	return 0;
 }
 
+static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+{
+	if (kvm_x86_ops.x2apic_icr_is_split)
+		return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+		       (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+
+	return kvm_lapic_get_reg64(apic, APIC_ICR);
+}
+
 /* emulate APIC access in a trap manner */
 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
 {
@@ -2495,7 +2509,7 @@  void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
 	 * maybe-unecessary write, and both are in the noise anyways.
 	 */
 	if (apic_x2apic_mode(apic) && offset == APIC_ICR)
-		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)));
+		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
 	else
 		kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
 }
@@ -3005,18 +3019,22 @@  static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 
 		/*
 		 * In x2APIC mode, the LDR is fixed and based on the id.  And
-		 * ICR is internally a single 64-bit register, but needs to be
-		 * split to ICR+ICR2 in userspace for backwards compatibility.
+		 * if the ICR is _not_ split, ICR is internally a single 64-bit
+		 * register, but needs to be split to ICR+ICR2 in userspace for
+		 * backwards compatibility.
 		 */
-		if (set) {
+		if (set)
 			*ldr = kvm_apic_calc_x2apic_ldr(*id);
 
-			icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
-			      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
-			__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
-		} else {
-			icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
-			__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+		if (!kvm_x86_ops.x2apic_icr_is_split) {
+			if (set) {
+				icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+				      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+				__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+			} else {
+				icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+				__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+			}
 		}
 	}
 
@@ -3214,7 +3232,7 @@  static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
 	u32 low;
 
 	if (reg == APIC_ICR) {
-		*data = kvm_lapic_get_reg64(apic, APIC_ICR);
+		*data = kvm_x2apic_icr_read(apic);
 		return 0;
 	}
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c115d26844f7..04c113386de6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5049,6 +5049,8 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.enable_nmi_window = svm_enable_nmi_window,
 	.enable_irq_window = svm_enable_irq_window,
 	.update_cr8_intercept = svm_update_cr8_intercept,
+
+	.x2apic_icr_is_split = true,
 	.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
 	.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
 	.apicv_post_state_restore = avic_apicv_post_state_restore,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 0bf35ebe8a1b..a70699665e11 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -89,6 +89,8 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 	.enable_nmi_window = vmx_enable_nmi_window,
 	.enable_irq_window = vmx_enable_irq_window,
 	.update_cr8_intercept = vmx_update_cr8_intercept,
+
+	.x2apic_icr_is_split = false,
 	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
 	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,