diff mbox series

[v5,4/6] KVM: SVM: Add support to handle AP reset MSR protocol

Message ID 20211020124416.24523-5-joro@8bytes.org (mailing list archive)
State New, archived
Headers show
Series KVM: SVM: Add initial GHCB protocol version 2 support | expand

Commit Message

Joerg Roedel Oct. 20, 2021, 12:44 p.m. UTC
From: Tom Lendacky <thomas.lendacky@amd.com>

Add support for AP Reset Hold being invoked using the GHCB MSR protocol,
available in version 2 of the GHCB specification.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/svm/sev.c          | 52 ++++++++++++++++++++++++++-------
 arch/x86/kvm/svm/svm.h          |  8 +++++
 3 files changed, 49 insertions(+), 12 deletions(-)

Comments

Sean Christopherson Oct. 20, 2021, 5:40 p.m. UTC | #1
On Wed, Oct 20, 2021, Joerg Roedel wrote:
> From: Tom Lendacky <thomas.lendacky@amd.com>
> 
> Add support for AP Reset Hold being invoked using the GHCB MSR protocol,
> available in version 2 of the GHCB specification.

The changelog needs to explain how this is actually supposed to work.  Doesn't
need gory details, just a basic explanation of the sequence of events to wake a
vCPU that requested a reset hold.

I apologize in advance for the following rant(s).  There's some actionable feedback,
but a lot of it is just me complaining about the reset hold nonsense.

For the actual feedback, attached are two patches: patch 1 eliminates the
"first_sipi_received" hack, patch 2 is a (hopefully) fixed version of this patch
(but doesn't have an updated changelog).  Both are compile tested only.  There
will be a benign conflict with patch 05 of this series.

> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 -
>  arch/x86/kvm/svm/sev.c          | 52 ++++++++++++++++++++++++++-------
>  arch/x86/kvm/svm/svm.h          |  8 +++++
>  3 files changed, 49 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b67f550616cf..5c6b1469cc3b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -237,7 +237,6 @@ enum x86_intercept_stage;
>  	KVM_GUESTDBG_INJECT_DB | \
>  	KVM_GUESTDBG_BLOCKIRQ)
>  
> -

Spurious whitespace change.

>  #define PFERR_PRESENT_BIT 0
>  #define PFERR_WRITE_BIT 1
>  #define PFERR_USER_BIT 2
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 9afa71cb36e6..10af4ac83971 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -2246,6 +2246,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
>  
>  void sev_es_unmap_ghcb(struct vcpu_svm *svm)
>  {
> +	/* Clear any indication that the vCPU is in a type of AP Reset Hold */
> +	svm->reset_hold_type = AP_RESET_HOLD_NONE;
> +
>  	if (!svm->ghcb)
>  		return;
>  
> @@ -2405,14 +2408,21 @@ static u64 ghcb_msr_version_info(void)
>  	return msr;
>  }
>  
> -static int sev_emulate_ap_reset_hold(struct vcpu_svm *svm)
> +static int sev_emulate_ap_reset_hold(struct vcpu_svm *svm, enum ap_reset_hold_type type)
>  {
>  	int ret = kvm_skip_emulated_instruction(&svm->vcpu);
>  
> +	svm->reset_hold_type = type;
> +
>  	return __kvm_vcpu_halt(&svm->vcpu,
>  			       KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
>  }
>  
> +static u64 ghcb_msr_ap_rst_resp(u64 value)
> +{
> +	return (u64)GHCB_MSR_AP_RESET_HOLD_RESP | (value << GHCB_DATA_LOW);
> +}
> +
>  static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
>  {
>  	struct vmcb_control_area *control = &svm->vmcb->control;
> @@ -2459,6 +2469,16 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
>  
>  		break;
>  	}
> +	case GHCB_MSR_AP_RESET_HOLD_REQ:
> +		ret = sev_emulate_ap_reset_hold(svm, AP_RESET_HOLD_MSR_PROTO);
> +
> +		/*
> +		 * Preset the result to a non-SIPI return and then only set
> +		 * the result to non-zero when delivering a SIPI.
> +		 */
> +		svm->vmcb->control.ghcb_gpa = ghcb_msr_ap_rst_resp(0);

This can race with the SIPI and effectively corrupt svm->vmcb->control.ghcb_gpa.

        vCPU0                           vCPU1
                                        #VMGEXIT(RESET_HOLD)
                                        __kvm_vcpu_halt()
        INIT
        SIPI
        sev_vcpu_deliver_sipi_vector()
        ghcb_msr_ap_rst_resp(1);
                                        ghcb_msr_ap_rst_resp(0);

Note, the "INIT" above is mostly a guess.  I'm pretty sure it's necessary because
I don't see how KVM can possibly be correct otherwise.  The SIPI handler (below)
quite clearly expects the vCPU to have been in an AP reset hold, but the invocation
of sev_vcpu_deliver_sipi_vector is gated by the vCPU being in
KVM_MP_STATE_INIT_RECEIVED, not KVM_MP_STATE_AP_RESET_HOLD.  That implies the BSP
must INIT the AP.

                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        /* evaluate pending_events before reading the vector */ 
                        smp_rmb();
                        sipi_vector = apic->sipi_vector;
                        kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector);
                        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
                }

But the GHCB 2.0 spec doesn't actually say that; it instead implies that SIPI is
supposed to be able to directly wake the AP.

  * When a guest AP reaches its HLT loop (or similar method for parking the AP),
    it instead can either:
      1. Issue an AP Reset Hold NAE event.
      2. Issue the AP Reset Hold Request MSR Protocol
  * The hypervisor treats treats either request like the guest issued a HLT
                   ^^^^^^^^^^^^^                                        ^^^
                   spec typo                                            ???
    instruction and marks the vCPU as halted.
  * When the hypervisor receives a SIPI request for the vCPU, it will not update
                                   ^^^^^^^^^^^^
    any register values and, instead, it will either complete the AP Reset Hold
    NAE event or complete the AP Reset Hold MSR protocol
  * Mark the vCPU as active, allowing the VMGEXIT to complete.
  * Upon return from the VMGEXIT, the AP must transition from its current execution
    mode into real mode and begin executing at the reset vector supplied in the SIPI
    request. 

Piecing things together, my understanding is that the "hold request" really is
intended to be a HLT, but with extra semantics where the host sets sw_exit_info_2
to indicate that the vCPU was woken by INIT-SIPI. 

It's probably too late to change the spec, but I'm going to complain anyways. 
This is all ridiculously convoluted and completely unnecessary.  As pointed out
in the SNP series regarding AP "creation", this can be fully handled in the guest
via a simple mailbox between firmware and kernel.  What's really fubar is that
the guest firmware and kernel already have a mailbox!  But it's defined by the
GHCB spec instead of e.g. ACPI, and so instead of handling this fully within the
guest, the hypervisor (and PSP to some extent on SNP because of the secrets page!!!)
gets involved.

The complications to support this in the guest firmware are hilarious, e.g. the
guest hasto manually switch from 64-bit mode to Real Mode just so that the kernel
can continue to use a horribly antiquated method for gaining control of APs.

> +
> +		break;
>  	case GHCB_MSR_TERM_REQ: {
>  		u64 reason_set, reason_code;
>  
> @@ -2544,7 +2564,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
>  		ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
>  		break;
>  	case SVM_VMGEXIT_AP_HLT_LOOP:
> -		ret = sev_emulate_ap_reset_hold(svm);
> +		ret = sev_emulate_ap_reset_hold(svm, AP_RESET_HOLD_NAE_EVENT);
>  		break;
>  	case SVM_VMGEXIT_AP_JUMP_TABLE: {
>  		struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
> @@ -2679,13 +2699,23 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)

Tying into above, handling this in SIPI is flawed.  For example, if the guest
does INIT-SIPI-SIPI without a reset hold, KVM would incorrect set sw_exit_info_2
on the SIPI.  Because this mess requires an INIT, KVM has lost track of whether
the guest was in KVM_MP_STATE_AP_RESET_HOLD and thus can't know if the SIPI
arrived after a reset hold.  Looking at KVM, IIUC, this bug is why the hack
"received_first_sipi" exists.

Of course this all begs the question of why there's a "reset hold" concept in
the first place.  It's literally HLT with a flag to say "you got INIT-SIPI".
But the guest has to supply and fill a jump table!  Just put the flag in the
jump table!!!!

>  		return;
>  	}
>  
> -	/*
> -	 * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
> -	 * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
> -	 * non-zero value.
> -	 */
> -	if (!svm->ghcb)
> -		return;
> -
> -	ghcb_set_sw_exit_info_2(svm->ghcb, 1);
> +	/* Subsequent SIPI */
> +	switch (svm->reset_hold_type) {
> +	case AP_RESET_HOLD_NAE_EVENT:
> +		/*
> +		 * Return from an AP Reset Hold VMGEXIT, where the guest will
> +		 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
> +		 */
> +		ghcb_set_sw_exit_info_2(svm->ghcb, 1);

Doesn't this need to check for a null svm->ghcb?

I also suspect a boolean might make it easier to understand the implications
and also make the whole thing less error prone, e.g.

        if (svm->reset_hold_msr_protocol) {

        } else if (svm->ghcb) {

        }

> +		break;
> +	case AP_RESET_HOLD_MSR_PROTO:
> +		/*
> +		 * Return from an AP Reset Hold VMGEXIT, where the guest will
> +		 * set the CS and RIP. Set GHCB data field to a non-zero value.
> +		 */
> +		svm->vmcb->control.ghcb_gpa = ghcb_msr_ap_rst_resp(1);
> +		break;
> +	default:
> +		break;
> +	}
>  }
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 68e5f16a0554..bf9379f1cfb8 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -69,6 +69,12 @@ enum {
>  /* TPR and CR2 are always written before VMRUN */
>  #define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
>  
> +enum ap_reset_hold_type {
> +	AP_RESET_HOLD_NONE,
> +	AP_RESET_HOLD_NAE_EVENT,
> +	AP_RESET_HOLD_MSR_PROTO,
> +};
> +
>  struct kvm_sev_info {
>  	bool active;		/* SEV enabled guest */
>  	bool es_active;		/* SEV-ES enabled guest */
> @@ -199,6 +205,8 @@ struct vcpu_svm {
>  	bool ghcb_sa_free;
>  
>  	bool guest_state_loaded;
> +
> +	enum ap_reset_hold_type reset_hold_type;
>  };
>  
>  struct svm_cpu_data {
> -- 
> 2.33.1
>
Tom Lendacky Oct. 20, 2021, 6:13 p.m. UTC | #2
On 10/20/21 12:40 PM, Sean Christopherson wrote:
> On Wed, Oct 20, 2021, Joerg Roedel wrote:
>> From: Tom Lendacky <thomas.lendacky@amd.com>
>>
>> Add support for AP Reset Hold being invoked using the GHCB MSR protocol,
>> available in version 2 of the GHCB specification.
> 
> The changelog needs to explain how this is actually supposed to work.  Doesn't
> need gory details, just a basic explanation of the sequence of events to wake a
> vCPU that requested a reset hold.
> 
> I apologize in advance for the following rant(s).  There's some actionable feedback,
> but a lot of it is just me complaining about the reset hold nonsense.
> 
> For the actual feedback, attached are two patches: patch 1 eliminates the
> "first_sipi_received" hack, patch 2 is a (hopefully) fixed version of this patch
> (but doesn't have an updated changelog).  Both are compile tested only.  There
> will be a benign conflict with patch 05 of this series.
> 
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>> Signed-off-by: Joerg Roedel <jroedel@suse.de>
>> ---
>>   arch/x86/include/asm/kvm_host.h |  1 -
>>   arch/x86/kvm/svm/sev.c          | 52 ++++++++++++++++++++++++++-------
>>   arch/x86/kvm/svm/svm.h          |  8 +++++
>>   3 files changed, 49 insertions(+), 12 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index b67f550616cf..5c6b1469cc3b 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -237,7 +237,6 @@ enum x86_intercept_stage;
>>   	KVM_GUESTDBG_INJECT_DB | \
>>   	KVM_GUESTDBG_BLOCKIRQ)
>>   
>> -
> 
> Spurious whitespace change.
> 
>>   #define PFERR_PRESENT_BIT 0
>>   #define PFERR_WRITE_BIT 1
>>   #define PFERR_USER_BIT 2
>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>> index 9afa71cb36e6..10af4ac83971 100644
>> --- a/arch/x86/kvm/svm/sev.c
>> +++ b/arch/x86/kvm/svm/sev.c
>> @@ -2246,6 +2246,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
>>   
>>   void sev_es_unmap_ghcb(struct vcpu_svm *svm)
>>   {
>> +	/* Clear any indication that the vCPU is in a type of AP Reset Hold */
>> +	svm->reset_hold_type = AP_RESET_HOLD_NONE;
>> +
>>   	if (!svm->ghcb)
>>   		return;
>>   
>> @@ -2405,14 +2408,21 @@ static u64 ghcb_msr_version_info(void)
>>   	return msr;
>>   }
>>   
>> -static int sev_emulate_ap_reset_hold(struct vcpu_svm *svm)
>> +static int sev_emulate_ap_reset_hold(struct vcpu_svm *svm, enum ap_reset_hold_type type)
>>   {
>>   	int ret = kvm_skip_emulated_instruction(&svm->vcpu);
>>   
>> +	svm->reset_hold_type = type;
>> +
>>   	return __kvm_vcpu_halt(&svm->vcpu,
>>   			       KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
>>   }
>>   
>> +static u64 ghcb_msr_ap_rst_resp(u64 value)
>> +{
>> +	return (u64)GHCB_MSR_AP_RESET_HOLD_RESP | (value << GHCB_DATA_LOW);
>> +}
>> +
>>   static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
>>   {
>>   	struct vmcb_control_area *control = &svm->vmcb->control;
>> @@ -2459,6 +2469,16 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
>>   
>>   		break;
>>   	}
>> +	case GHCB_MSR_AP_RESET_HOLD_REQ:
>> +		ret = sev_emulate_ap_reset_hold(svm, AP_RESET_HOLD_MSR_PROTO);
>> +
>> +		/*
>> +		 * Preset the result to a non-SIPI return and then only set
>> +		 * the result to non-zero when delivering a SIPI.
>> +		 */
>> +		svm->vmcb->control.ghcb_gpa = ghcb_msr_ap_rst_resp(0);
> 
> This can race with the SIPI and effectively corrupt svm->vmcb->control.ghcb_gpa.
> 
>          vCPU0                           vCPU1
>                                          #VMGEXIT(RESET_HOLD)
>                                          __kvm_vcpu_halt()
>          INIT
>          SIPI
>          sev_vcpu_deliver_sipi_vector()
>          ghcb_msr_ap_rst_resp(1);

This isn't possible. vCPU0 doesn't set vCPU1's GHCB value. vCPU1's GHCB 
value is set when vCPU1 handles events in vcpu_enter_guest().

Thanks,
Tom

>                                          ghcb_msr_ap_rst_resp(0);
> 
> Note, the "INIT" above is mostly a guess.  I'm pretty sure it's necessary because
> I don't see how KVM can possibly be correct otherwise.  The SIPI handler (below)
> quite clearly expects the vCPU to have been in an AP reset hold, but the invocation
> of sev_vcpu_deliver_sipi_vector is gated by the vCPU being in
> KVM_MP_STATE_INIT_RECEIVED, not KVM_MP_STATE_AP_RESET_HOLD.  That implies the BSP
> must INIT the AP.
> 
>                  if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
>                          /* evaluate pending_events before reading the vector */
>                          smp_rmb();
>                          sipi_vector = apic->sipi_vector;
>                          kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector);
>                          vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>                  }
> 
> But the GHCB 2.0 spec doesn't actually say that; it instead implies that SIPI is
> supposed to be able to directly wake the AP.
> 
>    * When a guest AP reaches its HLT loop (or similar method for parking the AP),
>      it instead can either:
>        1. Issue an AP Reset Hold NAE event.
>        2. Issue the AP Reset Hold Request MSR Protocol
>    * The hypervisor treats treats either request like the guest issued a HLT
>                     ^^^^^^^^^^^^^                                        ^^^
>                     spec typo                                            ???
>      instruction and marks the vCPU as halted.
>    * When the hypervisor receives a SIPI request for the vCPU, it will not update
>                                     ^^^^^^^^^^^^
>      any register values and, instead, it will either complete the AP Reset Hold
>      NAE event or complete the AP Reset Hold MSR protocol
>    * Mark the vCPU as active, allowing the VMGEXIT to complete.
>    * Upon return from the VMGEXIT, the AP must transition from its current execution
>      mode into real mode and begin executing at the reset vector supplied in the SIPI
>      request.
> 
> Piecing things together, my understanding is that the "hold request" really is
> intended to be a HLT, but with extra semantics where the host sets sw_exit_info_2
> to indicate that the vCPU was woken by INIT-SIPI.
> 
> It's probably too late to change the spec, but I'm going to complain anyways.
> This is all ridiculously convoluted and completely unnecessary.  As pointed out
> in the SNP series regarding AP "creation", this can be fully handled in the guest
> via a simple mailbox between firmware and kernel.  What's really fubar is that
> the guest firmware and kernel already have a mailbox!  But it's defined by the
> GHCB spec instead of e.g. ACPI, and so instead of handling this fully within the
> guest, the hypervisor (and PSP to some extent on SNP because of the secrets page!!!)
> gets involved.
> 
> The complications to support this in the guest firmware are hilarious, e.g. the
> guest hasto manually switch from 64-bit mode to Real Mode just so that the kernel
> can continue to use a horribly antiquated method for gaining control of APs.
> 
>> +
>> +		break;
>>   	case GHCB_MSR_TERM_REQ: {
>>   		u64 reason_set, reason_code;
>>   
>> @@ -2544,7 +2564,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
>>   		ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
>>   		break;
>>   	case SVM_VMGEXIT_AP_HLT_LOOP:
>> -		ret = sev_emulate_ap_reset_hold(svm);
>> +		ret = sev_emulate_ap_reset_hold(svm, AP_RESET_HOLD_NAE_EVENT);
>>   		break;
>>   	case SVM_VMGEXIT_AP_JUMP_TABLE: {
>>   		struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
>> @@ -2679,13 +2699,23 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
> 
> Tying into above, handling this in SIPI is flawed.  For example, if the guest
> does INIT-SIPI-SIPI without a reset hold, KVM would incorrect set sw_exit_info_2
> on the SIPI.  Because this mess requires an INIT, KVM has lost track of whether
> the guest was in KVM_MP_STATE_AP_RESET_HOLD and thus can't know if the SIPI
> arrived after a reset hold.  Looking at KVM, IIUC, this bug is why the hack
> "received_first_sipi" exists.
> 
> Of course this all begs the question of why there's a "reset hold" concept in
> the first place.  It's literally HLT with a flag to say "you got INIT-SIPI".
> But the guest has to supply and fill a jump table!  Just put the flag in the
> jump table!!!!
> 
>>   		return;
>>   	}
>>   
>> -	/*
>> -	 * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
>> -	 * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
>> -	 * non-zero value.
>> -	 */
>> -	if (!svm->ghcb)
>> -		return;
>> -
>> -	ghcb_set_sw_exit_info_2(svm->ghcb, 1);
>> +	/* Subsequent SIPI */
>> +	switch (svm->reset_hold_type) {
>> +	case AP_RESET_HOLD_NAE_EVENT:
>> +		/*
>> +		 * Return from an AP Reset Hold VMGEXIT, where the guest will
>> +		 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
>> +		 */
>> +		ghcb_set_sw_exit_info_2(svm->ghcb, 1);
> 
> Doesn't this need to check for a null svm->ghcb?
> 
> I also suspect a boolean might make it easier to understand the implications
> and also make the whole thing less error prone, e.g.
> 
>          if (svm->reset_hold_msr_protocol) {
> 
>          } else if (svm->ghcb) {
> 
>          }
> 
>> +		break;
>> +	case AP_RESET_HOLD_MSR_PROTO:
>> +		/*
>> +		 * Return from an AP Reset Hold VMGEXIT, where the guest will
>> +		 * set the CS and RIP. Set GHCB data field to a non-zero value.
>> +		 */
>> +		svm->vmcb->control.ghcb_gpa = ghcb_msr_ap_rst_resp(1);
>> +		break;
>> +	default:
>> +		break;
>> +	}
>>   }
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index 68e5f16a0554..bf9379f1cfb8 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -69,6 +69,12 @@ enum {
>>   /* TPR and CR2 are always written before VMRUN */
>>   #define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
>>   
>> +enum ap_reset_hold_type {
>> +	AP_RESET_HOLD_NONE,
>> +	AP_RESET_HOLD_NAE_EVENT,
>> +	AP_RESET_HOLD_MSR_PROTO,
>> +};
>> +
>>   struct kvm_sev_info {
>>   	bool active;		/* SEV enabled guest */
>>   	bool es_active;		/* SEV-ES enabled guest */
>> @@ -199,6 +205,8 @@ struct vcpu_svm {
>>   	bool ghcb_sa_free;
>>   
>>   	bool guest_state_loaded;
>> +
>> +	enum ap_reset_hold_type reset_hold_type;
>>   };
>>   
>>   struct svm_cpu_data {
>> -- 
>> 2.33.1
>>
Tom Lendacky Oct. 20, 2021, 6:36 p.m. UTC | #3
On 10/20/21 12:40 PM, Sean Christopherson wrote:
> On Wed, Oct 20, 2021, Joerg Roedel wrote:
> 
> Tying into above, handling this in SIPI is flawed.  For example, if the guest
> does INIT-SIPI-SIPI without a reset hold, KVM would incorrect set sw_exit_info_2
> on the SIPI.  Because this mess requires an INIT, KVM has lost track of whether
> the guest was in KVM_MP_STATE_AP_RESET_HOLD and thus can't know if the SIPI
> arrived after a reset hold.  Looking at KVM, IIUC, this bug is why the hack
> "received_first_sipi" exists.

The received_first_sipi is because the APs have to be started the very 
first time in the traditional way. The AP can't issue an AP reset hold if 
it hasn't started to begin with in which case there would be no GHCB mapped.

After the check to see if the  GHCB is mapped was added to 
sev_vcpu_deliver_sipi_vector(), the "received_first_sipi" could probably 
have been deleted at that point.

Thanks,
Tom
Sean Christopherson Oct. 20, 2021, 6:38 p.m. UTC | #4
On Wed, Oct 20, 2021, Tom Lendacky wrote:
> On 10/20/21 12:40 PM, Sean Christopherson wrote:
> > On Wed, Oct 20, 2021, Joerg Roedel wrote:
> > This can race with the SIPI and effectively corrupt svm->vmcb->control.ghcb_gpa.
> > 
> >          vCPU0                           vCPU1
> >                                          #VMGEXIT(RESET_HOLD)
> >                                          __kvm_vcpu_halt()
> >          INIT
> >          SIPI
> >          sev_vcpu_deliver_sipi_vector()
> >          ghcb_msr_ap_rst_resp(1);
> 
> This isn't possible. vCPU0 doesn't set vCPU1's GHCB value. vCPU1's GHCB
> value is set when vCPU1 handles events in vcpu_enter_guest().

Argh, I was thinking of injecting regular IPIs across vCPUs.  In hindsight it
makes sense that INIT and SIPI are handled on the current vCPU, stuffing all that
state from a different vCPU would be needlessly complex.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b67f550616cf..5c6b1469cc3b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -237,7 +237,6 @@  enum x86_intercept_stage;
 	KVM_GUESTDBG_INJECT_DB | \
 	KVM_GUESTDBG_BLOCKIRQ)
 
-
 #define PFERR_PRESENT_BIT 0
 #define PFERR_WRITE_BIT 1
 #define PFERR_USER_BIT 2
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 9afa71cb36e6..10af4ac83971 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2246,6 +2246,9 @@  static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 
 void sev_es_unmap_ghcb(struct vcpu_svm *svm)
 {
+	/* Clear any indication that the vCPU is in a type of AP Reset Hold */
+	svm->reset_hold_type = AP_RESET_HOLD_NONE;
+
 	if (!svm->ghcb)
 		return;
 
@@ -2405,14 +2408,21 @@  static u64 ghcb_msr_version_info(void)
 	return msr;
 }
 
-static int sev_emulate_ap_reset_hold(struct vcpu_svm *svm)
+static int sev_emulate_ap_reset_hold(struct vcpu_svm *svm, enum ap_reset_hold_type type)
 {
 	int ret = kvm_skip_emulated_instruction(&svm->vcpu);
 
+	svm->reset_hold_type = type;
+
 	return __kvm_vcpu_halt(&svm->vcpu,
 			       KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
 }
 
+static u64 ghcb_msr_ap_rst_resp(u64 value)
+{
+	return (u64)GHCB_MSR_AP_RESET_HOLD_RESP | (value << GHCB_DATA_LOW);
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -2459,6 +2469,16 @@  static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 
 		break;
 	}
+	case GHCB_MSR_AP_RESET_HOLD_REQ:
+		ret = sev_emulate_ap_reset_hold(svm, AP_RESET_HOLD_MSR_PROTO);
+
+		/*
+		 * Preset the result to a non-SIPI return and then only set
+		 * the result to non-zero when delivering a SIPI.
+		 */
+		svm->vmcb->control.ghcb_gpa = ghcb_msr_ap_rst_resp(0);
+
+		break;
 	case GHCB_MSR_TERM_REQ: {
 		u64 reason_set, reason_code;
 
@@ -2544,7 +2564,7 @@  int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
 		break;
 	case SVM_VMGEXIT_AP_HLT_LOOP:
-		ret = sev_emulate_ap_reset_hold(svm);
+		ret = sev_emulate_ap_reset_hold(svm, AP_RESET_HOLD_NAE_EVENT);
 		break;
 	case SVM_VMGEXIT_AP_JUMP_TABLE: {
 		struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
@@ -2679,13 +2699,23 @@  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 		return;
 	}
 
-	/*
-	 * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
-	 * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
-	 * non-zero value.
-	 */
-	if (!svm->ghcb)
-		return;
-
-	ghcb_set_sw_exit_info_2(svm->ghcb, 1);
+	/* Subsequent SIPI */
+	switch (svm->reset_hold_type) {
+	case AP_RESET_HOLD_NAE_EVENT:
+		/*
+		 * Return from an AP Reset Hold VMGEXIT, where the guest will
+		 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+		 */
+		ghcb_set_sw_exit_info_2(svm->ghcb, 1);
+		break;
+	case AP_RESET_HOLD_MSR_PROTO:
+		/*
+		 * Return from an AP Reset Hold VMGEXIT, where the guest will
+		 * set the CS and RIP. Set GHCB data field to a non-zero value.
+		 */
+		svm->vmcb->control.ghcb_gpa = ghcb_msr_ap_rst_resp(1);
+		break;
+	default:
+		break;
+	}
 }
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 68e5f16a0554..bf9379f1cfb8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -69,6 +69,12 @@  enum {
 /* TPR and CR2 are always written before VMRUN */
 #define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
 
+enum ap_reset_hold_type {
+	AP_RESET_HOLD_NONE,
+	AP_RESET_HOLD_NAE_EVENT,
+	AP_RESET_HOLD_MSR_PROTO,
+};
+
 struct kvm_sev_info {
 	bool active;		/* SEV enabled guest */
 	bool es_active;		/* SEV-ES enabled guest */
@@ -199,6 +205,8 @@  struct vcpu_svm {
 	bool ghcb_sa_free;
 
 	bool guest_state_loaded;
+
+	enum ap_reset_hold_type reset_hold_type;
 };
 
 struct svm_cpu_data {