diff mbox series

[RFC,05/35] KVM: SVM: Add initial support for SEV-ES GHCB access to KVM

Message ID 9e52807342691ff0d4b116af6e147021c61a2d71.1600114548.git.thomas.lendacky@amd.com (mailing list archive)
State New, archived
Headers show
Series SEV-ES hypervisor support | expand

Commit Message

Tom Lendacky Sept. 14, 2020, 8:15 p.m. UTC
From: Tom Lendacky <thomas.lendacky@amd.com>

Provide initial support for accessing the GHCB when needing to access
registers for an SEV-ES guest. The support consists of:

  - Accessing the GHCB instead of the VMSA when reading and writing
    guest registers (after the VMSA has been encrypted).
  - Creating register access override functions for reading and writing
    guest registers from the common KVM support.
  - Allocating pages for the VMSA and GHCB when creating each vCPU
    - The VMSA page holds the encrypted VMSA for the vCPU
    - The GHCB page is used to hold a copy of the guest GHCB during
      VMGEXIT processing.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 arch/x86/include/asm/kvm_host.h  |   7 ++
 arch/x86/include/asm/msr-index.h |   1 +
 arch/x86/kvm/kvm_cache_regs.h    |  30 +++++--
 arch/x86/kvm/svm/svm.c           | 138 ++++++++++++++++++++++++++++++-
 arch/x86/kvm/svm/svm.h           |  65 ++++++++++++++-
 5 files changed, 230 insertions(+), 11 deletions(-)

Comments

Tom Lendacky Sept. 15, 2020, 1:24 p.m. UTC | #1
On 9/14/20 3:58 PM, Sean Christopherson wrote:
> On Mon, Sep 14, 2020 at 03:15:19PM -0500, Tom Lendacky wrote:
>> From: Tom Lendacky <thomas.lendacky@amd.com>
>>
>> Provide initial support for accessing the GHCB when needing to access
>> registers for an SEV-ES guest. The support consists of:
>>
>>   - Accessing the GHCB instead of the VMSA when reading and writing
>>     guest registers (after the VMSA has been encrypted).
>>   - Creating register access override functions for reading and writing
>>     guest registers from the common KVM support.
>>   - Allocating pages for the VMSA and GHCB when creating each vCPU
>>     - The VMSA page holds the encrypted VMSA for the vCPU
>>     - The GHCB page is used to hold a copy of the guest GHCB during
>>       VMGEXIT processing.
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h  |   7 ++
>>  arch/x86/include/asm/msr-index.h |   1 +
>>  arch/x86/kvm/kvm_cache_regs.h    |  30 +++++--
>>  arch/x86/kvm/svm/svm.c           | 138 ++++++++++++++++++++++++++++++-
>>  arch/x86/kvm/svm/svm.h           |  65 ++++++++++++++-
>>  5 files changed, 230 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 5303dbc5c9bc..c900992701d6 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -788,6 +788,9 @@ struct kvm_vcpu_arch {
>>  
>>  	/* AMD MSRC001_0015 Hardware Configuration */
>>  	u64 msr_hwcr;
>> +
>> +	/* SEV-ES support */
>> +	bool vmsa_encrypted;
> 
> 
> Peeking a little into the future, Intel needs a very similar flag for TDX[*].
> At a glance throughout the series,, I don't see anything that is super SEV-ES
> specific, so I think we could do s/vmsa_encrypted/guest_state_protected (or
> something along those lines).

Yup, I can do that.

> 
> [*] https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fsoftware.intel.com%2Fcontent%2Fwww%2Fus%2Fen%2Fdevelop%2Farticles%2Fintel-trust-domain-extensions.html&amp;data=02%7C01%7Cthomas.lendacky%40amd.com%7Cd5fcf35d079042b095b308d858f0e12f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637357138885657516&amp;sdata=aSclP%2BxSatvG9GOMEtWpXfdUxrLOlVcCXJNH41OdGms%3D&amp;reserved=0
> 
>>  };
>>  
>>  struct kvm_lpage_info {
>> @@ -1227,6 +1230,10 @@ struct kvm_x86_ops {
>>  	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
>>  
>>  	void (*migrate_timers)(struct kvm_vcpu *vcpu);
>> +
>> +	void (*reg_read_override)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
>> +	void (*reg_write_override)(struct kvm_vcpu *vcpu, enum kvm_reg reg,
>> +				   unsigned long val);
>>  };
>>  
>>  struct kvm_x86_nested_ops {
>> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
>> index 249a4147c4b2..16f5b20bb099 100644
>> --- a/arch/x86/include/asm/msr-index.h
>> +++ b/arch/x86/include/asm/msr-index.h
>> @@ -466,6 +466,7 @@
>>  #define MSR_AMD64_IBSBRTARGET		0xc001103b
>>  #define MSR_AMD64_IBSOPDATA4		0xc001103d
>>  #define MSR_AMD64_IBS_REG_COUNT_MAX	8 /* includes MSR_AMD64_IBSBRTARGET */
>> +#define MSR_AMD64_VM_PAGE_FLUSH		0xc001011e
>>  #define MSR_AMD64_SEV_ES_GHCB		0xc0010130
>>  #define MSR_AMD64_SEV			0xc0010131
>>  #define MSR_AMD64_SEV_ENABLED_BIT	0
>> diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
>> index cfe83d4ae625..e87eb90999d5 100644
>> --- a/arch/x86/kvm/kvm_cache_regs.h
>> +++ b/arch/x86/kvm/kvm_cache_regs.h
>> @@ -9,15 +9,21 @@
>>  	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR  \
>>  	 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
>>  
>> -#define BUILD_KVM_GPR_ACCESSORS(lname, uname)				      \
>> -static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
>> -{									      \
>> -	return vcpu->arch.regs[VCPU_REGS_##uname];			      \
>> -}									      \
>> -static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu,	      \
>> -						unsigned long val)	      \
>> -{									      \
>> -	vcpu->arch.regs[VCPU_REGS_##uname] = val;			      \
>> +#define BUILD_KVM_GPR_ACCESSORS(lname, uname)					\
>> +static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)	\
>> +{										\
>> +	if (kvm_x86_ops.reg_read_override)					\
>> +		kvm_x86_ops.reg_read_override(vcpu, VCPU_REGS_##uname);		\
>> +										\
>> +	return vcpu->arch.regs[VCPU_REGS_##uname];				\
>> +}										\
>> +static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu,		\
>> +						unsigned long val)		\
>> +{										\
>> +	if (kvm_x86_ops.reg_write_override)					\
>> +		kvm_x86_ops.reg_write_override(vcpu, VCPU_REGS_##uname, val);	\
>> +										\
>> +	vcpu->arch.regs[VCPU_REGS_##uname] = val;				\
>>  }
>>  BUILD_KVM_GPR_ACCESSORS(rax, RAX)
>>  BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
>> @@ -67,6 +73,9 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
>>  	if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
>>  		return 0;
>>  
>> +	if (kvm_x86_ops.reg_read_override)
>> +		kvm_x86_ops.reg_read_override(vcpu, reg);
>> +
>>  	if (!kvm_register_is_available(vcpu, reg))
>>  		kvm_x86_ops.cache_reg(vcpu, reg);
>>  
>> @@ -79,6 +88,9 @@ static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
>>  	if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
>>  		return;
>>  
>> +	if (kvm_x86_ops.reg_write_override)
>> +		kvm_x86_ops.reg_write_override(vcpu, reg, val);
> 
> 
> There has to be a more optimal approach for propagating registers between
> vcpu->arch.regs and the VMSA than adding a per-GPR hook.  Why not simply
> copy the entire set of registers to/from the VMSA on every exit and entry?
> AFAICT, valid_bits is only used in the read path, and KVM doesn't do anything
> sophistated when it hits a !valid_bits reads.

That would probably be ok. And actually, the code might be able to just
check the GHCB valid bitmap for valid regs on exit, copy them and then
clear the bitmap. The write code could check if vmsa_encrypted is set and
then set a "valid" bit for the reg that could be used to set regs on entry.

I'm not sure if turning kvm_vcpu_arch.regs into a struct and adding a
valid bit would be overkill or not.

> 
> If unconditional copying has a noticeable impact on e.g. IRQ handling
> latency, the save/restore could be limited to exits that may access guest
> state, which is presumably a well-defined, limited list.  Such exits are
> basically a slow path anyways, especially if the guest kernel is taking a
> #VC on the front eend.  Adding hooks to KVM_{GET,SET}_REGS to ensure userspace
> accesses are handled correcty would be trivial.
> 
> Adding per-GPR hooks will bloat common KVM for both VMX and SVM, and will
> likely have a noticeable performance impact on SVM due to adding 60-70 cycles
> to every GPR access for the retpoline.  Static calls will take the sting out
> of that, but it's still a lot of code bytes, that IMO, are completely
> unecessary.

Yes, definitely something I need to look into more.

> 
>> +
>>  	vcpu->arch.regs[reg] = val;
>>  	kvm_register_mark_dirty(vcpu, reg);
>>  }
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 779c167e42cc..d1f52211627a 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -1175,6 +1175,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>>  	struct page *msrpm_pages;
>>  	struct page *hsave_page;
>>  	struct page *nested_msrpm_pages;
>> +	struct page *vmsa_page = NULL;
>>  	int err;
>>  
>>  	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
>> @@ -1197,9 +1198,19 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>>  	if (!hsave_page)
>>  		goto free_page3;
>>  
>> +	if (sev_es_guest(svm->vcpu.kvm)) {
>> +		/*
>> +		 * SEV-ES guests require a separate VMSA page used to contain
>> +		 * the encrypted register state of the guest.
>> +		 */
>> +		vmsa_page = alloc_page(GFP_KERNEL);
>> +		if (!vmsa_page)
>> +			goto free_page4;
>> +	}
>> +
>>  	err = avic_init_vcpu(svm);
>>  	if (err)
>> -		goto free_page4;
>> +		goto free_page5;
>>  
>>  	/* We initialize this flag to true to make sure that the is_running
>>  	 * bit would be set the first time the vcpu is loaded.
>> @@ -1219,6 +1230,12 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>>  	svm->vmcb = page_address(page);
>>  	clear_page(svm->vmcb);
>>  	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
>> +
>> +	if (vmsa_page) {
>> +		svm->vmsa = page_address(vmsa_page);
>> +		clear_page(svm->vmsa);
>> +	}
>> +
>>  	svm->asid_generation = 0;
>>  	init_vmcb(svm);
>>  
>> @@ -1227,6 +1244,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>>  
>>  	return 0;
>>  
>> +free_page5:
>> +	if (vmsa_page)
>> +		__free_page(vmsa_page);
>>  free_page4:
>>  	__free_page(hsave_page);
>>  free_page3:
>> @@ -1258,6 +1278,26 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
>>  	 */
>>  	svm_clear_current_vmcb(svm->vmcb);
>>  
>> +	if (sev_es_guest(vcpu->kvm)) {
>> +		struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
>> +
>> +		if (vcpu->arch.vmsa_encrypted) {
>> +			u64 page_to_flush;
>> +
>> +			/*
>> +			 * The VMSA page was used by hardware to hold guest
>> +			 * encrypted state, be sure to flush it before returning
>> +			 * it to the system. This is done using the VM Page
>> +			 * Flush MSR (which takes the page virtual address and
>> +			 * guest ASID).
>> +			 */
>> +			page_to_flush = (u64)svm->vmsa | sev->asid;
>> +			wrmsrl(MSR_AMD64_VM_PAGE_FLUSH, page_to_flush);
>> +		}
>> +
>> +		__free_page(virt_to_page(svm->vmsa));
>> +	}
>> +
>>  	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
>>  	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
>>  	__free_page(virt_to_page(svm->nested.hsave));
>> @@ -4012,6 +4052,99 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
>>  		   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
>>  }
>>  
>> +/*
>> + * These return values represent the offset in quad words within the VM save
>> + * area. This allows them to be accessed by casting the save area to a u64
>> + * array.
>> + */
>> +#define VMSA_REG_ENTRY(_field)	 (offsetof(struct vmcb_save_area, _field) / sizeof(u64))
>> +#define VMSA_REG_UNDEF		 VMSA_REG_ENTRY(valid_bitmap)
>> +static inline unsigned int vcpu_to_vmsa_entry(enum kvm_reg reg)
>> +{
>> +	switch (reg) {
>> +	case VCPU_REGS_RAX:	return VMSA_REG_ENTRY(rax);
>> +	case VCPU_REGS_RBX:	return VMSA_REG_ENTRY(rbx);
>> +	case VCPU_REGS_RCX:	return VMSA_REG_ENTRY(rcx);
>> +	case VCPU_REGS_RDX:	return VMSA_REG_ENTRY(rdx);
>> +	case VCPU_REGS_RSP:	return VMSA_REG_ENTRY(rsp);
>> +	case VCPU_REGS_RBP:	return VMSA_REG_ENTRY(rbp);
>> +	case VCPU_REGS_RSI:	return VMSA_REG_ENTRY(rsi);
>> +	case VCPU_REGS_RDI:	return VMSA_REG_ENTRY(rdi);
>> +#ifdef CONFIG_X86_64
>> +	case VCPU_REGS_R8:	return VMSA_REG_ENTRY(r8);
>> +	case VCPU_REGS_R9:	return VMSA_REG_ENTRY(r9);
>> +	case VCPU_REGS_R10:	return VMSA_REG_ENTRY(r10);
>> +	case VCPU_REGS_R11:	return VMSA_REG_ENTRY(r11);
>> +	case VCPU_REGS_R12:	return VMSA_REG_ENTRY(r12);
>> +	case VCPU_REGS_R13:	return VMSA_REG_ENTRY(r13);
>> +	case VCPU_REGS_R14:	return VMSA_REG_ENTRY(r14);
>> +	case VCPU_REGS_R15:	return VMSA_REG_ENTRY(r15);
>> +#endif
>> +	case VCPU_REGS_RIP:	return VMSA_REG_ENTRY(rip);
>> +	default:
>> +		WARN_ONCE(1, "unsupported VCPU to VMSA register conversion\n");
>> +		return VMSA_REG_UNDEF;
>> +	}
>> +}
>> +
>> +/* For SEV-ES guests, populate the vCPU register from the appropriate VMSA/GHCB */
>> +static void svm_reg_read_override(struct kvm_vcpu *vcpu, enum kvm_reg reg)
>> +{
>> +	struct vmcb_save_area *vmsa;
>> +	struct vcpu_svm *svm;
>> +	unsigned int entry;
>> +	unsigned long val;
>> +	u64 *vmsa_reg;
>> +
>> +	if (!sev_es_guest(vcpu->kvm))
>> +		return;
>> +
>> +	entry = vcpu_to_vmsa_entry(reg);
>> +	if (entry == VMSA_REG_UNDEF)
>> +		return;
>> +
>> +	svm = to_svm(vcpu);
>> +	vmsa = get_vmsa(svm);
>> +	vmsa_reg = (u64 *)vmsa;
>> +	val = (unsigned long)vmsa_reg[entry];
>> +
>> +	/* If a GHCB is mapped, check the bitmap of valid entries */
>> +	if (svm->ghcb) {
>> +		if (!test_bit(entry, (unsigned long *)vmsa->valid_bitmap))
>> +			val = 0;
> 
> Is KVM relying on this being 0?  Would it make sense to stuff something like
> 0xaaaa... or 0xdeadbeefdeadbeef so that consumption of bogus data is more
> noticeable?

No, KVM isn't relying on this being 0. I thought about using something
other than 0 here, but settled on just using 0. I'm open to changing that,
though. I'm not sure if there's an easy way to short-circuit the intercept
and respond back with an error at this point, that would be optimal.

> 
>> +	}
>> +
>> +	vcpu->arch.regs[reg] = val;
>> +}
>> +
>> +/* For SEV-ES guests, set the vCPU register in the appropriate VMSA */
>> +static void svm_reg_write_override(struct kvm_vcpu *vcpu, enum kvm_reg reg,
>> +				   unsigned long val)
>> +{
>> +	struct vmcb_save_area *vmsa;
>> +	struct vcpu_svm *svm;
>> +	unsigned int entry;
>> +	u64 *vmsa_reg;
>> +
>> +	entry = vcpu_to_vmsa_entry(reg);
>> +	if (entry == VMSA_REG_UNDEF)
>> +		return;
>> +
>> +	svm = to_svm(vcpu);
>> +	vmsa = get_vmsa(svm);
>> +	vmsa_reg = (u64 *)vmsa;
>> +
>> +	/* If a GHCB is mapped, set the bit to indicate a valid entry */
>> +	if (svm->ghcb) {
>> +		unsigned int index = entry / 8;
>> +		unsigned int shift = entry % 8;
>> +
>> +		vmsa->valid_bitmap[index] |= BIT(shift);
>> +	}
>> +
>> +	vmsa_reg[entry] = val;
>> +}
>> +
>>  static void svm_vm_destroy(struct kvm *kvm)
>>  {
>>  	avic_vm_destroy(kvm);
>> @@ -4150,6 +4283,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>>  	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>>  
>>  	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
>> +
>> +	.reg_read_override = svm_reg_read_override,
>> +	.reg_write_override = svm_reg_write_override,
>>  };
>>  
>>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index f42ba9d158df..ff587536f571 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -159,6 +159,10 @@ struct vcpu_svm {
>>  	 */
>>  	struct list_head ir_list;
>>  	spinlock_t ir_list_lock;
>> +
>> +	/* SEV-ES support */
>> +	struct vmcb_save_area *vmsa;
>> +	struct ghcb *ghcb;
>>  };
>>  
>>  struct svm_cpu_data {
>> @@ -509,9 +513,34 @@ void sev_hardware_teardown(void);
>>  
>>  static inline struct vmcb_save_area *get_vmsa(struct vcpu_svm *svm)
>>  {
>> -	return &svm->vmcb->save;
>> +	struct vmcb_save_area *vmsa;
>> +
>> +	if (sev_es_guest(svm->vcpu.kvm)) {
>> +		/*
>> +		 * Before LAUNCH_UPDATE_VMSA, use the actual SEV-ES save area
>> +		 * to construct the initial state.  Afterwards, use the mapped
>> +		 * GHCB in a VMGEXIT or the traditional save area as a scratch
>> +		 * area when outside of a VMGEXIT.
>> +		 */
>> +		if (svm->vcpu.arch.vmsa_encrypted) {
>> +			if (svm->ghcb)
>> +				vmsa = &svm->ghcb->save;
>> +			else
>> +				vmsa = &svm->vmcb->save;
>> +		} else {
>> +			vmsa = svm->vmsa;
>> +		}
> 
> Not sure if it's actually better, but this whole thing could be:
> 
> 	if (!sev_es_guest(svm->vcpu.kvm))
> 		return &svm->vmcb->save;
> 
> 	if (!svm->vcpu.arch.vmsa_encrypted)
> 		return svm->vmsa;
> 
> 	return svm->ghcb ? &svm->ghcb->save : &svm->vmcb->save;
> 

It does look cleaner.

> 
>> +	} else {
>> +		vmsa = &svm->vmcb->save;
>> +	}
>> +
>> +	return vmsa;
>>  }
>>  
>> +#define SEV_ES_SET_VALID(_vmsa, _field)					\
>> +	__set_bit(GHCB_BITMAP_IDX(_field),				\
>> +		  (unsigned long *)(_vmsa)->valid_bitmap)
>> +
>>  #define DEFINE_VMSA_SEGMENT_ENTRY(_field, _entry, _size)		\
>>  	static inline _size						\
>>  	svm_##_field##_read_##_entry(struct vcpu_svm *svm)		\
>> @@ -528,6 +557,9 @@ static inline struct vmcb_save_area *get_vmsa(struct vcpu_svm *svm)
>>  		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
>>  									\
>>  		vmsa->_field._entry = value;				\
>> +		if (svm->vcpu.arch.vmsa_encrypted) {			\
> 
> Pretty sure braces are unnecessary on all these.

Yup, I can get rid of them.

Thanks,
Tom

> 
>> +			SEV_ES_SET_VALID(vmsa, _field);			\
>> +		}							\
>>  	}								\
>>  
>>  #define DEFINE_VMSA_SEGMENT_ACCESSOR(_field)				\
>> @@ -551,6 +583,9 @@ static inline struct vmcb_save_area *get_vmsa(struct vcpu_svm *svm)
>>  		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
>>  									\
>>  		vmsa->_field = *seg;					\
>> +		if (svm->vcpu.arch.vmsa_encrypted) {			\
>> +			SEV_ES_SET_VALID(vmsa, _field);			\
>> +		}							\
>>  	}
>>  
>>  DEFINE_VMSA_SEGMENT_ACCESSOR(cs)
>> @@ -579,6 +614,9 @@ DEFINE_VMSA_SEGMENT_ACCESSOR(tr)
>>  		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
>>  									\
>>  		vmsa->_field = value;					\
>> +		if (svm->vcpu.arch.vmsa_encrypted) {			\
>> +			SEV_ES_SET_VALID(vmsa, _field);			\
>> +		}							\
>>  	}								\
>>  									\
>>  	static inline void						\
>> @@ -587,6 +625,9 @@ DEFINE_VMSA_SEGMENT_ACCESSOR(tr)
>>  		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
>>  									\
>>  		vmsa->_field &= value;					\
>> +		if (svm->vcpu.arch.vmsa_encrypted) {			\
>> +			SEV_ES_SET_VALID(vmsa, _field);			\
>> +		}							\
>>  	}								\
>>  									\
>>  	static inline void						\
>> @@ -595,6 +636,9 @@ DEFINE_VMSA_SEGMENT_ACCESSOR(tr)
>>  		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
>>  									\
>>  		vmsa->_field |= value;					\
>> +		if (svm->vcpu.arch.vmsa_encrypted) {			\
>> +			SEV_ES_SET_VALID(vmsa, _field);			\
>> +		}							\
Sean Christopherson Sept. 15, 2020, 4:28 p.m. UTC | #2
On Tue, Sep 15, 2020 at 08:24:22AM -0500, Tom Lendacky wrote:
> On 9/14/20 3:58 PM, Sean Christopherson wrote:
> >> @@ -79,6 +88,9 @@ static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
> >>  	if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
> >>  		return;
> >>  
> >> +	if (kvm_x86_ops.reg_write_override)
> >> +		kvm_x86_ops.reg_write_override(vcpu, reg, val);
> > 
> > 
> > There has to be a more optimal approach for propagating registers between
> > vcpu->arch.regs and the VMSA than adding a per-GPR hook.  Why not simply
> > copy the entire set of registers to/from the VMSA on every exit and entry?
> > AFAICT, valid_bits is only used in the read path, and KVM doesn't do anything
> > sophistated when it hits a !valid_bits reads.
> 
> That would probably be ok. And actually, the code might be able to just
> check the GHCB valid bitmap for valid regs on exit, copy them and then
> clear the bitmap. The write code could check if vmsa_encrypted is set and
> then set a "valid" bit for the reg that could be used to set regs on entry.
> 
> I'm not sure if turning kvm_vcpu_arch.regs into a struct and adding a
> valid bit would be overkill or not.

KVM already has space in regs_avail and regs_dirty for GPRs, they're just not
used by the get/set helpers because they're always loaded/stored for both SVM
and VMX.

I assume nothing will break if KVM "writes" random GPRs in the VMSA?  I can't
see how the guest would achieve any level of security if it wantonly consumes
GPRs, i.e. it's the guest's responsibility to consume only the relevant GPRs.

If that holds true, than avoiding the copying isn't functionally necessary, and
is really just a performance optimization.  One potentially crazy idea would be
to change vcpu->arch.regs to be a pointer (defaults a __regs array), and then
have SEV-ES switch it to point directly at the VMSA array (I think the layout
is identical for x86-64?).

> >> @@ -4012,6 +4052,99 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
> >>  		   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
> >>  }
> >>  
> >> +/*
> >> + * These return values represent the offset in quad words within the VM save
> >> + * area. This allows them to be accessed by casting the save area to a u64
> >> + * array.
> >> + */
> >> +#define VMSA_REG_ENTRY(_field)	 (offsetof(struct vmcb_save_area, _field) / sizeof(u64))
> >> +#define VMSA_REG_UNDEF		 VMSA_REG_ENTRY(valid_bitmap)
> >> +static inline unsigned int vcpu_to_vmsa_entry(enum kvm_reg reg)
> >> +{
> >> +	switch (reg) {
> >> +	case VCPU_REGS_RAX:	return VMSA_REG_ENTRY(rax);
> >> +	case VCPU_REGS_RBX:	return VMSA_REG_ENTRY(rbx);
> >> +	case VCPU_REGS_RCX:	return VMSA_REG_ENTRY(rcx);
> >> +	case VCPU_REGS_RDX:	return VMSA_REG_ENTRY(rdx);
> >> +	case VCPU_REGS_RSP:	return VMSA_REG_ENTRY(rsp);
> >> +	case VCPU_REGS_RBP:	return VMSA_REG_ENTRY(rbp);
> >> +	case VCPU_REGS_RSI:	return VMSA_REG_ENTRY(rsi);
> >> +	case VCPU_REGS_RDI:	return VMSA_REG_ENTRY(rdi);
> >> +#ifdef CONFIG_X86_64

Is KVM SEV-ES going to support 32-bit builds?

> >> +	case VCPU_REGS_R8:	return VMSA_REG_ENTRY(r8);
> >> +	case VCPU_REGS_R9:	return VMSA_REG_ENTRY(r9);
> >> +	case VCPU_REGS_R10:	return VMSA_REG_ENTRY(r10);
> >> +	case VCPU_REGS_R11:	return VMSA_REG_ENTRY(r11);
> >> +	case VCPU_REGS_R12:	return VMSA_REG_ENTRY(r12);
> >> +	case VCPU_REGS_R13:	return VMSA_REG_ENTRY(r13);
> >> +	case VCPU_REGS_R14:	return VMSA_REG_ENTRY(r14);
> >> +	case VCPU_REGS_R15:	return VMSA_REG_ENTRY(r15);
> >> +#endif
> >> +	case VCPU_REGS_RIP:	return VMSA_REG_ENTRY(rip);
> >> +	default:
> >> +		WARN_ONCE(1, "unsupported VCPU to VMSA register conversion\n");
> >> +		return VMSA_REG_UNDEF;
> >> +	}
> >> +}
> >> +
> >> +/* For SEV-ES guests, populate the vCPU register from the appropriate VMSA/GHCB */
> >> +static void svm_reg_read_override(struct kvm_vcpu *vcpu, enum kvm_reg reg)
> >> +{
> >> +	struct vmcb_save_area *vmsa;
> >> +	struct vcpu_svm *svm;
> >> +	unsigned int entry;
> >> +	unsigned long val;
> >> +	u64 *vmsa_reg;
> >> +
> >> +	if (!sev_es_guest(vcpu->kvm))
> >> +		return;
> >> +
> >> +	entry = vcpu_to_vmsa_entry(reg);
> >> +	if (entry == VMSA_REG_UNDEF)
> >> +		return;
> >> +
> >> +	svm = to_svm(vcpu);
> >> +	vmsa = get_vmsa(svm);
> >> +	vmsa_reg = (u64 *)vmsa;
> >> +	val = (unsigned long)vmsa_reg[entry];
> >> +
> >> +	/* If a GHCB is mapped, check the bitmap of valid entries */
> >> +	if (svm->ghcb) {
> >> +		if (!test_bit(entry, (unsigned long *)vmsa->valid_bitmap))
> >> +			val = 0;
> > 
> > Is KVM relying on this being 0?  Would it make sense to stuff something like
> > 0xaaaa... or 0xdeadbeefdeadbeef so that consumption of bogus data is more
> > noticeable?
> 
> No, KVM isn't relying on this being 0. I thought about using something
> other than 0 here, but settled on just using 0. I'm open to changing that,
> though. I'm not sure if there's an easy way to short-circuit the intercept
> and respond back with an error at this point, that would be optimal.

Ya, responding with an error would be ideal.  At this point, we're taking the
same lazy approach for TDX and effectively consuming garbage if the guest
requests emulation but doesn't expose the necessary GPRs.  That being said,
TDX's guest/host ABI is quite rigid, so all the "is this register valid"
checks could be hardcoded into the higher level "emulation" flows.

Would that also be an option for SEV-ES?
Tom Lendacky Sept. 16, 2020, 2:54 p.m. UTC | #3
On 9/15/20 11:28 AM, Sean Christopherson wrote:
> On Tue, Sep 15, 2020 at 08:24:22AM -0500, Tom Lendacky wrote:
>> On 9/14/20 3:58 PM, Sean Christopherson wrote:
>>>> @@ -79,6 +88,9 @@ static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
>>>>  	if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
>>>>  		return;
>>>>  
>>>> +	if (kvm_x86_ops.reg_write_override)
>>>> +		kvm_x86_ops.reg_write_override(vcpu, reg, val);
>>>
>>>
>>> There has to be a more optimal approach for propagating registers between
>>> vcpu->arch.regs and the VMSA than adding a per-GPR hook.  Why not simply
>>> copy the entire set of registers to/from the VMSA on every exit and entry?
>>> AFAICT, valid_bits is only used in the read path, and KVM doesn't do anything
>>> sophistated when it hits a !valid_bits reads.
>>
>> That would probably be ok. And actually, the code might be able to just
>> check the GHCB valid bitmap for valid regs on exit, copy them and then
>> clear the bitmap. The write code could check if vmsa_encrypted is set and
>> then set a "valid" bit for the reg that could be used to set regs on entry.
>>
>> I'm not sure if turning kvm_vcpu_arch.regs into a struct and adding a
>> valid bit would be overkill or not.
> 
> KVM already has space in regs_avail and regs_dirty for GPRs, they're just not
> used by the get/set helpers because they're always loaded/stored for both SVM
> and VMX.
> 
> I assume nothing will break if KVM "writes" random GPRs in the VMSA?  I can't
> see how the guest would achieve any level of security if it wantonly consumes
> GPRs, i.e. it's the guest's responsibility to consume only the relevant GPRs.

Right, the guest should only read the registers that it is expecting to be
provided by the hypervisor as set forth in the GHCB spec. It shouldn't
load any other registers that the hypervisor provides. The Linux SEV-ES
guest support follows this model and will only load the registers that are
specified via the GHCB spec for a particular NAE event, ignoring anything
else provided.

> 
> If that holds true, than avoiding the copying isn't functionally necessary, and
> is really just a performance optimization.  One potentially crazy idea would be
> to change vcpu->arch.regs to be a pointer (defaults a __regs array), and then
> have SEV-ES switch it to point directly at the VMSA array (I think the layout
> is identical for x86-64?).

That would be nice, but it isn't quite laid out like that. Before SEV-ES
support, RAX and RSP were the only GPRs saved. With the arrival of SEV-ES,
the remaining registers were added to the VMSA, but a number of bytes
after RAX and RSP. So right now, there are reserved areas where RAX and
RSP would have been at the new register block in the VMSA (see offset
0x300 in the VMSA layout of the APM volume 2,
https://www.amd.com/system/files/TechDocs/24593.pdf).

I might be able to move the RAX and RSP values before the VMSA is
encrypted (or the GHCB returned), assuming those fields would stay
reserved, but I don't think that can be guaranteed.

Let me see if I can put something together using regs_avail and regs_dirty.

> 
>>>> @@ -4012,6 +4052,99 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
>>>>  		   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
>>>>  }
>>>>  
>>>> +/*
>>>> + * These return values represent the offset in quad words within the VM save
>>>> + * area. This allows them to be accessed by casting the save area to a u64
>>>> + * array.
>>>> + */
>>>> +#define VMSA_REG_ENTRY(_field)	 (offsetof(struct vmcb_save_area, _field) / sizeof(u64))
>>>> +#define VMSA_REG_UNDEF		 VMSA_REG_ENTRY(valid_bitmap)
>>>> +static inline unsigned int vcpu_to_vmsa_entry(enum kvm_reg reg)
>>>> +{
>>>> +	switch (reg) {
>>>> +	case VCPU_REGS_RAX:	return VMSA_REG_ENTRY(rax);
>>>> +	case VCPU_REGS_RBX:	return VMSA_REG_ENTRY(rbx);
>>>> +	case VCPU_REGS_RCX:	return VMSA_REG_ENTRY(rcx);
>>>> +	case VCPU_REGS_RDX:	return VMSA_REG_ENTRY(rdx);
>>>> +	case VCPU_REGS_RSP:	return VMSA_REG_ENTRY(rsp);
>>>> +	case VCPU_REGS_RBP:	return VMSA_REG_ENTRY(rbp);
>>>> +	case VCPU_REGS_RSI:	return VMSA_REG_ENTRY(rsi);
>>>> +	case VCPU_REGS_RDI:	return VMSA_REG_ENTRY(rdi);
>>>> +#ifdef CONFIG_X86_64
> 
> Is KVM SEV-ES going to support 32-bit builds?

No, SEV-ES won't support 32-bit builds and since those fields are always
defined, I can just remove this #ifdef.

> 
>>>> +	case VCPU_REGS_R8:	return VMSA_REG_ENTRY(r8);
>>>> +	case VCPU_REGS_R9:	return VMSA_REG_ENTRY(r9);
>>>> +	case VCPU_REGS_R10:	return VMSA_REG_ENTRY(r10);
>>>> +	case VCPU_REGS_R11:	return VMSA_REG_ENTRY(r11);
>>>> +	case VCPU_REGS_R12:	return VMSA_REG_ENTRY(r12);
>>>> +	case VCPU_REGS_R13:	return VMSA_REG_ENTRY(r13);
>>>> +	case VCPU_REGS_R14:	return VMSA_REG_ENTRY(r14);
>>>> +	case VCPU_REGS_R15:	return VMSA_REG_ENTRY(r15);
>>>> +#endif
>>>> +	case VCPU_REGS_RIP:	return VMSA_REG_ENTRY(rip);
>>>> +	default:
>>>> +		WARN_ONCE(1, "unsupported VCPU to VMSA register conversion\n");
>>>> +		return VMSA_REG_UNDEF;
>>>> +	}
>>>> +}
>>>> +
>>>> +/* For SEV-ES guests, populate the vCPU register from the appropriate VMSA/GHCB */
>>>> +static void svm_reg_read_override(struct kvm_vcpu *vcpu, enum kvm_reg reg)
>>>> +{
>>>> +	struct vmcb_save_area *vmsa;
>>>> +	struct vcpu_svm *svm;
>>>> +	unsigned int entry;
>>>> +	unsigned long val;
>>>> +	u64 *vmsa_reg;
>>>> +
>>>> +	if (!sev_es_guest(vcpu->kvm))
>>>> +		return;
>>>> +
>>>> +	entry = vcpu_to_vmsa_entry(reg);
>>>> +	if (entry == VMSA_REG_UNDEF)
>>>> +		return;
>>>> +
>>>> +	svm = to_svm(vcpu);
>>>> +	vmsa = get_vmsa(svm);
>>>> +	vmsa_reg = (u64 *)vmsa;
>>>> +	val = (unsigned long)vmsa_reg[entry];
>>>> +
>>>> +	/* If a GHCB is mapped, check the bitmap of valid entries */
>>>> +	if (svm->ghcb) {
>>>> +		if (!test_bit(entry, (unsigned long *)vmsa->valid_bitmap))
>>>> +			val = 0;
>>>
>>> Is KVM relying on this being 0?  Would it make sense to stuff something like
>>> 0xaaaa... or 0xdeadbeefdeadbeef so that consumption of bogus data is more
>>> noticeable?
>>
>> No, KVM isn't relying on this being 0. I thought about using something
>> other than 0 here, but settled on just using 0. I'm open to changing that,
>> though. I'm not sure if there's an easy way to short-circuit the intercept
>> and respond back with an error at this point, that would be optimal.
> 
> Ya, responding with an error would be ideal.  At this point, we're taking the
> same lazy approach for TDX and effectively consuming garbage if the guest
> requests emulation but doesn't expose the necessary GPRs.  That being said,
> TDX's guest/host ABI is quite rigid, so all the "is this register valid"
> checks could be hardcoded into the higher level "emulation" flows.
> 
> Would that also be an option for SEV-ES?

Meaning adding the expected input checks at VMEXIT time in the VMGEXIT
handler, so that accesses later are guaranteed to be good? That is an
option and might also address one of the other points you brought up about
 about receiving exits that are not supported/expected.

Thanks,
Tom

>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5303dbc5c9bc..c900992701d6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -788,6 +788,9 @@  struct kvm_vcpu_arch {
 
 	/* AMD MSRC001_0015 Hardware Configuration */
 	u64 msr_hwcr;
+
+	/* SEV-ES support */
+	bool vmsa_encrypted;
 };
 
 struct kvm_lpage_info {
@@ -1227,6 +1230,10 @@  struct kvm_x86_ops {
 	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
 
 	void (*migrate_timers)(struct kvm_vcpu *vcpu);
+
+	void (*reg_read_override)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+	void (*reg_write_override)(struct kvm_vcpu *vcpu, enum kvm_reg reg,
+				   unsigned long val);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 249a4147c4b2..16f5b20bb099 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -466,6 +466,7 @@ 
 #define MSR_AMD64_IBSBRTARGET		0xc001103b
 #define MSR_AMD64_IBSOPDATA4		0xc001103d
 #define MSR_AMD64_IBS_REG_COUNT_MAX	8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_VM_PAGE_FLUSH		0xc001011e
 #define MSR_AMD64_SEV_ES_GHCB		0xc0010130
 #define MSR_AMD64_SEV			0xc0010131
 #define MSR_AMD64_SEV_ENABLED_BIT	0
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cfe83d4ae625..e87eb90999d5 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,15 +9,21 @@ 
 	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR  \
 	 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
 
-#define BUILD_KVM_GPR_ACCESSORS(lname, uname)				      \
-static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
-{									      \
-	return vcpu->arch.regs[VCPU_REGS_##uname];			      \
-}									      \
-static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu,	      \
-						unsigned long val)	      \
-{									      \
-	vcpu->arch.regs[VCPU_REGS_##uname] = val;			      \
+#define BUILD_KVM_GPR_ACCESSORS(lname, uname)					\
+static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)	\
+{										\
+	if (kvm_x86_ops.reg_read_override)					\
+		kvm_x86_ops.reg_read_override(vcpu, VCPU_REGS_##uname);		\
+										\
+	return vcpu->arch.regs[VCPU_REGS_##uname];				\
+}										\
+static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu,		\
+						unsigned long val)		\
+{										\
+	if (kvm_x86_ops.reg_write_override)					\
+		kvm_x86_ops.reg_write_override(vcpu, VCPU_REGS_##uname, val);	\
+										\
+	vcpu->arch.regs[VCPU_REGS_##uname] = val;				\
 }
 BUILD_KVM_GPR_ACCESSORS(rax, RAX)
 BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
@@ -67,6 +73,9 @@  static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
 	if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
 		return 0;
 
+	if (kvm_x86_ops.reg_read_override)
+		kvm_x86_ops.reg_read_override(vcpu, reg);
+
 	if (!kvm_register_is_available(vcpu, reg))
 		kvm_x86_ops.cache_reg(vcpu, reg);
 
@@ -79,6 +88,9 @@  static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
 	if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
 		return;
 
+	if (kvm_x86_ops.reg_write_override)
+		kvm_x86_ops.reg_write_override(vcpu, reg, val);
+
 	vcpu->arch.regs[reg] = val;
 	kvm_register_mark_dirty(vcpu, reg);
 }
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 779c167e42cc..d1f52211627a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1175,6 +1175,7 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 	struct page *msrpm_pages;
 	struct page *hsave_page;
 	struct page *nested_msrpm_pages;
+	struct page *vmsa_page = NULL;
 	int err;
 
 	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
@@ -1197,9 +1198,19 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 	if (!hsave_page)
 		goto free_page3;
 
+	if (sev_es_guest(svm->vcpu.kvm)) {
+		/*
+		 * SEV-ES guests require a separate VMSA page used to contain
+		 * the encrypted register state of the guest.
+		 */
+		vmsa_page = alloc_page(GFP_KERNEL);
+		if (!vmsa_page)
+			goto free_page4;
+	}
+
 	err = avic_init_vcpu(svm);
 	if (err)
-		goto free_page4;
+		goto free_page5;
 
 	/* We initialize this flag to true to make sure that the is_running
 	 * bit would be set the first time the vcpu is loaded.
@@ -1219,6 +1230,12 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+
+	if (vmsa_page) {
+		svm->vmsa = page_address(vmsa_page);
+		clear_page(svm->vmsa);
+	}
+
 	svm->asid_generation = 0;
 	init_vmcb(svm);
 
@@ -1227,6 +1244,9 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
 	return 0;
 
+free_page5:
+	if (vmsa_page)
+		__free_page(vmsa_page);
 free_page4:
 	__free_page(hsave_page);
 free_page3:
@@ -1258,6 +1278,26 @@  static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	 */
 	svm_clear_current_vmcb(svm->vmcb);
 
+	if (sev_es_guest(vcpu->kvm)) {
+		struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+		if (vcpu->arch.vmsa_encrypted) {
+			u64 page_to_flush;
+
+			/*
+			 * The VMSA page was used by hardware to hold guest
+			 * encrypted state, be sure to flush it before returning
+			 * it to the system. This is done using the VM Page
+			 * Flush MSR (which takes the page virtual address and
+			 * guest ASID).
+			 */
+			page_to_flush = (u64)svm->vmsa | sev->asid;
+			wrmsrl(MSR_AMD64_VM_PAGE_FLUSH, page_to_flush);
+		}
+
+		__free_page(virt_to_page(svm->vmsa));
+	}
+
 	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 	__free_page(virt_to_page(svm->nested.hsave));
@@ -4012,6 +4052,99 @@  static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
 		   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
 }
 
+/*
+ * These return values represent the offset in quad words within the VM save
+ * area. This allows them to be accessed by casting the save area to a u64
+ * array.
+ */
+#define VMSA_REG_ENTRY(_field)	 (offsetof(struct vmcb_save_area, _field) / sizeof(u64))
+#define VMSA_REG_UNDEF		 VMSA_REG_ENTRY(valid_bitmap)
+static inline unsigned int vcpu_to_vmsa_entry(enum kvm_reg reg)
+{
+	switch (reg) {
+	case VCPU_REGS_RAX:	return VMSA_REG_ENTRY(rax);
+	case VCPU_REGS_RBX:	return VMSA_REG_ENTRY(rbx);
+	case VCPU_REGS_RCX:	return VMSA_REG_ENTRY(rcx);
+	case VCPU_REGS_RDX:	return VMSA_REG_ENTRY(rdx);
+	case VCPU_REGS_RSP:	return VMSA_REG_ENTRY(rsp);
+	case VCPU_REGS_RBP:	return VMSA_REG_ENTRY(rbp);
+	case VCPU_REGS_RSI:	return VMSA_REG_ENTRY(rsi);
+	case VCPU_REGS_RDI:	return VMSA_REG_ENTRY(rdi);
+#ifdef CONFIG_X86_64
+	case VCPU_REGS_R8:	return VMSA_REG_ENTRY(r8);
+	case VCPU_REGS_R9:	return VMSA_REG_ENTRY(r9);
+	case VCPU_REGS_R10:	return VMSA_REG_ENTRY(r10);
+	case VCPU_REGS_R11:	return VMSA_REG_ENTRY(r11);
+	case VCPU_REGS_R12:	return VMSA_REG_ENTRY(r12);
+	case VCPU_REGS_R13:	return VMSA_REG_ENTRY(r13);
+	case VCPU_REGS_R14:	return VMSA_REG_ENTRY(r14);
+	case VCPU_REGS_R15:	return VMSA_REG_ENTRY(r15);
+#endif
+	case VCPU_REGS_RIP:	return VMSA_REG_ENTRY(rip);
+	default:
+		WARN_ONCE(1, "unsupported VCPU to VMSA register conversion\n");
+		return VMSA_REG_UNDEF;
+	}
+}
+
+/* For SEV-ES guests, populate the vCPU register from the appropriate VMSA/GHCB */
+static void svm_reg_read_override(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+	struct vmcb_save_area *vmsa;
+	struct vcpu_svm *svm;
+	unsigned int entry;
+	unsigned long val;
+	u64 *vmsa_reg;
+
+	if (!sev_es_guest(vcpu->kvm))
+		return;
+
+	entry = vcpu_to_vmsa_entry(reg);
+	if (entry == VMSA_REG_UNDEF)
+		return;
+
+	svm = to_svm(vcpu);
+	vmsa = get_vmsa(svm);
+	vmsa_reg = (u64 *)vmsa;
+	val = (unsigned long)vmsa_reg[entry];
+
+	/* If a GHCB is mapped, check the bitmap of valid entries */
+	if (svm->ghcb) {
+		if (!test_bit(entry, (unsigned long *)vmsa->valid_bitmap))
+			val = 0;
+	}
+
+	vcpu->arch.regs[reg] = val;
+}
+
+/* For SEV-ES guests, set the vCPU register in the appropriate VMSA */
+static void svm_reg_write_override(struct kvm_vcpu *vcpu, enum kvm_reg reg,
+				   unsigned long val)
+{
+	struct vmcb_save_area *vmsa;
+	struct vcpu_svm *svm;
+	unsigned int entry;
+	u64 *vmsa_reg;
+
+	entry = vcpu_to_vmsa_entry(reg);
+	if (entry == VMSA_REG_UNDEF)
+		return;
+
+	svm = to_svm(vcpu);
+	vmsa = get_vmsa(svm);
+	vmsa_reg = (u64 *)vmsa;
+
+	/* If a GHCB is mapped, set the bit to indicate a valid entry */
+	if (svm->ghcb) {
+		unsigned int index = entry / 8;
+		unsigned int shift = entry % 8;
+
+		vmsa->valid_bitmap[index] |= BIT(shift);
+	}
+
+	vmsa_reg[entry] = val;
+}
+
 static void svm_vm_destroy(struct kvm *kvm)
 {
 	avic_vm_destroy(kvm);
@@ -4150,6 +4283,9 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
 
 	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+	.reg_read_override = svm_reg_read_override,
+	.reg_write_override = svm_reg_write_override,
 };
 
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f42ba9d158df..ff587536f571 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -159,6 +159,10 @@  struct vcpu_svm {
 	 */
 	struct list_head ir_list;
 	spinlock_t ir_list_lock;
+
+	/* SEV-ES support */
+	struct vmcb_save_area *vmsa;
+	struct ghcb *ghcb;
 };
 
 struct svm_cpu_data {
@@ -509,9 +513,34 @@  void sev_hardware_teardown(void);
 
 static inline struct vmcb_save_area *get_vmsa(struct vcpu_svm *svm)
 {
-	return &svm->vmcb->save;
+	struct vmcb_save_area *vmsa;
+
+	if (sev_es_guest(svm->vcpu.kvm)) {
+		/*
+		 * Before LAUNCH_UPDATE_VMSA, use the actual SEV-ES save area
+		 * to construct the initial state.  Afterwards, use the mapped
+		 * GHCB in a VMGEXIT or the traditional save area as a scratch
+		 * area when outside of a VMGEXIT.
+		 */
+		if (svm->vcpu.arch.vmsa_encrypted) {
+			if (svm->ghcb)
+				vmsa = &svm->ghcb->save;
+			else
+				vmsa = &svm->vmcb->save;
+		} else {
+			vmsa = svm->vmsa;
+		}
+	} else {
+		vmsa = &svm->vmcb->save;
+	}
+
+	return vmsa;
 }
 
+#define SEV_ES_SET_VALID(_vmsa, _field)					\
+	__set_bit(GHCB_BITMAP_IDX(_field),				\
+		  (unsigned long *)(_vmsa)->valid_bitmap)
+
 #define DEFINE_VMSA_SEGMENT_ENTRY(_field, _entry, _size)		\
 	static inline _size						\
 	svm_##_field##_read_##_entry(struct vcpu_svm *svm)		\
@@ -528,6 +557,9 @@  static inline struct vmcb_save_area *get_vmsa(struct vcpu_svm *svm)
 		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
 									\
 		vmsa->_field._entry = value;				\
+		if (svm->vcpu.arch.vmsa_encrypted) {			\
+			SEV_ES_SET_VALID(vmsa, _field);			\
+		}							\
 	}								\
 
 #define DEFINE_VMSA_SEGMENT_ACCESSOR(_field)				\
@@ -551,6 +583,9 @@  static inline struct vmcb_save_area *get_vmsa(struct vcpu_svm *svm)
 		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
 									\
 		vmsa->_field = *seg;					\
+		if (svm->vcpu.arch.vmsa_encrypted) {			\
+			SEV_ES_SET_VALID(vmsa, _field);			\
+		}							\
 	}
 
 DEFINE_VMSA_SEGMENT_ACCESSOR(cs)
@@ -579,6 +614,9 @@  DEFINE_VMSA_SEGMENT_ACCESSOR(tr)
 		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
 									\
 		vmsa->_field = value;					\
+		if (svm->vcpu.arch.vmsa_encrypted) {			\
+			SEV_ES_SET_VALID(vmsa, _field);			\
+		}							\
 	}								\
 									\
 	static inline void						\
@@ -587,6 +625,9 @@  DEFINE_VMSA_SEGMENT_ACCESSOR(tr)
 		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
 									\
 		vmsa->_field &= value;					\
+		if (svm->vcpu.arch.vmsa_encrypted) {			\
+			SEV_ES_SET_VALID(vmsa, _field);			\
+		}							\
 	}								\
 									\
 	static inline void						\
@@ -595,6 +636,9 @@  DEFINE_VMSA_SEGMENT_ACCESSOR(tr)
 		struct vmcb_save_area *vmsa = get_vmsa(svm);		\
 									\
 		vmsa->_field |= value;					\
+		if (svm->vcpu.arch.vmsa_encrypted) {			\
+			SEV_ES_SET_VALID(vmsa, _field);			\
+		}							\
 	}
 
 #define DEFINE_VMSA_ACCESSOR(_field)					\
@@ -629,6 +673,25 @@  DEFINE_VMSA_ACCESSOR(last_excp_to)
 DEFINE_VMSA_U8_ACCESSOR(cpl)
 DEFINE_VMSA_ACCESSOR(rip)
 DEFINE_VMSA_ACCESSOR(rax)
+DEFINE_VMSA_ACCESSOR(rbx)
+DEFINE_VMSA_ACCESSOR(rcx)
+DEFINE_VMSA_ACCESSOR(rdx)
 DEFINE_VMSA_ACCESSOR(rsp)
+DEFINE_VMSA_ACCESSOR(rbp)
+DEFINE_VMSA_ACCESSOR(rsi)
+DEFINE_VMSA_ACCESSOR(rdi)
+DEFINE_VMSA_ACCESSOR(r8)
+DEFINE_VMSA_ACCESSOR(r9)
+DEFINE_VMSA_ACCESSOR(r10)
+DEFINE_VMSA_ACCESSOR(r11)
+DEFINE_VMSA_ACCESSOR(r12)
+DEFINE_VMSA_ACCESSOR(r13)
+DEFINE_VMSA_ACCESSOR(r14)
+DEFINE_VMSA_ACCESSOR(r15)
+DEFINE_VMSA_ACCESSOR(sw_exit_code)
+DEFINE_VMSA_ACCESSOR(sw_exit_info_1)
+DEFINE_VMSA_ACCESSOR(sw_exit_info_2)
+DEFINE_VMSA_ACCESSOR(sw_scratch)
+DEFINE_VMSA_ACCESSOR(xcr0)
 
 #endif