diff mbox series

[1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

Message ID 20210112063703.539893-1-wei.huang2@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions | expand

Commit Message

Wei Huang Jan. 12, 2021, 6:37 a.m. UTC
From: Bandan Das <bsd@redhat.com>

While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
before checking VMCB's instruction intercept. If EAX falls into such
memory areas, #GP is triggered before VMEXIT. This causes problem under
nested virtualization. To solve this problem, KVM needs to trap #GP and
check the instructions triggering #GP. For VM execution instructions,
KVM emulates these instructions; otherwise it re-injects #GP back to
guest VMs.

Signed-off-by: Bandan Das <bsd@redhat.com>
Co-developed-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
---
 arch/x86/include/asm/kvm_host.h |   8 +-
 arch/x86/kvm/mmu.h              |   1 +
 arch/x86/kvm/mmu/mmu.c          |   7 ++
 arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
 arch/x86/kvm/svm/svm.h          |   8 ++
 arch/x86/kvm/vmx/vmx.c          |   2 +-
 arch/x86/kvm/x86.c              |  37 +++++++-
 7 files changed, 146 insertions(+), 74 deletions(-)

Comments

Maxim Levitsky Jan. 12, 2021, 11:09 a.m. UTC | #1
On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
> From: Bandan Das <bsd@redhat.com>
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
> 
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Co-developed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>

This is the ultimate fix for this bug that I had in mind, 
but I didn't dare to develop it, thinking it won't be accepted 
due to the added complexity.

From a cursory look this look all right, and I will review 
and test this either today or tomorrow.

Thank you very much for doing the right fix for this bug.

Best regards,
	Maxim Levitsky

> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h              |   1 +
>  arch/x86/kvm/mmu/mmu.c          |   7 ++
>  arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>  arch/x86/kvm/svm/svm.h          |   8 ++
>  arch/x86/kvm/vmx/vmx.c          |   2 +-
>  arch/x86/kvm/x86.c              |  37 +++++++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *			     Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>   *			backdoor emulation, which is opt in via module param.
>   *			VMware backoor emulation handles select instructions
> - *			and reinjects the #GP for all other cases.
> + *			and reinjects #GP for all other cases. This also
> + *			handles other cases where #GP condition needs to be
> + *			handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>   *		 case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP		    (1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>  #define EMULTYPE_PF		    (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);
>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include <asm/io.h>
>  #include <asm/vmx.h>
>  #include <asm/kvm_page_track.h>
> +#include <asm/e820/api.h>
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> +	return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}
> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  		if (!(efer & EFER_SVME)) {
>  			svm_leave_nested(svm);
>  			svm_set_gif(svm, true);
> +			clr_exception_intercept(svm, GP_VECTOR);
>  
>  			/*
>  			 * Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> +	/* Enable GP interception for SVM instructions if needed */
> +	if (efer & EFER_SVME)
> +		set_exception_intercept(svm, GP_VECTOR);
> +
>  	return 0;
>  }
>  
> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int vmload_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmsave_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmrun_interception(struct vcpu_svm *svm)
> +{
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	return nested_svm_vmrun(svm);
> +}
> +
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>  }
>  
>  static bool is_erratum_383(void)
> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>  	return kvm_emulate_hypercall(&svm->vcpu);
>  }
>  
> -static int vmload_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmsave_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmrun_interception(struct vcpu_svm *svm)
> -{
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	return nested_svm_vmrun(svm);
> -}
> -
>  void svm_set_gif(struct vcpu_svm *svm, bool value)
>  {
>  	if (value) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 0fe874ae5498..d5dffcf59afa 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>  	recalc_intercepts(svm);
>  }
>  
> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
> +{
> +	struct vmcb *vmcb = get_host_vmcb(svm);
> +
> +	WARN_ON_ONCE(bit >= 32);
> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
> +}
> +
>  static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>  {
>  	struct vmcb *vmcb = get_host_vmcb(svm);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 2af05d3b0590..5fac2f7cba24 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  			return 1;
>  		}
> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>  	}
>  
>  	/*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a8969a6dd06..c3662fc3b1bc 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>  	++vcpu->stat.insn_emulation_fail;
>  	trace_kvm_emulate_insn_failed(vcpu);
>  
> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>  		return 1;
>  	}
> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>  	return false;
>  }
>  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);
> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;
>  	}
>  
>  	/*
Vitaly Kuznetsov Jan. 12, 2021, 12:15 p.m. UTC | #2
Wei Huang <wei.huang2@amd.com> writes:

> From: Bandan Das <bsd@redhat.com>
>
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
>
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Co-developed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h              |   1 +
>  arch/x86/kvm/mmu/mmu.c          |   7 ++
>  arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>  arch/x86/kvm/svm/svm.h          |   8 ++
>  arch/x86/kvm/vmx/vmx.c          |   2 +-
>  arch/x86/kvm/x86.c              |  37 +++++++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *			     Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>   *			backdoor emulation, which is opt in via module param.
>   *			VMware backoor emulation handles select instructions
> - *			and reinjects the #GP for all other cases.
> + *			and reinjects #GP for all other cases. This also
> + *			handles other cases where #GP condition needs to be
> + *			handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>   *		 case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP		    (1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>  #define EMULTYPE_PF		    (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);

Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 

>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include <asm/io.h>
>  #include <asm/vmx.h>
>  #include <asm/kvm_page_track.h>
> +#include <asm/e820/api.h>
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> +	return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}

While _e820__mapped_any()'s doc says '..  checks if any part of the
range <start,end> is mapped ..' it seems to me that the real check is
[start, end) so we should use 'gpa' instead of 'gpa-1', no?

> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  		if (!(efer & EFER_SVME)) {
>  			svm_leave_nested(svm);
>  			svm_set_gif(svm, true);
> +			clr_exception_intercept(svm, GP_VECTOR);
>  
>  			/*
>  			 * Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> +	/* Enable GP interception for SVM instructions if needed */
> +	if (efer & EFER_SVME)
> +		set_exception_intercept(svm, GP_VECTOR);
> +
>  	return 0;
>  }
>  
> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int vmload_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmsave_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmrun_interception(struct vcpu_svm *svm)
> +{
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	return nested_svm_vmrun(svm);
> +}
> +
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>  }
>  
>  static bool is_erratum_383(void)
> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>  	return kvm_emulate_hypercall(&svm->vcpu);
>  }
>  
> -static int vmload_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmsave_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmrun_interception(struct vcpu_svm *svm)
> -{
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	return nested_svm_vmrun(svm);
> -}
> -

Maybe if you'd do it the other way around and put gp_interception()
after vm{load,save,run}_interception(), the diff (and code churn)
would've been smaller? 

>  void svm_set_gif(struct vcpu_svm *svm, bool value)
>  {
>  	if (value) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 0fe874ae5498..d5dffcf59afa 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>  	recalc_intercepts(svm);
>  }
>  
> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
> +{
> +	struct vmcb *vmcb = get_host_vmcb(svm);
> +
> +	WARN_ON_ONCE(bit >= 32);
> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
> +}
> +
>  static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>  {
>  	struct vmcb *vmcb = get_host_vmcb(svm);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 2af05d3b0590..5fac2f7cba24 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  			return 1;
>  		}
> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>  	}
>  
>  	/*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a8969a6dd06..c3662fc3b1bc 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>  	++vcpu->stat.insn_emulation_fail;
>  	trace_kvm_emulate_insn_failed(vcpu);
>  
> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>  		return 1;
>  	}
> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>  	return false;
>  }
>  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)

Nit: it seems we either return '0' or 'ctxt->modrm' which is 'u8', so
'u8' instead of 'int' maybe?

> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);
> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;
>  	}
>  
>  	/*
Paolo Bonzini Jan. 12, 2021, 2:01 p.m. UTC | #3
On 12/01/21 07:37, Wei Huang wrote:
>   static int gp_interception(struct vcpu_svm *svm)
>   {
>   	struct kvm_vcpu *vcpu = &svm->vcpu;
>   	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>   
>   	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>   	 */
>   	if (error_code) {
>   		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>   		return 1;
>   	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>   }
>   

Passing back the third byte is quick hacky.  Instead of this change to 
kvm_emulate_instruction, I'd rather check the instruction bytes in 
gp_interception before calling kvm_emulate_instruction.  That would be 
something like:

- move "kvm_clear_exception_queue(vcpu);" inside the "if 
(!(emulation_type & EMULTYPE_NO_DECODE))".  It doesn't apply when you 
are coming back from userspace.

- extract the "if (!(emulation_type & EMULTYPE_NO_DECODE))" body to a 
new function x86_emulate_decoded_instruction.  Call it from 
gp_interception, we know this is not a pagefault and therefore 
vcpu->arch.write_fault_to_shadow_pgtable must be false.

- check ctxt->insn_bytes for an SVM instruction

- if not an SVM instruction, call kvm_emulate_instruction(vcpu, 
EMULTYPE_VMWARE_GP|EMULTYPE_NO_DECODE).

Thanks,

Paolo
Andy Lutomirski Jan. 12, 2021, 3:11 p.m. UTC | #4
> On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> 
> Wei Huang <wei.huang2@amd.com> writes:
> 
>> From: Bandan Das <bsd@redhat.com>
>> 
>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>> before checking VMCB's instruction intercept. If EAX falls into such
>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>> check the instructions triggering #GP. For VM execution instructions,
>> KVM emulates these instructions; otherwise it re-injects #GP back to
>> guest VMs.
>> 
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>> ---
>> arch/x86/include/asm/kvm_host.h |   8 +-
>> arch/x86/kvm/mmu.h              |   1 +
>> arch/x86/kvm/mmu/mmu.c          |   7 ++
>> arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>> arch/x86/kvm/svm/svm.h          |   8 ++
>> arch/x86/kvm/vmx/vmx.c          |   2 +-
>> arch/x86/kvm/x86.c              |  37 +++++++-
>> 7 files changed, 146 insertions(+), 74 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 3d6616f6f6ef..0ddc309f5a14 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>>  *                 due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>>  *                 Used to test the full emulator from userspace.
>>  *
>> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
>> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>>  *            backdoor emulation, which is opt in via module param.
>>  *            VMware backoor emulation handles select instructions
>> - *            and reinjects the #GP for all other cases.
>> + *            and reinjects #GP for all other cases. This also
>> + *            handles other cases where #GP condition needs to be
>> + *            handled and emulated appropriately
>>  *
>>  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>>  *         case the CR2/GPA value pass on the stack is valid.
>> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>> #define EMULTYPE_SKIP            (1 << 2)
>> #define EMULTYPE_ALLOW_RETRY_PF        (1 << 3)
>> #define EMULTYPE_TRAP_UD_FORCED        (1 << 4)
>> -#define EMULTYPE_VMWARE_GP        (1 << 5)
>> +#define EMULTYPE_PARAVIRT_GP        (1 << 5)
>> #define EMULTYPE_PF            (1 << 6)
>> 
>> int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 581925e476d6..1a2fff4e7140 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>> 
>> int kvm_mmu_post_init_vm(struct kvm *kvm);
>> void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>> +bool kvm_is_host_reserved_region(u64 gpa);
> 
> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
> 
>> 
>> #endif
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 6d16481aa29d..c5c4aaf01a1a 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -50,6 +50,7 @@
>> #include <asm/io.h>
>> #include <asm/vmx.h>
>> #include <asm/kvm_page_track.h>
>> +#include <asm/e820/api.h>
>> #include "trace.h"
>> 
>> extern bool itlb_multihit_kvm_mitigation;
>> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>> }
>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>> 
>> +bool kvm_is_host_reserved_region(u64 gpa)
>> +{
>> +    return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
>> +}
> 
> While _e820__mapped_any()'s doc says '..  checks if any part of the
> range <start,end> is mapped ..' it seems to me that the real check is
> [start, end) so we should use 'gpa' instead of 'gpa-1', no?

Why do you need to check GPA at all?
Maxim Levitsky Jan. 12, 2021, 3:17 p.m. UTC | #5
On Tue, 2021-01-12 at 07:11 -0800, Andy Lutomirski wrote:
> > On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> > 
> > Wei Huang <wei.huang2@amd.com> writes:
> > 
> > > From: Bandan Das <bsd@redhat.com>
> > > 
> > > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > > before checking VMCB's instruction intercept. If EAX falls into such
> > > memory areas, #GP is triggered before VMEXIT. This causes problem under
> > > nested virtualization. To solve this problem, KVM needs to trap #GP and
> > > check the instructions triggering #GP. For VM execution instructions,
> > > KVM emulates these instructions; otherwise it re-injects #GP back to
> > > guest VMs.
> > > 
> > > Signed-off-by: Bandan Das <bsd@redhat.com>
> > > Co-developed-by: Wei Huang <wei.huang2@amd.com>
> > > Signed-off-by: Wei Huang <wei.huang2@amd.com>
> > > ---
> > > arch/x86/include/asm/kvm_host.h |   8 +-
> > > arch/x86/kvm/mmu.h              |   1 +
> > > arch/x86/kvm/mmu/mmu.c          |   7 ++
> > > arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
> > > arch/x86/kvm/svm/svm.h          |   8 ++
> > > arch/x86/kvm/vmx/vmx.c          |   2 +-
> > > arch/x86/kvm/x86.c              |  37 +++++++-
> > > 7 files changed, 146 insertions(+), 74 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 3d6616f6f6ef..0ddc309f5a14 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
> > >  *                 due to an intercepted #UD (see EMULTYPE_TRAP_UD).
> > >  *                 Used to test the full emulator from userspace.
> > >  *
> > > - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> > > + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
> > >  *            backdoor emulation, which is opt in via module param.
> > >  *            VMware backoor emulation handles select instructions
> > > - *            and reinjects the #GP for all other cases.
> > > + *            and reinjects #GP for all other cases. This also
> > > + *            handles other cases where #GP condition needs to be
> > > + *            handled and emulated appropriately
> > >  *
> > >  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
> > >  *         case the CR2/GPA value pass on the stack is valid.
> > > @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
> > > #define EMULTYPE_SKIP            (1 << 2)
> > > #define EMULTYPE_ALLOW_RETRY_PF        (1 << 3)
> > > #define EMULTYPE_TRAP_UD_FORCED        (1 << 4)
> > > -#define EMULTYPE_VMWARE_GP        (1 << 5)
> > > +#define EMULTYPE_PARAVIRT_GP        (1 << 5)
> > > #define EMULTYPE_PF            (1 << 6)
> > > 
> > > int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> > > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > > index 581925e476d6..1a2fff4e7140 100644
> > > --- a/arch/x86/kvm/mmu.h
> > > +++ b/arch/x86/kvm/mmu.h
> > > @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
> > > 
> > > int kvm_mmu_post_init_vm(struct kvm *kvm);
> > > void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> > > +bool kvm_is_host_reserved_region(u64 gpa);
> > 
> > Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
> > 
> > > #endif
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 6d16481aa29d..c5c4aaf01a1a 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -50,6 +50,7 @@
> > > #include <asm/io.h>
> > > #include <asm/vmx.h>
> > > #include <asm/kvm_page_track.h>
> > > +#include <asm/e820/api.h>
> > > #include "trace.h"
> > > 
> > > extern bool itlb_multihit_kvm_mitigation;
> > > @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
> > > }
> > > EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
> > > 
> > > +bool kvm_is_host_reserved_region(u64 gpa)
> > > +{
> > > +    return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> > > +}
> > 
> > While _e820__mapped_any()'s doc says '..  checks if any part of the
> > range <start,end> is mapped ..' it seems to me that the real check is
> > [start, end) so we should use 'gpa' instead of 'gpa-1', no?
> 
> Why do you need to check GPA at all?
> 
To reduce the scope of the workaround.

The errata only happens when you use one of SVM instructions
in the guest with EAX that happens to be inside one
of the host reserved memory regions (for example SMM).

So it is not expected for an SVM instruction with EAX that is a valid host
physical address to get a #GP due to this errata. 

Best regards,
	Maxim Levitsky

>
Andy Lutomirski Jan. 12, 2021, 3:22 p.m. UTC | #6
> On Jan 12, 2021, at 7:17 AM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
> 
> On Tue, 2021-01-12 at 07:11 -0800, Andy Lutomirski wrote:
>>>> On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>> 
>>> Wei Huang <wei.huang2@amd.com> writes:
>>> 
>>>> From: Bandan Das <bsd@redhat.com>
>>>> 
>>>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>>>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>>>> before checking VMCB's instruction intercept. If EAX falls into such
>>>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>>>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>>>> check the instructions triggering #GP. For VM execution instructions,
>>>> KVM emulates these instructions; otherwise it re-injects #GP back to
>>>> guest VMs.
>>>> 
>>>> Signed-off-by: Bandan Das <bsd@redhat.com>
>>>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>>>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>>>> ---
>>>> arch/x86/include/asm/kvm_host.h |   8 +-
>>>> arch/x86/kvm/mmu.h              |   1 +
>>>> arch/x86/kvm/mmu/mmu.c          |   7 ++
>>>> arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>>>> arch/x86/kvm/svm/svm.h          |   8 ++
>>>> arch/x86/kvm/vmx/vmx.c          |   2 +-
>>>> arch/x86/kvm/x86.c              |  37 +++++++-
>>>> 7 files changed, 146 insertions(+), 74 deletions(-)
>>>> 
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index 3d6616f6f6ef..0ddc309f5a14 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>>>> *                 due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>>>> *                 Used to test the full emulator from userspace.
>>>> *
>>>> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
>>>> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>>>> *            backdoor emulation, which is opt in via module param.
>>>> *            VMware backoor emulation handles select instructions
>>>> - *            and reinjects the #GP for all other cases.
>>>> + *            and reinjects #GP for all other cases. This also
>>>> + *            handles other cases where #GP condition needs to be
>>>> + *            handled and emulated appropriately
>>>> *
>>>> * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>>>> *         case the CR2/GPA value pass on the stack is valid.
>>>> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>>>> #define EMULTYPE_SKIP            (1 << 2)
>>>> #define EMULTYPE_ALLOW_RETRY_PF        (1 << 3)
>>>> #define EMULTYPE_TRAP_UD_FORCED        (1 << 4)
>>>> -#define EMULTYPE_VMWARE_GP        (1 << 5)
>>>> +#define EMULTYPE_PARAVIRT_GP        (1 << 5)
>>>> #define EMULTYPE_PF            (1 << 6)
>>>> 
>>>> int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>>>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>>>> index 581925e476d6..1a2fff4e7140 100644
>>>> --- a/arch/x86/kvm/mmu.h
>>>> +++ b/arch/x86/kvm/mmu.h
>>>> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>>>> 
>>>> int kvm_mmu_post_init_vm(struct kvm *kvm);
>>>> void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>>>> +bool kvm_is_host_reserved_region(u64 gpa);
>>> 
>>> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
>>> 
>>>> #endif
>>>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>>>> index 6d16481aa29d..c5c4aaf01a1a 100644
>>>> --- a/arch/x86/kvm/mmu/mmu.c
>>>> +++ b/arch/x86/kvm/mmu/mmu.c
>>>> @@ -50,6 +50,7 @@
>>>> #include <asm/io.h>
>>>> #include <asm/vmx.h>
>>>> #include <asm/kvm_page_track.h>
>>>> +#include <asm/e820/api.h>
>>>> #include "trace.h"
>>>> 
>>>> extern bool itlb_multihit_kvm_mitigation;
>>>> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>>>> }
>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>> 
>>>> +bool kvm_is_host_reserved_region(u64 gpa)
>>>> +{
>>>> +    return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
>>>> +}
>>> 
>>> While _e820__mapped_any()'s doc says '..  checks if any part of the
>>> range <start,end> is mapped ..' it seems to me that the real check is
>>> [start, end) so we should use 'gpa' instead of 'gpa-1', no?
>> 
>> Why do you need to check GPA at all?
>> 
> To reduce the scope of the workaround.
> 
> The errata only happens when you use one of SVM instructions
> in the guest with EAX that happens to be inside one
> of the host reserved memory regions (for example SMM).

This code reduces the scope of the workaround at the cost of increasing the complexity of the workaround and adding a nonsensical coupling between KVM and host details and adding an export that really doesn’t deserve to be exported.

Is there an actual concrete benefit to this check?
Bandan Das Jan. 12, 2021, 3:46 p.m. UTC | #7
Andy Lutomirski <luto@amacapital.net> writes:
...
>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
>>>>> "trace.h"
>>>>> 
>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>>> 
>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
>>>>  While _e820__mapped_any()'s doc says '..  checks if any part of
>>>> the range <start,end> is mapped ..' it seems to me that the real
>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
>>>> no?
>>>  Why do you need to check GPA at all?
>>> 
>> To reduce the scope of the workaround.
>> 
>> The errata only happens when you use one of SVM instructions in the
>> guest with EAX that happens to be inside one of the host reserved
>> memory regions (for example SMM).
>
> This code reduces the scope of the workaround at the cost of
> increasing the complexity of the workaround and adding a nonsensical
> coupling between KVM and host details and adding an export that really
> doesn’t deserve to be exported.
>
> Is there an actual concrete benefit to this check?

Besides reducing the scope, my intention for the check was that we should
know if such exceptions occur for any other undiscovered reasons with other
memory types rather than hiding them under this workaround.

Bandan
Andy Lutomirski Jan. 12, 2021, 3:51 p.m. UTC | #8
> On Jan 12, 2021, at 7:46 AM, Bandan Das <bsd@redhat.com> wrote:
> 
> Andy Lutomirski <luto@amacapital.net> writes:
> ...
>>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
>>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
>>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
>>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
>>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
>>>>>> "trace.h"
>>>>>> 
>>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
>>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
>>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>>>> 
>>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
>>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
>>>>> While _e820__mapped_any()'s doc says '..  checks if any part of
>>>>> the range <start,end> is mapped ..' it seems to me that the real
>>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
>>>>> no?
>>>> Why do you need to check GPA at all?
>>>> 
>>> To reduce the scope of the workaround.
>>> 
>>> The errata only happens when you use one of SVM instructions in the
>>> guest with EAX that happens to be inside one of the host reserved
>>> memory regions (for example SMM).
>> 
>> This code reduces the scope of the workaround at the cost of
>> increasing the complexity of the workaround and adding a nonsensical
>> coupling between KVM and host details and adding an export that really
>> doesn’t deserve to be exported.
>> 
>> Is there an actual concrete benefit to this check?
> 
> Besides reducing the scope, my intention for the check was that we should
> know if such exceptions occur for any other undiscovered reasons with other
> memory types rather than hiding them under this workaround.

Ask AMD?

I would also believe that someone somewhere has a firmware that simply omits the problematic region instead of listing it as reserved.

> 
> Bandan
> 
> 
>
Sean Christopherson Jan. 12, 2021, 5:36 p.m. UTC | #9
On Tue, Jan 12, 2021, Wei Huang wrote:
> From: Bandan Das <bsd@redhat.com>
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept.

It would be very helpful to list exactly which CPUs are/aren't affected, even if
that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
I assume it's all CPUs without the new CPUID flag?
Sean Christopherson Jan. 12, 2021, 5:42 p.m. UTC | #10
On Tue, Jan 12, 2021, Paolo Bonzini wrote:
> On 12/01/21 07:37, Wei Huang wrote:
> >   static int gp_interception(struct vcpu_svm *svm)
> >   {
> >   	struct kvm_vcpu *vcpu = &svm->vcpu;
> >   	u32 error_code = svm->vmcb->control.exit_info_1;
> > -
> > -	WARN_ON_ONCE(!enable_vmware_backdoor);
> > +	int rc;
> >   	/*
> > -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> > -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> > +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> > +	 * them has non-zero error codes.
> >   	 */
> >   	if (error_code) {
> >   		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
> >   		return 1;
> >   	}
> > -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> > +
> > +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> > +	if (rc > 1)
> > +		rc = svm_emulate_vm_instr(vcpu, rc);
> > +	return rc;
> >   }
> 
> Passing back the third byte is quick hacky.  Instead of this change to
> kvm_emulate_instruction, I'd rather check the instruction bytes in
> gp_interception before calling kvm_emulate_instruction.

Agreed.  And I'd also prefer that any pure refactoring is done in separate
patch(es) so that the actual functional change is better isolated.

On a related topic, it feels like nested should be disabled by default on SVM
until it's truly ready for primetime, with the patch tagged for stable.  That
way we don't have to worry about crafting non-trivial fixes (like this one) to
make them backport-friendly.
Sean Christopherson Jan. 12, 2021, 5:56 p.m. UTC | #11
On Tue, Jan 12, 2021, Andy Lutomirski wrote:
> 
> > On Jan 12, 2021, at 7:46 AM, Bandan Das <bsd@redhat.com> wrote:
> > 
> > Andy Lutomirski <luto@amacapital.net> writes:
> > ...
> >>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
> >>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
> >>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
> >>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
> >>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
> >>>>>> "trace.h"
> >>>>>> 
> >>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
> >>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
> >>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
> >>>>>> 
> >>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
> >>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
> >>>>> While _e820__mapped_any()'s doc says '..  checks if any part of
> >>>>> the range <start,end> is mapped ..' it seems to me that the real
> >>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
> >>>>> no?
> >>>> Why do you need to check GPA at all?
> >>>> 
> >>> To reduce the scope of the workaround.
> >>> 
> >>> The errata only happens when you use one of SVM instructions in the
> >>> guest with EAX that happens to be inside one of the host reserved
> >>> memory regions (for example SMM).
> >> 
> >> This code reduces the scope of the workaround at the cost of
> >> increasing the complexity of the workaround and adding a nonsensical
> >> coupling between KVM and host details and adding an export that really
> >> doesn’t deserve to be exported.
> >> 
> >> Is there an actual concrete benefit to this check?
> > 
> > Besides reducing the scope, my intention for the check was that we should
> > know if such exceptions occur for any other undiscovered reasons with other
> > memory types rather than hiding them under this workaround.
> 
> Ask AMD?
> 
> I would also believe that someone somewhere has a firmware that simply omits
> the problematic region instead of listing it as reserved.

I agree with Andy, odds are very good that attempting to be precise will lead to
pain due to false negatives.

And, KVM's SVM instruction emulation needs to be be rock solid regardless of
this behavior since KVM unconditionally intercepts the instruction, i.e. there's
basically zero risk to KVM.
Sean Christopherson Jan. 12, 2021, 5:59 p.m. UTC | #12
On Tue, Jan 12, 2021, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Wei Huang wrote:
> > From: Bandan Das <bsd@redhat.com>
> > 
> > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > before checking VMCB's instruction intercept.
> 
> It would be very helpful to list exactly which CPUs are/aren't affected, even if
> that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
> I assume it's all CPUs without the new CPUID flag?

Ah, despite calling this an 'errata', the bad behavior is explicitly documented
in the APM, i.e. it's an architecture bug, not a silicon bug.

Can you reword the changelog to make it clear that the premature #GP is the
correct architectural behavior for CPUs without the new CPUID flag?
Andy Lutomirski Jan. 12, 2021, 6:58 p.m. UTC | #13
On Tue, Jan 12, 2021 at 9:59 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jan 12, 2021, Sean Christopherson wrote:
> > On Tue, Jan 12, 2021, Wei Huang wrote:
> > > From: Bandan Das <bsd@redhat.com>
> > >
> > > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > > before checking VMCB's instruction intercept.
> >
> > It would be very helpful to list exactly which CPUs are/aren't affected, even if
> > that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
> > I assume it's all CPUs without the new CPUID flag?
>
> Ah, despite calling this an 'errata', the bad behavior is explicitly documented
> in the APM, i.e. it's an architecture bug, not a silicon bug.
>
> Can you reword the changelog to make it clear that the premature #GP is the
> correct architectural behavior for CPUs without the new CPUID flag?

Andrew Cooper points out that there may be a nicer workaround.  Make
sure that the SMRAM and HT region (FFFD00000000 - FFFFFFFFFFFF) are
marked as reserved in the guest, too.

--Andy
Sean Christopherson Jan. 12, 2021, 7:40 p.m. UTC | #14
On Tue, Jan 12, 2021, Wei Huang wrote:
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
v> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>  }
 
...
  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);
> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;

I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
be handled further up the stack.

>  	}
>  
>  	/*
> -- 
> 2.27.0
>
Bandan Das Jan. 12, 2021, 8 p.m. UTC | #15
Sean Christopherson <seanjc@google.com> writes:
...
>> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
>> -	    !is_vmware_backdoor_opcode(ctxt)) {
>> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> -		return 1;
>> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>> +		vminstr = is_vm_instr_opcode(ctxt);
>> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
>> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> +			return 1;
>> +		}
>> +		if (vminstr)
>> +			return vminstr;
>
> I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
> L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
> x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
> be handled further up the stack.
>
So, the condition is that L2 executes a vmload and #GPs on a reserved address, jumps to L0 - L0 doesn't
check if L1 has asked for the instruction to be intercepted and goes on with emulating
vmload and returning back to L2 ?

>>  	}
>>  
>>  	/*
>> -- 
>> 2.27.0
>>
Wei Huang Jan. 12, 2021, 9:05 p.m. UTC | #16
On 1/12/21 5:09 AM, Maxim Levitsky wrote:
> On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
>> From: Bandan Das <bsd@redhat.com>
>>
>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>> before checking VMCB's instruction intercept. If EAX falls into such
>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>> check the instructions triggering #GP. For VM execution instructions,
>> KVM emulates these instructions; otherwise it re-injects #GP back to
>> guest VMs.
>>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> 
> This is the ultimate fix for this bug that I had in mind,
> but I didn't dare to develop it, thinking it won't be accepted
> due to the added complexity.
> 
>  From a cursory look this look all right, and I will review
> and test this either today or tomorrow.

My tests mainly relied on the kvm-unit-test you developed (thanks BTW), 
on machines w/ and w/o CPUID_0x8000000A_EDX[28]=1. Both cases passed.

>
Wei Huang Jan. 12, 2021, 9:50 p.m. UTC | #17
On 1/12/21 6:15 AM, Vitaly Kuznetsov wrote:
> Wei Huang <wei.huang2@amd.com> writes:
> 
>> From: Bandan Das <bsd@redhat.com>
>>
>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>> before checking VMCB's instruction intercept. If EAX falls into such
>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>> check the instructions triggering #GP. For VM execution instructions,
>> KVM emulates these instructions; otherwise it re-injects #GP back to
>> guest VMs.
>>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>> ---
>>   arch/x86/include/asm/kvm_host.h |   8 +-
>>   arch/x86/kvm/mmu.h              |   1 +
>>   arch/x86/kvm/mmu/mmu.c          |   7 ++
>>   arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>>   arch/x86/kvm/svm/svm.h          |   8 ++
>>   arch/x86/kvm/vmx/vmx.c          |   2 +-
>>   arch/x86/kvm/x86.c              |  37 +++++++-
>>   7 files changed, 146 insertions(+), 74 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 3d6616f6f6ef..0ddc309f5a14 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>>    *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>>    *			     Used to test the full emulator from userspace.
>>    *
>> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
>> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>>    *			backdoor emulation, which is opt in via module param.
>>    *			VMware backoor emulation handles select instructions
>> - *			and reinjects the #GP for all other cases.
>> + *			and reinjects #GP for all other cases. This also
>> + *			handles other cases where #GP condition needs to be
>> + *			handled and emulated appropriately
>>    *
>>    * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>>    *		 case the CR2/GPA value pass on the stack is valid.
>> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>>   #define EMULTYPE_SKIP		    (1 << 2)
>>   #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>>   #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
>> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
>> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>>   #define EMULTYPE_PF		    (1 << 6)
>>   
>>   int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 581925e476d6..1a2fff4e7140 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>>   
>>   int kvm_mmu_post_init_vm(struct kvm *kvm);
>>   void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>> +bool kvm_is_host_reserved_region(u64 gpa);
> 
> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe?

Will do in v2.

> 
>>   
>>   #endif
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 6d16481aa29d..c5c4aaf01a1a 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -50,6 +50,7 @@
>>   #include <asm/io.h>
>>   #include <asm/vmx.h>
>>   #include <asm/kvm_page_track.h>
>> +#include <asm/e820/api.h>
>>   #include "trace.h"
>>   
>>   extern bool itlb_multihit_kvm_mitigation;
>> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>>   }
>>   EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>   
>> +bool kvm_is_host_reserved_region(u64 gpa)
>> +{
>> +	return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
>> +}
> 
> While _e820__mapped_any()'s doc says '..  checks if any part of the
> range <start,end> is mapped ..' it seems to me that the real check is
> [start, end) so we should use 'gpa' instead of 'gpa-1', no?

I think you are right. The statement of "entry->addr >= end || 
entry->addr + entry->size <= start" shows the checking is against the 
area of [start, end).

> 
>> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
>> +
>>   void kvm_mmu_zap_all(struct kvm *kvm)
>>   {
>>   	struct kvm_mmu_page *sp, *node;
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 7ef171790d02..74620d32aa82 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>>   		if (!(efer & EFER_SVME)) {
>>   			svm_leave_nested(svm);
>>   			svm_set_gif(svm, true);
>> +			clr_exception_intercept(svm, GP_VECTOR);
>>   
>>   			/*
>>   			 * Free the nested guest state, unless we are in SMM.
>> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>>   
>>   	svm->vmcb->save.efer = efer | EFER_SVME;
>>   	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
>> +	/* Enable GP interception for SVM instructions if needed */
>> +	if (efer & EFER_SVME)
>> +		set_exception_intercept(svm, GP_VECTOR);
>> +
>>   	return 0;
>>   }
>>   
>> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>>   	return 1;
>>   }
>>   
>> +static int vmload_interception(struct vcpu_svm *svm)
>> +{
>> +	struct vmcb *nested_vmcb;
>> +	struct kvm_host_map map;
>> +	int ret;
>> +
>> +	if (nested_svm_check_permissions(svm))
>> +		return 1;
>> +
>> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> +	if (ret) {
>> +		if (ret == -EINVAL)
>> +			kvm_inject_gp(&svm->vcpu, 0);
>> +		return 1;
>> +	}
>> +
>> +	nested_vmcb = map.hva;
>> +
>> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> +
>> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
>> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> +
>> +	return ret;
>> +}
>> +
>> +static int vmsave_interception(struct vcpu_svm *svm)
>> +{
>> +	struct vmcb *nested_vmcb;
>> +	struct kvm_host_map map;
>> +	int ret;
>> +
>> +	if (nested_svm_check_permissions(svm))
>> +		return 1;
>> +
>> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> +	if (ret) {
>> +		if (ret == -EINVAL)
>> +			kvm_inject_gp(&svm->vcpu, 0);
>> +		return 1;
>> +	}
>> +
>> +	nested_vmcb = map.hva;
>> +
>> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> +
>> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
>> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> +
>> +	return ret;
>> +}
>> +
>> +static int vmrun_interception(struct vcpu_svm *svm)
>> +{
>> +	if (nested_svm_check_permissions(svm))
>> +		return 1;
>> +
>> +	return nested_svm_vmrun(svm);
>> +}
>> +
>> +/* Emulate SVM VM execution instructions */
>> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
>> +{
>> +	struct vcpu_svm *svm = to_svm(vcpu);
>> +
>> +	switch (modrm) {
>> +	case 0xd8: /* VMRUN */
>> +		return vmrun_interception(svm);
>> +	case 0xda: /* VMLOAD */
>> +		return vmload_interception(svm);
>> +	case 0xdb: /* VMSAVE */
>> +		return vmsave_interception(svm);
>> +	default:
>> +		/* inject a #GP for all other cases */
>> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> +		return 1;
>> +	}
>> +}
>> +
>>   static int gp_interception(struct vcpu_svm *svm)
>>   {
>>   	struct kvm_vcpu *vcpu = &svm->vcpu;
>>   	u32 error_code = svm->vmcb->control.exit_info_1;
>> -
>> -	WARN_ON_ONCE(!enable_vmware_backdoor);
>> +	int rc;
>>   
>>   	/*
>> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
>> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
>> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
>> +	 * them has non-zero error codes.
>>   	 */
>>   	if (error_code) {
>>   		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>>   		return 1;
>>   	}
>> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
>> +
>> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>> +	if (rc > 1)
>> +		rc = svm_emulate_vm_instr(vcpu, rc);
>> +	return rc;
>>   }
>>   
>>   static bool is_erratum_383(void)
>> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>>   	return kvm_emulate_hypercall(&svm->vcpu);
>>   }
>>   
>> -static int vmload_interception(struct vcpu_svm *svm)
>> -{
>> -	struct vmcb *nested_vmcb;
>> -	struct kvm_host_map map;
>> -	int ret;
>> -
>> -	if (nested_svm_check_permissions(svm))
>> -		return 1;
>> -
>> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> -	if (ret) {
>> -		if (ret == -EINVAL)
>> -			kvm_inject_gp(&svm->vcpu, 0);
>> -		return 1;
>> -	}
>> -
>> -	nested_vmcb = map.hva;
>> -
>> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> -
>> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
>> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> -
>> -	return ret;
>> -}
>> -
>> -static int vmsave_interception(struct vcpu_svm *svm)
>> -{
>> -	struct vmcb *nested_vmcb;
>> -	struct kvm_host_map map;
>> -	int ret;
>> -
>> -	if (nested_svm_check_permissions(svm))
>> -		return 1;
>> -
>> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> -	if (ret) {
>> -		if (ret == -EINVAL)
>> -			kvm_inject_gp(&svm->vcpu, 0);
>> -		return 1;
>> -	}
>> -
>> -	nested_vmcb = map.hva;
>> -
>> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> -
>> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
>> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> -
>> -	return ret;
>> -}
>> -
>> -static int vmrun_interception(struct vcpu_svm *svm)
>> -{
>> -	if (nested_svm_check_permissions(svm))
>> -		return 1;
>> -
>> -	return nested_svm_vmrun(svm);
>> -}
>> -
> 
> Maybe if you'd do it the other way around and put gp_interception()
> after vm{load,save,run}_interception(), the diff (and code churn)
> would've been smaller?

Agreed.

> 
>>   void svm_set_gif(struct vcpu_svm *svm, bool value)
>>   {
>>   	if (value) {
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index 0fe874ae5498..d5dffcf59afa 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>>   	recalc_intercepts(svm);
>>   }
>>   
>> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
>> +{
>> +	struct vmcb *vmcb = get_host_vmcb(svm);
>> +
>> +	WARN_ON_ONCE(bit >= 32);
>> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
>> +}
>> +
>>   static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>>   {
>>   	struct vmcb *vmcb = get_host_vmcb(svm);
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 2af05d3b0590..5fac2f7cba24 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>>   			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>>   			return 1;
>>   		}
>> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
>> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>>   	}
>>   
>>   	/*
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 9a8969a6dd06..c3662fc3b1bc 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>>   	++vcpu->stat.insn_emulation_fail;
>>   	trace_kvm_emulate_insn_failed(vcpu);
>>   
>> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
>> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>>   		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>>   		return 1;
>>   	}
>> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>>   	return false;
>>   }
>>   
>> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> 
> Nit: it seems we either return '0' or 'ctxt->modrm' which is 'u8', so
> 'u8' instead of 'int' maybe?

Agreed. Also Paolo has some comments around this area as well. We will 
take these comments into consideration in v2.

> 
>> +{
>> +	unsigned long rax;
>> +
>> +	if (ctxt->b != 0x1)
>> +		return 0;
>> +
>> +	switch (ctxt->modrm) {
>> +	case 0xd8: /* VMRUN */
>> +	case 0xda: /* VMLOAD */
>> +	case 0xdb: /* VMSAVE */
>> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
>> +		if (!kvm_is_host_reserved_region(rax))
>> +			return 0;
>> +		break;
>> +	default:
>> +		return 0;
>> +	}
>> +
>> +	return ctxt->modrm;
>> +}
>> +
>>   static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>>   {
>>   	switch (ctxt->opcode_len) {
>> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>>   	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>>   	bool writeback = true;
>>   	bool write_fault_to_spt;
>> +	int vminstr;
>>   
>>   	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>>   		return 1;
>> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>>   		}
>>   	}
>>   
>> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
>> -	    !is_vmware_backdoor_opcode(ctxt)) {
>> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> -		return 1;
>> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>> +		vminstr = is_vm_instr_opcode(ctxt);
>> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
>> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> +			return 1;
>> +		}
>> +		if (vminstr)
>> +			return vminstr;
>>   	}
>>   
>>   	/*
>
Wei Huang Jan. 13, 2021, 4:55 a.m. UTC | #18
On 1/12/21 11:56 AM, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Andy Lutomirski wrote:
>>
>>> On Jan 12, 2021, at 7:46 AM, Bandan Das <bsd@redhat.com> wrote:
>>>
>>> Andy Lutomirski <luto@amacapital.net> writes:
>>> ...
>>>>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
>>>>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
>>>>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
>>>>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
>>>>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
>>>>>>>> "trace.h"
>>>>>>>>
>>>>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
>>>>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
>>>>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>>>>>>
>>>>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
>>>>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
>>>>>>> While _e820__mapped_any()'s doc says '..  checks if any part of
>>>>>>> the range <start,end> is mapped ..' it seems to me that the real
>>>>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
>>>>>>> no?
>>>>>> Why do you need to check GPA at all?
>>>>>>
>>>>> To reduce the scope of the workaround.
>>>>>
>>>>> The errata only happens when you use one of SVM instructions in the
>>>>> guest with EAX that happens to be inside one of the host reserved
>>>>> memory regions (for example SMM).
>>>>
>>>> This code reduces the scope of the workaround at the cost of
>>>> increasing the complexity of the workaround and adding a nonsensical
>>>> coupling between KVM and host details and adding an export that really
>>>> doesn’t deserve to be exported.
>>>>
>>>> Is there an actual concrete benefit to this check?
>>>
>>> Besides reducing the scope, my intention for the check was that we should
>>> know if such exceptions occur for any other undiscovered reasons with other
>>> memory types rather than hiding them under this workaround.
>>
>> Ask AMD?

There are several checking before VMRUN launch. The function, 
e820__mapped_raw_any(), was definitely one of the easies way to figure 
out the problematic regions we had.

>>
>> I would also believe that someone somewhere has a firmware that simply omits
>> the problematic region instead of listing it as reserved.
> 
> I agree with Andy, odds are very good that attempting to be precise will lead to
> pain due to false negatives.
> 
> And, KVM's SVM instruction emulation needs to be be rock solid regardless of
> this behavior since KVM unconditionally intercepts the instruction, i.e. there's
> basically zero risk to KVM.
> 

Are you saying that the instruction decode before 
kvm_is_host_reserved_region() already guarantee the instructions #GP hit 
are SVM execution instructions (see below)? If so, I think this argument 
is fair.

+	switch (ctxt->modrm) {
+	case 0xd8: /* VMRUN */
+	case 0xda: /* VMLOAD */
+	case 0xdb: /* VMSAVE */

Bandan: What is your thoughts about removing kvm_is_host_reserved_region()?
Wei Huang Jan. 13, 2021, 5:03 a.m. UTC | #19
On 1/12/21 11:59 AM, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Sean Christopherson wrote:
>> On Tue, Jan 12, 2021, Wei Huang wrote:
>>> From: Bandan Das <bsd@redhat.com>
>>>
>>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>>> before checking VMCB's instruction intercept.
>>
>> It would be very helpful to list exactly which CPUs are/aren't affected, even if
>> that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
>> I assume it's all CPUs without the new CPUID flag?

This behavior was dated back to fairly old CPUs. It is fair to assume 
that _most_ CPUs without this CPUID bit can demonstrate such behavior.

> 
> Ah, despite calling this an 'errata', the bad behavior is explicitly documented
> in the APM, i.e. it's an architecture bug, not a silicon bug.
> 
> Can you reword the changelog to make it clear that the premature #GP is the
> correct architectural behavior for CPUs without the new CPUID flag?

Sure, will do in the next version.

>
Wei Huang Jan. 13, 2021, 5:15 a.m. UTC | #20
On 1/12/21 12:58 PM, Andy Lutomirski wrote:
> Andrew Cooper points out that there may be a nicer workaround.  Make
> sure that the SMRAM and HT region (FFFD00000000 - FFFFFFFFFFFF) are
> marked as reserved in the guest, too.

In theory this proposed solution can avoid intercepting #GP. But in 
reality SMRAM regions can be different on different machines. So this 
solution can break after VM migration.
Paolo Bonzini Jan. 13, 2021, 12:35 p.m. UTC | #21
On 12/01/21 18:42, Sean Christopherson wrote:
> On a related topic, it feels like nested should be disabled by default on SVM
> until it's truly ready for primetime, with the patch tagged for stable.  That
> way we don't have to worry about crafting non-trivial fixes (like this one) to
> make them backport-friendly.

Well, that's historical; I wish it had been disabled by default back in 
the day.

However, after 10 years and after the shakedown last year, it's hard to 
justify breaking backwards compatibility.  Nested SVM is not any less 
ready than nested VMX---just a little less optimized for things such as 
TLB flushes and ASID/VPID---even without this fix.  The erratum has 
visible effects only on a minority of AMD systems (it depends on an 
unlucky placement of TSEG on L0), and it is easy to work around it by 
lowering the amount of <4G memory in L1.

Paolo
Paolo Bonzini Jan. 13, 2021, 12:40 p.m. UTC | #22
On 12/01/21 18:59, Sean Christopherson wrote:
>> It would be very helpful to list exactly which CPUs are/aren't affected, even if
>> that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
>> I assume it's all CPUs without the new CPUID flag?
> Ah, despite calling this an 'errata', the bad behavior is explicitly documented
> in the APM, i.e. it's an architecture bug, not a silicon bug.

I would still call it an errata for the case when virtualized 
VMSAVE/VMLOAD is enabled (and therefore VMLOAD intercepts are disabled). 
  In that case, the problem is that the GPA does not go through NPT 
before it is checked against *host* reserved memory regions.

In fact I hope that,  on processors that have the fix, VMSAVE/VMLOAD 
from guest mode _does_ check the GPA after it's been translated!

Paolo
Maxim Levitsky Jan. 14, 2021, 11:42 a.m. UTC | #23
On Tue, 2021-01-12 at 23:15 -0600, Wei Huang wrote:
> 
> On 1/12/21 12:58 PM, Andy Lutomirski wrote:
> > Andrew Cooper points out that there may be a nicer workaround.  Make
> > sure that the SMRAM and HT region (FFFD00000000 - FFFFFFFFFFFF) are
> > marked as reserved in the guest, too.
> 
> In theory this proposed solution can avoid intercepting #GP. But in 
> reality SMRAM regions can be different on different machines. So this 
> solution can break after VM migration.
> 
I should add to this, that on my 3970X,
I just noticed that the problematic SMRAM region moved on
its own (likely due to the fact that I moved some pcie cards around recently).

Best regards,
	Maxim Levitsky
Maxim Levitsky Jan. 14, 2021, 11:47 a.m. UTC | #24
On Tue, 2021-01-12 at 15:00 -0500, Bandan Das wrote:
> Sean Christopherson <seanjc@google.com> writes:
> ...
> > > -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> > > -	    !is_vmware_backdoor_opcode(ctxt)) {
> > > -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > -		return 1;
> > > +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> > > +		vminstr = is_vm_instr_opcode(ctxt);
> > > +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> > > +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > +			return 1;
> > > +		}
> > > +		if (vminstr)
> > > +			return vminstr;
> > 
> > I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
> > L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
> > x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
> > be handled further up the stack.

Actually IMHO this exactly what we want. We want L0 to always intercept
these #GPs, and hide them from the guest.

What we do need to do (and I prepared and attached a patch for that, is that if we run
a guest, we want to inject corresponding vmexit (like SVM_EXIT_VMRUN)
instead of emulating the instruction.
The attached patch does this, and it made my kvm unit test pass,
even if the test was run in a VM (with an unpatched kernel).

This together with setting that X86_FEATURE_SVME_ADDR_CHK bit for
the guest will allow us to hide that errata completely from the guest
which is a very good thing.
(for example for guests that we can't modify)


Best regards,
	Maxim Levitsky



> > 
> So, the condition is that L2 executes a vmload and #GPs on a reserved address, jumps to L0 - L0 doesn't
> check if L1 has asked for the instruction to be intercepted and goes on with emulating
> vmload and returning back to L2 ?



> 
> > >  	}
> > >  
> > >  	/*
> > > -- 
> > > 2.27.0
> > >
Maxim Levitsky Jan. 14, 2021, 11:55 a.m. UTC | #25
On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
> From: Bandan Das <bsd@redhat.com>
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
> 
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Co-developed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h              |   1 +
>  arch/x86/kvm/mmu/mmu.c          |   7 ++
>  arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>  arch/x86/kvm/svm/svm.h          |   8 ++
>  arch/x86/kvm/vmx/vmx.c          |   2 +-
>  arch/x86/kvm/x86.c              |  37 +++++++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *			     Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
I would prefer to see this change in a separate patch.


>   *			backdoor emulation, which is opt in via module param.
>   *			VMware backoor emulation handles select instructions
> - *			and reinjects the #GP for all other cases.
> + *			and reinjects #GP for all other cases. This also
> + *			handles other cases where #GP condition needs to be
> + *			handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>   *		 case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP		    (1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>  #define EMULTYPE_PF		    (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);
>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include <asm/io.h>
>  #include <asm/vmx.h>
>  #include <asm/kvm_page_track.h>
> +#include <asm/e820/api.h>
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> +	return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}
> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  		if (!(efer & EFER_SVME)) {
>  			svm_leave_nested(svm);
>  			svm_set_gif(svm, true);
> +			clr_exception_intercept(svm, GP_VECTOR);
Wouldn't that be wrong if we intercept #GP due to the vmware backdoor?

I would add a flag that will be true when the workaround for the errata is enabled,
and use it together with flag that enables vmware backdoor for decisions
such as the above.

The flag can even be a module param to allow users to disable it if they
really want to.

>  
>  			/*
>  			 * Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> +	/* Enable GP interception for SVM instructions if needed */
> +	if (efer & EFER_SVME)
> +		set_exception_intercept(svm, GP_VECTOR);
> +
>  	return 0;
>  }
>  
> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int vmload_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmsave_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmrun_interception(struct vcpu_svm *svm)
> +{
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	return nested_svm_vmrun(svm);
> +}

I would prefer the move of these functions (I didn't check
if you changed them as well) to be done in a separate patch.

> +
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
The warning could be kept with extended condition.

> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
Could be great to mention (once published) the link to the errata
here or somewhere so readers understand what it is all about.

>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
As others mentioned in the review, I also would try to use something
else that passing the mod/reg/rm byte in the return value.
Otherwise it will backfire sooner or later.


>  }
>  
>  static bool is_erratum_383(void)
> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>  	return kvm_emulate_hypercall(&svm->vcpu);
>  }
>  
> -static int vmload_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmsave_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmrun_interception(struct vcpu_svm *svm)
> -{
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	return nested_svm_vmrun(svm);
> -}
> -
>  void svm_set_gif(struct vcpu_svm *svm, bool value)
>  {
>  	if (value) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 0fe874ae5498..d5dffcf59afa 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>  	recalc_intercepts(svm);
>  }
>  
> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
> +{
> +	struct vmcb *vmcb = get_host_vmcb(svm);
> +
> +	WARN_ON_ONCE(bit >= 32);
> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
> +}
This function doesn't seem to be used anywhere.

> +
>  static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>  {
>  	struct vmcb *vmcb = get_host_vmcb(svm);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 2af05d3b0590..5fac2f7cba24 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  			return 1;
>  		}
> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>  	}
>  
>  	/*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a8969a6dd06..c3662fc3b1bc 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>  	++vcpu->stat.insn_emulation_fail;
>  	trace_kvm_emulate_insn_failed(vcpu);
>  
> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>  		return 1;
>  	}
> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>  	return false;
>  }
>  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
Don't you also want to check that 'ctxt->opcode_len == 2'?
Prefixes? AMD's PRM doesn't mention if these are allowed/ignored/etc.

I guess they are ignored but this should be double checked vs real hardware.

> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);

As I said above, I would add some flag if workaround for the errata is used,
and use it here.

> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;
>  	}
>  
>  	/*

Best regards,
	Maxim Levitsky
Sean Christopherson Jan. 14, 2021, 5:19 p.m. UTC | #26
On Thu, Jan 14, 2021, Maxim Levitsky wrote:
> On Tue, 2021-01-12 at 15:00 -0500, Bandan Das wrote:
> > Sean Christopherson <seanjc@google.com> writes:
> > ...
> > > > -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> > > > -	    !is_vmware_backdoor_opcode(ctxt)) {
> > > > -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > > -		return 1;
> > > > +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> > > > +		vminstr = is_vm_instr_opcode(ctxt);
> > > > +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> > > > +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > > +			return 1;
> > > > +		}
> > > > +		if (vminstr)
> > > > +			return vminstr;
> > > 
> > > I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
> > > L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
> > > x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
> > > be handled further up the stack.
> 
> Actually IMHO this exactly what we want. We want L0 to always intercept
> these #GPs, and hide them from the guest.
> 
> What we do need to do (and I prepared and attached a patch for that, is that
> if we run a guest, we want to inject corresponding vmexit (like
> SVM_EXIT_VMRUN) instead of emulating the instruction.

Yes, lack of forwarding to L1 as a nested exit is what I meant by "doesn't
correctly handle".
Wei Huang Jan. 15, 2021, 7 a.m. UTC | #27
On 1/12/21 8:01 AM, Paolo Bonzini wrote:
> On 12/01/21 07:37, Wei Huang wrote:
>>   static int gp_interception(struct vcpu_svm *svm)
>>   {
>>       struct kvm_vcpu *vcpu = &svm->vcpu;
>>       u32 error_code = svm->vmcb->control.exit_info_1;
>> -
>> -    WARN_ON_ONCE(!enable_vmware_backdoor);
>> +    int rc;
>>         /*
>> -     * VMware backdoor emulation on #GP interception only handles IN{S},
>> -     * OUT{S}, and RDPMC, none of which generate a non-zero error code.
>> +     * Only VMware backdoor and SVM VME errata are handled. Neither of
>> +     * them has non-zero error codes.
>>        */
>>       if (error_code) {
>>           kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>>           return 1;
>>       }
>> -    return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
>> +
>> +    rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>> +    if (rc > 1)
>> +        rc = svm_emulate_vm_instr(vcpu, rc);
>> +    return rc;
>>   }
>>   
> 
> Passing back the third byte is quick hacky.  Instead of this change to
> kvm_emulate_instruction, I'd rather check the instruction bytes in
> gp_interception before calling kvm_emulate_instruction.  That would be
> something like:
> 
> - move "kvm_clear_exception_queue(vcpu);" inside the "if
> (!(emulation_type & EMULTYPE_NO_DECODE))".  It doesn't apply when you
> are coming back from userspace.
> 
> - extract the "if (!(emulation_type & EMULTYPE_NO_DECODE))" body to a
> new function x86_emulate_decoded_instruction.  Call it from
> gp_interception, we know this is not a pagefault and therefore
> vcpu->arch.write_fault_to_shadow_pgtable must be false.

If the whole body inside if-statement is moved out, do you expect the
interface of x86_emulate_decoded_instruction to be something like:

int x86_emulate_decoded_instruction(struct kvm_vcpu *vcpu,
                                    gpa_t cr2_or_gpa,
                                    int emulation_type, void *insn,
                                    int insn_len,
                                    bool write_fault_to_spt)

And if so, what is the emulation type to use when calling this function
from svm.c? EMULTYPE_VMWARE_GP?

> 
> - check ctxt->insn_bytes for an SVM instruction
> 
> - if not an SVM instruction, call kvm_emulate_instruction(vcpu,
> EMULTYPE_VMWARE_GP|EMULTYPE_NO_DECODE).
> 
> Thanks,
> 
> Paolo
>
Paolo Bonzini Jan. 17, 2021, 6:20 p.m. UTC | #28
On 15/01/21 08:00, Wei Huang wrote:
> If the whole body inside if-statement is moved out, do you expect the
> interface of x86_emulate_decoded_instruction to be something like:
> 
> int x86_emulate_decoded_instruction(struct kvm_vcpu *vcpu,
>                                      gpa_t cr2_or_gpa,
>                                      int emulation_type, void *insn,
>                                      int insn_len,
>                                      bool write_fault_to_spt)

An idea is to making the body of the new function just

         init_emulate_ctxt(vcpu);

         /*
          * We will reenter on the same instruction since
          * we do not set complete_userspace_io.  This does not
          * handle watchpoints yet, those would be handled in
          * the emulate_ops.
          */
         if (!(emulation_type & EMULTYPE_SKIP) &&
             kvm_vcpu_check_breakpoint(vcpu, &r))
                 return r;

         ctxt->interruptibility = 0;
         ctxt->have_exception = false;
         ctxt->exception.vector = -1;
         ctxt->exception.error_code_valid = false;

         ctxt->perm_ok = false;

         ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;

         r = x86_decode_insn(ctxt, insn, insn_len);

         trace_kvm_emulate_insn_start(vcpu);
         ++vcpu->stat.insn_emulation;
         return r;

because for the new caller, on EMULATION_FAILED you can just re-enter 
the guest.

> And if so, what is the emulation type to use when calling this function
> from svm.c? EMULTYPE_VMWARE_GP?

Just 0 I think.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..0ddc309f5a14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1450,10 +1450,12 @@  extern u64 kvm_mce_cap_supported;
  *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
  *			     Used to test the full emulator from userspace.
  *
- * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
  *			backdoor emulation, which is opt in via module param.
  *			VMware backoor emulation handles select instructions
- *			and reinjects the #GP for all other cases.
+ *			and reinjects #GP for all other cases. This also
+ *			handles other cases where #GP condition needs to be
+ *			handled and emulated appropriately
  *
  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
  *		 case the CR2/GPA value pass on the stack is valid.
@@ -1463,7 +1465,7 @@  extern u64 kvm_mce_cap_supported;
 #define EMULTYPE_SKIP		    (1 << 2)
 #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
 #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
-#define EMULTYPE_VMWARE_GP	    (1 << 5)
+#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
 #define EMULTYPE_PF		    (1 << 6)
 
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 581925e476d6..1a2fff4e7140 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -219,5 +219,6 @@  int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_post_init_vm(struct kvm *kvm);
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+bool kvm_is_host_reserved_region(u64 gpa);
 
 #endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..c5c4aaf01a1a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -50,6 +50,7 @@ 
 #include <asm/io.h>
 #include <asm/vmx.h>
 #include <asm/kvm_page_track.h>
+#include <asm/e820/api.h>
 #include "trace.h"
 
 extern bool itlb_multihit_kvm_mitigation;
@@ -5675,6 +5676,12 @@  void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
+bool kvm_is_host_reserved_region(u64 gpa)
+{
+	return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
+}
+EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
+
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7ef171790d02..74620d32aa82 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -288,6 +288,7 @@  int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		if (!(efer & EFER_SVME)) {
 			svm_leave_nested(svm);
 			svm_set_gif(svm, true);
+			clr_exception_intercept(svm, GP_VECTOR);
 
 			/*
 			 * Free the nested guest state, unless we are in SMM.
@@ -309,6 +310,10 @@  int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 	svm->vmcb->save.efer = efer | EFER_SVME;
 	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+	/* Enable GP interception for SVM instructions if needed */
+	if (efer & EFER_SVME)
+		set_exception_intercept(svm, GP_VECTOR);
+
 	return 0;
 }
 
@@ -1957,22 +1962,104 @@  static int ac_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int vmload_interception(struct vcpu_svm *svm)
+{
+	struct vmcb *nested_vmcb;
+	struct kvm_host_map map;
+	int ret;
+
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+	if (ret) {
+		if (ret == -EINVAL)
+			kvm_inject_gp(&svm->vcpu, 0);
+		return 1;
+	}
+
+	nested_vmcb = map.hva;
+
+	ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+	kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+	return ret;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm)
+{
+	struct vmcb *nested_vmcb;
+	struct kvm_host_map map;
+	int ret;
+
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+	if (ret) {
+		if (ret == -EINVAL)
+			kvm_inject_gp(&svm->vcpu, 0);
+		return 1;
+	}
+
+	nested_vmcb = map.hva;
+
+	ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+	kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+	return ret;
+}
+
+static int vmrun_interception(struct vcpu_svm *svm)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	return nested_svm_vmrun(svm);
+}
+
+/* Emulate SVM VM execution instructions */
+static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	switch (modrm) {
+	case 0xd8: /* VMRUN */
+		return vmrun_interception(svm);
+	case 0xda: /* VMLOAD */
+		return vmload_interception(svm);
+	case 0xdb: /* VMSAVE */
+		return vmsave_interception(svm);
+	default:
+		/* inject a #GP for all other cases */
+		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+		return 1;
+	}
+}
+
 static int gp_interception(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	u32 error_code = svm->vmcb->control.exit_info_1;
-
-	WARN_ON_ONCE(!enable_vmware_backdoor);
+	int rc;
 
 	/*
-	 * VMware backdoor emulation on #GP interception only handles IN{S},
-	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
+	 * Only VMware backdoor and SVM VME errata are handled. Neither of
+	 * them has non-zero error codes.
 	 */
 	if (error_code) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 		return 1;
 	}
-	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
+
+	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
+	if (rc > 1)
+		rc = svm_emulate_vm_instr(vcpu, rc);
+	return rc;
 }
 
 static bool is_erratum_383(void)
@@ -2113,66 +2200,6 @@  static int vmmcall_interception(struct vcpu_svm *svm)
 	return kvm_emulate_hypercall(&svm->vcpu);
 }
 
-static int vmload_interception(struct vcpu_svm *svm)
-{
-	struct vmcb *nested_vmcb;
-	struct kvm_host_map map;
-	int ret;
-
-	if (nested_svm_check_permissions(svm))
-		return 1;
-
-	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-	if (ret) {
-		if (ret == -EINVAL)
-			kvm_inject_gp(&svm->vcpu, 0);
-		return 1;
-	}
-
-	nested_vmcb = map.hva;
-
-	ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-	kvm_vcpu_unmap(&svm->vcpu, &map, true);
-
-	return ret;
-}
-
-static int vmsave_interception(struct vcpu_svm *svm)
-{
-	struct vmcb *nested_vmcb;
-	struct kvm_host_map map;
-	int ret;
-
-	if (nested_svm_check_permissions(svm))
-		return 1;
-
-	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-	if (ret) {
-		if (ret == -EINVAL)
-			kvm_inject_gp(&svm->vcpu, 0);
-		return 1;
-	}
-
-	nested_vmcb = map.hva;
-
-	ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-	kvm_vcpu_unmap(&svm->vcpu, &map, true);
-
-	return ret;
-}
-
-static int vmrun_interception(struct vcpu_svm *svm)
-{
-	if (nested_svm_check_permissions(svm))
-		return 1;
-
-	return nested_svm_vmrun(svm);
-}
-
 void svm_set_gif(struct vcpu_svm *svm, bool value)
 {
 	if (value) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 0fe874ae5498..d5dffcf59afa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -350,6 +350,14 @@  static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 	recalc_intercepts(svm);
 }
 
+static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	WARN_ON_ONCE(bit >= 32);
+	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+}
+
 static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 {
 	struct vmcb *vmcb = get_host_vmcb(svm);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2af05d3b0590..5fac2f7cba24 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4774,7 +4774,7 @@  static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 			return 1;
 		}
-		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
+		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
 	}
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a8969a6dd06..c3662fc3b1bc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7014,7 +7014,7 @@  static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 	++vcpu->stat.insn_emulation_fail;
 	trace_kvm_emulate_insn_failed(vcpu);
 
-	if (emulation_type & EMULTYPE_VMWARE_GP) {
+	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 		return 1;
 	}
@@ -7267,6 +7267,28 @@  static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
 	return false;
 }
 
+static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned long rax;
+
+	if (ctxt->b != 0x1)
+		return 0;
+
+	switch (ctxt->modrm) {
+	case 0xd8: /* VMRUN */
+	case 0xda: /* VMLOAD */
+	case 0xdb: /* VMSAVE */
+		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
+		if (!kvm_is_host_reserved_region(rax))
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+
+	return ctxt->modrm;
+}
+
 static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
 {
 	switch (ctxt->opcode_len) {
@@ -7305,6 +7327,7 @@  int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
 	bool writeback = true;
 	bool write_fault_to_spt;
+	int vminstr;
 
 	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
 		return 1;
@@ -7367,10 +7390,14 @@  int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		}
 	}
 
-	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
-	    !is_vmware_backdoor_opcode(ctxt)) {
-		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-		return 1;
+	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
+		vminstr = is_vm_instr_opcode(ctxt);
+		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
+			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+			return 1;
+		}
+		if (vminstr)
+			return vminstr;
 	}
 
 	/*