diff mbox series

[07/21] KVM: VMX: Introduce test mode related to EPT violation VE

Message ID 20240227232100.478238-8-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series TDX/SNP part 1 of n, for 6.9 | expand

Commit Message

Paolo Bonzini Feb. 27, 2024, 11:20 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM uses the
suppress #VE bit in EPT entries selectively, in order to be able to trap
non-present conditions.  However, #VE isn't used for VMX and it's a bug
if it happens.  To be defensive and test that VMX case isn't broken
introduce an option ept_violation_ve_test and when it's set, BUG the vm.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Message-Id: <d6db6ba836605c0412e166359ba5c46a63c22f86.1705965635.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h | 12 +++++++
 arch/x86/kvm/vmx/vmcs.h    |  5 +++
 arch/x86/kvm/vmx/vmx.c     | 69 +++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/vmx.h     |  6 +++-
 4 files changed, 90 insertions(+), 2 deletions(-)

Comments

Sean Christopherson Feb. 28, 2024, 1:56 a.m. UTC | #1
On Tue, Feb 27, 2024, Paolo Bonzini wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM uses the
> suppress #VE bit in EPT entries selectively, in order to be able to trap
> non-present conditions.  However, #VE isn't used for VMX and it's a bug
> if it happens.  To be defensive and test that VMX case isn't broken
> introduce an option ept_violation_ve_test and when it's set, BUG the vm.

This needs to be two patches:

 1. Add the architecture #defines, enums, structures, and is_ve_fault().
 2. Add the forced #VE enabling test code

> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Message-Id: <d6db6ba836605c0412e166359ba5c46a63c22f86.1705965635.git.isaku.yamahata@intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/vmx.h | 12 +++++++
>  arch/x86/kvm/vmx/vmcs.h    |  5 +++
>  arch/x86/kvm/vmx/vmx.c     | 69 +++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/vmx.h     |  6 +++-
>  4 files changed, 90 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 76ed39541a52..f703bae0c4ac 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -70,6 +70,7 @@
>  #define SECONDARY_EXEC_ENCLS_EXITING		VMCS_CONTROL_BIT(ENCLS_EXITING)
>  #define SECONDARY_EXEC_RDSEED_EXITING		VMCS_CONTROL_BIT(RDSEED_EXITING)
>  #define SECONDARY_EXEC_ENABLE_PML               VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
> +#define SECONDARY_EXEC_EPT_VIOLATION_VE		VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
>  #define SECONDARY_EXEC_PT_CONCEAL_VMX		VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
>  #define SECONDARY_EXEC_ENABLE_XSAVES		VMCS_CONTROL_BIT(XSAVES)
>  #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
> @@ -225,6 +226,8 @@ enum vmcs_field {
>  	VMREAD_BITMAP_HIGH              = 0x00002027,
>  	VMWRITE_BITMAP                  = 0x00002028,
>  	VMWRITE_BITMAP_HIGH             = 0x00002029,
> +	VE_INFORMATION_ADDRESS		= 0x0000202A,
> +	VE_INFORMATION_ADDRESS_HIGH	= 0x0000202B,
>  	XSS_EXIT_BITMAP                 = 0x0000202C,
>  	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
>  	ENCLS_EXITING_BITMAP		= 0x0000202E,
> @@ -630,4 +633,13 @@ enum vmx_l1d_flush_state {
>  
>  extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
>  
> +struct vmx_ve_information {
> +	u32 exit_reason;
> +	u32 delivery;
> +	u64 exit_qualification;
> +	u64 guest_linear_address;
> +	u64 guest_physical_address;
> +	u16 eptp_index;
> +};

Should this be __packed since it's hardware-defined, or are we ok relying on the
compiler to not be stupid?

>  #endif
> diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
> index 7c1996b433e2..b25625314658 100644
> --- a/arch/x86/kvm/vmx/vmcs.h
> +++ b/arch/x86/kvm/vmx/vmcs.h
> @@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
>  	return is_exception_n(intr_info, NM_VECTOR);
>  }
>  
> +static inline bool is_ve_fault(u32 intr_info)
> +{
> +	return is_exception_n(intr_info, VE_VECTOR);
> +}
> +
>  /* Undocumented: icebp/int1 */
>  static inline bool is_icebp(u32 intr_info)
>  {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 9239a89dea22..6468f421ba9e 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -126,6 +126,9 @@ module_param(error_on_inconsistent_vmcs_config, bool, 0444);
>  static bool __read_mostly dump_invalid_vmcs = 0;
>  module_param(dump_invalid_vmcs, bool, 0644);
>  
> +static bool __read_mostly ept_violation_ve_test;
> +module_param(ept_violation_ve_test, bool, 0444);

I would much prefer to enable #VE if CONFIG_KVM_PROVE_MMU=y.  We already have
too many module params to deal with for testing, and practically speaking the
only people who will ever turn this on are the same people that run with
CONFIG_KVM_PROVE_MMU=y.

>  #define MSR_BITMAP_MODE_X2APIC		1
>  #define MSR_BITMAP_MODE_X2APIC_APICV	2
>  
> @@ -868,6 +871,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
>  
>  	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
>  	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
> +	/*
> +	 * #VE isn't used for VMX.  To test against unexpected changes
> +	 * related to #VE for VMX, intercept unexpected #VE and warn on it.
> +	 */
> +	if (ept_violation_ve_test)
> +		eb |= 1u << VE_VECTOR;
>  	/*
>  	 * Guest access to VMware backdoor ports could legitimately
>  	 * trigger #GP because of TSS I/O permission bitmap.
> @@ -2603,6 +2613,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  					&_cpu_based_2nd_exec_control))
>  			return -EIO;
>  	}
> +	if (!ept_violation_ve_test)
> +		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;

_If_ we add a module param, the param needs to be disabled if #VE isn't supported.

>  #ifndef CONFIG_X86_64
>  	if (!(_cpu_based_2nd_exec_control &
>  				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> @@ -2627,6 +2640,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  			return -EIO;
>  
>  		vmx_cap->ept = 0;
> +		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  	}
>  	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
>  	    vmx_cap->vpid) {
> @@ -4592,6 +4606,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
>  	if (!enable_ept) {
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
> +		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  		enable_unrestricted_guest = 0;
>  	}
>  	if (!enable_unrestricted_guest)
> @@ -4715,8 +4730,40 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  
>  	exec_controls_set(vmx, vmx_exec_control(vmx));
>  
> -	if (cpu_has_secondary_exec_ctrls())
> +	if (cpu_has_secondary_exec_ctrls()) {
>  		secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
> +		if (secondary_exec_controls_get(vmx) &
> +		    SECONDARY_EXEC_EPT_VIOLATION_VE) {
> +			if (!vmx->ve_info) {
> +				/* ve_info must be page aligned. */
> +				struct page *page;
> +
> +				BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
> +				page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +				if (page)
> +					vmx->ve_info = page_to_virt(page);
> +			}
> +			if (vmx->ve_info) {
> +				/*
> +				 * Allow #VE delivery. CPU sets this field to
> +				 * 0xFFFFFFFF on #VE delivery.  Another #VE can
> +				 * occur only if software clears the field.
> +				 */
> +				vmx->ve_info->delivery = 0;

This is completely unnecessary, the entire page is zero-allocated.

> +				vmcs_write64(VE_INFORMATION_ADDRESS,
> +					     __pa(vmx->ve_info));
> +			} else {
> +				/*
> +				 * Because SECONDARY_EXEC_EPT_VIOLATION_VE is
> +				 * used only when ept_violation_ve_test is true,
> +				 * it's okay to go with the bit disabled.

No, it's not.  This is silly on multiple fronts.  (a) KVM knows if it's going to
enable #VE when the vCPU is first created, the allocation can and should be done
at that time along with all the other allocations needed for the VM.  (b) Except
for error injection from syzkaller and friends, there is basically zero chance
the VM will live on if one 4KiB allocation fails.  (c) This will never be enabled
in production; it's totally fine if the vCPU creation fails during testing, because
as a above, that will practically never happen outside of deliberate error
injection.

> +				 */
> +				pr_err("Failed to allocate ve_info. disabling EPT_VIOLATION_VE.\n");
> +				secondary_exec_controls_clearbit(vmx,
> +								 SECONDARY_EXEC_EPT_VIOLATION_VE);
> +			}
> +		}
> +	}
>  
>  	if (cpu_has_tertiary_exec_ctrls())
>  		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
> @@ -5204,6 +5251,12 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  	if (is_invalid_opcode(intr_info))
>  		return handle_ud(vcpu);
>  
> +	/*
> +	 * #VE isn't supposed to happen.  Block the VM if it does.

This is not a useful comment.  Obviously #VE isn't supposed to happen, otherwise
KVM wouldn't be bugging the VM. 

> +	 */
> +	if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
> +		return -EIO;
> +
>  	error_code = 0;
>  	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
>  		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> @@ -6393,6 +6446,18 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
>  	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
>  		pr_err("Virtual processor ID = 0x%04x\n",
>  		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
> +	if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
> +		struct vmx_ve_information *ve_info;
> +
> +		pr_err("VE info address = 0x%016llx\n",
> +		       vmcs_read64(VE_INFORMATION_ADDRESS));
> +		ve_info = __va(vmcs_read64(VE_INFORMATION_ADDRESS));

Why!?!?  You have the address in vcpu_vmx, just use that.  If KVM is dumping
the VMCS, then something has gone wrong, possible in hardware or ucode.
Derefencing an address from the VMCS, which could very well be corrupted, is a
terrible idea.  This could easily escalate from a dead VM into a dead host.

> +		pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
> +		       ve_info->exit_reason, ve_info->delivery,
> +		       ve_info->exit_qualification,
> +		       ve_info->guest_linear_address,
> +		       ve_info->guest_physical_address, ve_info->eptp_index);
> +	}
>  }
>  
>  /*
> @@ -7433,6 +7498,8 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
>  	free_vpid(vmx->vpid);
>  	nested_vmx_free_vcpu(vcpu);
>  	free_loaded_vmcs(vmx->loaded_vmcs);
> +	if (vmx->ve_info)

Unnecessary, free_page() does this for you.

> +		free_page((unsigned long)vmx->ve_info);
>  }
>  
>  static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index e3b0985bb74a..1ea1e5c8930d 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -364,6 +364,9 @@ struct vcpu_vmx {
>  		DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
>  		DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
>  	} shadow_msr_intercept;
> +
> +	/* ve_info must be page aligned. */

This is also not a useful comment.  Even the one at the allocation site is of
dubious value.

> +	struct vmx_ve_information *ve_info;
>  };
>  
>  struct kvm_vmx {
> @@ -576,7 +579,8 @@ static inline u8 vmx_get_rvi(void)
>  	 SECONDARY_EXEC_ENABLE_VMFUNC |					\
>  	 SECONDARY_EXEC_BUS_LOCK_DETECTION |				\
>  	 SECONDARY_EXEC_NOTIFY_VM_EXITING |				\
> -	 SECONDARY_EXEC_ENCLS_EXITING)
> +	 SECONDARY_EXEC_ENCLS_EXITING |					\
> +	 SECONDARY_EXEC_EPT_VIOLATION_VE)
>  
>  #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
>  #define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL			\
> -- 
> 2.39.0
> 
>
Huang, Kai March 12, 2024, 1:35 a.m. UTC | #2
On 28/02/2024 12:20 pm, Paolo Bonzini wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM uses the
> suppress #VE bit in EPT entries selectively, in order to be able to trap
> non-present conditions.  However, #VE isn't used for VMX and it's a bug
> if it happens.  To be defensive and test that VMX case isn't broken
> introduce an option ept_violation_ve_test and when it's set, BUG the vm.

I am wondering from HW's point of view, is it OK for the kernel to 
explicitly send #VE IPI, in which case, IIUC, the guest can legally get 
the #VE w/o being a TDX guest?
Sean Christopherson March 12, 2024, 4:54 p.m. UTC | #3
On Tue, Mar 12, 2024, Kai Huang wrote:
> On 28/02/2024 12:20 pm, Paolo Bonzini wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM uses the
> > suppress #VE bit in EPT entries selectively, in order to be able to trap
> > non-present conditions.  However, #VE isn't used for VMX and it's a bug
> > if it happens.  To be defensive and test that VMX case isn't broken
> > introduce an option ept_violation_ve_test and when it's set, BUG the vm.
> 
> I am wondering from HW's point of view, is it OK for the kernel to
> explicitly send #VE IPI, in which case, IIUC, the guest can legally get the
> #VE w/o being a TDX guest?

Ooh, fun.  Short answer: there's nothing to worry about here.

Legally, no.  Vectors 0-31 are reserved.  However, I do _think_ the guest could
technically send IPIs on vectors 16-31, as the local APIC doesn't outright reject
such vectors.  But such software would be in clear violation of the SDM.

  11.5.2 Valid Interrupt Vectors
  
  The Intel 64 and IA-32 architectures define 256 vector numbers, ranging from
  0 through 255 (see Section 6.2, “Exception and Interrupt Vectors”). Local and
  I/O APICs support 240 of these vectors (in the range of 16 to 255) as valid
  interrupts.
  
  When an interrupt vector in the range of 0 to 15 is sent or received through
  the local APIC, the APIC indicates an illegal vector in its Error Status
  Register (see Section 11.5.3, “Error Handling”). The Intel 64 and IA-32
  architectures reserve vectors 16 through 31 for predefined interrupts,
  exceptions, and Intel-reserved encodings (see Table 6-1). However, the local
  APIC does not treat vectors in this range as illegal.

  When an illegal vector value (0 to 15) is written to an LVT entry and the delivery
  mode is Fixed (bits 8-11 equal 0), the APIC may signal an illegal vector error,
  without regard to whether the mask bit is set or whether an interrupt is actually
  seen on the input.

where Table 6-1 defines the various exceptions, including #VE, and for vectors
22-31 says "Intel reserved. Do not use."  Vectors 32-255 are explicitly described
as "User Defined (Non-reserved) Interrupts" that can be generated via "External
interrupt or INT n instruction."

However, INTn is far more interesting than IPIs, as INTn can definitely generate
interrupts for vectors 0-31, and the legality of software generating such interrupts
is questionable.  E.g. KVM used to "forward" NMI VM-Exits to the kernel by doing
INTn with vector 2.

Key word "interrupts"!  IPIs are hardware interrupts, and INTn generates software
interrupts, neither of which are subject to exception bitmap interception:

  Exceptions (faults, traps, and aborts) cause VM exits based on the exception
  bitmap (see Section 25.6.3). If an exception occurs, its vector (in the range
  0–31) is used to select a bit in the exception bitmap. If the bit is 1, a VM
  exit occurs; if the bit is 0, the exception is delivered normally through the
  guest IDT. This use of the exception bitmap applies also to exceptions generated
  by the instructions INT1, INT3, INTO, BOUND, UD0, UD1, and UD2.

with a footnote that further says:

  INT1 and INT3 refer to the instructions with opcodes F1 and CC, respectively,
  and not to INT n with value 1 or 3 for n.

So while a misbehaving guest could generate a software interrupt on vector 20,
it would not be a true #VE, i.e. not an exception, and thus would not generate
an EXCEPTION_NMI VM-Exit.  I.e. the KVM_BUG_ON() can't be triggered by the guest
(assuming hardware isn't broken).
Huang, Kai March 12, 2024, 9:03 p.m. UTC | #4
On Tue, 2024-03-12 at 09:54 -0700, Sean Christopherson wrote:
> On Tue, Mar 12, 2024, Kai Huang wrote:
> > On 28/02/2024 12:20 pm, Paolo Bonzini wrote:
> > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > 
> > > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM uses the
> > > suppress #VE bit in EPT entries selectively, in order to be able to trap
> > > non-present conditions.  However, #VE isn't used for VMX and it's a bug
> > > if it happens.  To be defensive and test that VMX case isn't broken
> > > introduce an option ept_violation_ve_test and when it's set, BUG the vm.
> > 
> > I am wondering from HW's point of view, is it OK for the kernel to
> > explicitly send #VE IPI, in which case, IIUC, the guest can legally get the
> > #VE w/o being a TDX guest?
> 
> Ooh, fun.  Short answer: there's nothing to worry about here.
> 
> Legally, no.  Vectors 0-31 are reserved.  However, I do _think_ the guest could
> technically send IPIs on vectors 16-31, as the local APIC doesn't outright reject
> such vectors.  But such software would be in clear violation of the SDM.
> 
>   11.5.2 Valid Interrupt Vectors
>   
>   The Intel 64 and IA-32 architectures define 256 vector numbers, ranging from
>   0 through 255 (see Section 6.2, “Exception and Interrupt Vectors”). Local and
>   I/O APICs support 240 of these vectors (in the range of 16 to 255) as valid
>   interrupts.
>   
>   When an interrupt vector in the range of 0 to 15 is sent or received through
>   the local APIC, the APIC indicates an illegal vector in its Error Status
>   Register (see Section 11.5.3, “Error Handling”). The Intel 64 and IA-32
>   architectures reserve vectors 16 through 31 for predefined interrupts,
>   exceptions, and Intel-reserved encodings (see Table 6-1). However, the local
>   APIC does not treat vectors in this range as illegal.
> 
>   When an illegal vector value (0 to 15) is written to an LVT entry and the delivery
>   mode is Fixed (bits 8-11 equal 0), the APIC may signal an illegal vector error,
>   without regard to whether the mask bit is set or whether an interrupt is actually
>   seen on the input.

I hate the "may" here :-)

> 
> where Table 6-1 defines the various exceptions, including #VE, and for vectors
> 22-31 says "Intel reserved. Do not use."  Vectors 32-255 are explicitly described
> as "User Defined (Non-reserved) Interrupts" that can be generated via "External
> interrupt or INT n instruction."
> 
> However, INTn is far more interesting than IPIs, as INTn can definitely generate
> interrupts for vectors 0-31, and the legality of software generating such interrupts
> is questionable.  E.g. KVM used to "forward" NMI VM-Exits to the kernel by doing
> INTn with vector 2.
> 
> Key word "interrupts"!  IPIs are hardware interrupts, and INTn generates software
> interrupts, neither of which are subject to exception bitmap interception:
> 
>   Exceptions (faults, traps, and aborts) cause VM exits based on the exception
>   bitmap (see Section 25.6.3). If an exception occurs, its vector (in the range
>   0–31) is used to select a bit in the exception bitmap. If the bit is 1, a VM
>   exit occurs; if the bit is 0, the exception is delivered normally through the
>   guest IDT. This use of the exception bitmap applies also to exceptions generated
>   by the instructions INT1, INT3, INTO, BOUND, UD0, UD1, and UD2.
> 
> with a footnote that further says:
> 
>   INT1 and INT3 refer to the instructions with opcodes F1 and CC, respectively,
>   and not to INT n with value 1 or 3 for n.
> 
> So while a misbehaving guest could generate a software interrupt on vector 20,
> it would not be a true #VE, i.e. not an exception, and thus would not generate
> an EXCEPTION_NMI VM-Exit.  I.e. the KVM_BUG_ON() can't be triggered by the guest
> (assuming hardware isn't broken).
> 

Ah, right, software-interrupts but not exceptions.  

Thanks for the full explanation!
diff mbox series

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 76ed39541a52..f703bae0c4ac 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,6 +70,7 @@ 
 #define SECONDARY_EXEC_ENCLS_EXITING		VMCS_CONTROL_BIT(ENCLS_EXITING)
 #define SECONDARY_EXEC_RDSEED_EXITING		VMCS_CONTROL_BIT(RDSEED_EXITING)
 #define SECONDARY_EXEC_ENABLE_PML               VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE		VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
 #define SECONDARY_EXEC_PT_CONCEAL_VMX		VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
 #define SECONDARY_EXEC_ENABLE_XSAVES		VMCS_CONTROL_BIT(XSAVES)
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
@@ -225,6 +226,8 @@  enum vmcs_field {
 	VMREAD_BITMAP_HIGH              = 0x00002027,
 	VMWRITE_BITMAP                  = 0x00002028,
 	VMWRITE_BITMAP_HIGH             = 0x00002029,
+	VE_INFORMATION_ADDRESS		= 0x0000202A,
+	VE_INFORMATION_ADDRESS_HIGH	= 0x0000202B,
 	XSS_EXIT_BITMAP                 = 0x0000202C,
 	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
 	ENCLS_EXITING_BITMAP		= 0x0000202E,
@@ -630,4 +633,13 @@  enum vmx_l1d_flush_state {
 
 extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
 
+struct vmx_ve_information {
+	u32 exit_reason;
+	u32 delivery;
+	u64 exit_qualification;
+	u64 guest_linear_address;
+	u64 guest_physical_address;
+	u16 eptp_index;
+};
+
 #endif
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7c1996b433e2..b25625314658 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@  static inline bool is_nm_fault(u32 intr_info)
 	return is_exception_n(intr_info, NM_VECTOR);
 }
 
+static inline bool is_ve_fault(u32 intr_info)
+{
+	return is_exception_n(intr_info, VE_VECTOR);
+}
+
 /* Undocumented: icebp/int1 */
 static inline bool is_icebp(u32 intr_info)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9239a89dea22..6468f421ba9e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -126,6 +126,9 @@  module_param(error_on_inconsistent_vmcs_config, bool, 0444);
 static bool __read_mostly dump_invalid_vmcs = 0;
 module_param(dump_invalid_vmcs, bool, 0644);
 
+static bool __read_mostly ept_violation_ve_test;
+module_param(ept_violation_ve_test, bool, 0444);
+
 #define MSR_BITMAP_MODE_X2APIC		1
 #define MSR_BITMAP_MODE_X2APIC_APICV	2
 
@@ -868,6 +871,12 @@  void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
 
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
+	/*
+	 * #VE isn't used for VMX.  To test against unexpected changes
+	 * related to #VE for VMX, intercept unexpected #VE and warn on it.
+	 */
+	if (ept_violation_ve_test)
+		eb |= 1u << VE_VECTOR;
 	/*
 	 * Guest access to VMware backdoor ports could legitimately
 	 * trigger #GP because of TSS I/O permission bitmap.
@@ -2603,6 +2613,9 @@  static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 					&_cpu_based_2nd_exec_control))
 			return -EIO;
 	}
+	if (!ept_violation_ve_test)
+		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
 #ifndef CONFIG_X86_64
 	if (!(_cpu_based_2nd_exec_control &
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2627,6 +2640,7 @@  static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			return -EIO;
 
 		vmx_cap->ept = 0;
+		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
 	}
 	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
 	    vmx_cap->vpid) {
@@ -4592,6 +4606,7 @@  static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 	if (!enable_ept) {
 		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
 		enable_unrestricted_guest = 0;
 	}
 	if (!enable_unrestricted_guest)
@@ -4715,8 +4730,40 @@  static void init_vmcs(struct vcpu_vmx *vmx)
 
 	exec_controls_set(vmx, vmx_exec_control(vmx));
 
-	if (cpu_has_secondary_exec_ctrls())
+	if (cpu_has_secondary_exec_ctrls()) {
 		secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+		if (secondary_exec_controls_get(vmx) &
+		    SECONDARY_EXEC_EPT_VIOLATION_VE) {
+			if (!vmx->ve_info) {
+				/* ve_info must be page aligned. */
+				struct page *page;
+
+				BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+				page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+				if (page)
+					vmx->ve_info = page_to_virt(page);
+			}
+			if (vmx->ve_info) {
+				/*
+				 * Allow #VE delivery. CPU sets this field to
+				 * 0xFFFFFFFF on #VE delivery.  Another #VE can
+				 * occur only if software clears the field.
+				 */
+				vmx->ve_info->delivery = 0;
+				vmcs_write64(VE_INFORMATION_ADDRESS,
+					     __pa(vmx->ve_info));
+			} else {
+				/*
+				 * Because SECONDARY_EXEC_EPT_VIOLATION_VE is
+				 * used only when ept_violation_ve_test is true,
+				 * it's okay to go with the bit disabled.
+				 */
+				pr_err("Failed to allocate ve_info. disabling EPT_VIOLATION_VE.\n");
+				secondary_exec_controls_clearbit(vmx,
+								 SECONDARY_EXEC_EPT_VIOLATION_VE);
+			}
+		}
+	}
 
 	if (cpu_has_tertiary_exec_ctrls())
 		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -5204,6 +5251,12 @@  static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	if (is_invalid_opcode(intr_info))
 		return handle_ud(vcpu);
 
+	/*
+	 * #VE isn't supposed to happen.  Block the VM if it does.
+	 */
+	if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
+		return -EIO;
+
 	error_code = 0;
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -6393,6 +6446,18 @@  void dump_vmcs(struct kvm_vcpu *vcpu)
 	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
 		pr_err("Virtual processor ID = 0x%04x\n",
 		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
+	if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+		struct vmx_ve_information *ve_info;
+
+		pr_err("VE info address = 0x%016llx\n",
+		       vmcs_read64(VE_INFORMATION_ADDRESS));
+		ve_info = __va(vmcs_read64(VE_INFORMATION_ADDRESS));
+		pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+		       ve_info->exit_reason, ve_info->delivery,
+		       ve_info->exit_qualification,
+		       ve_info->guest_linear_address,
+		       ve_info->guest_physical_address, ve_info->eptp_index);
+	}
 }
 
 /*
@@ -7433,6 +7498,8 @@  static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
 	free_vpid(vmx->vpid);
 	nested_vmx_free_vcpu(vcpu);
 	free_loaded_vmcs(vmx->loaded_vmcs);
+	if (vmx->ve_info)
+		free_page((unsigned long)vmx->ve_info);
 }
 
 static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e3b0985bb74a..1ea1e5c8930d 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -364,6 +364,9 @@  struct vcpu_vmx {
 		DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 		DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 	} shadow_msr_intercept;
+
+	/* ve_info must be page aligned. */
+	struct vmx_ve_information *ve_info;
 };
 
 struct kvm_vmx {
@@ -576,7 +579,8 @@  static inline u8 vmx_get_rvi(void)
 	 SECONDARY_EXEC_ENABLE_VMFUNC |					\
 	 SECONDARY_EXEC_BUS_LOCK_DETECTION |				\
 	 SECONDARY_EXEC_NOTIFY_VM_EXITING |				\
-	 SECONDARY_EXEC_ENCLS_EXITING)
+	 SECONDARY_EXEC_ENCLS_EXITING |					\
+	 SECONDARY_EXEC_EPT_VIOLATION_VE)
 
 #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
 #define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL			\