diff mbox series

[v7,041/102] KVM: VMX: Introduce test mode related to EPT violation VE

Message ID cadf3221e3f7b911c810f15cfe300dd5337a966d.1656366338.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM TDX basic feature support | expand

Commit Message

Isaku Yamahata June 27, 2022, 9:53 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
defensive (test that VMX case isn't broken), introduce option
ept_violation_ve_test and when it's set, set error.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/vmx.h | 12 +++++++
 arch/x86/kvm/vmx/vmx.c     | 68 +++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/vmx.h     |  3 ++
 3 files changed, 82 insertions(+), 1 deletion(-)

Comments

Huang, Kai July 8, 2022, 2:23 a.m. UTC | #1
On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
> to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
> case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
> defensive (test that VMX case isn't broken), introduce option
> ept_violation_ve_test and when it's set, set error.

I don't see why we need this patch.  It may be helpful during your test, but why
do we need this patch for formal submission?

And for a normal guest, what prevents one vcpu from sending #VE IPI to another
vcpu?
 
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/vmx.h | 12 +++++++
>  arch/x86/kvm/vmx/vmx.c     | 68 +++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/vmx.h     |  3 ++
>  3 files changed, 82 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 6231ef005a50..f0f8eecf55ac 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -68,6 +68,7 @@
>  #define SECONDARY_EXEC_ENCLS_EXITING		VMCS_CONTROL_BIT(ENCLS_EXITING)
>  #define SECONDARY_EXEC_RDSEED_EXITING		VMCS_CONTROL_BIT(RDSEED_EXITING)
>  #define SECONDARY_EXEC_ENABLE_PML               VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
> +#define SECONDARY_EXEC_EPT_VIOLATION_VE		VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
>  #define SECONDARY_EXEC_PT_CONCEAL_VMX		VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
>  #define SECONDARY_EXEC_XSAVES			VMCS_CONTROL_BIT(XSAVES)
>  #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
> @@ -223,6 +224,8 @@ enum vmcs_field {
>  	VMREAD_BITMAP_HIGH              = 0x00002027,
>  	VMWRITE_BITMAP                  = 0x00002028,
>  	VMWRITE_BITMAP_HIGH             = 0x00002029,
> +	VE_INFORMATION_ADDRESS		= 0x0000202A,
> +	VE_INFORMATION_ADDRESS_HIGH	= 0x0000202B,
>  	XSS_EXIT_BITMAP                 = 0x0000202C,
>  	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
>  	ENCLS_EXITING_BITMAP		= 0x0000202E,
> @@ -628,4 +631,13 @@ enum vmx_l1d_flush_state {
>  
>  extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
>  
> +struct vmx_ve_information {
> +	u32 exit_reason;
> +	u32 delivery;
> +	u64 exit_qualification;
> +	u64 guest_linear_address;
> +	u64 guest_physical_address;
> +	u16 eptp_index;
> +};
> +
>  #endif
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index e2415ac55317..e3d304b14df0 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -126,6 +126,9 @@ module_param(error_on_inconsistent_vmcs_config, bool, 0444);
>  static bool __read_mostly dump_invalid_vmcs = 0;
>  module_param(dump_invalid_vmcs, bool, 0644);
>  
> +static bool __read_mostly ept_violation_ve_test = 0;
> +module_param(ept_violation_ve_test, bool, 0444);
> +
>  #define MSR_BITMAP_MODE_X2APIC		1
>  #define MSR_BITMAP_MODE_X2APIC_APICV	2
>  
> @@ -726,6 +729,13 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
>  
>  	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
>  	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
> +	/*
> +	 * #VE isn't used for VMX, but for TDX.  To test against unexpected
> +	 * change related to #VE for VMX, intercept unexpected #VE and warn on
> +	 * it.
> +	 */
> +	if (ept_violation_ve_test)
> +		eb |= 1u << VE_VECTOR;
>  	/*
>  	 * Guest access to VMware backdoor ports could legitimately
>  	 * trigger #GP because of TSS I/O permission bitmap.
> @@ -2524,6 +2534,8 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  			SECONDARY_EXEC_NOTIFY_VM_EXITING;
>  		if (cpu_has_sgx())
>  			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
> +		if (ept_violation_ve_test)
> +			opt2 |= SECONDARY_EXEC_EPT_VIOLATION_VE;
>  		if (adjust_vmx_controls(min2, opt2,
>  					MSR_IA32_VMX_PROCBASED_CTLS2,
>  					&_cpu_based_2nd_exec_control) < 0)
> @@ -2558,6 +2570,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  			return -EIO;
>  
>  		vmx_cap->ept = 0;
> +		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  	}
>  	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
>  	    vmx_cap->vpid) {
> @@ -4390,6 +4403,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
>  	if (!enable_ept) {
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
> +		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  		enable_unrestricted_guest = 0;
>  	}
>  	if (!enable_unrestricted_guest)
> @@ -4517,8 +4531,40 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  
>  	exec_controls_set(vmx, vmx_exec_control(vmx));
>  
> -	if (cpu_has_secondary_exec_ctrls())
> +	if (cpu_has_secondary_exec_ctrls()) {
>  		secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
> +		if (secondary_exec_controls_get(vmx) &
> +		    SECONDARY_EXEC_EPT_VIOLATION_VE) {
> +			if (!vmx->ve_info) {
> +				/* ve_info must be page aligned. */
> +				struct page *page;
> +
> +				BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
> +				page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +				if (page)
> +					vmx->ve_info = page_to_virt(page);
> +			}
> +			if (vmx->ve_info) {
> +				/*
> +				 * Allow #VE delivery. CPU sets this field to
> +				 * 0xFFFFFFFF on #VE delivery.  Another #VE can
> +				 * occur only if software clears the field.
> +				 */
> +				vmx->ve_info->delivery = 0;
> +				vmcs_write64(VE_INFORMATION_ADDRESS,
> +					     __pa(vmx->ve_info));
> +			} else {
> +				/*
> +				 * Because SECONDARY_EXEC_EPT_VIOLATION_VE is
> +				 * used only when ept_violation_ve_test is true,
> +				 * it's okay to go with the bit disabled.
> +				 */
> +				pr_err("Failed to allocate ve_info. disabling EPT_VIOLATION_VE.\n");
> +				secondary_exec_controls_clearbit(
> +					vmx, SECONDARY_EXEC_EPT_VIOLATION_VE);
> +			}
> +		}
> +	}
>  
>  	if (cpu_has_tertiary_exec_ctrls())
>  		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
> @@ -5116,7 +5162,14 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
>  			return 1;
>  		fallthrough;
> +	case VE_VECTOR:
>  	default:
> +		if (ept_violation_ve_test && ex_no == VE_VECTOR) {
> +			pr_err("VMEXIT due to unexpected #VE.\n");
> +			secondary_exec_controls_clearbit(
> +				vmx, SECONDARY_EXEC_EPT_VIOLATION_VE);
> +			return 1;
> +		}
>  		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
>  		kvm_run->ex.exception = ex_no;
>  		kvm_run->ex.error_code = error_code;
> @@ -6182,6 +6235,17 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
>  	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
>  		pr_err("Virtual processor ID = 0x%04x\n",
>  		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
> +	if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
> +		struct vmx_ve_information *ve_info;
> +		pr_err("VE info address = 0x%016llx\n",
> +		       vmcs_read64(VE_INFORMATION_ADDRESS));
> +		ve_info = __va(vmcs_read64(VE_INFORMATION_ADDRESS));
> +		pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
> +		       ve_info->exit_reason, ve_info->delivery,
> +		       ve_info->exit_qualification,
> +		       ve_info->guest_linear_address,
> +		       ve_info->guest_physical_address, ve_info->eptp_index);
> +	}
>  }
>  
>  /*
> @@ -7173,6 +7237,8 @@ void vmx_vcpu_free(struct kvm_vcpu *vcpu)
>  	free_vpid(vmx->vpid);
>  	nested_vmx_free_vcpu(vcpu);
>  	free_loaded_vmcs(vmx->loaded_vmcs);
> +	if (vmx->ve_info)
> +		free_page((unsigned long)vmx->ve_info);
>  }
>  
>  int vmx_vcpu_create(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 9feb994e5ea2..60d93c38e014 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -338,6 +338,9 @@ struct vcpu_vmx {
>  		DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
>  		DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
>  	} shadow_msr_intercept;
> +
> +	/* ve_info must be page aligned. */
> +	struct vmx_ve_information *ve_info;
>  };
>  
>  struct kvm_vmx {
Isaku Yamahata July 19, 2022, 2:49 p.m. UTC | #2
On Fri, Jul 08, 2022 at 02:23:43PM +1200,
Kai Huang <kai.huang@intel.com> wrote:

> On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
> > to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
> > case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
> > defensive (test that VMX case isn't broken), introduce option
> > ept_violation_ve_test and when it's set, set error.
> 
> I don't see why we need this patch.  It may be helpful during your test, but why
> do we need this patch for formal submission?
> 
> And for a normal guest, what prevents one vcpu from sending #VE IPI to another
> vcpu?

Paolo suggested it as follows.  Maybe it should be kernel config.
(I forgot to add suggested-by. I'll add it)

https://lore.kernel.org/lkml/84d56339-4a8a-6ddb-17cb-12074588ba9c@redhat.com/

> On 3/4/22 20:48, isaku.yamahata@intel.com wrote:
> > + if (enable_ept) {
> > +  const u64 init_value = enable_tdx ? VMX_EPT_SUPPRESS_VE_BIT : 0ull;
> >     kvm_mmu_set_ept_masks(enable_ept_ad_bits,
> > -          cpu_has_vmx_ept_execute_only());
> > +          cpu_has_vmx_ept_execute_only(), init_value);
> > +  kvm_mmu_set_spte_init_value(init_value);
> > + }
> 
> I think kvm-intel.ko should use VMX_EPT_SUPPRESS_VE_BIT unconditionally 
> as the init value.  The bit is ignored anyway if the "EPT-violation #VE" 
> execution control is 0.  Otherwise looks good, but I have a couple more 
> crazy ideas:
> 
> 1) there could even be a test mode where KVM enables the execution 
> control, traps #VE in the exception bitmap, and shouts loudly if it gets 
> a #VE.  That might avoid hard-to-find bugs due to forgetting about 
> VMX_EPT_SUPPRESS_VE_BIT.
> 
> 2) or even, perhaps the init_value for the TDP MMU could set bit 63 
> _unconditionally_, because KVM always sets the NX bit on AMD hardware. 
> That would remove the whole infrastructure to keep shadow_init_value, 
> because it would be constant 0 in mmu.c and constant BIT(63) in tdp_mmu.c.
> 
> Sean, what do you think?
> 
> Paolo
Huang, Kai July 20, 2022, 5:13 a.m. UTC | #3
On Tue, 2022-07-19 at 07:49 -0700, Isaku Yamahata wrote:
> On Fri, Jul 08, 2022 at 02:23:43PM +1200,
> Kai Huang <kai.huang@intel.com> wrote:
> 
> > On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > 
> > > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
> > > to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
> > > case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
> > > defensive (test that VMX case isn't broken), introduce option
> > > ept_violation_ve_test and when it's set, set error.
> > 
> > I don't see why we need this patch.  It may be helpful during your test, but why
> > do we need this patch for formal submission?
> > 
> > And for a normal guest, what prevents one vcpu from sending #VE IPI to another
> > vcpu?
> 
> Paolo suggested it as follows.  Maybe it should be kernel config.
> (I forgot to add suggested-by. I'll add it)
> 
> https://lore.kernel.org/lkml/84d56339-4a8a-6ddb-17cb-12074588ba9c@redhat.com/
> 
> > 

OK.  But can we assume a normal guest won't sending #VE IPI?
Isaku Yamahata July 27, 2022, 11:39 p.m. UTC | #4
On Wed, Jul 20, 2022 at 05:13:08PM +1200,
Kai Huang <kai.huang@intel.com> wrote:

> On Tue, 2022-07-19 at 07:49 -0700, Isaku Yamahata wrote:
> > On Fri, Jul 08, 2022 at 02:23:43PM +1200,
> > Kai Huang <kai.huang@intel.com> wrote:
> > 
> > > On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > 
> > > > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
> > > > to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
> > > > case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
> > > > defensive (test that VMX case isn't broken), introduce option
> > > > ept_violation_ve_test and when it's set, set error.
> > > 
> > > I don't see why we need this patch.  It may be helpful during your test, but why
> > > do we need this patch for formal submission?
> > > 
> > > And for a normal guest, what prevents one vcpu from sending #VE IPI to another
> > > vcpu?
> > 
> > Paolo suggested it as follows.  Maybe it should be kernel config.
> > (I forgot to add suggested-by. I'll add it)
> > 
> > https://lore.kernel.org/lkml/84d56339-4a8a-6ddb-17cb-12074588ba9c@redhat.com/
> > 
> > > 
> 
> OK.  But can we assume a normal guest won't sending #VE IPI?

Theoretically nothing prevents that.  I wouldn't way "normal".
Anyway this is off by default.
Huang, Kai July 28, 2022, 12:54 a.m. UTC | #5
On Wed, 2022-07-27 at 16:39 -0700, Isaku Yamahata wrote:
> On Wed, Jul 20, 2022 at 05:13:08PM +1200,
> Kai Huang <kai.huang@intel.com> wrote:
> 
> > On Tue, 2022-07-19 at 07:49 -0700, Isaku Yamahata wrote:
> > > On Fri, Jul 08, 2022 at 02:23:43PM +1200,
> > > Kai Huang <kai.huang@intel.com> wrote:
> > > 
> > > > On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> > > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > > 
> > > > > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
> > > > > to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
> > > > > case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
> > > > > defensive (test that VMX case isn't broken), introduce option
> > > > > ept_violation_ve_test and when it's set, set error.
> > > > 
> > > > I don't see why we need this patch.  It may be helpful during your test, but why
> > > > do we need this patch for formal submission?
> > > > 
> > > > And for a normal guest, what prevents one vcpu from sending #VE IPI to another
> > > > vcpu?
> > > 
> > > Paolo suggested it as follows.  Maybe it should be kernel config.
> > > (I forgot to add suggested-by. I'll add it)
> > > 
> > > https://lore.kernel.org/lkml/84d56339-4a8a-6ddb-17cb-12074588ba9c@redhat.com/
> > > 
> > > > 
> > 
> > OK.  But can we assume a normal guest won't sending #VE IPI?
> 
> Theoretically nothing prevents that.  I wouldn't way "normal".
> Anyway this is off by default.

I don't think whether it is on or off by default matters.  If it can happen
legitimately in the guest, it doesn't look right to print out something like
below:

	pr_err("VMEXIT due to unexpected #VE.\n");

Anyway, will let maintainers to decide.
Sean Christopherson July 28, 2022, 8:11 p.m. UTC | #6
On Thu, Jul 28, 2022, Kai Huang wrote:
> On Wed, 2022-07-27 at 16:39 -0700, Isaku Yamahata wrote:
> > On Wed, Jul 20, 2022 at 05:13:08PM +1200,
> > Kai Huang <kai.huang@intel.com> wrote:
> > 
> > > On Tue, 2022-07-19 at 07:49 -0700, Isaku Yamahata wrote:
> > > > On Fri, Jul 08, 2022 at 02:23:43PM +1200,
> > > > Kai Huang <kai.huang@intel.com> wrote:
> > > > 
> > > > > On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> > > > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > > > 
> > > > > > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
> > > > > > to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
> > > > > > case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
> > > > > > defensive (test that VMX case isn't broken), introduce option
> > > > > > ept_violation_ve_test and when it's set, set error.
> > > > > 
> > > > > I don't see why we need this patch.  It may be helpful during your test, but why
> > > > > do we need this patch for formal submission?
> > > > > 
> > > > > And for a normal guest, what prevents one vcpu from sending #VE IPI to another
> > > > > vcpu?
> > > > 
> > > > Paolo suggested it as follows.  Maybe it should be kernel config.
> > > > (I forgot to add suggested-by. I'll add it)
> > > > 
> > > > https://lore.kernel.org/lkml/84d56339-4a8a-6ddb-17cb-12074588ba9c@redhat.com/
> > > > 
> > > > > 
> > > 
> > > OK.  But can we assume a normal guest won't sending #VE IPI?
> > 
> > Theoretically nothing prevents that.  I wouldn't way "normal".
> > Anyway this is off by default.
> 
> I don't think whether it is on or off by default matters.

It matters in the sense that the module param is intended purely for testing, i.e.
there's zero reason to ever enable it in production.  That changes what is and
wasn't isn't a reasonable response to an unexpected #VE.

> If it can happen legitimately in the guest, it doesn't look right to print
> out something like below:
> 
> 	pr_err("VMEXIT due to unexpected #VE.\n");

Agreed.  In this particular case I think the right approach is to treat an
unexpected #VE as a fatal KVM bug.  Yes, disabling EPT violation #VEs would likely
allow the guest to live, but as above the module param should never be enabled in
production.  And if we get a #VE with the module param disabled, then KVM is truly
in the weeds and killing the VM is the safe option.

E.g. something like

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4fd25e1d6ec9..54b9cb56f6e2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5010,6 +5010,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
        if (is_invalid_opcode(intr_info))
                return handle_ud(vcpu);

+       if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
+               return -EIO;
+
        error_code = 0;
        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
Isaku Yamahata Aug. 9, 2022, 12:48 a.m. UTC | #7
On Thu, Jul 28, 2022 at 08:11:59PM +0000,
Sean Christopherson <seanjc@google.com> wrote:

> On Thu, Jul 28, 2022, Kai Huang wrote:
> > On Wed, 2022-07-27 at 16:39 -0700, Isaku Yamahata wrote:
> > > On Wed, Jul 20, 2022 at 05:13:08PM +1200,
> > > Kai Huang <kai.huang@intel.com> wrote:
> > > 
> > > > On Tue, 2022-07-19 at 07:49 -0700, Isaku Yamahata wrote:
> > > > > On Fri, Jul 08, 2022 at 02:23:43PM +1200,
> > > > > Kai Huang <kai.huang@intel.com> wrote:
> > > > > 
> > > > > > On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> > > > > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > > > > 
> > > > > > > To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM programs
> > > > > > > to inject #VE conditionally and set #VE suppress bit in EPT entry.  For VMX
> > > > > > > case, #VE isn't used.  If #VE happens for VMX, it's a bug.  To be
> > > > > > > defensive (test that VMX case isn't broken), introduce option
> > > > > > > ept_violation_ve_test and when it's set, set error.
> > > > > > 
> > > > > > I don't see why we need this patch.  It may be helpful during your test, but why
> > > > > > do we need this patch for formal submission?
> > > > > > 
> > > > > > And for a normal guest, what prevents one vcpu from sending #VE IPI to another
> > > > > > vcpu?
> > > > > 
> > > > > Paolo suggested it as follows.  Maybe it should be kernel config.
> > > > > (I forgot to add suggested-by. I'll add it)
> > > > > 
> > > > > https://lore.kernel.org/lkml/84d56339-4a8a-6ddb-17cb-12074588ba9c@redhat.com/
> > > > > 
> > > > > > 
> > > > 
> > > > OK.  But can we assume a normal guest won't sending #VE IPI?
> > > 
> > > Theoretically nothing prevents that.  I wouldn't way "normal".
> > > Anyway this is off by default.
> > 
> > I don't think whether it is on or off by default matters.
> 
> It matters in the sense that the module param is intended purely for testing, i.e.
> there's zero reason to ever enable it in production.  That changes what is and
> wasn't isn't a reasonable response to an unexpected #VE.
> 
> > If it can happen legitimately in the guest, it doesn't look right to print
> > out something like below:
> > 
> > 	pr_err("VMEXIT due to unexpected #VE.\n");
> 
> Agreed.  In this particular case I think the right approach is to treat an
> unexpected #VE as a fatal KVM bug.  Yes, disabling EPT violation #VEs would likely
> allow the guest to live, but as above the module param should never be enabled in
> production.  And if we get a #VE with the module param disabled, then KVM is truly
> in the weeds and killing the VM is the safe option.
> 
> E.g. something like

Thanks, I finally ended up with the following.

diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index ac290a44a693..9277676057a7 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
 	return is_exception_n(intr_info, NM_VECTOR);
 }
 
+static inline bool is_ve_fault(u32 intr_info)
+{
+	return is_exception_n(intr_info, VE_VECTOR);
+}
+
 /* Undocumented: icebp/int1 */
 static inline bool is_icebp(u32 intr_info)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 881db80ceee9..c3e4c0d17b63 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5047,6 +5047,12 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	if (is_invalid_opcode(intr_info))
 		return handle_ud(vcpu);
 
+	/*
+	 * #VE isn't supposed to happen.  Although vcpu can send
+	 */
+	if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
+		return -EIO;
+
 	error_code = 0;
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -5167,14 +5173,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
 			return 1;
 		fallthrough;
-	case VE_VECTOR:
 	default:
-		if (ept_violation_ve_test && ex_no == VE_VECTOR) {
-			pr_err("VMEXIT due to unexpected #VE.\n");
-			secondary_exec_controls_clearbit(
-				vmx, SECONDARY_EXEC_EPT_VIOLATION_VE);
-			return 1;
-		}
 		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
 		kvm_run->ex.exception = ex_no;
 		kvm_run->ex.error_code = error_code;
diff mbox series

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 6231ef005a50..f0f8eecf55ac 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -68,6 +68,7 @@ 
 #define SECONDARY_EXEC_ENCLS_EXITING		VMCS_CONTROL_BIT(ENCLS_EXITING)
 #define SECONDARY_EXEC_RDSEED_EXITING		VMCS_CONTROL_BIT(RDSEED_EXITING)
 #define SECONDARY_EXEC_ENABLE_PML               VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE		VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
 #define SECONDARY_EXEC_PT_CONCEAL_VMX		VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
 #define SECONDARY_EXEC_XSAVES			VMCS_CONTROL_BIT(XSAVES)
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
@@ -223,6 +224,8 @@  enum vmcs_field {
 	VMREAD_BITMAP_HIGH              = 0x00002027,
 	VMWRITE_BITMAP                  = 0x00002028,
 	VMWRITE_BITMAP_HIGH             = 0x00002029,
+	VE_INFORMATION_ADDRESS		= 0x0000202A,
+	VE_INFORMATION_ADDRESS_HIGH	= 0x0000202B,
 	XSS_EXIT_BITMAP                 = 0x0000202C,
 	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
 	ENCLS_EXITING_BITMAP		= 0x0000202E,
@@ -628,4 +631,13 @@  enum vmx_l1d_flush_state {
 
 extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
 
+struct vmx_ve_information {
+	u32 exit_reason;
+	u32 delivery;
+	u64 exit_qualification;
+	u64 guest_linear_address;
+	u64 guest_physical_address;
+	u16 eptp_index;
+};
+
 #endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e2415ac55317..e3d304b14df0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -126,6 +126,9 @@  module_param(error_on_inconsistent_vmcs_config, bool, 0444);
 static bool __read_mostly dump_invalid_vmcs = 0;
 module_param(dump_invalid_vmcs, bool, 0644);
 
+static bool __read_mostly ept_violation_ve_test = 0;
+module_param(ept_violation_ve_test, bool, 0444);
+
 #define MSR_BITMAP_MODE_X2APIC		1
 #define MSR_BITMAP_MODE_X2APIC_APICV	2
 
@@ -726,6 +729,13 @@  void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
 
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
+	/*
+	 * #VE isn't used for VMX, but for TDX.  To test against unexpected
+	 * change related to #VE for VMX, intercept unexpected #VE and warn on
+	 * it.
+	 */
+	if (ept_violation_ve_test)
+		eb |= 1u << VE_VECTOR;
 	/*
 	 * Guest access to VMware backdoor ports could legitimately
 	 * trigger #GP because of TSS I/O permission bitmap.
@@ -2524,6 +2534,8 @@  static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			SECONDARY_EXEC_NOTIFY_VM_EXITING;
 		if (cpu_has_sgx())
 			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
+		if (ept_violation_ve_test)
+			opt2 |= SECONDARY_EXEC_EPT_VIOLATION_VE;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -2558,6 +2570,7 @@  static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			return -EIO;
 
 		vmx_cap->ept = 0;
+		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
 	}
 	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
 	    vmx_cap->vpid) {
@@ -4390,6 +4403,7 @@  static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 	if (!enable_ept) {
 		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
 		enable_unrestricted_guest = 0;
 	}
 	if (!enable_unrestricted_guest)
@@ -4517,8 +4531,40 @@  static void init_vmcs(struct vcpu_vmx *vmx)
 
 	exec_controls_set(vmx, vmx_exec_control(vmx));
 
-	if (cpu_has_secondary_exec_ctrls())
+	if (cpu_has_secondary_exec_ctrls()) {
 		secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+		if (secondary_exec_controls_get(vmx) &
+		    SECONDARY_EXEC_EPT_VIOLATION_VE) {
+			if (!vmx->ve_info) {
+				/* ve_info must be page aligned. */
+				struct page *page;
+
+				BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+				page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+				if (page)
+					vmx->ve_info = page_to_virt(page);
+			}
+			if (vmx->ve_info) {
+				/*
+				 * Allow #VE delivery. CPU sets this field to
+				 * 0xFFFFFFFF on #VE delivery.  Another #VE can
+				 * occur only if software clears the field.
+				 */
+				vmx->ve_info->delivery = 0;
+				vmcs_write64(VE_INFORMATION_ADDRESS,
+					     __pa(vmx->ve_info));
+			} else {
+				/*
+				 * Because SECONDARY_EXEC_EPT_VIOLATION_VE is
+				 * used only when ept_violation_ve_test is true,
+				 * it's okay to go with the bit disabled.
+				 */
+				pr_err("Failed to allocate ve_info. disabling EPT_VIOLATION_VE.\n");
+				secondary_exec_controls_clearbit(
+					vmx, SECONDARY_EXEC_EPT_VIOLATION_VE);
+			}
+		}
+	}
 
 	if (cpu_has_tertiary_exec_ctrls())
 		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -5116,7 +5162,14 @@  static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
 			return 1;
 		fallthrough;
+	case VE_VECTOR:
 	default:
+		if (ept_violation_ve_test && ex_no == VE_VECTOR) {
+			pr_err("VMEXIT due to unexpected #VE.\n");
+			secondary_exec_controls_clearbit(
+				vmx, SECONDARY_EXEC_EPT_VIOLATION_VE);
+			return 1;
+		}
 		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
 		kvm_run->ex.exception = ex_no;
 		kvm_run->ex.error_code = error_code;
@@ -6182,6 +6235,17 @@  void dump_vmcs(struct kvm_vcpu *vcpu)
 	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
 		pr_err("Virtual processor ID = 0x%04x\n",
 		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
+	if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+		struct vmx_ve_information *ve_info;
+		pr_err("VE info address = 0x%016llx\n",
+		       vmcs_read64(VE_INFORMATION_ADDRESS));
+		ve_info = __va(vmcs_read64(VE_INFORMATION_ADDRESS));
+		pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+		       ve_info->exit_reason, ve_info->delivery,
+		       ve_info->exit_qualification,
+		       ve_info->guest_linear_address,
+		       ve_info->guest_physical_address, ve_info->eptp_index);
+	}
 }
 
 /*
@@ -7173,6 +7237,8 @@  void vmx_vcpu_free(struct kvm_vcpu *vcpu)
 	free_vpid(vmx->vpid);
 	nested_vmx_free_vcpu(vcpu);
 	free_loaded_vmcs(vmx->loaded_vmcs);
+	if (vmx->ve_info)
+		free_page((unsigned long)vmx->ve_info);
 }
 
 int vmx_vcpu_create(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9feb994e5ea2..60d93c38e014 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -338,6 +338,9 @@  struct vcpu_vmx {
 		DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 		DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 	} shadow_msr_intercept;
+
+	/* ve_info must be page aligned. */
+	struct vmx_ve_information *ve_info;
 };
 
 struct kvm_vmx {