diff mbox series

[v8,3/7] KVM: VMX: Pass through CET related MSRs

Message ID 20191101085222.27997-4-weijiang.yang@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce support for guest CET feature | expand

Commit Message

Yang, Weijiang Nov. 1, 2019, 8:52 a.m. UTC
CET MSRs pass through Guest directly to enhance performance.
CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
these MSRs are defined in kernel and re-used here.

MSR_IA32_U_CET and MSR_IA32_PL3_SSP are used for user mode protection,
the contents could differ from process to process, therefore,
kernel needs to save/restore them during context switch, it makes
sense to pass through them so that the guest kernel can
use xsaves/xrstors to operate them efficiently. Other MSRs are used
for non-user mode protection. See CET spec for detailed info.

The difference between CET VMCS state fields and xsave components is that,
the former used for CET state storage during VMEnter/VMExit,
whereas the latter used for state retention between Guest task/process
switch.

Co-developed-by: Zhang Yi Z <yi.z.zhang@linux.intel.com>
Signed-off-by: Zhang Yi Z <yi.z.zhang@linux.intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/kvm/cpuid.c   |  4 +--
 arch/x86/kvm/cpuid.h   |  2 ++
 arch/x86/kvm/vmx/vmx.c | 65 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 2 deletions(-)

Comments

Sean Christopherson Dec. 10, 2019, 9:18 p.m. UTC | #1
On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> CET MSRs pass through Guest directly to enhance performance.
> CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> these MSRs are defined in kernel and re-used here.
> 
> MSR_IA32_U_CET and MSR_IA32_PL3_SSP are used for user mode protection,
> the contents could differ from process to process, therefore,
> kernel needs to save/restore them during context switch, it makes
> sense to pass through them so that the guest kernel can
> use xsaves/xrstors to operate them efficiently. Other MSRs are used
> for non-user mode protection. See CET spec for detailed info.
> 
> The difference between CET VMCS state fields and xsave components is that,
> the former used for CET state storage during VMEnter/VMExit,
> whereas the latter used for state retention between Guest task/process
> switch.
> 
> Co-developed-by: Zhang Yi Z <yi.z.zhang@linux.intel.com>
> Signed-off-by: Zhang Yi Z <yi.z.zhang@linux.intel.com>
> Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> ---
>  arch/x86/kvm/cpuid.c   |  4 +--
>  arch/x86/kvm/cpuid.h   |  2 ++
>  arch/x86/kvm/vmx/vmx.c | 65 ++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 69 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index dd387a785c1e..4166c4fcad1e 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -371,13 +371,13 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
>  		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
>  		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
>  		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
> -		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
> +		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK);
>  
>  	/* cpuid 7.0.edx*/
>  	const u32 kvm_cpuid_7_0_edx_x86_features =
>  		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
>  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> -		F(MD_CLEAR);
> +		F(MD_CLEAR) | F(IBT);

Advertising CET to userspace/guest needs to be done at the end of the
series, or at least after CR4.CET is no longer reserved, e.g. KVM_SET_SREGS
will fail and the guest will get a #GP when trying to set CR4.CET.

I'm pretty sure I've said this at least twice in previous versions of
this series...

>  
>  	/* cpuid 7.1.eax */
>  	const u32 kvm_cpuid_7_1_eax_x86_features =
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index d78a61408243..1d77b880084d 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -27,6 +27,8 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
>  
>  int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
>  
> +u64 kvm_supported_xss(void);
> +
>  static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
>  {
>  	return vcpu->arch.maxphyaddr;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index a84198cff397..db03d9dc1297 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -2918,6 +2918,24 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  	vmcs_writel(GUEST_CR3, guest_cr3);
>  }
>  
> +static bool guest_cet_allowed(struct kvm_vcpu *vcpu, u32 feature, u32 mode)
> +{
> +	u64 kvm_xss = kvm_supported_xss();
> +
> +	/*
> +	 * Sanity check for guest CET dependencies, guest_cpu_has(SHSTK|IBT) has
> +	 * implied corresponding host CET status check.
> +	 */
> +	if (feature == X86_FEATURE_SHSTK)
> +		return guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) &&
> +		       (kvm_xss & mode);
> +	else if (feature == X86_FEATURE_IBT)
> +		return guest_cpuid_has(vcpu, X86_FEATURE_IBT) &&
> +		       (kvm_xss & mode);
> +
> +	return false;
> +}
> +
>  int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -7001,6 +7019,50 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
>  		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
>  }
>  
> +static void vmx_pass_cet_msrs(struct kvm_vcpu *vcpu)

"pass" isn't accurate, this function also does the opposite.  Maybe 
vmx_update_intercept_for_cet_msr()?  Or reuse the PT naming and go with
cet_update_intercept_for_msr()?

> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> +
> +	/*
> +	 * U_CET is required for USER CET, per CET spec., meanwhile U_CET and
> +	 * PL3_SPP are a bundle for USER CET xsaves.
> +	 */
> +	if (guest_cet_allowed(vcpu, X86_FEATURE_SHSTK, XFEATURE_MASK_CET_USER) ||
> +	    guest_cet_allowed(vcpu, X86_FEATURE_IBT, XFEATURE_MASK_CET_USER)) {

IMO, the guest_cet_allowed() wrappers do more harm than good, e.g. I find
this easier to understand because it doesn't require digging into a random
helper.

	if ((kvm_supported_xss() & XFEATURE_MASK_CET_USER) &&
	    (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) ||
	     guest_cpuid_has(vcpu, X86_FEATURE_IBT)))

> +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_U_CET, MSR_TYPE_RW);
> +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL3_SSP, MSR_TYPE_RW);
> +	} else {
> +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_U_CET, MSR_TYPE_RW, true);
> +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL3_SSP, MSR_TYPE_RW, true);
> +	}
> +	/*
> +	 * S_CET is required for KERNEL CET, meanwhile PL0_SSP ... PL2_SSP are a bundle
> +	 * for CET KERNEL xsaves.
> +	 */
> +	if (guest_cet_allowed(vcpu, X86_FEATURE_SHSTK, XFEATURE_MASK_CET_KERNEL) ||
> +	    guest_cet_allowed(vcpu, X86_FEATURE_IBT, XFEATURE_MASK_CET_KERNEL)) {
> +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_S_CET, MSR_TYPE_RW);
> +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL0_SSP, MSR_TYPE_RW);
> +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL1_SSP, MSR_TYPE_RW);
> +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL2_SSP, MSR_TYPE_RW);
> +
> +		/* SSP_TAB only available for KERNEL SHSTK.*/
> +		if (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK))
> +			vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB,
> +						      MSR_TYPE_RW);
> +		else
> +			vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB,
> +						  MSR_TYPE_RW, true);
> +	} else {
> +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_S_CET, MSR_TYPE_RW, true);
> +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL0_SSP, MSR_TYPE_RW, true);
> +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL1_SSP, MSR_TYPE_RW, true);
> +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL2_SSP, MSR_TYPE_RW, true);
> +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB, MSR_TYPE_RW, true);
> +	}
> +}
> +
>  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -7025,6 +7087,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
>  	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
>  			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
>  		update_intel_pt_cfg(vcpu);
> +
> +	if (!is_guest_mode(vcpu))
> +		vmx_pass_cet_msrs(vcpu);

Hmm, this looks insufficent, e.g. deliberately toggling CET from on->off
while in guest mode would put KVM in a weird state as the msr bitmap for
L1 would still allow L1 to access the CET MSRs.

Allowing KVM_SET_CPUID{2} while running a nested guest seems bogus, can we
kill that path entirely with -EINVAL?

>  }
>  
>  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
> -- 
> 2.17.2
>
Yang, Weijiang Dec. 11, 2019, 1:32 a.m. UTC | #2
On Tue, Dec 10, 2019 at 01:18:21PM -0800, Sean Christopherson wrote:
> On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> > CET MSRs pass through Guest directly to enhance performance.
> > CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> > Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> > SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> > these MSRs are defined in kernel and re-used here.
> > 
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index dd387a785c1e..4166c4fcad1e 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -371,13 +371,13 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
> >  		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
> >  		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
> >  		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
> > -		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
> > +		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK);
> >  
> >  	/* cpuid 7.0.edx*/
> >  	const u32 kvm_cpuid_7_0_edx_x86_features =
> >  		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
> >  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> > -		F(MD_CLEAR);
> > +		F(MD_CLEAR) | F(IBT);
> 
> Advertising CET to userspace/guest needs to be done at the end of the
> series, or at least after CR4.CET is no longer reserved, e.g. KVM_SET_SREGS
> will fail and the guest will get a #GP when trying to set CR4.CET.
> 
> I'm pretty sure I've said this at least twice in previous versions of
> this series...

Thanks Sean for picking these up!
The reason is, starting from this patch, I'm using guest_cpuid_has(CET)
to check the availability of guest CET CPUID, so logically I would like to let
the readers understand CET related CPUID word is
defined as above. But no problem, I can move these definitions to a
latter patch as the patchset only meaningful as a whole. 
> 
> >  
> >  	/* cpuid 7.1.eax */
> >  	const u32 kvm_cpuid_7_1_eax_x86_features =
> > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> > index d78a61408243..1d77b880084d 100644
> > --- a/arch/x86/kvm/cpuid.h
> > +++ b/arch/x86/kvm/cpuid.h
> > @@ -27,6 +27,8 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
> >  
> > +static void vmx_pass_cet_msrs(struct kvm_vcpu *vcpu)
> 
> "pass" isn't accurate, this function also does the opposite.  Maybe 
> vmx_update_intercept_for_cet_msr()?  Or reuse the PT naming and go with
> cet_update_intercept_for_msr()?
>
Sure, will change it.

> > +{
> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> > +
> > +	/*
> > +	 * U_CET is required for USER CET, per CET spec., meanwhile U_CET and
> > +	 * PL3_SPP are a bundle for USER CET xsaves.
> > +	 */
> > +	if (guest_cet_allowed(vcpu, X86_FEATURE_SHSTK, XFEATURE_MASK_CET_USER) ||
> > +	    guest_cet_allowed(vcpu, X86_FEATURE_IBT, XFEATURE_MASK_CET_USER)) {
> 
> IMO, the guest_cet_allowed() wrappers do more harm than good, e.g. I find
> this easier to understand because it doesn't require digging into a random
> helper.
> 
> 	if ((kvm_supported_xss() & XFEATURE_MASK_CET_USER) &&
> 	    (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) ||
> 	     guest_cpuid_has(vcpu, X86_FEATURE_IBT)))
> 
Hmm, sounds like it's an unnecessary wrapper, will remove it, thanks!

> > +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_U_CET, MSR_TYPE_RW);
> > +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL3_SSP, MSR_TYPE_RW);
> > +	} else {
> > +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_U_CET, MSR_TYPE_RW, true);
> > +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL3_SSP, MSR_TYPE_RW, true);
> > +	}
> > +	/*
> > +	 * S_CET is required for KERNEL CET, meanwhile PL0_SSP ... PL2_SSP are a bundle
> > +	 * for CET KERNEL xsaves.
> > +	 */
> > +	if (guest_cet_allowed(vcpu, X86_FEATURE_SHSTK, XFEATURE_MASK_CET_KERNEL) ||
> > +	    guest_cet_allowed(vcpu, X86_FEATURE_IBT, XFEATURE_MASK_CET_KERNEL)) {
> > +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_S_CET, MSR_TYPE_RW);
> > +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL0_SSP, MSR_TYPE_RW);
> > +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL1_SSP, MSR_TYPE_RW);
> > +		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL2_SSP, MSR_TYPE_RW);
> > +
> > +		/* SSP_TAB only available for KERNEL SHSTK.*/
> > +		if (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK))
> > +			vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB,
> > +						      MSR_TYPE_RW);
> > +		else
> > +			vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB,
> > +						  MSR_TYPE_RW, true);
> > +	} else {
> > +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_S_CET, MSR_TYPE_RW, true);
> > +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL0_SSP, MSR_TYPE_RW, true);
> > +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL1_SSP, MSR_TYPE_RW, true);
> > +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL2_SSP, MSR_TYPE_RW, true);
> > +		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB, MSR_TYPE_RW, true);
> > +	}
> > +}
> > +
> >  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> >  {
> >  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > @@ -7025,6 +7087,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> >  	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
> >  			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
> >  		update_intel_pt_cfg(vcpu);
> > +
> > +	if (!is_guest_mode(vcpu))
> > +		vmx_pass_cet_msrs(vcpu);
> 
> Hmm, this looks insufficent, e.g. deliberately toggling CET from on->off
> while in guest mode would put KVM in a weird state as the msr bitmap for
> L1 would still allow L1 to access the CET MSRs.
Not sure I understand correctly, guest_cpu_has(CET) implies the check of
host CET status, if CET is off in host, CET MSRs won't exposed to L1
guest.
> 
> Allowing KVM_SET_CPUID{2} while running a nested guest seems bogus, can we
> kill that path entirely with -EINVAL?
> 
Do you mean prevent L1 using KVM_SET_CPUID{2} to expose CET feature bits to
L2?

> >  }
> >  
> >  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
> > -- 
> > 2.17.2
> >
Sean Christopherson Dec. 11, 2019, 1:50 a.m. UTC | #3
On Wed, Dec 11, 2019 at 09:32:07AM +0800, Yang Weijiang wrote:
> On Tue, Dec 10, 2019 at 01:18:21PM -0800, Sean Christopherson wrote:
> > On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> > > CET MSRs pass through Guest directly to enhance performance.
> > > CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> > > Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> > > SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> > > these MSRs are defined in kernel and re-used here.
> > > 
> > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > > index dd387a785c1e..4166c4fcad1e 100644
> > > --- a/arch/x86/kvm/cpuid.c
> > > +++ b/arch/x86/kvm/cpuid.c
> > > @@ -371,13 +371,13 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
> > >  		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
> > >  		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
> > >  		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
> > > -		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
> > > +		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK);
> > >  
> > >  	/* cpuid 7.0.edx*/
> > >  	const u32 kvm_cpuid_7_0_edx_x86_features =
> > >  		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
> > >  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> > > -		F(MD_CLEAR);
> > > +		F(MD_CLEAR) | F(IBT);
> > 
> > Advertising CET to userspace/guest needs to be done at the end of the
> > series, or at least after CR4.CET is no longer reserved, e.g. KVM_SET_SREGS
> > will fail and the guest will get a #GP when trying to set CR4.CET.
> > 
> > I'm pretty sure I've said this at least twice in previous versions of
> > this series...
> 
> Thanks Sean for picking these up!
> The reason is, starting from this patch, I'm using guest_cpuid_has(CET)
> to check the availability of guest CET CPUID, so logically I would like to let
> the readers understand CET related CPUID word is
> defined as above. But no problem, I can move these definitions to a
> latter patch as the patchset only meaningful as a whole. 

Adding usage of guest_cpuid_has(CET) without advertising CET is perfectly
ok from a functionality perspective.  Having a user without a consumer
isn't ideal, but it's better than having one gigantic patch.

The problem with advertising CET when it's not fully supported is that it
will break bisection, e.g. trying to boot a CET-enabled guest would get a
#GP during boot and likely crash.  Whether or not a series is useful when
taken as a whole is orthogonal to the integrity of each invidiual patch.
Yang, Weijiang Dec. 11, 2019, 2:27 a.m. UTC | #4
On Tue, Dec 10, 2019 at 05:50:52PM -0800, Sean Christopherson wrote:
> On Wed, Dec 11, 2019 at 09:32:07AM +0800, Yang Weijiang wrote:
> > On Tue, Dec 10, 2019 at 01:18:21PM -0800, Sean Christopherson wrote:
> > > On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> > > > CET MSRs pass through Guest directly to enhance performance.
> > > > CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> > > > Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> > > > SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> > > > these MSRs are defined in kernel and re-used here.
> > > > 
> > > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > > > index dd387a785c1e..4166c4fcad1e 100644
> > > > --- a/arch/x86/kvm/cpuid.c
> > > > +++ b/arch/x86/kvm/cpuid.c
> > > > @@ -371,13 +371,13 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
> > > >  		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
> > > >  		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
> > > >  		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
> > > > -		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
> > > > +		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK);
> > > >  
> > > >  	/* cpuid 7.0.edx*/
> > > >  	const u32 kvm_cpuid_7_0_edx_x86_features =
> > > >  		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
> > > >  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> > > > -		F(MD_CLEAR);
> > > > +		F(MD_CLEAR) | F(IBT);
> > > 
> > > Advertising CET to userspace/guest needs to be done at the end of the
> > > series, or at least after CR4.CET is no longer reserved, e.g. KVM_SET_SREGS
> > > will fail and the guest will get a #GP when trying to set CR4.CET.
> > > 
> > > I'm pretty sure I've said this at least twice in previous versions of
> > > this series...
> > 
> > Thanks Sean for picking these up!
> > The reason is, starting from this patch, I'm using guest_cpuid_has(CET)
> > to check the availability of guest CET CPUID, so logically I would like to let
> > the readers understand CET related CPUID word is
> > defined as above. But no problem, I can move these definitions to a
> > latter patch as the patchset only meaningful as a whole. 
> 
> Adding usage of guest_cpuid_has(CET) without advertising CET is perfectly
> ok from a functionality perspective.  Having a user without a consumer
> isn't ideal, but it's better than having one gigantic patch.
> 
> The problem with advertising CET when it's not fully supported is that it
> will break bisection, e.g. trying to boot a CET-enabled guest would get a
> #GP during boot and likely crash.  Whether or not a series is useful when
> taken as a whole is orthogonal to the integrity of each invidiual patch.

Oh, I omitted case likes bisection, you're right, I'll change it, thanks
a lot!
Yang, Weijiang Dec. 16, 2019, 2:18 a.m. UTC | #5
On Tue, Dec 10, 2019 at 01:18:21PM -0800, Sean Christopherson wrote:
> On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> > CET MSRs pass through Guest directly to enhance performance.
> > CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> > Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> > SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> > these MSRs are defined in kernel and re-used here.
> > 
 > +
> >  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> >  {
> >  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > @@ -7025,6 +7087,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> >  	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
> >  			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
> >  		update_intel_pt_cfg(vcpu);
> > +
> > +	if (!is_guest_mode(vcpu))
> > +		vmx_pass_cet_msrs(vcpu);
> 
> Hmm, this looks insufficent, e.g. deliberately toggling CET from on->off
> while in guest mode would put KVM in a weird state as the msr bitmap for
> L1 would still allow L1 to access the CET MSRs.
>
Hi, Sean,
I don't get you, there's guest mode check before access CET msrs, it'll
fail if it's in guest mode.

> Allowing KVM_SET_CPUID{2} while running a nested guest seems bogus, can we
> kill that path entirely with -EINVAL?
>
Do you mean don't expose CET cpuids to L2 guest?
thanks!

> >  }
> >  
> >  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
> > -- 
> > 2.17.2
> >
Sean Christopherson Dec. 18, 2019, 12:34 a.m. UTC | #6
On Mon, Dec 16, 2019 at 10:18:16AM +0800, Yang Weijiang wrote:
> On Tue, Dec 10, 2019 at 01:18:21PM -0800, Sean Christopherson wrote:
> > On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> > > CET MSRs pass through Guest directly to enhance performance.
> > > CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> > > Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> > > SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> > > these MSRs are defined in kernel and re-used here.
> > > 
>  > +
> > >  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> > >  {
> > >  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > > @@ -7025,6 +7087,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> > >  	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
> > >  			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
> > >  		update_intel_pt_cfg(vcpu);
> > > +
> > > +	if (!is_guest_mode(vcpu))
> > > +		vmx_pass_cet_msrs(vcpu);
> > 
> > Hmm, this looks insufficent, e.g. deliberately toggling CET from on->off
> > while in guest mode would put KVM in a weird state as the msr bitmap for
> > L1 would still allow L1 to access the CET MSRs.
> >
> Hi, Sean,
> I don't get you, there's guest mode check before access CET msrs, it'll
> fail if it's in guest mode.

KVM can exit to userspae while L2 is active.  If userspace then did a
KVM_SET_CPUID2, e.g. instead of KVM_RUN, vmx_cpuid_update() would skip
vmx_pass_cet_msrs() and KVM would never update L1's MSR bitmaps.

> > Allowing KVM_SET_CPUID{2} while running a nested guest seems bogus, can we
> > kill that path entirely with -EINVAL?
> >
> Do you mean don't expose CET cpuids to L2 guest?

I mean completely disallow KVM_SET_CPUID and KVM_SET_CPUID2 if
is_guest_mode() is true.  My question is mostly directed at Paolo and
anyone else that has an opinion on whether we can massage the ABI to
retroactively change KVM_SET_CPUID{2} behavior.
Yang, Weijiang Dec. 18, 2019, 1:55 p.m. UTC | #7
On Tue, Dec 17, 2019 at 04:34:55PM -0800, Sean Christopherson wrote:
> On Mon, Dec 16, 2019 at 10:18:16AM +0800, Yang Weijiang wrote:
> > On Tue, Dec 10, 2019 at 01:18:21PM -0800, Sean Christopherson wrote:
> > > On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> > > > CET MSRs pass through Guest directly to enhance performance.
> > > > CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> > > > Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> > > > SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> > > > these MSRs are defined in kernel and re-used here.
> > > > 
> >  > +
> > > >  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> > > >  {
> > > >  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > > > @@ -7025,6 +7087,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> > > >  	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
> > > >  			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
> > > >  		update_intel_pt_cfg(vcpu);
> > > > +
> > > > +	if (!is_guest_mode(vcpu))
> > > > +		vmx_pass_cet_msrs(vcpu);
> > > 
> > > Hmm, this looks insufficent, e.g. deliberately toggling CET from on->off
> > > while in guest mode would put KVM in a weird state as the msr bitmap for
> > > L1 would still allow L1 to access the CET MSRs.
> > >
> > Hi, Sean,
> > I don't get you, there's guest mode check before access CET msrs, it'll
> > fail if it's in guest mode.
> 
> KVM can exit to userspae while L2 is active.  If userspace then did a
> KVM_SET_CPUID2, e.g. instead of KVM_RUN, vmx_cpuid_update() would skip
> vmx_pass_cet_msrs() and KVM would never update L1's MSR bitmaps.
>
Thanks, it makes sense to me. Given current implementation, how about
removing above check and adding it in CET CPUID
enumeration for L2 so that no CET msrs passed through to L2 when
is_guest_mode() is true?

> > > Allowing KVM_SET_CPUID{2} while running a nested guest seems bogus, can we
> > > kill that path entirely with -EINVAL?
> > >
> > Do you mean don't expose CET cpuids to L2 guest?
> 
> I mean completely disallow KVM_SET_CPUID and KVM_SET_CPUID2 if
> is_guest_mode() is true.  My question is mostly directed at Paolo and
> anyone else that has an opinion on whether we can massage the ABI to
> retroactively change KVM_SET_CPUID{2} behavior.
This sounds like something deserving an individual patch after get
agreement in community. I'll put it aside right now.
Sean Christopherson Dec. 18, 2019, 4:02 p.m. UTC | #8
On Wed, Dec 18, 2019 at 09:55:13PM +0800, Yang Weijiang wrote:
> On Tue, Dec 17, 2019 at 04:34:55PM -0800, Sean Christopherson wrote:
> > On Mon, Dec 16, 2019 at 10:18:16AM +0800, Yang Weijiang wrote:
> > > On Tue, Dec 10, 2019 at 01:18:21PM -0800, Sean Christopherson wrote:
> > > > On Fri, Nov 01, 2019 at 04:52:18PM +0800, Yang Weijiang wrote:
> > > > > CET MSRs pass through Guest directly to enhance performance.
> > > > > CET runtime control settings are stored in MSR_IA32_{U,S}_CET,
> > > > > Shadow Stack Pointer(SSP) are stored in MSR_IA32_PL{0,1,2,3}_SSP,
> > > > > SSP table base address is stored in MSR_IA32_INT_SSP_TAB,
> > > > > these MSRs are defined in kernel and re-used here.
> > > > > 
> > >  > +
> > > > >  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> > > > >  {
> > > > >  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > > > > @@ -7025,6 +7087,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
> > > > >  	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
> > > > >  			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
> > > > >  		update_intel_pt_cfg(vcpu);
> > > > > +
> > > > > +	if (!is_guest_mode(vcpu))
> > > > > +		vmx_pass_cet_msrs(vcpu);
> > > > 
> > > > Hmm, this looks insufficent, e.g. deliberately toggling CET from on->off
> > > > while in guest mode would put KVM in a weird state as the msr bitmap for
> > > > L1 would still allow L1 to access the CET MSRs.
> > > >
> > > Hi, Sean,
> > > I don't get you, there's guest mode check before access CET msrs, it'll
> > > fail if it's in guest mode.
> > 
> > KVM can exit to userspae while L2 is active.  If userspace then did a
> > KVM_SET_CPUID2, e.g. instead of KVM_RUN, vmx_cpuid_update() would skip
> > vmx_pass_cet_msrs() and KVM would never update L1's MSR bitmaps.
> >
> Thanks, it makes sense to me. Given current implementation, how about
> removing above check and adding it in CET CPUID
> enumeration for L2 so that no CET msrs passed through to L2 when
> is_guest_mode() is true?

But you still need to update L1's MSR bitmaps.  That can obviously be done
all at once, but it's annoying and IMO unnecessarily complex.

> > > > Allowing KVM_SET_CPUID{2} while running a nested guest seems bogus, can we
> > > > kill that path entirely with -EINVAL?
> > > >
> > > Do you mean don't expose CET cpuids to L2 guest?
> > 
> > I mean completely disallow KVM_SET_CPUID and KVM_SET_CPUID2 if
> > is_guest_mode() is true.  My question is mostly directed at Paolo and
> > anyone else that has an opinion on whether we can massage the ABI to
> > retroactively change KVM_SET_CPUID{2} behavior.
>
> This sounds like something deserving an individual patch after get
> agreement in community. I'll put it aside right now.

Normally I would agree, but I think in this case it would significantly
reduce the complexity and implementation cost of CET support.  I'll send a
patch to kickstart the conversation, it's a tiny change in terms of code.
diff mbox series

Patch

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index dd387a785c1e..4166c4fcad1e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -371,13 +371,13 @@  static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
 		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK);
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
 		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
 		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
-		F(MD_CLEAR);
+		F(MD_CLEAR) | F(IBT);
 
 	/* cpuid 7.1.eax */
 	const u32 kvm_cpuid_7_1_eax_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d78a61408243..1d77b880084d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -27,6 +27,8 @@  bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
 
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
+u64 kvm_supported_xss(void);
+
 static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.maxphyaddr;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a84198cff397..db03d9dc1297 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2918,6 +2918,24 @@  void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
+static bool guest_cet_allowed(struct kvm_vcpu *vcpu, u32 feature, u32 mode)
+{
+	u64 kvm_xss = kvm_supported_xss();
+
+	/*
+	 * Sanity check for guest CET dependencies, guest_cpu_has(SHSTK|IBT) has
+	 * implied corresponding host CET status check.
+	 */
+	if (feature == X86_FEATURE_SHSTK)
+		return guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) &&
+		       (kvm_xss & mode);
+	else if (feature == X86_FEATURE_IBT)
+		return guest_cpuid_has(vcpu, X86_FEATURE_IBT) &&
+		       (kvm_xss & mode);
+
+	return false;
+}
+
 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7001,6 +7019,50 @@  static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
 		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
 }
 
+static void vmx_pass_cet_msrs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+
+	/*
+	 * U_CET is required for USER CET, per CET spec., meanwhile U_CET and
+	 * PL3_SPP are a bundle for USER CET xsaves.
+	 */
+	if (guest_cet_allowed(vcpu, X86_FEATURE_SHSTK, XFEATURE_MASK_CET_USER) ||
+	    guest_cet_allowed(vcpu, X86_FEATURE_IBT, XFEATURE_MASK_CET_USER)) {
+		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_U_CET, MSR_TYPE_RW);
+		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL3_SSP, MSR_TYPE_RW);
+	} else {
+		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_U_CET, MSR_TYPE_RW, true);
+		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL3_SSP, MSR_TYPE_RW, true);
+	}
+	/*
+	 * S_CET is required for KERNEL CET, meanwhile PL0_SSP ... PL2_SSP are a bundle
+	 * for CET KERNEL xsaves.
+	 */
+	if (guest_cet_allowed(vcpu, X86_FEATURE_SHSTK, XFEATURE_MASK_CET_KERNEL) ||
+	    guest_cet_allowed(vcpu, X86_FEATURE_IBT, XFEATURE_MASK_CET_KERNEL)) {
+		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_S_CET, MSR_TYPE_RW);
+		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL0_SSP, MSR_TYPE_RW);
+		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL1_SSP, MSR_TYPE_RW);
+		vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PL2_SSP, MSR_TYPE_RW);
+
+		/* SSP_TAB only available for KERNEL SHSTK.*/
+		if (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK))
+			vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB,
+						      MSR_TYPE_RW);
+		else
+			vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB,
+						  MSR_TYPE_RW, true);
+	} else {
+		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_S_CET, MSR_TYPE_RW, true);
+		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL0_SSP, MSR_TYPE_RW, true);
+		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL1_SSP, MSR_TYPE_RW, true);
+		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PL2_SSP, MSR_TYPE_RW, true);
+		vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_INT_SSP_TAB, MSR_TYPE_RW, true);
+	}
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7025,6 +7087,9 @@  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
 			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
 		update_intel_pt_cfg(vcpu);
+
+	if (!is_guest_mode(vcpu))
+		vmx_pass_cet_msrs(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)