diff mbox series

[v9,7/7] KVM: X86: Add user-space access interface for CET MSRs

Message ID 20191227021133.11993-8-weijiang.yang@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce support for guest CET feature | expand

Commit Message

Yang, Weijiang Dec. 27, 2019, 2:11 a.m. UTC
There're two different places storing Guest CET states, states
managed with XSAVES/XRSTORS, as restored/saved
in previous patch, can be read/write directly from/to the MSRs.
For those stored in VMCS fields, they're access via vmcs_read/
vmcs_write.

Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/include/asm/kvm_host.h |   3 +-
 arch/x86/kvm/cpuid.c            |   5 +-
 arch/x86/kvm/vmx/vmx.c          | 138 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  11 +++
 4 files changed, 154 insertions(+), 3 deletions(-)

Comments

Sean Christopherson March 3, 2020, 10:28 p.m. UTC | #1
Subject should be something like "Enable CET virtualization", or maybe
move CPUID changes to a separate final patch?

On Fri, Dec 27, 2019 at 10:11:33AM +0800, Yang Weijiang wrote:
> There're two different places storing Guest CET states, states
> managed with XSAVES/XRSTORS, as restored/saved
> in previous patch, can be read/write directly from/to the MSRs.
> For those stored in VMCS fields, they're access via vmcs_read/
> vmcs_write.
> 
> Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   3 +-
>  arch/x86/kvm/cpuid.c            |   5 +-
>  arch/x86/kvm/vmx/vmx.c          | 138 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/x86.c              |  11 +++
>  4 files changed, 154 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 64bf379381e4..34140462084f 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -90,7 +90,8 @@
>  			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
>  			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
>  			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
> -			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
> +			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
> +			  | X86_CR4_CET))
>  
>  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
>  
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 126a31b99823..4414bd110f3c 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -385,13 +385,14 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
>  		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
>  		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
>  		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
> -		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
> +		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK) |
> +		0 /*WAITPKG*/;
>  
>  	/* cpuid 7.0.edx*/
>  	const u32 kvm_cpuid_7_0_edx_x86_features =
>  		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
>  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> -		F(MD_CLEAR);
> +		F(MD_CLEAR) | F(IBT);
>  
>  	/* cpuid 7.1.eax */
>  	const u32 kvm_cpuid_7_1_eax_x86_features =
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 0a75b65d03f0..52ac67604026 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1763,6 +1763,96 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
>  	return 0;
>  }
>  
> +#define CET_MSR_RSVD_BITS_1    0x3
> +#define CET_MSR_RSVD_BITS_2   (0xF << 6)

Would it make sense to use GENMASK?

> +static bool cet_ssp_write_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> +{
> +	u64 data = msr->data;
> +	u32 high_word = data >> 32;
> +
> +	if (is_64_bit_mode(vcpu)) {
> +		if (data & CET_MSR_RSVD_BITS_1)

This looks odd.  I assume it should look more like cet_ctl_write_allowed()?
E.g.

	if (data & CET_MSR_RSVD_BITS_1)
		return false;

	if (!is_64_bit_mode(vcpu) && high_word)
		return false;

> +			return false;
> +	} else if (high_word) {
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool cet_ctl_write_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> +{
> +	u64 data = msr->data;
> +	u32 high_word = data >> 32;
> +
> +	if (data & CET_MSR_RSVD_BITS_2)
> +		return false;
> +
> +	if (!is_64_bit_mode(vcpu) && high_word)
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool cet_ssp_access_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> +{
> +	u64 kvm_xss;
> +	u32 index = msr->index;
> +
> +	if (is_guest_mode(vcpu))

Hmm, this seems wrong, e.g. shouldn't WRMSR be allowed if L1 passes the MSR
to L2, which is the only way to reach this, if I'm not mistaken.

> +		return false;
> +
> +	if (!boot_cpu_has(X86_FEATURE_SHSTK))
> +		return false;
> +
> +	if (!msr->host_initiated &&
> +	    !guest_cpuid_has(vcpu, X86_FEATURE_SHSTK))
> +		return false;
> +
> +	if (index == MSR_IA32_INT_SSP_TAB)
> +		return true;
> +
> +	kvm_xss = kvm_supported_xss();
> +
> +	if (index == MSR_IA32_PL3_SSP) {
> +		if (!(kvm_xss & XFEATURE_MASK_CET_USER))
> +			return false;
> +	} else if (!(kvm_xss & XFEATURE_MASK_CET_KERNEL)) {
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool cet_ctl_access_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> +{
> +	u64 kvm_xss;
> +	u32 index = msr->index;
> +
> +	if (is_guest_mode(vcpu))
> +		return false;
> +
> +	kvm_xss = kvm_supported_xss();
> +
> +	if (!boot_cpu_has(X86_FEATURE_SHSTK) &&
> +	    !boot_cpu_has(X86_FEATURE_IBT))
> +		return false;
> +
> +	if (!msr->host_initiated &&
> +	    !guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) &&
> +	    !guest_cpuid_has(vcpu, X86_FEATURE_IBT))
> +		return false;
> +
> +	if (index == MSR_IA32_U_CET) {
> +		if (!(kvm_xss & XFEATURE_MASK_CET_USER))
> +			return false;
> +	} else if (!(kvm_xss & XFEATURE_MASK_CET_KERNEL)) {
> +		return false;
> +	}
> +
> +	return true;
> +}
>  /*
>   * Reads an msr value (of 'msr_index') into 'pdata'.
>   * Returns 0 on success, non-0 otherwise.
> @@ -1886,6 +1976,26 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  		else
>  			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
>  		break;
> +	case MSR_IA32_S_CET:
> +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> +			return 1;
> +		msr_info->data = vmcs_readl(GUEST_S_CET);
> +		break;
> +	case MSR_IA32_INT_SSP_TAB:
> +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> +			return 1;
> +		msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
> +		break;
> +	case MSR_IA32_U_CET:
> +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> +			return 1;
> +		rdmsrl(MSR_IA32_U_CET, msr_info->data);
> +		break;
> +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> +			return 1;
> +		rdmsrl(msr_info->index, msr_info->data);

Ugh, thought of another problem.  If a SoftIRQ runs after an IRQ it can
load the kernel FPU state.  So for all the XSAVES MSRs we'll need a helper
similar to vmx_write_guest_kernel_gs_base(), except XSAVES has to be even
more restrictive and disable IRQs entirely.  E.g.

static void vmx_get_xsave_msr(struct msr_data *msr_info)
{
	local_irq_disable();
	if (test_thread_flag(TIF_NEED_FPU_LOAD))
		switch_fpu_return();
	rdmsrl(msr_info->index, msr_info->data);
	local_irq_enable();
}

> +		break;
>  	case MSR_TSC_AUX:
>  		if (!msr_info->host_initiated &&
>  		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> @@ -2147,6 +2257,34 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  		else
>  			vmx->pt_desc.guest.addr_a[index / 2] = data;
>  		break;
> +	case MSR_IA32_S_CET:
> +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> +			return 1;
> +		if (!cet_ctl_write_allowed(vcpu, msr_info))
> +			return 1;
> +		vmcs_writel(GUEST_S_CET, data);
> +		break;
> +	case MSR_IA32_INT_SSP_TAB:
> +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> +			return 1;
> +		if (!is_64_bit_mode(vcpu))
> +			return 1;
> +		vmcs_writel(GUEST_INTR_SSP_TABLE, data);
> +		break;
> +	case MSR_IA32_U_CET:
> +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> +			return 1;
> +		if (!cet_ctl_write_allowed(vcpu, msr_info))
> +			return 1;
> +		wrmsrl(MSR_IA32_U_CET, data);
> +		break;
> +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> +			return 1;
> +		if (!cet_ssp_write_allowed(vcpu, msr_info))
> +			return 1;
> +		wrmsrl(msr_info->index, data);
> +		break;
>  	case MSR_TSC_AUX:
>  		if (!msr_info->host_initiated &&
>  		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6dbe77365b22..7de6faa6aa51 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1186,6 +1186,10 @@ static const u32 msrs_to_save_all[] = {
>  	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
>  	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
>  	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
> +
> +	MSR_IA32_XSS, MSR_IA32_U_CET, MSR_IA32_S_CET,
> +	MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
> +	MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
>  };
>  
>  static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
> @@ -1468,6 +1472,13 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
>  		 * invokes 64-bit SYSENTER.
>  		 */
>  		data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
> +		break;
> +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> +	case MSR_IA32_U_CET:
> +	case MSR_IA32_S_CET:
> +	case MSR_IA32_INT_SSP_TAB:
> +		if (is_noncanonical_address(data, vcpu))
> +			return 1;
>  	}
>  
>  	msr.data = data;
> -- 
> 2.17.2
>
Yang, Weijiang March 4, 2020, 3:18 p.m. UTC | #2
On Tue, Mar 03, 2020 at 02:28:27PM -0800, Sean Christopherson wrote:
> Subject should be something like "Enable CET virtualization", or maybe
> move CPUID changes to a separate final patch?
>
OK, let me put the CPUID/CR4.CET into a separate patch.

> On Fri, Dec 27, 2019 at 10:11:33AM +0800, Yang Weijiang wrote:
> > There're two different places storing Guest CET states, states
> > managed with XSAVES/XRSTORS, as restored/saved
> > in previous patch, can be read/write directly from/to the MSRs.
> > For those stored in VMCS fields, they're access via vmcs_read/
> > vmcs_write.
> > 
> > Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |   3 +-
> >  arch/x86/kvm/cpuid.c            |   5 +-
> >  arch/x86/kvm/vmx/vmx.c          | 138 ++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/x86.c              |  11 +++
> >  4 files changed, 154 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 64bf379381e4..34140462084f 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -90,7 +90,8 @@
> >  			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
> >  			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
> >  			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
> > -			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
> > +			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
> > +			  | X86_CR4_CET))
> >  
> >  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
> >  
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index 126a31b99823..4414bd110f3c 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -385,13 +385,14 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
> >  		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
> >  		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
> >  		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
> > -		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
> > +		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK) |
> > +		0 /*WAITPKG*/;
> >  
> >  	/* cpuid 7.0.edx*/
> >  	const u32 kvm_cpuid_7_0_edx_x86_features =
> >  		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
> >  		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
> > -		F(MD_CLEAR);
> > +		F(MD_CLEAR) | F(IBT);
> >  
> >  	/* cpuid 7.1.eax */
> >  	const u32 kvm_cpuid_7_1_eax_x86_features =
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 0a75b65d03f0..52ac67604026 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -1763,6 +1763,96 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
> >  	return 0;
> >  }
> >  
> > +#define CET_MSR_RSVD_BITS_1    0x3
> > +#define CET_MSR_RSVD_BITS_2   (0xF << 6)
> 
> Would it make sense to use GENMASK?
>
Yes, will change it. thank you.

> > +static bool cet_ssp_write_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> > +{
> > +	u64 data = msr->data;
> > +	u32 high_word = data >> 32;
> > +
> > +	if (is_64_bit_mode(vcpu)) {
> > +		if (data & CET_MSR_RSVD_BITS_1)
> 
> This looks odd.  I assume it should look more like cet_ctl_write_allowed()?
> E.g.
> 
> 	if (data & CET_MSR_RSVD_BITS_1)
> 		return false;
> 
> 	if (!is_64_bit_mode(vcpu) && high_word)
> 		return false;
> 
Correct, this looks much better.
> > +			return false;
> > +	} else if (high_word) {
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static bool cet_ctl_write_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> > +{
> > +	u64 data = msr->data;
> > +	u32 high_word = data >> 32;
> > +
> > +	if (data & CET_MSR_RSVD_BITS_2)
> > +		return false;
> > +
> > +	if (!is_64_bit_mode(vcpu) && high_word)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static bool cet_ssp_access_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> > +{
> > +	u64 kvm_xss;
> > +	u32 index = msr->index;
> > +
> > +	if (is_guest_mode(vcpu))
> 
> Hmm, this seems wrong, e.g. shouldn't WRMSR be allowed if L1 passes the MSR
> to L2, which is the only way to reach this, if I'm not mistaken.
> 
Actually I tried to hide the CET feature from L2 guest, but the code
was broken for this part, I'm working on this part...

> > +		return false;
> > +
> > +	if (!boot_cpu_has(X86_FEATURE_SHSTK))
> > +		return false;
> > +
> > +	if (!msr->host_initiated &&
> > +	    !guest_cpuid_has(vcpu, X86_FEATURE_SHSTK))
> > +		return false;
> > +
> > +	if (index == MSR_IA32_INT_SSP_TAB)
> > +		return true;
> > +
> > +	kvm_xss = kvm_supported_xss();
> > +
> > +	if (index == MSR_IA32_PL3_SSP) {
> > +		if (!(kvm_xss & XFEATURE_MASK_CET_USER))
> > +			return false;
> > +	} else if (!(kvm_xss & XFEATURE_MASK_CET_KERNEL)) {
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static bool cet_ctl_access_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
> > +{
> > +	u64 kvm_xss;
> > +	u32 index = msr->index;
> > +
> > +	if (is_guest_mode(vcpu))
> > +		return false;
> > +
> > +	kvm_xss = kvm_supported_xss();
> > +
> > +	if (!boot_cpu_has(X86_FEATURE_SHSTK) &&
> > +	    !boot_cpu_has(X86_FEATURE_IBT))
> > +		return false;
> > +
> > +	if (!msr->host_initiated &&
> > +	    !guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) &&
> > +	    !guest_cpuid_has(vcpu, X86_FEATURE_IBT))
> > +		return false;
> > +
> > +	if (index == MSR_IA32_U_CET) {
> > +		if (!(kvm_xss & XFEATURE_MASK_CET_USER))
> > +			return false;
> > +	} else if (!(kvm_xss & XFEATURE_MASK_CET_KERNEL)) {
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> >  /*
> >   * Reads an msr value (of 'msr_index') into 'pdata'.
> >   * Returns 0 on success, non-0 otherwise.
> > @@ -1886,6 +1976,26 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >  		else
> >  			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
> >  		break;
> > +	case MSR_IA32_S_CET:
> > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		msr_info->data = vmcs_readl(GUEST_S_CET);
> > +		break;
> > +	case MSR_IA32_INT_SSP_TAB:
> > +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
> > +		break;
> > +	case MSR_IA32_U_CET:
> > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		rdmsrl(MSR_IA32_U_CET, msr_info->data);
> > +		break;
> > +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> > +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		rdmsrl(msr_info->index, msr_info->data);
> 
> Ugh, thought of another problem.  If a SoftIRQ runs after an IRQ it can
> load the kernel FPU state.  So for all the XSAVES MSRs we'll need a helper
> similar to vmx_write_guest_kernel_gs_base(), except XSAVES has to be even
> more restrictive and disable IRQs entirely.  E.g.
> 
> static void vmx_get_xsave_msr(struct msr_data *msr_info)
> {
> 	local_irq_disable();
> 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
> 		switch_fpu_return();
> 	rdmsrl(msr_info->index, msr_info->data);
> 	local_irq_enable();
In this case, would SoftIRQ destroy vcpu->arch.guest.fpu states which
had been restored to XSAVES MSRs that we were accessing? So should we restore
guest.fpu or? In previous patch, we have restored guest.fpu before
access the XSAVES MSRs.

> }
> 
> > +		break;
> >  	case MSR_TSC_AUX:
> >  		if (!msr_info->host_initiated &&
> >  		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> > @@ -2147,6 +2257,34 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >  		else
> >  			vmx->pt_desc.guest.addr_a[index / 2] = data;
> >  		break;
> > +	case MSR_IA32_S_CET:
> > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		if (!cet_ctl_write_allowed(vcpu, msr_info))
> > +			return 1;
> > +		vmcs_writel(GUEST_S_CET, data);
> > +		break;
> > +	case MSR_IA32_INT_SSP_TAB:
> > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		if (!is_64_bit_mode(vcpu))
> > +			return 1;
> > +		vmcs_writel(GUEST_INTR_SSP_TABLE, data);
> > +		break;
> > +	case MSR_IA32_U_CET:
> > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		if (!cet_ctl_write_allowed(vcpu, msr_info))
> > +			return 1;
> > +		wrmsrl(MSR_IA32_U_CET, data);
> > +		break;
> > +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> > +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> > +			return 1;
> > +		if (!cet_ssp_write_allowed(vcpu, msr_info))
> > +			return 1;
> > +		wrmsrl(msr_info->index, data);
> > +		break;
> >  	case MSR_TSC_AUX:
> >  		if (!msr_info->host_initiated &&
> >  		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 6dbe77365b22..7de6faa6aa51 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1186,6 +1186,10 @@ static const u32 msrs_to_save_all[] = {
> >  	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
> >  	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
> >  	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
> > +
> > +	MSR_IA32_XSS, MSR_IA32_U_CET, MSR_IA32_S_CET,
> > +	MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
> > +	MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
> >  };
> >  
> >  static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
> > @@ -1468,6 +1472,13 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
> >  		 * invokes 64-bit SYSENTER.
> >  		 */
> >  		data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
> > +		break;
> > +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> > +	case MSR_IA32_U_CET:
> > +	case MSR_IA32_S_CET:
> > +	case MSR_IA32_INT_SSP_TAB:
> > +		if (is_noncanonical_address(data, vcpu))
> > +			return 1;
> >  	}
> >  
> >  	msr.data = data;
> > -- 
> > 2.17.2
> >
Sean Christopherson March 4, 2020, 3:45 p.m. UTC | #3
On Wed, Mar 04, 2020 at 11:18:15PM +0800, Yang Weijiang wrote:
> On Tue, Mar 03, 2020 at 02:28:27PM -0800, Sean Christopherson wrote:
> > > @@ -1886,6 +1976,26 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >  		else
> > >  			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
> > >  		break;
> > > +	case MSR_IA32_S_CET:
> > > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > > +			return 1;
> > > +		msr_info->data = vmcs_readl(GUEST_S_CET);
> > > +		break;
> > > +	case MSR_IA32_INT_SSP_TAB:
> > > +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> > > +			return 1;
> > > +		msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
> > > +		break;
> > > +	case MSR_IA32_U_CET:
> > > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > > +			return 1;
> > > +		rdmsrl(MSR_IA32_U_CET, msr_info->data);
> > > +		break;
> > > +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> > > +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> > > +			return 1;
> > > +		rdmsrl(msr_info->index, msr_info->data);
> > 
> > Ugh, thought of another problem.  If a SoftIRQ runs after an IRQ it can
> > load the kernel FPU state.  So for all the XSAVES MSRs we'll need a helper
> > similar to vmx_write_guest_kernel_gs_base(), except XSAVES has to be even
> > more restrictive and disable IRQs entirely.  E.g.
> > 
> > static void vmx_get_xsave_msr(struct msr_data *msr_info)
> > {
> > 	local_irq_disable();
> > 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
> > 		switch_fpu_return();
> > 	rdmsrl(msr_info->index, msr_info->data);
> > 	local_irq_enable();
> In this case, would SoftIRQ destroy vcpu->arch.guest.fpu states which
> had been restored to XSAVES MSRs that we were accessing?

Doing kernel_fpu_begin() from a softirq would swap guest.fpu out of the
CPUs registers.  It sets TIF_NEED_FPU_LOAD to mark the tasks has needing to
reload its FPU state prior to returning to userspace.  So it doesn't
destroy it per se.  The result is that KVM would read/write the CET MSRs
after they're loaded from the kernel's FPU state instead of reading the
MSRs loaded from the guest's FPU state.

> So should we restore
> guest.fpu or? In previous patch, we have restored guest.fpu before
> access the XSAVES MSRs.

There are three different FPU states:

  - kernel
  - userspace
  - guest

RDMSR/WRMSR for CET MSRs need to run while the guest.fpu state is loaded
into the CPU registers[1].  At the beginning of the syscall from userspace,
i.e. the vCPU ioctl(), the task's FPU state[2] holds userspace FPU state.
Patch 6/7 swaps out the userspace state and loads the guest state.

But, if a softirq runs between kvm_load_guest_fpu() and now, and executes
kernel_fpu_begin(), it will swap the guest state (out of CPU registers)
and load the kernel state (into PCU registers).  The actual RDMSR/WRMSR
needs to ensure the guest state is still loaded by checking and handling
TIF_NEED_FPU_LOAD.

[1] An alternative to doing switch_fpu_return() on TIF_NEED_FPU_LOAD would
    be to calculate the offset into the xsave and read/write directly
    to/from memory.  But IMO that's unnecessary complexity as the guest's
    fpu state still needs to be reloaded before re-entering the guest, e.g.
    if vmx_{g,s}et_msr() is invoked on {RD,WR}MSR intercept, while loading
    or saving MSR state from userspace isn't a hot path.

[2] I worded this to say "task's FPU state" because it's also possible the
    CPU registers hold kernel state at the beginning of the vCPU ioctl(),
    e.g. because of softirq.
Yang, Weijiang March 5, 2020, 12:31 p.m. UTC | #4
On Wed, Mar 04, 2020 at 07:45:38AM -0800, Sean Christopherson wrote:
> On Wed, Mar 04, 2020 at 11:18:15PM +0800, Yang Weijiang wrote:
> > On Tue, Mar 03, 2020 at 02:28:27PM -0800, Sean Christopherson wrote:
> > > > @@ -1886,6 +1976,26 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > > >  		else
> > > >  			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
> > > >  		break;
> > > > +	case MSR_IA32_S_CET:
> > > > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > > > +			return 1;
> > > > +		msr_info->data = vmcs_readl(GUEST_S_CET);
> > > > +		break;
> > > > +	case MSR_IA32_INT_SSP_TAB:
> > > > +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> > > > +			return 1;
> > > > +		msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
> > > > +		break;
> > > > +	case MSR_IA32_U_CET:
> > > > +		if (!cet_ctl_access_allowed(vcpu, msr_info))
> > > > +			return 1;
> > > > +		rdmsrl(MSR_IA32_U_CET, msr_info->data);
> > > > +		break;
> > > > +	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
> > > > +		if (!cet_ssp_access_allowed(vcpu, msr_info))
> > > > +			return 1;
> > > > +		rdmsrl(msr_info->index, msr_info->data);
> > > 
> > > Ugh, thought of another problem.  If a SoftIRQ runs after an IRQ it can
> > > load the kernel FPU state.  So for all the XSAVES MSRs we'll need a helper
> > > similar to vmx_write_guest_kernel_gs_base(), except XSAVES has to be even
> > > more restrictive and disable IRQs entirely.  E.g.
> > > 
> > > static void vmx_get_xsave_msr(struct msr_data *msr_info)
> > > {
> > > 	local_irq_disable();
> > > 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
> > > 		switch_fpu_return();
> > > 	rdmsrl(msr_info->index, msr_info->data);
> > > 	local_irq_enable();
> > In this case, would SoftIRQ destroy vcpu->arch.guest.fpu states which
> > had been restored to XSAVES MSRs that we were accessing?
> 
> Doing kernel_fpu_begin() from a softirq would swap guest.fpu out of the
> CPUs registers.  It sets TIF_NEED_FPU_LOAD to mark the tasks has needing to
> reload its FPU state prior to returning to userspace.  So it doesn't
> destroy it per se.  The result is that KVM would read/write the CET MSRs
> after they're loaded from the kernel's FPU state instead of reading the
> MSRs loaded from the guest's FPU state.
>
OK, will wrap the access code with a helper, thank you!
> > So should we restore
> > guest.fpu or? In previous patch, we have restored guest.fpu before
> > access the XSAVES MSRs.
> 
> There are three different FPU states:
> 
>   - kernel
>   - userspace
>   - guest
> 
> RDMSR/WRMSR for CET MSRs need to run while the guest.fpu state is loaded
> into the CPU registers[1].  At the beginning of the syscall from userspace,
> i.e. the vCPU ioctl(), the task's FPU state[2] holds userspace FPU state.
> Patch 6/7 swaps out the userspace state and loads the guest state.
> 
> But, if a softirq runs between kvm_load_guest_fpu() and now, and executes
> kernel_fpu_begin(), it will swap the guest state (out of CPU registers)
> and load the kernel state (into PCU registers).  The actual RDMSR/WRMSR
> needs to ensure the guest state is still loaded by checking and handling
> TIF_NEED_FPU_LOAD.
> 
> [1] An alternative to doing switch_fpu_return() on TIF_NEED_FPU_LOAD would
>     be to calculate the offset into the xsave and read/write directly
>     to/from memory.  But IMO that's unnecessary complexity as the guest's
>     fpu state still needs to be reloaded before re-entering the guest, e.g.
>     if vmx_{g,s}et_msr() is invoked on {RD,WR}MSR intercept, while loading
>     or saving MSR state from userspace isn't a hot path.
> 
> [2] I worded this to say "task's FPU state" because it's also possible the
>     CPU registers hold kernel state at the beginning of the vCPU ioctl(),
>     e.g. because of softirq.

It's clear to me, thanks for the explanation.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 64bf379381e4..34140462084f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -90,7 +90,8 @@ 
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
 			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
-			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
+			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
+			  | X86_CR4_CET))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 126a31b99823..4414bd110f3c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -385,13 +385,14 @@  static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
 		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SHSTK) |
+		0 /*WAITPKG*/;
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
 		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
 		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
-		F(MD_CLEAR);
+		F(MD_CLEAR) | F(IBT);
 
 	/* cpuid 7.1.eax */
 	const u32 kvm_cpuid_7_1_eax_x86_features =
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0a75b65d03f0..52ac67604026 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1763,6 +1763,96 @@  static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 	return 0;
 }
 
+#define CET_MSR_RSVD_BITS_1    0x3
+#define CET_MSR_RSVD_BITS_2   (0xF << 6)
+
+static bool cet_ssp_write_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+	u64 data = msr->data;
+	u32 high_word = data >> 32;
+
+	if (is_64_bit_mode(vcpu)) {
+		if (data & CET_MSR_RSVD_BITS_1)
+			return false;
+	} else if (high_word) {
+		return false;
+	}
+
+	return true;
+}
+
+static bool cet_ctl_write_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+	u64 data = msr->data;
+	u32 high_word = data >> 32;
+
+	if (data & CET_MSR_RSVD_BITS_2)
+		return false;
+
+	if (!is_64_bit_mode(vcpu) && high_word)
+		return false;
+
+	return true;
+}
+
+static bool cet_ssp_access_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+	u64 kvm_xss;
+	u32 index = msr->index;
+
+	if (is_guest_mode(vcpu))
+		return false;
+
+	if (!boot_cpu_has(X86_FEATURE_SHSTK))
+		return false;
+
+	if (!msr->host_initiated &&
+	    !guest_cpuid_has(vcpu, X86_FEATURE_SHSTK))
+		return false;
+
+	if (index == MSR_IA32_INT_SSP_TAB)
+		return true;
+
+	kvm_xss = kvm_supported_xss();
+
+	if (index == MSR_IA32_PL3_SSP) {
+		if (!(kvm_xss & XFEATURE_MASK_CET_USER))
+			return false;
+	} else if (!(kvm_xss & XFEATURE_MASK_CET_KERNEL)) {
+		return false;
+	}
+
+	return true;
+}
+
+static bool cet_ctl_access_allowed(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+	u64 kvm_xss;
+	u32 index = msr->index;
+
+	if (is_guest_mode(vcpu))
+		return false;
+
+	kvm_xss = kvm_supported_xss();
+
+	if (!boot_cpu_has(X86_FEATURE_SHSTK) &&
+	    !boot_cpu_has(X86_FEATURE_IBT))
+		return false;
+
+	if (!msr->host_initiated &&
+	    !guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) &&
+	    !guest_cpuid_has(vcpu, X86_FEATURE_IBT))
+		return false;
+
+	if (index == MSR_IA32_U_CET) {
+		if (!(kvm_xss & XFEATURE_MASK_CET_USER))
+			return false;
+	} else if (!(kvm_xss & XFEATURE_MASK_CET_KERNEL)) {
+		return false;
+	}
+
+	return true;
+}
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -1886,6 +1976,26 @@  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 		break;
+	case MSR_IA32_S_CET:
+		if (!cet_ctl_access_allowed(vcpu, msr_info))
+			return 1;
+		msr_info->data = vmcs_readl(GUEST_S_CET);
+		break;
+	case MSR_IA32_INT_SSP_TAB:
+		if (!cet_ssp_access_allowed(vcpu, msr_info))
+			return 1;
+		msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
+		break;
+	case MSR_IA32_U_CET:
+		if (!cet_ctl_access_allowed(vcpu, msr_info))
+			return 1;
+		rdmsrl(MSR_IA32_U_CET, msr_info->data);
+		break;
+	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+		if (!cet_ssp_access_allowed(vcpu, msr_info))
+			return 1;
+		rdmsrl(msr_info->index, msr_info->data);
+		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
@@ -2147,6 +2257,34 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			vmx->pt_desc.guest.addr_a[index / 2] = data;
 		break;
+	case MSR_IA32_S_CET:
+		if (!cet_ctl_access_allowed(vcpu, msr_info))
+			return 1;
+		if (!cet_ctl_write_allowed(vcpu, msr_info))
+			return 1;
+		vmcs_writel(GUEST_S_CET, data);
+		break;
+	case MSR_IA32_INT_SSP_TAB:
+		if (!cet_ctl_access_allowed(vcpu, msr_info))
+			return 1;
+		if (!is_64_bit_mode(vcpu))
+			return 1;
+		vmcs_writel(GUEST_INTR_SSP_TABLE, data);
+		break;
+	case MSR_IA32_U_CET:
+		if (!cet_ctl_access_allowed(vcpu, msr_info))
+			return 1;
+		if (!cet_ctl_write_allowed(vcpu, msr_info))
+			return 1;
+		wrmsrl(MSR_IA32_U_CET, data);
+		break;
+	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+		if (!cet_ssp_access_allowed(vcpu, msr_info))
+			return 1;
+		if (!cet_ssp_write_allowed(vcpu, msr_info))
+			return 1;
+		wrmsrl(msr_info->index, data);
+		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6dbe77365b22..7de6faa6aa51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1186,6 +1186,10 @@  static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+	MSR_IA32_XSS, MSR_IA32_U_CET, MSR_IA32_S_CET,
+	MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
+	MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
 };
 
 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1468,6 +1472,13 @@  static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
 		 * invokes 64-bit SYSENTER.
 		 */
 		data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+		break;
+	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+	case MSR_IA32_U_CET:
+	case MSR_IA32_S_CET:
+	case MSR_IA32_INT_SSP_TAB:
+		if (is_noncanonical_address(data, vcpu))
+			return 1;
 	}
 
 	msr.data = data;