diff mbox series

[v5,09/19] KVM:x86: Make guest supervisor states as non-XSAVE managed

Message ID 20230803042732.88515-10-weijiang.yang@intel.com (mailing list archive)
State New, archived
Headers show
Series Enable CET Virtualization | expand

Commit Message

Yang, Weijiang Aug. 3, 2023, 4:27 a.m. UTC
Save guest CET supervisor states, i.e.,PL{0,1,2}_SSP, when vCPU
is exiting to userspace or being preempted. Reload the MSRs
before vm-entry.

Embeded the helpers in {vmx,svm}_prepare_switch_to_guest() and
vmx_prepare_switch_to_host()/svm_prepare_host_switch() to employ
existing guest state management and optimize the invocation of
the helpers.

Enabling CET supervisor state management in KVM due to:
 -Introducing unnecessary XSAVE operation when switch to non-vCPU
userspace within current FPU framework.
 -Forcing allocating additional space for CET supervisor states in
each thread context regardless whether it's vCPU thread or not.

Suggested-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.h            | 11 +++++++++++
 arch/x86/kvm/svm/svm.c          |  2 ++
 arch/x86/kvm/vmx/vmx.c          |  2 ++
 arch/x86/kvm/x86.c              | 27 +++++++++++++++++++++++++++
 arch/x86/kvm/x86.h              |  3 +++
 6 files changed, 46 insertions(+)

Comments

Chao Gao Aug. 3, 2023, 11:15 a.m. UTC | #1
On Thu, Aug 03, 2023 at 12:27:22AM -0400, Yang Weijiang wrote:
>+void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>+{
>+	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
>+		rdmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
>+		rdmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
>+		rdmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);

>+		/*
>+		 * Omit reset to host PL{1,2}_SSP because Linux will never use
>+		 * these MSRs.
>+		 */
>+		wrmsrl(MSR_IA32_PL0_SSP, 0);

This wrmsrl() can be dropped because host doesn't support SSS yet.

>+	}
>+}
>+EXPORT_SYMBOL_GPL(save_cet_supervisor_ssp);
>+
>+void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>+{
>+	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {

ditto

>+		wrmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
>+		wrmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
>+		wrmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
>+	}
>+}
>+EXPORT_SYMBOL_GPL(reload_cet_supervisor_ssp);
>+
> int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
> {
> 	struct kvm_queued_exception *ex = &vcpu->arch.exception;
>@@ -12133,6 +12158,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> 
> 	vcpu->arch.cr3 = 0;
> 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
>+	memset(vcpu->arch.cet_s_ssp, 0, sizeof(vcpu->arch.cet_s_ssp));
> 
> 	/*
> 	 * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
>@@ -12313,6 +12339,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
> 		pmu->need_cleanup = true;
> 		kvm_make_request(KVM_REQ_PMU, vcpu);
> 	}
>+

remove the stray newline.

> 	static_call(kvm_x86_sched_in)(vcpu, cpu);
> }
> 
>diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>index 6e6292915f8c..c69fc027f5ec 100644
>--- a/arch/x86/kvm/x86.h
>+++ b/arch/x86/kvm/x86.h
>@@ -501,6 +501,9 @@ static inline void kvm_machine_check(void)
> 
> void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
> void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
>+void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu);
>+void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu);

nit: please add kvm_ prefix to the function names because they are exposed to
other modules. "cet" in the names is a little redundant. I slightly prefer
kvm_save/load_guest_supervisor_ssp()

Overall, this patch looks good to me. Hence,

Reviewed-by: Chao Gao <chao.gao@intel.com>

>+
> int kvm_spec_ctrl_test_value(u64 value);
> bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
> int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
>-- 
>2.27.0
>
Yang, Weijiang Aug. 4, 2023, 3:26 a.m. UTC | #2
On 8/3/2023 7:15 PM, Chao Gao wrote:
> On Thu, Aug 03, 2023 at 12:27:22AM -0400, Yang Weijiang wrote:
>> +void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>> +{
>> +	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
>> +		rdmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
>> +		rdmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
>> +		rdmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
>> +		/*
>> +		 * Omit reset to host PL{1,2}_SSP because Linux will never use
>> +		 * these MSRs.
>> +		 */
>> +		wrmsrl(MSR_IA32_PL0_SSP, 0);
> This wrmsrl() can be dropped because host doesn't support SSS yet.
Frankly speaking, I want to remove this line of code. But that would mess up the MSR
on host side, i.e., from host perspective, the MSRs could be filled with garbage data,
and looks awful. Anyway, I can remove it.
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(save_cet_supervisor_ssp);
>> +
>> +void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>> +{
>> +	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
> ditto
Below is to reload guest supervisor SSPs instead of resetting host ones.
>> +		wrmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
>> +		wrmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
>> +		wrmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(reload_cet_supervisor_ssp);
>> +
>> int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
>> {
>> 	struct kvm_queued_exception *ex = &vcpu->arch.exception;
>> @@ -12133,6 +12158,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>
>> 	vcpu->arch.cr3 = 0;
>> 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
>> +	memset(vcpu->arch.cet_s_ssp, 0, sizeof(vcpu->arch.cet_s_ssp));
>>
>> 	/*
>> 	 * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
>> @@ -12313,6 +12339,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
>> 		pmu->need_cleanup = true;
>> 		kvm_make_request(KVM_REQ_PMU, vcpu);
>> 	}
>> +
> remove the stray newline.
OK.
>> 	static_call(kvm_x86_sched_in)(vcpu, cpu);
>> }
>>
>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>> index 6e6292915f8c..c69fc027f5ec 100644
>> --- a/arch/x86/kvm/x86.h
>> +++ b/arch/x86/kvm/x86.h
>> @@ -501,6 +501,9 @@ static inline void kvm_machine_check(void)
>>
>> void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
>> void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
>> +void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu);
>> +void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu);
> nit: please add kvm_ prefix to the function names because they are exposed to
> other modules. "cet" in the names is a little redundant. I slightly prefer
> kvm_save/load_guest_supervisor_ssp()
Sure, actually I wanted to add the prefix, but at a second thought, the functions with
kvm_ are mostly generic functions in KVM, but here are the CET specific functions.
>
> Overall, this patch looks good to me. Hence,
>
> Reviewed-by: Chao Gao <chao.gao@intel.com>
Thanks a lot for the review!
>> +
>> int kvm_spec_ctrl_test_value(u64 value);
>> bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
>> int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
>> -- 
>> 2.27.0
>>
Sean Christopherson Aug. 4, 2023, 8:45 p.m. UTC | #3
On Fri, Aug 04, 2023, Weijiang Yang wrote:
> On 8/3/2023 7:15 PM, Chao Gao wrote:
> > On Thu, Aug 03, 2023 at 12:27:22AM -0400, Yang Weijiang wrote:
> > > +void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
> > > +{
> > > +	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {

Drop the unlikely, KVM should not speculate on the guest configuration or underlying
hardware.

> > > +		rdmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
> > > +		rdmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
> > > +		rdmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
> > > +		/*
> > > +		 * Omit reset to host PL{1,2}_SSP because Linux will never use
> > > +		 * these MSRs.
> > > +		 */
> > > +		wrmsrl(MSR_IA32_PL0_SSP, 0);
> > This wrmsrl() can be dropped because host doesn't support SSS yet.
> Frankly speaking, I want to remove this line of code. But that would mess up the MSR
> on host side, i.e., from host perspective, the MSRs could be filled with garbage data,
> and looks awful.

So?  :-)

That's the case for all of the MSRs that KVM defers restoring until the host
returns to userspace, i.e. running in the host with bogus values in hardware is
nothing new.

And as I mentioned in the other thread regarding the assertion that SSS isn't
enabled in the host, sanitizing hardware values for something that should never
be consumed is a fools errand.

> Anyway, I can remove it.

Yes please, though it may be a moot point.

> > > +	}
> > > +}
> > > +EXPORT_SYMBOL_GPL(save_cet_supervisor_ssp);
> > > +
> > > +void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
> > > +{
> > > +	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
> > ditto
> Below is to reload guest supervisor SSPs instead of resetting host ones.
> > > +		wrmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
> > > +		wrmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
> > > +		wrmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);

Pulling back in the justification from v3:

 the Pros:
  - Super easy to implement for KVM.
  - Automatically avoids saving and restoring this data when the vmexit
    is handled within KVM.

 the Cons:
  - Unnecessarily restores XFEATURE_CET_KERNEL when switching to
    non-KVM task's userspace.
  - Forces allocating space for this state on all tasks, whether or not
    they use KVM, and with likely zero users today and the near future.
  - Complicates the FPU optimization thinking by including things that
    can have no affect on userspace in the FPU

IMO the pros far outweigh the cons.  3x RDMSR and 3x WRMSR when loading host/guest
state is non-trivial overhead.  That can be mitigated, e.g. by utilizing the
user return MSR framework, but it's still unpalatable.  It's unlikely many guests
will SSS in the *near* future, but I don't want to end up with code that performs
poorly in the future and needs to be rewritten.

Especially because another big negative is that not utilizing XSTATE bleeds into
KVM's ABI.  Userspace has to be told to manually save+restore MSRs instead of just
letting KVM_{G,S}ET_XSAVE handle the state.  And that will create a bit of a
snafu if Linux does gain support for SSS.

On the other hand, the extra per-task memory is all of 24 bytes.  AFAICT, there's
literally zero effect on guest XSTATE allocations because those are vmalloc'd and
thus rounded up to PAGE_SIZE, i.e. the next 4KiB.  And XSTATE needs to be 64-byte
aligned, so the 24 bytes is only actually meaningful if the current size is within
24 bytes of the next cahce line.  And the "current" size is variable depending on
which features are present and enabled, i.e. it's a roll of the dice as to whether
or not using XSTATE for supervisor CET would actually increase memory usage.  And
_if_ it does increase memory consumption, I have a very hard time believing an
extra 64 bytes in the worst case scenario is a dealbreaker.

If the performance is a concern, i.e. we don't want to eat saving/restoring the
MSRs when switching to/from host FPU context, then I *think* that's simply a matter
of keeping guest state resident when loading non-guest FPU state.

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1015af1ae562..8e7599e3b923 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -167,6 +167,16 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
                 */
                xfd_update_state(fpstate);
 
+               /*
+                * Leave supervisor CET state as-is when loading host state
+                * (kernel or userspace).  Supervisor CET state is managed via
+                * XSTATE for KVM guests, but the host never consumes said
+                * state (doesn't support supervisor shadow stacks), i.e. it's
+                * safe to keep guest state loaded into hardware.
+                */
+               if (!fpstate->is_guest)
+                       mask &= ~XFEATURE_MASK_CET_KERNEL;
+
                /*
                 * Restoring state always needs to modify all features
                 * which are in @mask even if the current task cannot use


So unless I'm missing something, NAK to this approach, at least not without trying
the kernel FPU approach, i.e. I want somelike like to PeterZ or tglx to actually
full on NAK the kernel approach before we consider shoving a hack into KVM.
Peter Zijlstra Aug. 4, 2023, 8:59 p.m. UTC | #4
On Fri, Aug 04, 2023 at 01:45:11PM -0700, Sean Christopherson wrote:

> So unless I'm missing something, NAK to this approach, at least not without trying
> the kernel FPU approach, i.e. I want somelike like to PeterZ or tglx to actually
> full on NAK the kernel approach before we consider shoving a hack into KVM.

Not having fully followed things (I'll go read up), SSS is blocked on
FRED. But it is definitely on the books to do SSS once FRED is go.

So if the approach as chosen gets in the way of host kernel SS
management, that is a wee problem.
Paolo Bonzini Aug. 4, 2023, 9:32 p.m. UTC | #5
On 8/4/23 22:45, Sean Christopherson wrote:
>>>> +void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
> Drop the unlikely, KVM should not speculate on the guest configuration or underlying
> hardware.

In general unlikely() can still be a good idea if you have a fast path 
vs. a slow path; the extra cost of a branch will be much more visible on 
the fast path.  That said the compiler should already be doing that.

>  the Pros:
>   - Super easy to implement for KVM.
>   - Automatically avoids saving and restoring this data when the vmexit
>     is handled within KVM.
>
>  the Cons:
>   - Unnecessarily restores XFEATURE_CET_KERNEL when switching to
>     non-KVM task's userspace.
>   - Forces allocating space for this state on all tasks, whether or not
>     they use KVM, and with likely zero users today and the near future.
>   - Complicates the FPU optimization thinking by including things that
>     can have no affect on userspace in the FPU

I'm not sure if Linux will ever use XFEATURE_CET_KERNEL.  Linux does not 
use MSR_IA32_PL{1,2}_SSP; MSR_IA32_PL0_SSP probably would be per-CPU but 
it is not used while in ring 0 (except for SETSSBSY) and the restore can 
be delayed until return to userspace.  It is not unlike the SYSCALL MSRs.

So I would treat the bit similar to the dynamic features even if it's 
not guarded by XFD, for example

#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
#define XFEATURE_MASK_USER_OPTIONAL \
	(XFEATURE_MASK_DYNAMIC | XFEATURE_MASK_CET_KERNEL)

where XFEATURE_MASK_USER_DYNAMIC is used for xfd-related tasks but 
everything else uses XFEATURE_MASK_USER_OPTIONAL.

Then you'd enable the feature by hand when allocating the guest fpstate.

> Especially because another big negative is that not utilizing XSTATE bleeds into
> KVM's ABI.  Userspace has to be told to manually save+restore MSRs instead of just
> letting KVM_{G,S}ET_XSAVE handle the state.  And that will create a bit of a
> snafu if Linux does gain support for SSS.

I don't think this matters, we don't have any MSRs in KVM_GET/SET_XSAVE 
and in fact we can't even add them since the uABI uses the non-compacted 
format.  MSRs should be retrieved and set via KVM_GET/SET_MSR and 
userspace will learn about the index automatically via 
KVM_GET_MSR_INDEX_LIST.

Paolo
Yang, Weijiang Aug. 9, 2023, 2:39 a.m. UTC | #6
On 8/5/2023 4:45 AM, Sean Christopherson wrote:
> On Fri, Aug 04, 2023, Weijiang Yang wrote:
>> On 8/3/2023 7:15 PM, Chao Gao wrote:
>>> On Thu, Aug 03, 2023 at 12:27:22AM -0400, Yang Weijiang wrote:
>>>> +void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
> Drop the unlikely, KVM should not speculate on the guest configuration or underlying
> hardware.
OK.
>>>> +		rdmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
>>>> +		rdmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
>>>> +		rdmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
>>>> +		/*
>>>> +		 * Omit reset to host PL{1,2}_SSP because Linux will never use
>>>> +		 * these MSRs.
>>>> +		 */
>>>> +		wrmsrl(MSR_IA32_PL0_SSP, 0);
>>> This wrmsrl() can be dropped because host doesn't support SSS yet.
>> Frankly speaking, I want to remove this line of code. But that would mess up the MSR
>> on host side, i.e., from host perspective, the MSRs could be filled with garbage data,
>> and looks awful.
> So?  :-)
>
> That's the case for all of the MSRs that KVM defers restoring until the host
> returns to userspace, i.e. running in the host with bogus values in hardware is
> nothing new.
CET PL{0,1,2}_SSP are a bit different from other MSRs, the latter will be reloaded with host values
at some points after VM-Exit, but the CET MSRs are "leaked" and never be handled anywhere.
>
> And as I mentioned in the other thread regarding the assertion that SSS isn't
> enabled in the host, sanitizing hardware values for something that should never
> be consumed is a fools errand.
>
>> Anyway, I can remove it.
> Yes please, though it may be a moot point.
>
>>>> +	}
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(save_cet_supervisor_ssp);
>>>> +
>>>> +void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
>>> ditto
>> Below is to reload guest supervisor SSPs instead of resetting host ones.
>>>> +		wrmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
>>>> +		wrmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
>>>> +		wrmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
> Pulling back in the justification from v3:
>
>   the Pros:
>    - Super easy to implement for KVM.
>    - Automatically avoids saving and restoring this data when the vmexit
>      is handled within KVM.
>
>   the Cons:
>    - Unnecessarily restores XFEATURE_CET_KERNEL when switching to
>      non-KVM task's userspace.
>    - Forces allocating space for this state on all tasks, whether or not
>      they use KVM, and with likely zero users today and the near future.
>    - Complicates the FPU optimization thinking by including things that
>      can have no affect on userspace in the FPU
>
> IMO the pros far outweigh the cons.  3x RDMSR and 3x WRMSR when loading host/guest
> state is non-trivial overhead.  That can be mitigated, e.g. by utilizing the
> user return MSR framework, but it's still unpalatable.  It's unlikely many guests
> will SSS in the *near* future, but I don't want to end up with code that performs
> poorly in the future and needs to be rewritten.
> Especially because another big negative is that not utilizing XSTATE bleeds into
> KVM's ABI.  Userspace has to be told to manually save+restore MSRs instead of just
> letting KVM_{G,S}ET_XSAVE handle the state.  And that will create a bit of a
> snafu if Linux does gain support for SSS.
>
> On the other hand, the extra per-task memory is all of 24 bytes.  AFAICT, there's
> literally zero effect on guest XSTATE allocations because those are vmalloc'd and
> thus rounded up to PAGE_SIZE, i.e. the next 4KiB.  And XSTATE needs to be 64-byte
> aligned, so the 24 bytes is only actually meaningful if the current size is within
> 24 bytes of the next cahce line.  And the "current" size is variable depending on
> which features are present and enabled, i.e. it's a roll of the dice as to whether
> or not using XSTATE for supervisor CET would actually increase memory usage.  And
> _if_ it does increase memory consumption, I have a very hard time believing an
> extra 64 bytes in the worst case scenario is a dealbreaker.
>
> If the performance is a concern, i.e. we don't want to eat saving/restoring the
> MSRs when switching to/from host FPU context, then I *think* that's simply a matter
> of keeping guest state resident when loading non-guest FPU state.
>
> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> index 1015af1ae562..8e7599e3b923 100644
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -167,6 +167,16 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
>                   */
>                  xfd_update_state(fpstate);
>   
> +               /*
> +                * Leave supervisor CET state as-is when loading host state
> +                * (kernel or userspace).  Supervisor CET state is managed via
> +                * XSTATE for KVM guests, but the host never consumes said
> +                * state (doesn't support supervisor shadow stacks), i.e. it's
> +                * safe to keep guest state loaded into hardware.
> +                */
> +               if (!fpstate->is_guest)
> +                       mask &= ~XFEATURE_MASK_CET_KERNEL;
> +
>                  /*
>                   * Restoring state always needs to modify all features
>                   * which are in @mask even if the current task cannot use
>
>
> So unless I'm missing something, NAK to this approach, at least not without trying
> the kernel FPU approach, i.e. I want somelike like to PeterZ or tglx to actually
> full on NAK the kernel approach before we consider shoving a hack into KVM.
I will discuss it with the stakeholders, and get back to this when it's clear. Thanks!
Yang, Weijiang Aug. 9, 2023, 2:51 a.m. UTC | #7
On 8/5/2023 5:32 AM, Paolo Bonzini wrote:
> On 8/4/23 22:45, Sean Christopherson wrote:
>>>>> +void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
>> Drop the unlikely, KVM should not speculate on the guest configuration or underlying
>> hardware.
>
> In general unlikely() can still be a good idea if you have a fast path vs. a slow path; the extra cost of a branch will be much more visible on the fast path.  That said the compiler should already be doing that.
This is my original assumption that compiler can help do some level of optimization with the modifier. Thanks!
>>  the Pros:
>>   - Super easy to implement for KVM.
>>   - Automatically avoids saving and restoring this data when the vmexit
>>     is handled within KVM.
>>
>>  the Cons:
>>   - Unnecessarily restores XFEATURE_CET_KERNEL when switching to
>>     non-KVM task's userspace.
>>   - Forces allocating space for this state on all tasks, whether or not
>>     they use KVM, and with likely zero users today and the near future.
>>   - Complicates the FPU optimization thinking by including things that
>>     can have no affect on userspace in the FPU
>
> I'm not sure if Linux will ever use XFEATURE_CET_KERNEL.  Linux does not use MSR_IA32_PL{1,2}_SSP; MSR_IA32_PL0_SSP probably would be per-CPU but it is not used while in ring 0 (except for SETSSBSY) and the restore can be delayed until return to userspace.  It is not unlike the SYSCALL MSRs.
>
> So I would treat the bit similar to the dynamic features even if it's not guarded by XFD, for example
>
> #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
> #define XFEATURE_MASK_USER_OPTIONAL \
>     (XFEATURE_MASK_DYNAMIC | XFEATURE_MASK_CET_KERNEL)
>
> where XFEATURE_MASK_USER_DYNAMIC is used for xfd-related tasks but everything else uses XFEATURE_MASK_USER_OPTIONAL.
>
> Then you'd enable the feature by hand when allocating the guest fpstate.
Yes, this is another way to optimize the kernel-managed solution, I'll investigate it, thanks!
>> Especially because another big negative is that not utilizing XSTATE bleeds into
>> KVM's ABI.  Userspace has to be told to manually save+restore MSRs instead of just
>> letting KVM_{G,S}ET_XSAVE handle the state.  And that will create a bit of a
>> snafu if Linux does gain support for SSS.
>
> I don't think this matters, we don't have any MSRs in KVM_GET/SET_XSAVE and in fact we can't even add them since the uABI uses the non-compacted format.  MSRs should be retrieved and set via KVM_GET/SET_MSR and userspace will learn about the index automatically via KVM_GET_MSR_INDEX_LIST.
> Paolo
>
Yang, Weijiang Aug. 10, 2023, 9:29 a.m. UTC | #8
Hi, Dave, Thomas and Peter,

I would like to connect you to this discussion loop about CET supervisor states support
in kernel so that you may directly talk to KVM maintainers, thanks!
The discussion background/problem/solution as below:

Background:
When KVM enumerates shadow stack support for guest in CPUID(0x7, 0).ECX[bit7],
architecturally it claims both SS user and supervisor mode are supported. Although
the latter is not supported in Linux, but in virtualization world, the guest OS could
be non-Linux system, so KVM supervisor state support is necessary in this case.
Two solutions are on the table:
1) Enable CET supervisor support in Linux kernel like user mode support.
2) Enable support in KVM domain.

Problem:
The Pros/Cons for each solution(my individual thoughts):
In kernel solution:
Pros:
- Avoid saving/restoring 3 supervisor MSRs(PL{0,1,2}_SSP) at vCPU execution path.
- Easy for KVM to manage guest CET xstate bits for guest.
Cons:
- Unnecessary supervisor state xsaves/xrstors operation for non-vCPU thread.
- Potentially extra storage space(24 bytes) for thread context.

KVM solution:
Pros:
- Not touch current kernel FPU management framework and logic.
- No extra space and operation for non-vCPU thread.
Cons:
- Manually saving/restoring 3 supervisor MSRs is a performance burden to KVM.
- It looks more like a hack method for KVM, and some handling logic seems a bit awkward.

KVM maintainers request it supported in kernel instead of KVM to make things streamlined.

We'd like to hear your voice of in kernel solution, favor vs. objection?
Any important points we omitted?
Appreciated!

Solution:
Below is the supervisor state enabling patch in kernel, not include Sean's suggestion below.
=====================================================================
 From 53f9890c76e4163a0fead3afe198d0c17136120e Mon Sep 17 00:00:00 2001
From: Yang Weijiang <weijiang.yang@intel.com>
Date: Thu, 10 Aug 2023 00:10:55 -0400
Subject: [RFC PATCH] x86: fpu: Enable CET supervisor state support

Enable CET supervisor state support within current FPU states management
framework. CET shadow stack feature is enabled with CPUID(0x7,0).ECX[bit7],
if the bit is set, archchtectually both user and supervisor SHSTK should
be supported, i.e., when KVM enumerates the feature bit to guest, it claims
both modes are supported by the VMM.

The user mode SHSTK XSAVE states comprise of IA32_{U_CET,PL3_SSP},
and supervisor mode states inlude IA32_PL{0,1,2}_SSP. The xstate support
for the former is enclosed in native user mode shadow stack series, but
the latter is not supported yet.

KVM is going to support guest shadow stack which means guest's supervisor
shadow stack states should also be well managed by VMM.

To make KVM fully support guest shadow stack states, there're at least two
approaches, one is to enable supervisor xstate bit in kernel, which is
straightforward and can fit well in all cases. The alternative is to enable
the support within KVM domain and manually save/restore the states per vCPU
thread, i.e., introduce addtional WRMSR/RDMSR at vCPU execution path.

This patch doesn't optimize CET supervisor state management, just follow
the implementation of user mode state support.

Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
  arch/x86/include/asm/fpu/types.h  | 14 ++++++++++++--
  arch/x86/include/asm/fpu/xstate.h |  6 +++---
  arch/x86/kernel/fpu/xstate.c      |  6 +++++-
  3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb810074f1e7..c6fd13a17205 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -116,7 +116,7 @@ enum xfeature {
         XFEATURE_PKRU,
         XFEATURE_PASID,
         XFEATURE_CET_USER,
-       XFEATURE_CET_KERNEL_UNUSED,
+       XFEATURE_CET_KERNEL,
         XFEATURE_RSRVD_COMP_13,
         XFEATURE_RSRVD_COMP_14,
         XFEATURE_LBR,
@@ -139,7 +139,7 @@ enum xfeature {
  #define XFEATURE_MASK_PKRU             (1 << XFEATURE_PKRU)
  #define XFEATURE_MASK_PASID            (1 << XFEATURE_PASID)
  #define XFEATURE_MASK_CET_USER         (1 << XFEATURE_CET_USER)
-#define XFEATURE_MASK_CET_KERNEL       (1 << XFEATURE_CET_KERNEL_UNUSED)
+#define XFEATURE_MASK_CET_KERNEL       (1 << XFEATURE_CET_KERNEL)
  #define XFEATURE_MASK_LBR              (1 << XFEATURE_LBR)
  #define XFEATURE_MASK_XTILE_CFG                (1 << XFEATURE_XTILE_CFG)
  #define XFEATURE_MASK_XTILE_DATA       (1 << XFEATURE_XTILE_DATA)
@@ -264,6 +264,16 @@ struct cet_user_state {
         u64 user_ssp;
  };

+/*
+ * State component 12 is Control-flow Enforcement supervisor states
+ */
+struct cet_supervisor_state {
+       /* supervisor ssp pointers  */
+       u64 pl0_ssp;
+       u64 pl1_ssp;
+       u64 pl2_ssp;
+};
+
  /*
   * State component 15: Architectural LBR configuration state.
   * The size of Arch LBR state depends on the number of LBRs (lbr_depth).
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index d4427b88ee12..3b4a038d3c57 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -51,7 +51,8 @@

  /* All currently supported supervisor features */
  #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
-                                           XFEATURE_MASK_CET_USER)
+                                           XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_CET_KERNEL)

  /*
   * A supervisor state component may not always contain valuable information,
@@ -78,8 +79,7 @@
   * Unsupported supervisor features. When a supervisor feature in this mask is
   * supported in the future, move it to the supported supervisor feature mask.
   */
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
- XFEATURE_MASK_CET_KERNEL)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)

  /* All supervisor states including supported and unsupported states. */
  #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4fa4751912d9..fc346c7c6916 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -51,7 +51,7 @@ static const char *xfeature_names[] =
         "Protection Keys User registers",
         "PASID state",
         "Control-flow User registers",
-       "Control-flow Kernel registers (unused)",
+       "Control-flow Kernel registers",
         "unknown xstate feature",
         "unknown xstate feature",
         "unknown xstate feature",
@@ -74,6 +74,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
         [XFEATURE_PKRU]                         = X86_FEATURE_PKU,
         [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
         [XFEATURE_CET_USER]                     = X86_FEATURE_SHSTK,
+       [XFEATURE_CET_KERNEL]                   = X86_FEATURE_SHSTK,
         [XFEATURE_XTILE_CFG]                    = X86_FEATURE_AMX_TILE,
         [XFEATURE_XTILE_DATA]                   = X86_FEATURE_AMX_TILE,
  };
@@ -278,6 +279,7 @@ static void __init print_xstate_features(void)
         print_xstate_feature(XFEATURE_MASK_PKRU);
         print_xstate_feature(XFEATURE_MASK_PASID);
         print_xstate_feature(XFEATURE_MASK_CET_USER);
+       print_xstate_feature(XFEATURE_MASK_CET_KERNEL);
         print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
         print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
  }
@@ -347,6 +349,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
          XFEATURE_MASK_BNDCSR |                 \
          XFEATURE_MASK_PASID |                  \
          XFEATURE_MASK_CET_USER |               \
+        XFEATURE_MASK_CET_KERNEL |             \
          XFEATURE_MASK_XTILE)

  /*
@@ -547,6 +550,7 @@ static bool __init check_xstate_against_struct(int nr)
         case XFEATURE_PASID:      return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
         case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
         case XFEATURE_CET_USER:   return XCHECK_SZ(sz, nr, struct cet_user_state);
+       case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
         case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
         default:
                 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
--
2.27.0




On 8/5/2023 4:45 AM, Sean Christopherson wrote:
> [...]
> Pulling back in the justification from v3:
>
>   the Pros:
>    - Super easy to implement for KVM.
>    - Automatically avoids saving and restoring this data when the vmexit
>      is handled within KVM.
>
>   the Cons:
>    - Unnecessarily restores XFEATURE_CET_KERNEL when switching to
>      non-KVM task's userspace.
>    - Forces allocating space for this state on all tasks, whether or not
>      they use KVM, and with likely zero users today and the near future.
>    - Complicates the FPU optimization thinking by including things that
>      can have no affect on userspace in the FPU
>
> IMO the pros far outweigh the cons.  3x RDMSR and 3x WRMSR when loading host/guest
> state is non-trivial overhead.  That can be mitigated, e.g. by utilizing the
> user return MSR framework, but it's still unpalatable.  It's unlikely many guests
> will SSS in the *near* future, but I don't want to end up with code that performs
> poorly in the future and needs to be rewritten.
>
> Especially because another big negative is that not utilizing XSTATE bleeds into
> KVM's ABI.  Userspace has to be told to manually save+restore MSRs instead of just
> letting KVM_{G,S}ET_XSAVE handle the state.  And that will create a bit of a
> snafu if Linux does gain support for SSS.
>
> On the other hand, the extra per-task memory is all of 24 bytes.  AFAICT, there's
> literally zero effect on guest XSTATE allocations because those are vmalloc'd and
> thus rounded up to PAGE_SIZE, i.e. the next 4KiB.  And XSTATE needs to be 64-byte
> aligned, so the 24 bytes is only actually meaningful if the current size is within
> 24 bytes of the next cahce line.  And the "current" size is variable depending on
> which features are present and enabled, i.e. it's a roll of the dice as to whether
> or not using XSTATE for supervisor CET would actually increase memory usage.  And
> _if_ it does increase memory consumption, I have a very hard time believing an
> extra 64 bytes in the worst case scenario is a dealbreaker.
>
> If the performance is a concern, i.e. we don't want to eat saving/restoring the
> MSRs when switching to/from host FPU context, then I *think* that's simply a matter
> of keeping guest state resident when loading non-guest FPU state.
>
> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> index 1015af1ae562..8e7599e3b923 100644
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -167,6 +167,16 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
>                   */
>                  xfd_update_state(fpstate);
>   
> +               /*
> +                * Leave supervisor CET state as-is when loading host state
> +                * (kernel or userspace).  Supervisor CET state is managed via
> +                * XSTATE for KVM guests, but the host never consumes said
> +                * state (doesn't support supervisor shadow stacks), i.e. it's
> +                * safe to keep guest state loaded into hardware.
> +                */
> +               if (!fpstate->is_guest)
> +                       mask &= ~XFEATURE_MASK_CET_KERNEL;
> +
>                  /*
>                   * Restoring state always needs to modify all features
>                   * which are in @mask even if the current task cannot use
>
>
> So unless I'm missing something, NAK to this approach, at least not without trying
> the kernel FPU approach, i.e. I want somelike like to PeterZ or tglx to actually
> full on NAK the kernel approach before we consider shoving a hack into KVM.
Dave Hansen Aug. 10, 2023, 2:29 p.m. UTC | #9
On 8/10/23 02:29, Yang, Weijiang wrote:
...
> When KVM enumerates shadow stack support for guest in CPUID(0x7, 
> 0).ECX[bit7], architecturally it claims both SS user and supervisor
> mode are supported. Although the latter is not supported in Linux,
> but in virtualization world, the guest OS could be non-Linux system,
> so KVM supervisor state support is necessary in this case.

What actual OSes need this support?

> Two solutions are on the table:
> 1) Enable CET supervisor support in Linux kernel like user mode support.

We _will_ do this eventually, but not until FRED is merged.  The core
kernel also probably won't be managing the MSRs on non-FRED hardware.

I think what you're really talking about here is that the kernel would
enable CET_S XSAVE state management so that CET_S state could be managed
by the core kernel's FPU code.

That is, frankly, *NOT* like the user mode support at all.

> 2) Enable support in KVM domain.
> 
> Problem:
> The Pros/Cons for each solution(my individual thoughts):
> In kernel solution:
> Pros:
> - Avoid saving/restoring 3 supervisor MSRs(PL{0,1,2}_SSP) at vCPU
>   execution path.
> - Easy for KVM to manage guest CET xstate bits for guest.
> Cons:
> - Unnecessary supervisor state xsaves/xrstors operation for non-vCPU
>   thread.

What operations would be unnecessary exactly?

> - Potentially extra storage space(24 bytes) for thread context.

Yep.  This one is pretty unavoidable.  But, we've kept MPX around in
this state for a looooooong time and nobody really seemed to care.

> KVM solution:
> Pros:
> - Not touch current kernel FPU management framework and logic.
> - No extra space and operation for non-vCPU thread.
> Cons:
> - Manually saving/restoring 3 supervisor MSRs is a performance burden to
>   KVM.
> - It looks more like a hack method for KVM, and some handling logic
>   seems a bit awkward.

In a perfect world, we'd just allocate space for CET_S in the KVM
fpstates.  The core kernel fpstates would have
XSTATE_BV[13]==XCOMP_BV[13]==0.  An XRSTOR of the core kernel fpstates
would just set CET_S to its init state.

But I suspect that would be too much work to implement in practice.  It
would be akin to a new lesser kind of dynamic xstate, one that didn't
interact with XFD and *NEVER* gets allocated in the core kernel
fpstates, even on demand.

I want to hear more about who is going to use CET_S state under KVM in
practice.  I don't want to touch it if this is some kind of purely
academic exercise.  But it's also silly to hack some kind of temporary
solution into KVM that we'll rip out in a year when real supervisor
shadow stack support comes along.

If it's actually necessary, we should probably just eat the 24 bytes in
the fpstates, flip the bit in IA32_XSS and move on.  There shouldn't be
any other meaningful impact to the core kernel.
Paolo Bonzini Aug. 10, 2023, 3:15 p.m. UTC | #10
On 8/10/23 16:29, Dave Hansen wrote:
> On 8/10/23 02:29, Yang, Weijiang wrote:
> ...
>> When KVM enumerates shadow stack support for guest in CPUID(0x7,
>> 0).ECX[bit7], architecturally it claims both SS user and supervisor
>> mode are supported. Although the latter is not supported in Linux,
>> but in virtualization world, the guest OS could be non-Linux system,
>> so KVM supervisor state support is necessary in this case.
> 
> What actual OSes need this support?

I think Xen could use it when running nested.  But KVM cannot expose 
support for CET in CPUID, and at the same time fake support for 
MSR_IA32_PL{0,1,2}_SSP (e.g. inject a #GP if it's ever written to a 
nonzero value).

I suppose we could invent our own paravirtualized CPUID bit for 
"supervisor IBT works but supervisor SHSTK doesn't".  Linux could check 
that but I don't think it's a good idea.

So... do, or do not.  There is no try. :)

>> Two solutions are on the table:
>> 1) Enable CET supervisor support in Linux kernel like user mode support.
> 
> We _will_ do this eventually, but not until FRED is merged.  The core
> kernel also probably won't be managing the MSRs on non-FRED hardware.
> 
> I think what you're really talking about here is that the kernel would
> enable CET_S XSAVE state management so that CET_S state could be managed
> by the core kernel's FPU code.

Yes, I understand it that way too.

> That is, frankly, *NOT* like the user mode support at all.

I agree.

>> 2) Enable support in KVM domain.
>>
>> Problem:
>> The Pros/Cons for each solution(my individual thoughts):
>> In kernel solution:
>> Pros:
>> - Avoid saving/restoring 3 supervisor MSRs(PL{0,1,2}_SSP) at vCPU
>>    execution path.
>> - Easy for KVM to manage guest CET xstate bits for guest.
>> Cons:
>> - Unnecessary supervisor state xsaves/xrstors operation for non-vCPU
>>    thread.
> 
> What operations would be unnecessary exactly?

Saving/restoring PL0/1/2_SSP when switching from one usermode task's 
fpstate to another.

>> KVM solution:
>> Pros:
>> - Not touch current kernel FPU management framework and logic.
>> - No extra space and operation for non-vCPU thread.
>> Cons:
>> - Manually saving/restoring 3 supervisor MSRs is a performance burden to
>>    KVM.
>> - It looks more like a hack method for KVM, and some handling logic
>>    seems a bit awkward.
> 
> In a perfect world, we'd just allocate space for CET_S in the KVM
> fpstates.  The core kernel fpstates would have
> XSTATE_BV[13]==XCOMP_BV[13]==0.  An XRSTOR of the core kernel fpstates
> would just set CET_S to its init state.

Yep.  I don't think it's a lot of work to implement.  The basic idea as 
you point out below is something like

#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
#define XFEATURE_MASK_USER_OPTIONAL \
     (XFEATURE_MASK_DYNAMIC | XFEATURE_MASK_CET_KERNEL)

where XFEATURE_MASK_USER_DYNAMIC is used for xfd-related tasks 
(including the ARCH_GET_XCOMP_SUPP arch_prctl) but everything else uses 
XFEATURE_MASK_USER_OPTIONAL.

KVM would enable the feature by hand when allocating the guest fpstate. 
Disabled features would be cleared from EDX:EAX when calling 
XSAVE/XSAVEC/XSAVES.

> But I suspect that would be too much work to implement in practice.  It
> would be akin to a new lesser kind of dynamic xstate, one that didn't
> interact with XFD and *NEVER* gets allocated in the core kernel
> fpstates, even on demand.
> 
> I want to hear more about who is going to use CET_S state under KVM in
> practice.  I don't want to touch it if this is some kind of purely
> academic exercise.  But it's also silly to hack some kind of temporary
> solution into KVM that we'll rip out in a year when real supervisor
> shadow stack support comes along.
> 
> If it's actually necessary, we should probably just eat the 24 bytes in
> the fpstates, flip the bit in IA32_XSS and move on.  There shouldn't be
> any other meaningful impact to the core kernel.

If that's good to you, why not.

Paolo
Sean Christopherson Aug. 10, 2023, 3:37 p.m. UTC | #11
On Thu, Aug 10, 2023, Paolo Bonzini wrote:
> On 8/10/23 16:29, Dave Hansen wrote:
> > On 8/10/23 02:29, Yang, Weijiang wrote:
> > ...
> > > When KVM enumerates shadow stack support for guest in CPUID(0x7,
> > > 0).ECX[bit7], architecturally it claims both SS user and supervisor
> > > mode are supported. Although the latter is not supported in Linux,
> > > but in virtualization world, the guest OS could be non-Linux system,
> > > so KVM supervisor state support is necessary in this case.
> > 
> > What actual OSes need this support?
> 
> I think Xen could use it when running nested.  But KVM cannot expose support
> for CET in CPUID, and at the same time fake support for
> MSR_IA32_PL{0,1,2}_SSP (e.g. inject a #GP if it's ever written to a nonzero
> value).
> 
> I suppose we could invent our own paravirtualized CPUID bit for "supervisor
> IBT works but supervisor SHSTK doesn't".  Linux could check that but I don't
> think it's a good idea.
> 
> So... do, or do not.  There is no try. :)

> > I want to hear more about who is going to use CET_S state under KVM in
> > practice.  I don't want to touch it if this is some kind of purely
> > academic exercise.  But it's also silly to hack some kind of temporary
> > solution into KVM that we'll rip out in a year when real supervisor
> > shadow stack support comes along.

As Paolo alluded to, this is about KVM faithfully emulating the architecture.
There is no combination of CPUID bits that allows KVM to advertise SHSTK for
userspace without advertising SHSTK for supervisor.

Whether or not there are any users in the short term is unfortunately irrelevant
from KVM's perspective.
Yang, Weijiang Aug. 11, 2023, 3:03 a.m. UTC | #12
On 8/10/2023 11:15 PM, Paolo Bonzini wrote:
> On 8/10/23 16:29, Dave Hansen wrote:
>> On 8/10/23 02:29, Yang, Weijiang wrote:
>> ...
>>> When KVM enumerates shadow stack support for guest in CPUID(0x7,
>>> 0).ECX[bit7], architecturally it claims both SS user and supervisor
>>> mode are supported. Although the latter is not supported in Linux,
>>> but in virtualization world, the guest OS could be non-Linux system,
>>> so KVM supervisor state support is necessary in this case.
>>
>> What actual OSes need this support?
>
> I think Xen could use it when running nested.  But KVM cannot expose support for CET in CPUID, and at the same time fake support for MSR_IA32_PL{0,1,2}_SSP (e.g. inject a #GP if it's ever written to a nonzero value).
>
> I suppose we could invent our own paravirtualized CPUID bit for "supervisor IBT works but supervisor SHSTK doesn't".  Linux could check that but I don't think it's a good idea.
>
> So... do, or do not.  There is no try. :)
>
>>> Two solutions are on the table:
>>> 1) Enable CET supervisor support in Linux kernel like user mode support.
>>
>> We _will_ do this eventually, but not until FRED is merged.  The core
>> kernel also probably won't be managing the MSRs on non-FRED hardware.
>>
>> I think what you're really talking about here is that the kernel would
>> enable CET_S XSAVE state management so that CET_S state could be managed
>> by the core kernel's FPU code.
>
> Yes, I understand it that way too.

Sorry for confusion, I missed the word "state" here.

>> That is, frankly, *NOT* like the user mode support at all.
>
> I agree.
>
>>> 2) Enable support in KVM domain.
>>>
>>> Problem:
>>> The Pros/Cons for each solution(my individual thoughts):
>>> In kernel solution:
>>> Pros:
>>> - Avoid saving/restoring 3 supervisor MSRs(PL{0,1,2}_SSP) at vCPU
>>>    execution path.
>>> - Easy for KVM to manage guest CET xstate bits for guest.
>>> Cons:
>>> - Unnecessary supervisor state xsaves/xrstors operation for non-vCPU
>>>    thread.
>>
>> What operations would be unnecessary exactly?
>
> Saving/restoring PL0/1/2_SSP when switching from one usermode task's fpstate to another.
>
>>> KVM solution:
>>> Pros:
>>> - Not touch current kernel FPU management framework and logic.
>>> - No extra space and operation for non-vCPU thread.
>>> Cons:
>>> - Manually saving/restoring 3 supervisor MSRs is a performance burden to
>>>    KVM.
>>> - It looks more like a hack method for KVM, and some handling logic
>>>    seems a bit awkward.
>>
>> In a perfect world, we'd just allocate space for CET_S in the KVM
>> fpstates.  The core kernel fpstates would have
>> XSTATE_BV[13]==XCOMP_BV[13]==0.  An XRSTOR of the core kernel fpstates
>> would just set CET_S to its init state.
>
> Yep.  I don't think it's a lot of work to implement.  The basic idea as you point out below is something like
>
> #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
> #define XFEATURE_MASK_USER_OPTIONAL \
>     (XFEATURE_MASK_DYNAMIC | XFEATURE_MASK_CET_KERNEL)
>
> where XFEATURE_MASK_USER_DYNAMIC is used for xfd-related tasks (including the ARCH_GET_XCOMP_SUPP arch_prctl) but everything else uses XFEATURE_MASK_USER_OPTIONAL.
>
> KVM would enable the feature by hand when allocating the guest fpstate. Disabled features would be cleared from EDX:EAX when calling XSAVE/XSAVEC/XSAVES.

OK, I'll move ahead in that direction.

>> But I suspect that would be too much work to implement in practice.  It
>> would be akin to a new lesser kind of dynamic xstate, one that didn't
>> interact with XFD and *NEVER* gets allocated in the core kernel
>> fpstates, even on demand.
>>
>> I want to hear more about who is going to use CET_S state under KVM in
>> practice.  I don't want to touch it if this is some kind of purely
>> academic exercise.  But it's also silly to hack some kind of temporary
>> solution into KVM that we'll rip out in a year when real supervisor
>> shadow stack support comes along.
>>
>> If it's actually necessary, we should probably just eat the 24 bytes in
>> the fpstates, flip the bit in IA32_XSS and move on.  There shouldn't be
>> any other meaningful impact to the core kernel.
>
> If that's good to you, why not.

Thanks to all of you for quickly helping out!

> Paolo
>
Dave Hansen Aug. 28, 2023, 9 p.m. UTC | #13
On 8/10/23 08:15, Paolo Bonzini wrote:
> On 8/10/23 16:29, Dave Hansen wrote:
>> What actual OSes need this support?
> 
> I think Xen could use it when running nested.  But KVM cannot expose
> support for CET in CPUID, and at the same time fake support for
> MSR_IA32_PL{0,1,2}_SSP (e.g. inject a #GP if it's ever written to a
> nonzero value).
> 
> I suppose we could invent our own paravirtualized CPUID bit for
> "supervisor IBT works but supervisor SHSTK doesn't".  Linux could check
> that but I don't think it's a good idea.
> 
> So... do, or do not.  There is no try. :)

Ahh, that makes sense.  This is needed for implementing the
*architecture*, not because some OS actually wants to _do_ it.

...
>> In a perfect world, we'd just allocate space for CET_S in the KVM
>> fpstates.  The core kernel fpstates would have
>> XSTATE_BV[13]==XCOMP_BV[13]==0.  An XRSTOR of the core kernel fpstates
>> would just set CET_S to its init state.
> 
> Yep.  I don't think it's a lot of work to implement.  The basic idea as
> you point out below is something like
> 
> #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
> #define XFEATURE_MASK_USER_OPTIONAL \
>     (XFEATURE_MASK_DYNAMIC | XFEATURE_MASK_CET_KERNEL)
> 
> where XFEATURE_MASK_USER_DYNAMIC is used for xfd-related tasks
> (including the ARCH_GET_XCOMP_SUPP arch_prctl) but everything else uses
> XFEATURE_MASK_USER_OPTIONAL.
> 
> KVM would enable the feature by hand when allocating the guest fpstate.
> Disabled features would be cleared from EDX:EAX when calling
> XSAVE/XSAVEC/XSAVES.

OK, so let's _try_ this perfect-world solution.  KVM fpstates get
fpstate->xfeatures[13] set, but no normal task fpstates have that bit
set.  Most of the infrastructure should be there to handle this without
much fuss because it _should_ be looking at generic things like
fpstate->size and fpstate->features.

But who knows what trouble this will turn up.  It could get nasty and
not worth it, but we should at least try it.
Yang, Weijiang Aug. 29, 2023, 7:05 a.m. UTC | #14
On 8/29/2023 5:00 AM, Dave Hansen wrote:
> On 8/10/23 08:15, Paolo Bonzini wrote:
>> On 8/10/23 16:29, Dave Hansen wrote:
>>> What actual OSes need this support?
>> I think Xen could use it when running nested.  But KVM cannot expose
>> support for CET in CPUID, and at the same time fake support for
>> MSR_IA32_PL{0,1,2}_SSP (e.g. inject a #GP if it's ever written to a
>> nonzero value).
>>
>> I suppose we could invent our own paravirtualized CPUID bit for
>> "supervisor IBT works but supervisor SHSTK doesn't".  Linux could check
>> that but I don't think it's a good idea.
>>
>> So... do, or do not.  There is no try. :)
> Ahh, that makes sense.  This is needed for implementing the
> *architecture*, not because some OS actually wants to _do_ it.
>
> ...
>>> In a perfect world, we'd just allocate space for CET_S in the KVM
>>> fpstates.  The core kernel fpstates would have
>>> XSTATE_BV[13]==XCOMP_BV[13]==0.  An XRSTOR of the core kernel fpstates
>>> would just set CET_S to its init state.
>> Yep.  I don't think it's a lot of work to implement.  The basic idea as
>> you point out below is something like
>>
>> #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
>> #define XFEATURE_MASK_USER_OPTIONAL \
>>      (XFEATURE_MASK_DYNAMIC | XFEATURE_MASK_CET_KERNEL)
>>
>> where XFEATURE_MASK_USER_DYNAMIC is used for xfd-related tasks
>> (including the ARCH_GET_XCOMP_SUPP arch_prctl) but everything else uses
>> XFEATURE_MASK_USER_OPTIONAL.
>>
>> KVM would enable the feature by hand when allocating the guest fpstate.
>> Disabled features would be cleared from EDX:EAX when calling
>> XSAVE/XSAVEC/XSAVES.
> OK, so let's _try_ this perfect-world solution.  KVM fpstates get
> fpstate->xfeatures[13] set, but no normal task fpstates have that bit
> set.  Most of the infrastructure should be there to handle this without
> much fuss because it _should_ be looking at generic things like
> fpstate->size and fpstate->features.
>
> But who knows what trouble this will turn up.  It could get nasty and
> not worth it, but we should at least try it.

Thanks Dave for clarity!
I'm moving in that direction...
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 20bbcd95511f..69cbc9d9b277 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -805,6 +805,7 @@  struct kvm_vcpu_arch {
 	u64 xcr0;
 	u64 guest_supported_xcr0;
 	u64 guest_supported_xss;
+	u64 cet_s_ssp[3];
 
 	struct kvm_pio_request pio;
 	void *pio_data;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b1658c0de847..b221a663de4c 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -232,4 +232,15 @@  static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
 	return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
 }
 
+/*
+ * FIXME: When the "KVM-governed" enabling patchset is merge, rebase this
+ * series on top of that and add a new patch for CET to replace this helper
+ * with the qualified one.
+ */
+static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
+					  unsigned int feature)
+{
+	return kvm_cpu_cap_has(feature) && guest_cpuid_has(vcpu, feature);
+}
+
 #endif
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 1bc0936bbd51..8652e86fbfb2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1515,11 +1515,13 @@  static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	if (likely(tsc_aux_uret_slot >= 0))
 		kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
 
+	reload_cet_supervisor_ssp(vcpu);
 	svm->guest_state_loaded = true;
 }
 
 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
 {
+	save_cet_supervisor_ssp(vcpu);
 	to_svm(vcpu)->guest_state_loaded = false;
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c8d9870cfecb..6aa76124e81e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1323,6 +1323,7 @@  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	gs_base = segment_base(gs_sel);
 #endif
 
+	reload_cet_supervisor_ssp(vcpu);
 	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
 	vmx->guest_state_loaded = true;
 }
@@ -1362,6 +1363,7 @@  static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
 	load_fixmap_gdt(raw_smp_processor_id());
+	save_cet_supervisor_ssp(&vmx->vcpu);
 	vmx->guest_state_loaded = false;
 	vmx->guest_uret_msrs_loaded = false;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d68ef87fe007..5b63441fd2d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11128,6 +11128,31 @@  static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	trace_kvm_fpu(0);
 }
 
+void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
+		rdmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
+		rdmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
+		rdmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
+		/*
+		 * Omit reset to host PL{1,2}_SSP because Linux will never use
+		 * these MSRs.
+		 */
+		wrmsrl(MSR_IA32_PL0_SSP, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(save_cet_supervisor_ssp);
+
+void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(guest_can_use(vcpu, X86_FEATURE_SHSTK))) {
+		wrmsrl(MSR_IA32_PL0_SSP, vcpu->arch.cet_s_ssp[0]);
+		wrmsrl(MSR_IA32_PL1_SSP, vcpu->arch.cet_s_ssp[1]);
+		wrmsrl(MSR_IA32_PL2_SSP, vcpu->arch.cet_s_ssp[2]);
+	}
+}
+EXPORT_SYMBOL_GPL(reload_cet_supervisor_ssp);
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
 	struct kvm_queued_exception *ex = &vcpu->arch.exception;
@@ -12133,6 +12158,7 @@  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vcpu->arch.cr3 = 0;
 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+	memset(vcpu->arch.cet_s_ssp, 0, sizeof(vcpu->arch.cet_s_ssp));
 
 	/*
 	 * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
@@ -12313,6 +12339,7 @@  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 		pmu->need_cleanup = true;
 		kvm_make_request(KVM_REQ_PMU, vcpu);
 	}
+
 	static_call(kvm_x86_sched_in)(vcpu, cpu);
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6e6292915f8c..c69fc027f5ec 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -501,6 +501,9 @@  static inline void kvm_machine_check(void)
 
 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
+void save_cet_supervisor_ssp(struct kvm_vcpu *vcpu);
+void reload_cet_supervisor_ssp(struct kvm_vcpu *vcpu);
+
 int kvm_spec_ctrl_test_value(u64 value);
 bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,