diff mbox series

[v8,4/4] kvm: vmx: virtualize split lock detection

Message ID 20200414063129.133630-5-xiaoyao.li@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: Add virtualization support of split lock detection | expand

Commit Message

Xiaoyao Li April 14, 2020, 6:31 a.m. UTC
Due to the fact that TEST_CTRL MSR is per-core scope, i.e., the sibling
threads in the same physical CPU core share the same MSR, only
advertising feature split lock detection to guest when SMT is disabled
or unsupported, for simplicitly.

1) When host sld_state is sld_off, feature split lock detection is
   unsupported/disabled. Cannot expose it to guest in this case.

2) When host sld_state is sld_warn, feature split lock detection can be
   exposed to guest if nosmt. Further, to avoid the potential
   MSR_TEST_CTRL.SLD toggling overhead during every vm-enter/-exit,
   loading guest's SLD setting when in KVM context.

3) When host sld_state is sld_fatal, feature split lock detection can also
   be exposed to guest if nosmt. But the feature is forced on for
   guest, i.e., the hardware MSR_TEST_CTRL.SLD bit is always set even if
   guest clears the SLD bit.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
---
 arch/x86/kvm/vmx/vmx.c | 79 +++++++++++++++++++++++++++++++++++++-----
 arch/x86/kvm/vmx/vmx.h |  2 ++
 arch/x86/kvm/x86.c     | 17 +++++++--
 3 files changed, 86 insertions(+), 12 deletions(-)

Comments

Thomas Gleixner April 15, 2020, 5:43 p.m. UTC | #1
Xiaoyao Li <xiaoyao.li@intel.com> writes:
> +/*
> + * Note: for guest, feature split lock detection can only be enumerated through
> + * MSR_IA32_CORE_CAPABILITIES bit. The FMS enumeration is unsupported.

That comment is confusing at best.

> + */
> +static inline bool guest_cpu_has_feature_sld(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.core_capabilities &
> +	       MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
> +}
> +
> +static inline bool guest_cpu_sld_on(struct vcpu_vmx *vmx)
> +{
> +	return vmx->msr_test_ctrl & MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
> +}
> +
> +static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
> +{
> +	/*
> +	 * Toggle SLD if the guest wants it enabled but its been disabled for
> +	 * the userspace VMM, and vice versa.  Note, TIF_SLD is true if SLD has
> +	 * been turned off.  Yes, it's a terrible name.

Instead of writing that useless blurb you could have written a patch
which changes TIF_SLD to TIF_SLD_OFF to make it clear.

> +	 */
> +	if (sld_state == sld_warn && guest_cpu_has_feature_sld(vcpu) &&
> +	    on == test_thread_flag(TIF_SLD)) {
> +		    sld_update_msr(on);
> +		    update_thread_flag(TIF_SLD, !on);

Of course you completely fail to explain why TIF_SLD needs to be fiddled
with.

> @@ -1188,6 +1217,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>  #endif
>
> 	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
> +
> +	vmx->host_sld_on = !test_thread_flag(TIF_SLD);

This inverted storage is non-intuitive. What's wrong with simply
reflecting the TIF_SLD state?

> +	vmx_update_sld(vcpu, guest_cpu_sld_on(vmx));
> +
>	vmx->guest_state_loaded = true;
> }
>
> @@ -1226,6 +1259,9 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
> 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
>  #endif
> 	load_fixmap_gdt(raw_smp_processor_id());
> +
> +	vmx_update_sld(&vmx->vcpu, vmx->host_sld_on);
> +

vmx_prepare_switch_to_guest() is called via:

kvm_arch_vcpu_ioctl_run()
  vcpu_run()
    vcpu_enter_guest()
      preempt_disable();
      kvm_x86_ops.prepare_guest_switch(vcpu);

but vmx_prepare_switch_to_host() is invoked at the very end of:

kvm_arch_vcpu_ioctl_run()
  .....
  vcpu_run()
  .....
  vcpu_put()
    vmx_vcpu_put()
      vmx_prepare_switch_to_host();

That asymmetry does not make any sense without an explanation.

What's even worse is that vmx_prepare_switch_to_host() is invoked with
preemption enabled, so MSR state and TIF_SLD state can get out of sync
on preemption/migration.

> @@ -1946,9 +1992,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> 
> 	switch (msr_index) {
> 	case MSR_TEST_CTRL:
> -		if (data)
> +		if (data & ~vmx_msr_test_ctrl_valid_bits(vcpu))
> 			return 1;
> 
> +		vmx->msr_test_ctrl = data;
> +
> +		preempt_disable();

This preempt_disable/enable() lacks explanation as well.

> +		if (vmx->guest_state_loaded)
> +			vmx_update_sld(vcpu, guest_cpu_sld_on(vmx));
> +		preempt_enable();

How is updating msr_test_ctrl valid if this is invoked from the IOCTL,
i.e. host_initiated == true?

That said, I also hate the fact that you export both the low level MSR
function _and_ the state variable. Having all these details including the
TIF mangling in the VMX code is just wrong.

Thanks,

        tglx
Sean Christopherson April 15, 2020, 7:18 p.m. UTC | #2
On Wed, Apr 15, 2020 at 07:43:22PM +0200, Thomas Gleixner wrote:
> Xiaoyao Li <xiaoyao.li@intel.com> writes:
> > +/*
> > + * Note: for guest, feature split lock detection can only be enumerated through
> > + * MSR_IA32_CORE_CAPABILITIES bit. The FMS enumeration is unsupported.
> 
> That comment is confusing at best.
> 
> > + */
> > +static inline bool guest_cpu_has_feature_sld(struct kvm_vcpu *vcpu)
> > +{
> > +	return vcpu->arch.core_capabilities &
> > +	       MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
> > +}
> > +
> > +static inline bool guest_cpu_sld_on(struct vcpu_vmx *vmx)
> > +{
> > +	return vmx->msr_test_ctrl & MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
> > +}
> > +
> > +static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
> > +{
> > +	/*
> > +	 * Toggle SLD if the guest wants it enabled but its been disabled for
> > +	 * the userspace VMM, and vice versa.  Note, TIF_SLD is true if SLD has
> > +	 * been turned off.  Yes, it's a terrible name.
> 
> Instead of writing that useless blurb you could have written a patch
> which changes TIF_SLD to TIF_SLD_OFF to make it clear.

Hah, that's my comment, though I must admit I didn't fully intend for the
editorial at the end to get submitted upstream.

Anyways, I _did_ point out that TIF_SLD is a terrible name[1][2], and my
feedback got ignored/overlooked.  I'd be more than happy to write a patch,
I didn't do so because I assumed that people wanted TIF_SLD as the name for
whatever reason.

[1] https://lkml.kernel.org/r/20191122184457.GA31235@linux.intel.com
[2] https://lkml.kernel.org/r/20200115225724.GA18268@linux.intel.com

> > +	 */
> > +	if (sld_state == sld_warn && guest_cpu_has_feature_sld(vcpu) &&
> > +	    on == test_thread_flag(TIF_SLD)) {
> > +		    sld_update_msr(on);
> > +		    update_thread_flag(TIF_SLD, !on);
> 
> Of course you completely fail to explain why TIF_SLD needs to be fiddled
> with.

Ya, that comment should be something like:

	* Toggle SLD if the guest wants it enabled but its been disabled for
	* the userspace VMM, and vice versa, so that the flag and MSR state
	* are consistent, i.e. its handling during task switches naturally does
	* the right thing if KVM is preempted with guest state loaded.

> > @@ -1188,6 +1217,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> >  #endif
> >
> > 	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
> > +
> > +	vmx->host_sld_on = !test_thread_flag(TIF_SLD);
> 
> This inverted storage is non-intuitive. What's wrong with simply
> reflecting the TIF_SLD state?

So that the guest/host tracking use the same polairy, and IMO it makes
the restoration code more intuitive, e.g.:

	vmx_update_sld(&vmx->vcpu, vmx->host_sld_on);
vs
	vmx_update_sld(&vmx->vcpu, !vmx->host_tif_sld);

I.e. the inversion needs to happen somewhere.

> > +	vmx_update_sld(vcpu, guest_cpu_sld_on(vmx));
> > +
> >	vmx->guest_state_loaded = true;
> > }
> >
> > @@ -1226,6 +1259,9 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
> > 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
> >  #endif
> > 	load_fixmap_gdt(raw_smp_processor_id());
> > +
> > +	vmx_update_sld(&vmx->vcpu, vmx->host_sld_on);
> > +
> 
> vmx_prepare_switch_to_guest() is called via:
> 
> kvm_arch_vcpu_ioctl_run()
>   vcpu_run()
>     vcpu_enter_guest()
>       preempt_disable();
>       kvm_x86_ops.prepare_guest_switch(vcpu);
> 
> but vmx_prepare_switch_to_host() is invoked at the very end of:
> 
> kvm_arch_vcpu_ioctl_run()
>   .....
>   vcpu_run()
>   .....
>   vcpu_put()
>     vmx_vcpu_put()
>       vmx_prepare_switch_to_host();
> 
> That asymmetry does not make any sense without an explanation.

Deferring the "switch to host" until the vCPU is put allows KVM to keep
certain guest state loaded when staying in the vCPU run loop, e.g.
MSR_KERNEL_GS_BASE can be exposed to the guest without having to save and
restore it on every VM-Enter/VM-Exit.

I agree that all of KVM's state save/load trickerly lacks documentation,
I'll put that on my todo list.
 
> What's even worse is that vmx_prepare_switch_to_host() is invoked with
> preemption enabled, so MSR state and TIF_SLD state can get out of sync
> on preemption/migration.

It shouldn't be (called with preempation enabled):

void vcpu_put(struct kvm_vcpu *vcpu)
{
	preempt_disable();
	kvm_arch_vcpu_put(vcpu); <-- leads to vmx_prepare_switch_to_host()
	preempt_notifier_unregister(&vcpu->preempt_notifier);
	__this_cpu_write(kvm_running_vcpu, NULL);
	preempt_enable();
}

> > @@ -1946,9 +1992,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > 
> > 	switch (msr_index) {
> > 	case MSR_TEST_CTRL:
> > -		if (data)
> > +		if (data & ~vmx_msr_test_ctrl_valid_bits(vcpu))
> > 			return 1;
> > 
> > +		vmx->msr_test_ctrl = data;
> > +
> > +		preempt_disable();
> 
> This preempt_disable/enable() lacks explanation as well.

Is an explanation still needed if it's made clear (somewhere) that
interacting with guest_state_loaded needs to be done with preemption
disabled?
 
> > +		if (vmx->guest_state_loaded)
> > +			vmx_update_sld(vcpu, guest_cpu_sld_on(vmx));
> > +		preempt_enable();
> 
> How is updating msr_test_ctrl valid if this is invoked from the IOCTL,
> i.e. host_initiated == true?

Not sure I understand the underlying question.  The host is always allowed
to manipulate guest state, including MSRs.

I'm pretty sure guest_state_loaded should always be false if host_initiated
is true, e.g. we could technically do a WARN on guest_state_loaded and
host_initiated, but the ioctl() is obviously not a hot path and nothing
will break if the assumption doesn't hold.

> That said, I also hate the fact that you export both the low level MSR
> function _and_ the state variable. Having all these details including the
> TIF mangling in the VMX code is just wrong.

I'm not a fan of exporting the low level state either, but IIRC trying to
hide the low level details while achieving the same resulting functionality
was even messier.

I don't see any way to avoid having KVM differentiate between sld_warn and
sld_fatal.  Even if KVM is able to virtualize SLD in sld_fatal mode, e.g.
by telling the guest it must not try to disable SLD, KVM would still need
to know the kernel is sld_fatal so that it can forward that information to
the guest.

It'd be possible to avoid mucking with TIF or exporting the MSR helper, but
that would require KVM to manually save/restore the MSR when KVM is
preempted with guest state loaded.  That probably wouldn't actually affect
performance for most use cases, but IMO it's not worth the extra headache
just to avoid exporting a helper.
Thomas Gleixner April 15, 2020, 7:47 p.m. UTC | #3
Xiaoyao Li <xiaoyao.li@intel.com> writes:

> Due to the fact that TEST_CTRL MSR is per-core scope, i.e., the sibling
> threads in the same physical CPU core share the same MSR, only
> advertising feature split lock detection to guest when SMT is disabled
> or unsupported, for simplicitly.

That's not for simplicity. It's for correctness because you cannot
provide consistent state to a guest.

Thanks,

        tglx
Thomas Gleixner April 15, 2020, 9:22 p.m. UTC | #4
Sean,

Sean Christopherson <sean.j.christopherson@intel.com> writes:
> On Wed, Apr 15, 2020 at 07:43:22PM +0200, Thomas Gleixner wrote:
>> Xiaoyao Li <xiaoyao.li@intel.com> writes:
>> > +static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
>> > +{
>> > +	/*
>> > +	 * Toggle SLD if the guest wants it enabled but its been disabled for
>> > +	 * the userspace VMM, and vice versa.  Note, TIF_SLD is true if SLD has
>> > +	 * been turned off.  Yes, it's a terrible name.
>> 
>> Instead of writing that useless blurb you could have written a patch
>> which changes TIF_SLD to TIF_SLD_OFF to make it clear.
>
> Hah, that's my comment, though I must admit I didn't fully intend for the
> editorial at the end to get submitted upstream.
>
> Anyways, I _did_ point out that TIF_SLD is a terrible name[1][2], and my
> feedback got ignored/overlooked.  I'd be more than happy to write a
> patch, I didn't do so because I assumed that people wanted TIF_SLD as the name for
> whatever reason.

I somehow missed that in the maze of mails regarding this stuff. I've
already written a patch to rename it to TIF_SLD_DISABLED which is pretty
self explaining. But see below.

>> > +	 */
>> > +	if (sld_state == sld_warn && guest_cpu_has_feature_sld(vcpu) &&
>> > +	    on == test_thread_flag(TIF_SLD)) {
>> > +		    sld_update_msr(on);
>> > +		    update_thread_flag(TIF_SLD, !on);
>> 
>> Of course you completely fail to explain why TIF_SLD needs to be fiddled
>> with.
>
> Ya, that comment should be something like:
>
> 	* Toggle SLD if the guest wants it enabled but its been disabled for
> 	* the userspace VMM, and vice versa, so that the flag and MSR state
> 	* are consistent, i.e. its handling during task switches naturally does
> 	* the right thing if KVM is preempted with guest state loaded.

Something to that effect.

>> > @@ -1188,6 +1217,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>> >  #endif
>> >
>> > 	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
>> > +
>> > +	vmx->host_sld_on = !test_thread_flag(TIF_SLD);
>> 
>> This inverted storage is non-intuitive. What's wrong with simply
>> reflecting the TIF_SLD state?
>
> So that the guest/host tracking use the same polairy, and IMO it makes
> the restoration code more intuitive, e.g.:
>
> 	vmx_update_sld(&vmx->vcpu, vmx->host_sld_on);
> vs
> 	vmx_update_sld(&vmx->vcpu, !vmx->host_tif_sld);
>
> I.e. the inversion needs to happen somewhere.

Correct, but we can make it consistently use the 'disabled' convention.

I briefly thought about renaming the flag to TIF_SLD_ENABLED, set it by
default and update the 5 places where it is used. But that's
inconsistent as well simply because it does not make any sense to set
that flag when detection is not available or disabled on the command
line.

>> > @@ -1226,6 +1259,9 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
>> > 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
>> >  #endif
>> > 	load_fixmap_gdt(raw_smp_processor_id());
>> > +
>> > +	vmx_update_sld(&vmx->vcpu, vmx->host_sld_on);
>> > +
>> 
>> vmx_prepare_switch_to_guest() is called via:
>> 
>> kvm_arch_vcpu_ioctl_run()
>>   vcpu_run()
>>     vcpu_enter_guest()
>>       preempt_disable();
>>       kvm_x86_ops.prepare_guest_switch(vcpu);
>> 
>> but vmx_prepare_switch_to_host() is invoked at the very end of:
>> 
>> kvm_arch_vcpu_ioctl_run()
>>   .....
>>   vcpu_run()
>>   .....
>>   vcpu_put()
>>     vmx_vcpu_put()
>>       vmx_prepare_switch_to_host();
>> 
>> That asymmetry does not make any sense without an explanation.
>
> Deferring the "switch to host" until the vCPU is put allows KVM to keep
> certain guest state loaded when staying in the vCPU run loop, e.g.
> MSR_KERNEL_GS_BASE can be exposed to the guest without having to save and
> restore it on every VM-Enter/VM-Exit.

I know why this is done (after staring at the callchains for a while),
but 5 lines of explanation at least in the changelog would have saved my
time.

>> What's even worse is that vmx_prepare_switch_to_host() is invoked with
>> preemption enabled, so MSR state and TIF_SLD state can get out of sync
>> on preemption/migration.
>
> It shouldn't be (called with preempation enabled):
>
> void vcpu_put(struct kvm_vcpu *vcpu)
> {
> 	preempt_disable();
> 	kvm_arch_vcpu_put(vcpu); <-- leads to vmx_prepare_switch_to_host()
> 	preempt_notifier_unregister(&vcpu->preempt_notifier);
> 	__this_cpu_write(kvm_running_vcpu, NULL);
> 	preempt_enable();
> }

Ooops. How did I miss that?

>> > @@ -1946,9 +1992,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > 
>> > 	switch (msr_index) {
>> > 	case MSR_TEST_CTRL:
>> > -		if (data)
>> > +		if (data & ~vmx_msr_test_ctrl_valid_bits(vcpu))
>> > 			return 1;
>> > 
>> > +		vmx->msr_test_ctrl = data;
>> > +
>> > +		preempt_disable();
>> 
>> This preempt_disable/enable() lacks explanation as well.
>
> Is an explanation still needed if it's made clear (somewhere) that
> interacting with guest_state_loaded needs to be done with preemption
> disabled?

Well, the thing is that finding that explanation somewhere is not always
trivial. Aside of that this really is the wrong place to do that. That
wants to be inside a function which is invoked from here and that
function should have the appropriate commentry
  
>> > +		if (vmx->guest_state_loaded)
>> > +			vmx_update_sld(vcpu, guest_cpu_sld_on(vmx));
>> > +		preempt_enable();
>> 
>> How is updating msr_test_ctrl valid if this is invoked from the IOCTL,
>> i.e. host_initiated == true?
>
> Not sure I understand the underlying question.  The host is always allowed
> to manipulate guest state, including MSRs.

Fair enough.

> I'm pretty sure guest_state_loaded should always be false if host_initiated
> is true, e.g. we could technically do a WARN on guest_state_loaded and
> host_initiated, but the ioctl() is obviously not a hot path and nothing
> will break if the assumption doesn't hold.

You'd create inconsistent state because the guest internal state cache
is not updated, but if you can updated it with !loaded then you can do
that anyway. Shrug.

>> That said, I also hate the fact that you export both the low level MSR
>> function _and_ the state variable. Having all these details including the
>> TIF mangling in the VMX code is just wrong.
>
> I'm not a fan of exporting the low level state either, but IIRC trying to
> hide the low level details while achieving the same resulting functionality
> was even messier.
>
> I don't see any way to avoid having KVM differentiate between sld_warn and
> sld_fatal.  Even if KVM is able to virtualize SLD in sld_fatal mode, e.g.
> by telling the guest it must not try to disable SLD, KVM would still need
> to know the kernel is sld_fatal so that it can forward that information to
> the guest.

Huch? There is absolutely zero code like that. The only place where
sld_state is used is:

+ static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
+ {
+	if (sld_state == sld_warn && guest_cpu_has_feature_sld(vcpu) &&
+	    on == test_thread_flag(TIF_SLD)) {
+		    sld_update_msr(on);
+		    update_thread_flag(TIF_SLD, !on);
+	}

You might have some faint memories from the previous trainwrecks :)

The fatal mode emulation which is used in this patch set is simply that
the guest can 'write' to the MSR but it's not propagated to the real
MSR. It's just stored in the guest state. There is no way that you can
tell the guest that the MSR is there but fake.

The alternative solution is to prevent the exposure of SLD to the guest
in fatal mode. But that does not buy anything.

The detection is anyway incomplete. If the SLD #AC is raised in guest's
user mode and the guest has user #AC enabled then the exception is
injected into the guest unconditionally and independent of the host's
and guest's SLD state. That's entirely correct because a SLD #AC in user
mode is also a user mode alignment violation; it's not distinguishable.

You could of course analyse the offending instruction and check for a
lock prefix and a cache line overlap, but that still does not prevent
false positives. When the guest is non-malicious and has proper user #AC
handling in place then it would be wrong or at least very surprising to
kill it just because the detection code decided that it is a dangerous
split lock attempt.

So we can go with the proposed mode of allowing the write but not
propagating it. If the resulting split lock #AC originates from CPL != 3
then the guest will be killed with SIGBUS. If it originates from CPL ==
3 and the guest has user #AC disabled then it will be killed as well.

Thanks,

        tglx
Sean Christopherson April 15, 2020, 9:43 p.m. UTC | #5
On Wed, Apr 15, 2020 at 11:22:11PM +0200, Thomas Gleixner wrote:
> Sean Christopherson <sean.j.christopherson@intel.com> writes:
> > I don't see any way to avoid having KVM differentiate between sld_warn and
> > sld_fatal.  Even if KVM is able to virtualize SLD in sld_fatal mode, e.g.
> > by telling the guest it must not try to disable SLD, KVM would still need
> > to know the kernel is sld_fatal so that it can forward that information to
> > the guest.
> 
> Huch? There is absolutely zero code like that. The only place where
> sld_state is used is:
> 
> + static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
> + {
> +	if (sld_state == sld_warn && guest_cpu_has_feature_sld(vcpu) &&
> +	    on == test_thread_flag(TIF_SLD)) {
> +		    sld_update_msr(on);
> +		    update_thread_flag(TIF_SLD, !on);
> +	}
> 
> You might have some faint memories from the previous trainwrecks :)

Yeah, I was thinking SLD was only being exposed if the host is sld_warn.
I'll work with Xiaoyao to figure out a cleaner interface for this code.

> The fatal mode emulation which is used in this patch set is simply that
> the guest can 'write' to the MSR but it's not propagated to the real
> MSR. It's just stored in the guest state. There is no way that you can
> tell the guest that the MSR is there but fake.
>
> The alternative solution is to prevent the exposure of SLD to the guest
> in fatal mode. But that does not buy anything.
> 
> The detection is anyway incomplete. If the SLD #AC is raised in guest's
> user mode and the guest has user #AC enabled then the exception is
> injected into the guest unconditionally and independent of the host's
> and guest's SLD state. That's entirely correct because a SLD #AC in user
> mode is also a user mode alignment violation; it's not distinguishable.
> 
> You could of course analyse the offending instruction and check for a
> lock prefix and a cache line overlap, but that still does not prevent
> false positives. When the guest is non-malicious and has proper user #AC
> handling in place then it would be wrong or at least very surprising to
> kill it just because the detection code decided that it is a dangerous
> split lock attempt.
> 
> So we can go with the proposed mode of allowing the write but not
> propagating it. If the resulting split lock #AC originates from CPL != 3
> then the guest will be killed with SIGBUS. If it originates from CPL ==
> 3 and the guest has user #AC disabled then it will be killed as well.

An idea that's been floated around to avoid killing the guest on a CPL==3
split-lock #AC is to add a STICKY bit to MSR_TEST_CTRL that KVM can
virtualize to tell the guest that attempting to disable SLD is futile,
e.g. so that the guest can kill its misbehaving userspace apps instead of
trying to disable SLD and getting killed by the host.

I've discussed it with a few folks internally and it sounds like getting
such a bit added to the SDM would be doable, even if Intel never ships
hardware that supports the bit.  The thought is that getting the STICKY
bit architecturally defined would give KVM/Linux leverage to persuade guest
kernels to add support for the bit.  But anything that touches the SDM
doesn't exactly happen quickly :-/.
Xiaoyao Li April 16, 2020, 2:12 a.m. UTC | #6
On 4/16/2020 3:47 AM, Thomas Gleixner wrote:
> Xiaoyao Li <xiaoyao.li@intel.com> writes:
> 
>> Due to the fact that TEST_CTRL MSR is per-core scope, i.e., the sibling
>> threads in the same physical CPU core share the same MSR, only
>> advertising feature split lock detection to guest when SMT is disabled
>> or unsupported, for simplicitly.
> 
> That's not for simplicity. It's for correctness because you cannot
> provide consistent state to a guest.
> 

I'll correct it.

Thanks!
-Xiaoyao
Xiaoyao Li April 16, 2020, 12:34 p.m. UTC | #7
On 4/16/2020 5:22 AM, Thomas Gleixner wrote:
> Sean,
> 
> Sean Christopherson <sean.j.christopherson@intel.com> writes:
>> On Wed, Apr 15, 2020 at 07:43:22PM +0200, Thomas Gleixner wrote:
>>> Xiaoyao Li <xiaoyao.li@intel.com> writes:
>>>> +static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
>>>> +{
>>>> +	/*
>>>> +	 * Toggle SLD if the guest wants it enabled but its been disabled for
>>>> +	 * the userspace VMM, and vice versa.  Note, TIF_SLD is true if SLD has
>>>> +	 * been turned off.  Yes, it's a terrible name.
>>>
>>> Instead of writing that useless blurb you could have written a patch
>>> which changes TIF_SLD to TIF_SLD_OFF to make it clear.
>>
>> Hah, that's my comment, though I must admit I didn't fully intend for the
>> editorial at the end to get submitted upstream.
>>
>> Anyways, I _did_ point out that TIF_SLD is a terrible name[1][2], and my
>> feedback got ignored/overlooked.  I'd be more than happy to write a
>> patch, I didn't do so because I assumed that people wanted TIF_SLD as the name for
>> whatever reason.
> 
> I somehow missed that in the maze of mails regarding this stuff. I've
> already written a patch to rename it to TIF_SLD_DISABLED which is pretty
> self explaining. But see below.
> 
[...]
> 
>>>> @@ -1188,6 +1217,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>>>>   #endif
>>>>
>>>> 	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
>>>> +
>>>> +	vmx->host_sld_on = !test_thread_flag(TIF_SLD);
>>>
>>> This inverted storage is non-intuitive. What's wrong with simply
>>> reflecting the TIF_SLD state?
>>
>> So that the guest/host tracking use the same polairy, and IMO it makes
>> the restoration code more intuitive, e.g.:
>>
>> 	vmx_update_sld(&vmx->vcpu, vmx->host_sld_on);
>> vs
>> 	vmx_update_sld(&vmx->vcpu, !vmx->host_tif_sld);
>>
>> I.e. the inversion needs to happen somewhere.
> 
> Correct, but we can make it consistently use the 'disabled' convention.
> 
> I briefly thought about renaming the flag to TIF_SLD_ENABLED, set it by
> default and update the 5 places where it is used. But that's
> inconsistent as well simply because it does not make any sense to set
> that flag when detection is not available or disabled on the command
> line.
> 

Assuming you'll pick TIF_SLD_DISABLED, I guess we need to set this flag 
by default for the case SLD is no available or disabled on the command, 
for consistency?
Thomas Gleixner April 16, 2020, 1:33 p.m. UTC | #8
Xiaoyao Li <xiaoyao.li@intel.com> writes:
> On 4/16/2020 5:22 AM, Thomas Gleixner wrote:
>> I briefly thought about renaming the flag to TIF_SLD_ENABLED, set it by
>> default and update the 5 places where it is used. But that's
>> inconsistent as well simply because it does not make any sense to set
>> that flag when detection is not available or disabled on the command
>> line.
>> 
>
> Assuming you'll pick TIF_SLD_DISABLED, I guess we need to set this flag 
> by default for the case SLD is no available or disabled on the command, 
> for consistency?

No, because nothing cares if SLD is off. There is no way to make this
fully consistent under all circumstances.

Thanks,

        tglx
Sean Christopherson May 5, 2020, 3:07 a.m. UTC | #9
On Wed, Apr 15, 2020 at 02:43:18PM -0700, Sean Christopherson wrote:
> On Wed, Apr 15, 2020 at 11:22:11PM +0200, Thomas Gleixner wrote:
> > Sean Christopherson <sean.j.christopherson@intel.com> writes:
> > > I don't see any way to avoid having KVM differentiate between sld_warn and
> > > sld_fatal.  Even if KVM is able to virtualize SLD in sld_fatal mode, e.g.
> > > by telling the guest it must not try to disable SLD, KVM would still need
> > > to know the kernel is sld_fatal so that it can forward that information to
> > > the guest.
> > 
> > Huch? There is absolutely zero code like that. The only place where
> > sld_state is used is:
> > 
> > + static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
> > + {
> > +	if (sld_state == sld_warn && guest_cpu_has_feature_sld(vcpu) &&
> > +	    on == test_thread_flag(TIF_SLD)) {
> > +		    sld_update_msr(on);
> > +		    update_thread_flag(TIF_SLD, !on);
> > +	}
> > 
> > You might have some faint memories from the previous trainwrecks :)
> 
> Yeah, I was thinking SLD was only being exposed if the host is sld_warn.
> I'll work with Xiaoyao to figure out a cleaner interface for this code.

...

> > So we can go with the proposed mode of allowing the write but not
> > propagating it. If the resulting split lock #AC originates from CPL != 3
> > then the guest will be killed with SIGBUS. If it originates from CPL ==
> > 3 and the guest has user #AC disabled then it will be killed as well.
> 
> An idea that's been floated around to avoid killing the guest on a CPL==3
> split-lock #AC is to add a STICKY bit to MSR_TEST_CTRL that KVM can
> virtualize to tell the guest that attempting to disable SLD is futile,
> e.g. so that the guest can kill its misbehaving userspace apps instead of
> trying to disable SLD and getting killed by the host.

Circling back to this.  KVM needs access to sld_state in one form or another
if we want to add a KVM hint when the host is in fatal mode.  Three options
I've come up with:

  1. Bite the bullet and export sld_state.  

  2. Add an is_split_fatal_wrapper().  Ugly since it needs to be non-inline
     to avoid triggering (1).

  3. Add a synthetic feature flag, e.g. X86_FEATURE_SLD_FATAL, and drop
     sld_state altogether.

I like (3) because it requires the least amount of code when all is said
and done, doesn't require more exports, and as a bonus it'd probably be nice
for userspace to see sld_fatal in /proc/cpuinfo.

Thoughts?
Thomas Gleixner May 6, 2020, 12:12 p.m. UTC | #10
Sean Christopherson <sean.j.christopherson@intel.com> writes:
> On Wed, Apr 15, 2020 at 02:43:18PM -0700, Sean Christopherson wrote:
>> On Wed, Apr 15, 2020 at 11:22:11PM +0200, Thomas Gleixner wrote:
>> > So we can go with the proposed mode of allowing the write but not
>> > propagating it. If the resulting split lock #AC originates from CPL != 3
>> > then the guest will be killed with SIGBUS. If it originates from CPL ==
>> > 3 and the guest has user #AC disabled then it will be killed as well.
>> 
>> An idea that's been floated around to avoid killing the guest on a CPL==3
>> split-lock #AC is to add a STICKY bit to MSR_TEST_CTRL that KVM can
>> virtualize to tell the guest that attempting to disable SLD is futile,
>> e.g. so that the guest can kill its misbehaving userspace apps instead of
>> trying to disable SLD and getting killed by the host.
>
> Circling back to this.  KVM needs access to sld_state in one form or another
> if we want to add a KVM hint when the host is in fatal mode.  Three options
> I've come up with:
>
>   1. Bite the bullet and export sld_state.  
>
>   2. Add an is_split_fatal_wrapper().  Ugly since it needs to be non-inline
>      to avoid triggering (1).
>
>   3. Add a synthetic feature flag, e.g. X86_FEATURE_SLD_FATAL, and drop
>      sld_state altogether.
>
> I like (3) because it requires the least amount of code when all is said
> and done, doesn't require more exports, and as a bonus it'd probably be nice
> for userspace to see sld_fatal in /proc/cpuinfo.

#3 makes sense and is elegant.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ae394ed174cd..2077abe4edf9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1120,6 +1120,35 @@  void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
 	}
 }
 
+/*
+ * Note: for guest, feature split lock detection can only be enumerated through
+ * MSR_IA32_CORE_CAPABILITIES bit. The FMS enumeration is unsupported.
+ */
+static inline bool guest_cpu_has_feature_sld(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.core_capabilities &
+	       MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
+}
+
+static inline bool guest_cpu_sld_on(struct vcpu_vmx *vmx)
+{
+	return vmx->msr_test_ctrl & MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+}
+
+static inline void vmx_update_sld(struct kvm_vcpu *vcpu, bool on)
+{
+	/*
+	 * Toggle SLD if the guest wants it enabled but its been disabled for
+	 * the userspace VMM, and vice versa.  Note, TIF_SLD is true if SLD has
+	 * been turned off.  Yes, it's a terrible name.
+	 */
+	if (sld_state == sld_warn && guest_cpu_has_feature_sld(vcpu) &&
+	    on == test_thread_flag(TIF_SLD)) {
+		    sld_update_msr(on);
+		    update_thread_flag(TIF_SLD, !on);
+	}
+}
+
 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1188,6 +1217,10 @@  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 #endif
 
 	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+
+	vmx->host_sld_on = !test_thread_flag(TIF_SLD);
+	vmx_update_sld(vcpu, guest_cpu_sld_on(vmx));
+
 	vmx->guest_state_loaded = true;
 }
 
@@ -1226,6 +1259,9 @@  static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
 	load_fixmap_gdt(raw_smp_processor_id());
+
+	vmx_update_sld(&vmx->vcpu, vmx->host_sld_on);
+
 	vmx->guest_state_loaded = false;
 	vmx->guest_msrs_ready = false;
 }
@@ -1777,6 +1813,16 @@  static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 	}
 }
 
+static inline u64 vmx_msr_test_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+{
+	u64 valid_bits = 0;
+
+	if (guest_cpu_has_feature_sld(vcpu))
+		valid_bits |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+	return valid_bits;
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -1790,7 +1836,7 @@  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 	switch (msr_info->index) {
 	case MSR_TEST_CTRL:
-		msr_info->data = 0;
+		msr_info->data = vmx->msr_test_ctrl;
 		break;
 #ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
@@ -1946,9 +1992,15 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 	switch (msr_index) {
 	case MSR_TEST_CTRL:
-		if (data)
+		if (data & ~vmx_msr_test_ctrl_valid_bits(vcpu))
 			return 1;
 
+		vmx->msr_test_ctrl = data;
+
+		preempt_disable();
+		if (vmx->guest_state_loaded)
+			vmx_update_sld(vcpu, guest_cpu_sld_on(vmx));
+		preempt_enable();
 		break;
 	case MSR_EFER:
 		ret = kvm_set_msr_common(vcpu, msr_info);
@@ -4266,7 +4318,7 @@  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vmx->rmode.vm86_active = 0;
 	vmx->spec_ctrl = 0;
-
+	vmx->msr_test_ctrl = 0;
 	vmx->msr_ia32_umwait_control = 0;
 
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
@@ -4596,24 +4648,33 @@  static int handle_machine_check(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static inline bool guest_cpu_alignment_check_enabled(struct kvm_vcpu *vcpu)
+{
+	return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+	       (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+}
+
 /*
  * If the host has split lock detection disabled, then #AC is
  * unconditionally injected into the guest, which is the pre split lock
  * detection behaviour.
  *
  * If the host has split lock detection enabled then #AC is
- * only injected into the guest when:
- *  - Guest CPL == 3 (user mode)
- *  - Guest has #AC detection enabled in CR0
- *  - Guest EFLAGS has AC bit set
+ * injected into the guest when:
+ * 1) guest has alignment check enabled;
+ * or 2) guest has split lock detection enabled;
  */
 static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
 {
 	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
 		return true;
 
-	return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
-	       (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+	/*
+	 * A split lock access must be an unaligned access, so we should check
+	 * guest_cpu_alignment_check_enabled() first.
+	 */
+	return guest_cpu_alignment_check_enabled(vcpu) ||
+	       guest_cpu_sld_on(to_vmx(vcpu));
 }
 
 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index aab9df55336e..b3c5be90b023 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -216,12 +216,14 @@  struct vcpu_vmx {
 	int                   nmsrs;
 	int                   save_nmsrs;
 	bool                  guest_msrs_ready;
+	bool                  host_sld_on;
 #ifdef CONFIG_X86_64
 	u64		      msr_host_kernel_gs_base;
 	u64		      msr_guest_kernel_gs_base;
 #endif
 
 	u64		      spec_ctrl;
+	u64                   msr_test_ctrl;
 	u32		      msr_ia32_umwait_control;
 
 	u32 secondary_exec_control;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index adfd4d74ea53..8c8f5ccfd98b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1189,7 +1189,7 @@  static const u32 msrs_to_save_all[] = {
 #endif
 	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 	MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
-	MSR_IA32_SPEC_CTRL,
+	MSR_IA32_SPEC_CTRL, MSR_TEST_CTRL,
 	MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
 	MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
 	MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -1371,7 +1371,12 @@  static u64 kvm_get_arch_capabilities(void)
 
 static u64 kvm_get_core_capabilities(void)
 {
-	return 0;
+	u64 data = 0;
+
+	if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && !cpu_smt_possible())
+		data |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
+
+	return data;
 }
 
 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -2764,7 +2769,8 @@  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vcpu->arch.arch_capabilities = data;
 		break;
 	case MSR_IA32_CORE_CAPS:
-		if (!msr_info->host_initiated)
+		if (!msr_info->host_initiated ||
+		    data & ~kvm_get_core_capabilities())
 			return 1;
 		vcpu->arch.core_capabilities = data;
 		break;
@@ -5243,6 +5249,11 @@  static void kvm_init_msr_list(void)
 		 * to the guests in some cases.
 		 */
 		switch (msrs_to_save_all[i]) {
+		case MSR_TEST_CTRL:
+			if (!(kvm_get_core_capabilities() &
+			      MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
+				continue;
+			break;
 		case MSR_IA32_BNDCFGS:
 			if (!kvm_mpx_supported())
 				continue;