diff mbox

kvm: x86: do not leak guest xcr0 into host interrupt handlers

Message ID 1459365887-146735-1-git-send-email-dmatlack@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

David Matlack March 30, 2016, 7:24 p.m. UTC
An interrupt handler that uses the fpu can kill a KVM VM, if it runs
under the following conditions:
 - the guest's xcr0 register is loaded on the cpu
 - the guest's fpu context is not loaded
 - the host is using eagerfpu

Note that the guest's xcr0 register and fpu context are not loaded as
part of the atomic world switch into "guest mode". They are loaded by
KVM while the cpu is still in "host mode".

Usage of the fpu in interrupt context is gated by irq_fpu_usable(). The
interrupt handler will look something like this:

if (irq_fpu_usable()) {
        kernel_fpu_begin();

        [... code that uses the fpu ...]

        kernel_fpu_end();
}

As long as the guest's fpu is not loaded and the host is using eager
fpu, irq_fpu_usable() returns true (interrupted_kernel_fpu_idle()
returns true). The interrupt handler proceeds to use the fpu with
the guest's xcr0 live.

kernel_fpu_begin() saves the current fpu context. If this uses
XSAVE[OPT], it may leave the xsave area in an undesirable state.
According to the SDM, during XSAVE bit i of XSTATE_BV is not modified
if bit i is 0 in xcr0. So it's possible that XSTATE_BV[i] == 1 and
xcr0[i] == 0 following an XSAVE.

kernel_fpu_end() restores the fpu context. Now if any bit i in
XSTATE_BV == 1 while xcr0[i] == 0, XRSTOR generates a #GP. The
fault is trapped and SIGSEGV is delivered to the current process.

Only pre-4.2 kernels appear to be vulnerable to this sequence of
events. Commit 653f52c ("kvm,x86: load guest FPU context more eagerly")
from 4.2 forces the guest's fpu to always be loaded on eagerfpu hosts.

This patch fixes the bug by keeping the host's xcr0 loaded outside
of the interrupts-disabled region where KVM switches into guest mode.

Cc: stable@vger.kernel.org
Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: David Matlack <dmatlack@google.com>
---
 arch/x86/kvm/x86.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

Comments

Paolo Bonzini March 31, 2016, 8:05 a.m. UTC | #1
On 30/03/2016 21:24, David Matlack wrote:
> An interrupt handler that uses the fpu can kill a KVM VM, if it runs
> under the following conditions:
>  - the guest's xcr0 register is loaded on the cpu
>  - the guest's fpu context is not loaded
>  - the host is using eagerfpu
> 
> Note that the guest's xcr0 register and fpu context are not loaded as
> part of the atomic world switch into "guest mode". They are loaded by
> KVM while the cpu is still in "host mode".
> 
> Usage of the fpu in interrupt context is gated by irq_fpu_usable(). The
> interrupt handler will look something like this:
> 
> if (irq_fpu_usable()) {
>         kernel_fpu_begin();
> 
>         [... code that uses the fpu ...]
> 
>         kernel_fpu_end();
> }
> 
> As long as the guest's fpu is not loaded and the host is using eager
> fpu, irq_fpu_usable() returns true (interrupted_kernel_fpu_idle()
> returns true). The interrupt handler proceeds to use the fpu with
> the guest's xcr0 live.
> 
> kernel_fpu_begin() saves the current fpu context. If this uses
> XSAVE[OPT], it may leave the xsave area in an undesirable state.
> According to the SDM, during XSAVE bit i of XSTATE_BV is not modified
> if bit i is 0 in xcr0. So it's possible that XSTATE_BV[i] == 1 and
> xcr0[i] == 0 following an XSAVE.
> 
> kernel_fpu_end() restores the fpu context. Now if any bit i in
> XSTATE_BV == 1 while xcr0[i] == 0, XRSTOR generates a #GP. The
> fault is trapped and SIGSEGV is delivered to the current process.
> 
> Only pre-4.2 kernels appear to be vulnerable to this sequence of
> events. Commit 653f52c ("kvm,x86: load guest FPU context more eagerly")
> from 4.2 forces the guest's fpu to always be loaded on eagerfpu hosts.
> 
> This patch fixes the bug by keeping the host's xcr0 loaded outside
> of the interrupts-disabled region where KVM switches into guest mode.
> 
> Cc: stable@vger.kernel.org
> Suggested-by: Andy Lutomirski <luto@amacapital.net>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/kvm/x86.c | 10 ++++------
>  1 file changed, 4 insertions(+), 6 deletions(-)

On guest entry we get:

> @@ -6590,8 +6589,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->prepare_guest_switch(vcpu);
>  	if (vcpu->fpu_active)
>  		kvm_load_guest_fpu(vcpu);

One fewer kvm_put_guest_xcr0, at least in eager mode.

> -	kvm_load_guest_xcr0(vcpu);
> -

One fewer kvm_load_guest_xcr0.

>  	vcpu->mode = IN_GUEST_MODE;
>  
>  	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> @@ -6607,6 +6604,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	local_irq_disable();
>  
> +	kvm_load_guest_xcr0(vcpu);

One more kvm_load_guest_xcr0.

>  	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
>  	    || need_resched() || signal_pending(current)) {
>  		vcpu->mode = OUTSIDE_GUEST_MODE;
> @@ -6667,6 +6666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	vcpu->mode = OUTSIDE_GUEST_MODE;
>  	smp_wmb();
>  
> +	kvm_put_guest_xcr0(vcpu);

One more kvm_put_guest_xcr0.

So everything balances out.  Considering that the logic is cleaner, I
can apply this to all released kernels.

Paolo

>  	/* Interrupt is enabled by handle_external_intr() */
>  	kvm_x86_ops->handle_external_intr(vcpu);
>  
> @@ -7314,7 +7315,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>  	 * and assume host would use all available bits.
>  	 * Guest xcr0 would be loaded later.
>  	 */
> -	kvm_put_guest_xcr0(vcpu);
>  	vcpu->guest_fpu_loaded = 1;
>  	__kernel_fpu_begin();
>  	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
> @@ -7323,8 +7323,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>  
>  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>  {
> -	kvm_put_guest_xcr0(vcpu);
> -
>  	if (!vcpu->guest_fpu_loaded) {
>  		vcpu->fpu_counter = 0;
>  		return;
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini April 5, 2016, 11:28 a.m. UTC | #2
On 30/03/2016 21:24, David Matlack wrote:
> An interrupt handler that uses the fpu can kill a KVM VM, if it runs
> under the following conditions:
>  - the guest's xcr0 register is loaded on the cpu
>  - the guest's fpu context is not loaded
>  - the host is using eagerfpu
> 
> Note that the guest's xcr0 register and fpu context are not loaded as
> part of the atomic world switch into "guest mode". They are loaded by
> KVM while the cpu is still in "host mode".
> 
> Usage of the fpu in interrupt context is gated by irq_fpu_usable(). The
> interrupt handler will look something like this:
> 
> if (irq_fpu_usable()) {
>         kernel_fpu_begin();
> 
>         [... code that uses the fpu ...]
> 
>         kernel_fpu_end();
> }
> 
> As long as the guest's fpu is not loaded and the host is using eager
> fpu, irq_fpu_usable() returns true (interrupted_kernel_fpu_idle()
> returns true). The interrupt handler proceeds to use the fpu with
> the guest's xcr0 live.
> 
> kernel_fpu_begin() saves the current fpu context. If this uses
> XSAVE[OPT], it may leave the xsave area in an undesirable state.
> According to the SDM, during XSAVE bit i of XSTATE_BV is not modified
> if bit i is 0 in xcr0. So it's possible that XSTATE_BV[i] == 1 and
> xcr0[i] == 0 following an XSAVE.
> 
> kernel_fpu_end() restores the fpu context. Now if any bit i in
> XSTATE_BV == 1 while xcr0[i] == 0, XRSTOR generates a #GP. The
> fault is trapped and SIGSEGV is delivered to the current process.
> 
> Only pre-4.2 kernels appear to be vulnerable to this sequence of
> events. Commit 653f52c ("kvm,x86: load guest FPU context more eagerly")
> from 4.2 forces the guest's fpu to always be loaded on eagerfpu hosts.
> 
> This patch fixes the bug by keeping the host's xcr0 loaded outside
> of the interrupts-disabled region where KVM switches into guest mode.
> 
> Cc: stable@vger.kernel.org
> Suggested-by: Andy Lutomirski <luto@amacapital.net>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/kvm/x86.c | 10 ++++------
>  1 file changed, 4 insertions(+), 6 deletions(-)

While running my acceptance tests, in one case I got one CPU whose xcr0 
had leaked into the host.  This showed up as a SIGILL in strncasecmp's 
AVX code, and a simple program confirmed it:

    $ cat xgetbv.c
    #include <stdio.h>
    int main(void)
    {
        unsigned xcr0_h, xcr0_l;
        asm("xgetbv" : "=d"(xcr0_h), "=a"(xcr0_l) : "c"(0));
        printf("%08x:%08x\n", xcr0_h, xcr0_l);
    }
    $ gcc xgetbv.c -O2
    $ for i in `seq 0 55`; do echo $i `taskset -c $i ./a.out`; done|grep -v 007
    19 00000000:00000003

I'm going to rerun the tests without this patch, as it seems the most
likely culprit, and leave it out of the pull request if they pass.

Paolo

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e260ccb..8df1167 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -700,7 +700,6 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
>  		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
>  			return 1;
>  	}
> -	kvm_put_guest_xcr0(vcpu);
>  	vcpu->arch.xcr0 = xcr0;
>  
>  	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
> @@ -6590,8 +6589,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->prepare_guest_switch(vcpu);
>  	if (vcpu->fpu_active)
>  		kvm_load_guest_fpu(vcpu);
> -	kvm_load_guest_xcr0(vcpu);
> -
>  	vcpu->mode = IN_GUEST_MODE;
>  
>  	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> @@ -6607,6 +6604,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	local_irq_disable();
>  
> +	kvm_load_guest_xcr0(vcpu);
> +
>  	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
>  	    || need_resched() || signal_pending(current)) {
>  		vcpu->mode = OUTSIDE_GUEST_MODE;
> @@ -6667,6 +6666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	vcpu->mode = OUTSIDE_GUEST_MODE;
>  	smp_wmb();
>  
> +	kvm_put_guest_xcr0(vcpu);
> +
>  	/* Interrupt is enabled by handle_external_intr() */
>  	kvm_x86_ops->handle_external_intr(vcpu);
>  
> @@ -7314,7 +7315,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>  	 * and assume host would use all available bits.
>  	 * Guest xcr0 would be loaded later.
>  	 */
> -	kvm_put_guest_xcr0(vcpu);
>  	vcpu->guest_fpu_loaded = 1;
>  	__kernel_fpu_begin();
>  	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
> @@ -7323,8 +7323,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>  
>  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>  {
> -	kvm_put_guest_xcr0(vcpu);
> -
>  	if (!vcpu->guest_fpu_loaded) {
>  		vcpu->fpu_counter = 0;
>  		return;
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack April 5, 2016, 3:56 p.m. UTC | #3
On Tue, Apr 5, 2016 at 4:28 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
...
>
> While running my acceptance tests, in one case I got one CPU whose xcr0
> had leaked into the host.  This showed up as a SIGILL in strncasecmp's
> AVX code, and a simple program confirmed it:
>
>     $ cat xgetbv.c
>     #include <stdio.h>
>     int main(void)
>     {
>         unsigned xcr0_h, xcr0_l;
>         asm("xgetbv" : "=d"(xcr0_h), "=a"(xcr0_l) : "c"(0));
>         printf("%08x:%08x\n", xcr0_h, xcr0_l);
>     }
>     $ gcc xgetbv.c -O2
>     $ for i in `seq 0 55`; do echo $i `taskset -c $i ./a.out`; done|grep -v 007
>     19 00000000:00000003
>
> I'm going to rerun the tests without this patch, as it seems the most
> likely culprit, and leave it out of the pull request if they pass.

Agreed this is a very likely culprit. I think I see one way the
guest's xcr0 can leak into the host. I will do some testing an send
another version. Thanks.

>
> Paolo
>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index e260ccb..8df1167 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -700,7 +700,6 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
>>               if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
>>                       return 1;
>>       }
>> -     kvm_put_guest_xcr0(vcpu);
>>       vcpu->arch.xcr0 = xcr0;
>>
>>       if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
>> @@ -6590,8 +6589,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>       kvm_x86_ops->prepare_guest_switch(vcpu);
>>       if (vcpu->fpu_active)
>>               kvm_load_guest_fpu(vcpu);
>> -     kvm_load_guest_xcr0(vcpu);
>> -
>>       vcpu->mode = IN_GUEST_MODE;
>>
>>       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
>> @@ -6607,6 +6604,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>
>>       local_irq_disable();
>>
>> +     kvm_load_guest_xcr0(vcpu);
>> +
>>       if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
>>           || need_resched() || signal_pending(current)) {

Here, after we've loaded the guest xcr0, if we enter this if
statement, we return from vcpu_enter_guest with the guest's xcr0 still
loaded.

>>               vcpu->mode = OUTSIDE_GUEST_MODE;
>> @@ -6667,6 +6666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>       vcpu->mode = OUTSIDE_GUEST_MODE;
>>       smp_wmb();
>>
>> +     kvm_put_guest_xcr0(vcpu);
>> +
>>       /* Interrupt is enabled by handle_external_intr() */
>>       kvm_x86_ops->handle_external_intr(vcpu);
>>
>> @@ -7314,7 +7315,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>>        * and assume host would use all available bits.
>>        * Guest xcr0 would be loaded later.
>>        */
>> -     kvm_put_guest_xcr0(vcpu);
>>       vcpu->guest_fpu_loaded = 1;
>>       __kernel_fpu_begin();
>>       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
>> @@ -7323,8 +7323,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>>
>>  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>>  {
>> -     kvm_put_guest_xcr0(vcpu);
>> -
>>       if (!vcpu->guest_fpu_loaded) {
>>               vcpu->fpu_counter = 0;
>>               return;
>>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini April 5, 2016, 4:07 p.m. UTC | #4
On 05/04/2016 17:56, David Matlack wrote:
> > I'm going to rerun the tests without this patch, as it seems the most
> > likely culprit, and leave it out of the pull request if they pass.
>
> Agreed this is a very likely culprit. I think I see one way the
> guest's xcr0 can leak into the host. I will do some testing an send
> another version. Thanks.

Tests passed without this patch.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini April 7, 2016, 9:08 a.m. UTC | #5
On 05/04/2016 17:56, David Matlack wrote:
> On Tue, Apr 5, 2016 at 4:28 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
> ...
>>
>> While running my acceptance tests, in one case I got one CPU whose xcr0
>> had leaked into the host.  This showed up as a SIGILL in strncasecmp's
>> AVX code, and a simple program confirmed it:
>>
>>     $ cat xgetbv.c
>>     #include <stdio.h>
>>     int main(void)
>>     {
>>         unsigned xcr0_h, xcr0_l;
>>         asm("xgetbv" : "=d"(xcr0_h), "=a"(xcr0_l) : "c"(0));
>>         printf("%08x:%08x\n", xcr0_h, xcr0_l);
>>     }
>>     $ gcc xgetbv.c -O2
>>     $ for i in `seq 0 55`; do echo $i `taskset -c $i ./a.out`; done|grep -v 007
>>     19 00000000:00000003
>>
>> I'm going to rerun the tests without this patch, as it seems the most
>> likely culprit, and leave it out of the pull request if they pass.
> 
> Agreed this is a very likely culprit. I think I see one way the
> guest's xcr0 can leak into the host.

That's cancel_injection, right?  If it's just about moving the load call
below, I can do that.  Hmm, I will even test that today. :)

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack April 7, 2016, 6:18 p.m. UTC | #6
On Thu, Apr 7, 2016 at 2:08 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 05/04/2016 17:56, David Matlack wrote:
>> On Tue, Apr 5, 2016 at 4:28 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>> ...
>>>
>>> While running my acceptance tests, in one case I got one CPU whose xcr0
>>> had leaked into the host.  This showed up as a SIGILL in strncasecmp's
>>> AVX code, and a simple program confirmed it:
>>>
>>>     $ cat xgetbv.c
>>>     #include <stdio.h>
>>>     int main(void)
>>>     {
>>>         unsigned xcr0_h, xcr0_l;
>>>         asm("xgetbv" : "=d"(xcr0_h), "=a"(xcr0_l) : "c"(0));
>>>         printf("%08x:%08x\n", xcr0_h, xcr0_l);
>>>     }
>>>     $ gcc xgetbv.c -O2
>>>     $ for i in `seq 0 55`; do echo $i `taskset -c $i ./a.out`; done|grep -v 007
>>>     19 00000000:00000003
>>>
>>> I'm going to rerun the tests without this patch, as it seems the most
>>> likely culprit, and leave it out of the pull request if they pass.
>>
>> Agreed this is a very likely culprit. I think I see one way the
>> guest's xcr0 can leak into the host.
>
> That's cancel_injection, right?  If it's just about moving the load call
> below, I can do that.  Hmm, I will even test that today. :)

Yes that's what I was thinking, move kvm_load_guest_xcr0 below that if.

Thank you :). Let me know how testing goes.

>
> Paolo
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini April 7, 2016, 7:03 p.m. UTC | #7
----- Original Message -----
> >>> While running my acceptance tests, in one case I got one CPU whose xcr0
> >>> had leaked into the host.  This showed up as a SIGILL in strncasecmp's
> >>> AVX code, and a simple program confirmed it:
> >>>
> >>>     $ cat xgetbv.c
> >>>     #include <stdio.h>
> >>>     int main(void)
> >>>     {
> >>>         unsigned xcr0_h, xcr0_l;
> >>>         asm("xgetbv" : "=d"(xcr0_h), "=a"(xcr0_l) : "c"(0));
> >>>         printf("%08x:%08x\n", xcr0_h, xcr0_l);
> >>>     }
> >>>     $ gcc xgetbv.c -O2
> >>>     $ for i in `seq 0 55`; do echo $i `taskset -c $i ./a.out`; done|grep
> >>>     -v 007
> >>>     19 00000000:00000003
> >>>
> >>> I'm going to rerun the tests without this patch, as it seems the most
> >>> likely culprit, and leave it out of the pull request if they pass.
> >>
> >> Agreed this is a very likely culprit. I think I see one way the
> >> guest's xcr0 can leak into the host.
> >
> > That's cancel_injection, right?  If it's just about moving the load call
> > below, I can do that.  Hmm, I will even test that today. :)
> 
> Yes that's what I was thinking, move kvm_load_guest_xcr0 below that if.
> 
> Thank you :). Let me know how testing goes.

It went well.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack April 8, 2016, 4:25 p.m. UTC | #8
On Thu, Apr 7, 2016 at 12:03 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> Thank you :). Let me know how testing goes.
>
> It went well.

Great! How should we proceed?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini April 8, 2016, 4:50 p.m. UTC | #9
On 08/04/2016 18:25, David Matlack wrote:
> On Thu, Apr 7, 2016 at 12:03 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>> Thank you :). Let me know how testing goes.
>>
>> It went well.
> 
> Great! How should we proceed?

It will appear very soon on kvm/next and Radim will send the pull
request to Linus next week (I'm having him practice before I go on
vacation ;)).

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack April 8, 2016, 4:54 p.m. UTC | #10
On Fri, Apr 8, 2016 at 9:50 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 08/04/2016 18:25, David Matlack wrote:
>> On Thu, Apr 7, 2016 at 12:03 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>>
>>>> Thank you :). Let me know how testing goes.
>>>
>>> It went well.
>>
>> Great! How should we proceed?
>
> It will appear very soon on kvm/next and Radim will send the pull
> request to Linus next week (I'm having him practice before I go on
> vacation ;)).

Makes sense. Thanks for taking care of it!

>
> Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li April 22, 2016, 7:30 a.m. UTC | #11
Hi Paolo and David,
2016-03-31 3:24 GMT+08:00 David Matlack <dmatlack@google.com>:
> An interrupt handler that uses the fpu can kill a KVM VM, if it runs
> under the following conditions:
>  - the guest's xcr0 register is loaded on the cpu
>  - the guest's fpu context is not loaded
>  - the host is using eagerfpu
>
> Note that the guest's xcr0 register and fpu context are not loaded as
> part of the atomic world switch into "guest mode". They are loaded by
> KVM while the cpu is still in "host mode".
>
> Usage of the fpu in interrupt context is gated by irq_fpu_usable(). The
> interrupt handler will look something like this:
>
> if (irq_fpu_usable()) {
>         kernel_fpu_begin();
>
>         [... code that uses the fpu ...]
>
>         kernel_fpu_end();
> }
>
> As long as the guest's fpu is not loaded and the host is using eager
> fpu, irq_fpu_usable() returns true (interrupted_kernel_fpu_idle()
> returns true). The interrupt handler proceeds to use the fpu with
> the guest's xcr0 live.
>
> kernel_fpu_begin() saves the current fpu context. If this uses
> XSAVE[OPT], it may leave the xsave area in an undesirable state.
> According to the SDM, during XSAVE bit i of XSTATE_BV is not modified
> if bit i is 0 in xcr0. So it's possible that XSTATE_BV[i] == 1 and
> xcr0[i] == 0 following an XSAVE.

How XSAVE save bit i since SDM mentioned that "XSAVE saves state
component i if and only if RFBM[i] = 1. "?  RFBM[i] will be 0 if
XSTATE_BV[i] == 1 && guest xcr0[i] == 0.

Regards,
Wanpeng Li

>
> kernel_fpu_end() restores the fpu context. Now if any bit i in
> XSTATE_BV == 1 while xcr0[i] == 0, XRSTOR generates a #GP. The
> fault is trapped and SIGSEGV is delivered to the current process.
>
> Only pre-4.2 kernels appear to be vulnerable to this sequence of
> events. Commit 653f52c ("kvm,x86: load guest FPU context more eagerly")
> from 4.2 forces the guest's fpu to always be loaded on eagerfpu hosts.
>
> This patch fixes the bug by keeping the host's xcr0 loaded outside
> of the interrupts-disabled region where KVM switches into guest mode.
>
> Cc: stable@vger.kernel.org
> Suggested-by: Andy Lutomirski <luto@amacapital.net>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/kvm/x86.c | 10 ++++------
>  1 file changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e260ccb..8df1167 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -700,7 +700,6 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
>                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
>                         return 1;
>         }
> -       kvm_put_guest_xcr0(vcpu);
>         vcpu->arch.xcr0 = xcr0;
>
>         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
> @@ -6590,8 +6589,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         kvm_x86_ops->prepare_guest_switch(vcpu);
>         if (vcpu->fpu_active)
>                 kvm_load_guest_fpu(vcpu);
> -       kvm_load_guest_xcr0(vcpu);
> -
>         vcpu->mode = IN_GUEST_MODE;
>
>         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> @@ -6607,6 +6604,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>
>         local_irq_disable();
>
> +       kvm_load_guest_xcr0(vcpu);
> +
>         if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
>             || need_resched() || signal_pending(current)) {
>                 vcpu->mode = OUTSIDE_GUEST_MODE;
> @@ -6667,6 +6666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         vcpu->mode = OUTSIDE_GUEST_MODE;
>         smp_wmb();
>
> +       kvm_put_guest_xcr0(vcpu);
> +
>         /* Interrupt is enabled by handle_external_intr() */
>         kvm_x86_ops->handle_external_intr(vcpu);
>
> @@ -7314,7 +7315,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>          * and assume host would use all available bits.
>          * Guest xcr0 would be loaded later.
>          */
> -       kvm_put_guest_xcr0(vcpu);
>         vcpu->guest_fpu_loaded = 1;
>         __kernel_fpu_begin();
>         __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
> @@ -7323,8 +7323,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>
>  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>  {
> -       kvm_put_guest_xcr0(vcpu);
> -
>         if (!vcpu->guest_fpu_loaded) {
>                 vcpu->fpu_counter = 0;
>                 return;
> --
> 2.8.0.rc3.226.g39d4020
>
David Matlack April 22, 2016, 5:21 p.m. UTC | #12
On Fri, Apr 22, 2016 at 12:30 AM, Wanpeng Li <kernellwp@gmail.com> wrote:
> Hi Paolo and David,
> 2016-03-31 3:24 GMT+08:00 David Matlack <dmatlack@google.com>:
>>
>> kernel_fpu_begin() saves the current fpu context. If this uses
>> XSAVE[OPT], it may leave the xsave area in an undesirable state.
>> According to the SDM, during XSAVE bit i of XSTATE_BV is not modified
>> if bit i is 0 in xcr0. So it's possible that XSTATE_BV[i] == 1 and
>> xcr0[i] == 0 following an XSAVE.
>
> How XSAVE save bit i since SDM mentioned that "XSAVE saves state
> component i if and only if RFBM[i] = 1. "?  RFBM[i] will be 0 if
> XSTATE_BV[i] == 1 && guest xcr0[i] == 0.

You are correct, RFBM[i] will be 0 and XSAVE does not save state
component i in this case. However, XSTATE_BV[i] is left untouched by
XSAVE (left as 1). On XRSTOR, the CPU checks if XSTATE_BV[i] == 1 &&
xcr0[i] == 0, and if so delivers a #GP.

If you are wondering how XSTATE_BV[i] could be 1 in the first place, I
suspect it is left over from a previous XSAVE (which sets XSTATE_BV[i]
to the value in XINUSE[i]).

>
> Regards,
> Wanpeng Li
>
>>
>> kernel_fpu_end() restores the fpu context. Now if any bit i in
>> XSTATE_BV == 1 while xcr0[i] == 0, XRSTOR generates a #GP. The
>> fault is trapped and SIGSEGV is delivered to the current process.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li April 23, 2016, 10:37 p.m. UTC | #13
2016-04-23 1:21 GMT+08:00 David Matlack <dmatlack@google.com>:
> On Fri, Apr 22, 2016 at 12:30 AM, Wanpeng Li <kernellwp@gmail.com> wrote:
>> Hi Paolo and David,
>> 2016-03-31 3:24 GMT+08:00 David Matlack <dmatlack@google.com>:
>>>
>>> kernel_fpu_begin() saves the current fpu context. If this uses
>>> XSAVE[OPT], it may leave the xsave area in an undesirable state.
>>> According to the SDM, during XSAVE bit i of XSTATE_BV is not modified
>>> if bit i is 0 in xcr0. So it's possible that XSTATE_BV[i] == 1 and
>>> xcr0[i] == 0 following an XSAVE.
>>
>> How XSAVE save bit i since SDM mentioned that "XSAVE saves state
>> component i if and only if RFBM[i] = 1. "?  RFBM[i] will be 0 if
>> XSTATE_BV[i] == 1 && guest xcr0[i] == 0.
>
> You are correct, RFBM[i] will be 0 and XSAVE does not save state
> component i in this case. However, XSTATE_BV[i] is left untouched by
> XSAVE (left as 1). On XRSTOR, the CPU checks if XSTATE_BV[i] == 1 &&
> xcr0[i] == 0, and if so delivers a #GP.

However, SDM also mentioned that "If RFBM[i] = 0, XRSTOR does not
update state component i." So we #GP on a don't need restore bit i if
XSTATE_BV[I] == 1 && xcr0[0] ==0. That's where I miss I think, thanks
for your explanation.

Regard,
Wanpeng Li

>
> If you are wondering how XSTATE_BV[i] could be 1 in the first place, I
> suspect it is left over from a previous XSAVE (which sets XSTATE_BV[i]
> to the value in XINUSE[i]).
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e260ccb..8df1167 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -700,7 +700,6 @@  static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
 			return 1;
 	}
-	kvm_put_guest_xcr0(vcpu);
 	vcpu->arch.xcr0 = xcr0;
 
 	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -6590,8 +6589,6 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
-	kvm_load_guest_xcr0(vcpu);
-
 	vcpu->mode = IN_GUEST_MODE;
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6607,6 +6604,8 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	local_irq_disable();
 
+	kvm_load_guest_xcr0(vcpu);
+
 	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
 		vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -6667,6 +6666,8 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 
+	kvm_put_guest_xcr0(vcpu);
+
 	/* Interrupt is enabled by handle_external_intr() */
 	kvm_x86_ops->handle_external_intr(vcpu);
 
@@ -7314,7 +7315,6 @@  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	 * and assume host would use all available bits.
 	 * Guest xcr0 would be loaded later.
 	 */
-	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	__kernel_fpu_begin();
 	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
@@ -7323,8 +7323,6 @@  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	kvm_put_guest_xcr0(vcpu);
-
 	if (!vcpu->guest_fpu_loaded) {
 		vcpu->fpu_counter = 0;
 		return;