diff mbox

x86/kvm/vmx: don't read current->thread.{fs,gs}base of legacy tasks

Message ID 20180711173718.8850-1-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Vitaly Kuznetsov July 11, 2018, 5:37 p.m. UTC
When we switched from doing rdmsr() to reading FS/GS base values from
current->thread we completely forgot about legacy 32-bit userspaces which
we still support in KVM (why?). task->thread.{fsbase,gsbase} are only
synced for 64-bit processes, calling save_fsgs_for_kvm() and using
its result from current is illegal for legacy processes.

There's no ARCH_SET_FS/GS prctls for legacy applications. Base MSRs are,
however, not always equal to zero. Intel's manual says (3.4.4 Segment
Loading Instructions in IA-32e Mode):

"In order to set up compatibility mode for an application, segment-load
instructions (MOV to Sreg, POP Sreg) work normally in 64-bit mode. An
entry is read from the system descriptor table (GDT or LDT) and is loaded
in the hidden portion of the segment register.
...
The hidden descriptor register fields for FS.base and GS.base are
physically mapped to MSRs in order to load all address bits supported by
a 64-bit implementation.
"

The issue was found by strace test suite where 32-bit ioctl_kvm_run test
started segfaulting.

Reported-by: Dmitry V. Levin <ldv@altlinux.org>
Bisected-by: Masatake YAMATO <yamato@redhat.com>
Fixes: 42b933b59721 ("x86/kvm/vmx: read MSR_{FS,KERNEL_GS}_BASE from current->thread")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/vmx.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

Comments

Wanpeng Li July 12, 2018, 1:39 a.m. UTC | #1
On Thu, 12 Jul 2018 at 08:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>
> When we switched from doing rdmsr() to reading FS/GS base values from
> current->thread we completely forgot about legacy 32-bit userspaces which
> we still support in KVM (why?). task->thread.{fsbase,gsbase} are only
> synced for 64-bit processes, calling save_fsgs_for_kvm() and using
> its result from current is illegal for legacy processes.
>
> There's no ARCH_SET_FS/GS prctls for legacy applications. Base MSRs are,
> however, not always equal to zero. Intel's manual says (3.4.4 Segment
> Loading Instructions in IA-32e Mode):
>
> "In order to set up compatibility mode for an application, segment-load
> instructions (MOV to Sreg, POP Sreg) work normally in 64-bit mode. An
> entry is read from the system descriptor table (GDT or LDT) and is loaded
> in the hidden portion of the segment register.
> ...
> The hidden descriptor register fields for FS.base and GS.base are
> physically mapped to MSRs in order to load all address bits supported by
> a 64-bit implementation.
> "
>
> The issue was found by strace test suite where 32-bit ioctl_kvm_run test
> started segfaulting.

Test suite: MSR switch
PASS: VM entry MSR load
PASS: VM exit MSR store
PASS: VM exit MSR load
FAIL: VM entry MSR load: try to load FS_BASE
SUMMARY: 4 tests, 1 unexpected failures

kvm-unit-tests fails w/ and w/o the patch, maybe it is another issue,
i didn't dig further, you can have a look if you are interested in. :)

Regards,
Wanpeng Li

>
> Reported-by: Dmitry V. Levin <ldv@altlinux.org>
> Bisected-by: Masatake YAMATO <yamato@redhat.com>
> Fixes: 42b933b59721 ("x86/kvm/vmx: read MSR_{FS,KERNEL_GS}_BASE from current->thread")
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 25 +++++++++++++++++--------
>  1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 559a12b6184d..65968649b365 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2560,6 +2560,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
>  #ifdef CONFIG_X86_64
>         int cpu = raw_smp_processor_id();
> +       unsigned long fsbase, kernel_gsbase;
>  #endif
>         int i;
>
> @@ -2575,12 +2576,20 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
>
>  #ifdef CONFIG_X86_64
> -       save_fsgs_for_kvm();
> -       vmx->host_state.fs_sel = current->thread.fsindex;
> -       vmx->host_state.gs_sel = current->thread.gsindex;
> -#else
> -       savesegment(fs, vmx->host_state.fs_sel);
> -       savesegment(gs, vmx->host_state.gs_sel);
> +       if (likely(is_64bit_mm(current->mm))) {
> +               save_fsgs_for_kvm();
> +               vmx->host_state.fs_sel = current->thread.fsindex;
> +               vmx->host_state.gs_sel = current->thread.gsindex;
> +               fsbase = current->thread.fsbase;
> +               kernel_gsbase = current->thread.gsbase;
> +       } else {
> +#endif
> +               savesegment(fs, vmx->host_state.fs_sel);
> +               savesegment(gs, vmx->host_state.gs_sel);
> +#ifdef CONFIG_X86_64
> +               fsbase = read_msr(MSR_FS_BASE);
> +               kernel_gsbase = read_msr(MSR_KERNEL_GS_BASE);
> +       }
>  #endif
>         if (!(vmx->host_state.fs_sel & 7)) {
>                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
> @@ -2600,10 +2609,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>         savesegment(ds, vmx->host_state.ds_sel);
>         savesegment(es, vmx->host_state.es_sel);
>
> -       vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
> +       vmcs_writel(HOST_FS_BASE, fsbase);
>         vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
>
> -       vmx->msr_host_kernel_gs_base = current->thread.gsbase;
> +       vmx->msr_host_kernel_gs_base = kernel_gsbase;
>         if (is_long_mode(&vmx->vcpu))
>                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
>  #else
> --
> 2.14.4
>
Vitaly Kuznetsov July 12, 2018, 9:31 a.m. UTC | #2
Wanpeng Li <kernellwp@gmail.com> writes:

> On Thu, 12 Jul 2018 at 08:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>
>> When we switched from doing rdmsr() to reading FS/GS base values from
>> current->thread we completely forgot about legacy 32-bit userspaces which
>> we still support in KVM (why?). task->thread.{fsbase,gsbase} are only
>> synced for 64-bit processes, calling save_fsgs_for_kvm() and using
>> its result from current is illegal for legacy processes.
>>
>> There's no ARCH_SET_FS/GS prctls for legacy applications. Base MSRs are,
>> however, not always equal to zero. Intel's manual says (3.4.4 Segment
>> Loading Instructions in IA-32e Mode):
>>
>> "In order to set up compatibility mode for an application, segment-load
>> instructions (MOV to Sreg, POP Sreg) work normally in 64-bit mode. An
>> entry is read from the system descriptor table (GDT or LDT) and is loaded
>> in the hidden portion of the segment register.
>> ...
>> The hidden descriptor register fields for FS.base and GS.base are
>> physically mapped to MSRs in order to load all address bits supported by
>> a 64-bit implementation.
>> "
>>
>> The issue was found by strace test suite where 32-bit ioctl_kvm_run test
>> started segfaulting.
>
> Test suite: MSR switch
> PASS: VM entry MSR load
> PASS: VM exit MSR store
> PASS: VM exit MSR load
> FAIL: VM entry MSR load: try to load FS_BASE
> SUMMARY: 4 tests, 1 unexpected failures
>
> kvm-unit-tests fails w/ and w/o the patch, maybe it is another issue,
> i didn't dig further, you can have a look if you are interested in. :)

The patch only changes the behavior for legacy userspaces and I can
reproduce the failure on native x86_64, it is something different. I'm,
however, interested so stay tuned :-)
Vitaly Kuznetsov July 12, 2018, 11:23 a.m. UTC | #3
Vitaly Kuznetsov <vkuznets@redhat.com> writes:

> Wanpeng Li <kernellwp@gmail.com> writes:
>
>> Test suite: MSR switch
>> PASS: VM entry MSR load
>> PASS: VM exit MSR store
>> PASS: VM exit MSR load
>> FAIL: VM entry MSR load: try to load FS_BASE
>> SUMMARY: 4 tests, 1 unexpected failures
>>
>> kvm-unit-tests fails w/ and w/o the patch, maybe it is another issue,
>> i didn't dig further, you can have a look if you are interested in. :)
>
> The patch only changes the behavior for legacy userspaces and I can
> reproduce the failure on native x86_64, it is something different. I'm,
> however, interested so stay tuned :-)

Yes,

the regression was introduced by 

commit e79f245ddec17bbd89d73cd0169dba4be46c9b55
Author: KarimAllah Ahmed <karahmed@amazon.de>
Date:   Sat Apr 14 05:10:52 2018 +0200

    X86/KVM: Properly update 'tsc_offset' to represent the running guest

basically, when nested_vmx_load_msr() fails we don't set
exit_qualification accordingly.

The fix is simple:

@@ -11720,8 +11721,10 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
        msr_entry_idx = nested_vmx_load_msr(vcpu,
                                            vmcs12->vm_entry_msr_load_addr,
                                            vmcs12->vm_entry_msr_load_count);
-       if (msr_entry_idx)
+       if (msr_entry_idx) {
+               exit_qual = msr_entry_idx;
                goto fail;
+       }
 
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point

I'll be sending a patch out shortly. But this is completely orthogonal
to the 'legacy' issue ;-)
Sean Christopherson July 13, 2018, 4:46 p.m. UTC | #4
On Wed, Jul 11, 2018 at 07:37:18PM +0200, Vitaly Kuznetsov wrote:
> When we switched from doing rdmsr() to reading FS/GS base values from
> current->thread we completely forgot about legacy 32-bit userspaces which
> we still support in KVM (why?). task->thread.{fsbase,gsbase} are only
> synced for 64-bit processes, calling save_fsgs_for_kvm() and using
> its result from current is illegal for legacy processes.
> 
> There's no ARCH_SET_FS/GS prctls for legacy applications. Base MSRs are,
> however, not always equal to zero. Intel's manual says (3.4.4 Segment
> Loading Instructions in IA-32e Mode):
> 
> "In order to set up compatibility mode for an application, segment-load
> instructions (MOV to Sreg, POP Sreg) work normally in 64-bit mode. An
> entry is read from the system descriptor table (GDT or LDT) and is loaded
> in the hidden portion of the segment register.
> ...
> The hidden descriptor register fields for FS.base and GS.base are
> physically mapped to MSRs in order to load all address bits supported by
> a 64-bit implementation.
> "
> 
> The issue was found by strace test suite where 32-bit ioctl_kvm_run test
> started segfaulting.
> 
> Reported-by: Dmitry V. Levin <ldv@altlinux.org>
> Bisected-by: Masatake YAMATO <yamato@redhat.com>
> Fixes: 42b933b59721 ("x86/kvm/vmx: read MSR_{FS,KERNEL_GS}_BASE from current->thread")
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 25 +++++++++++++++++--------
>  1 file changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 559a12b6184d..65968649b365 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2560,6 +2560,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  #ifdef CONFIG_X86_64
>  	int cpu = raw_smp_processor_id();
> +	unsigned long fsbase, kernel_gsbase;

Because bikeshedding is fun, what do you think about using fs_base and
kernel_gs_base for these names?  I have a series that touches this
code and also adds local variables for {FS,GS}.base and {FS,GS}.sel.
I used {fs,gs}_base and {fs,gs}_sel to be consistent with the
vmx->host_state nomenclature (the local variables are used to update
the associated vmx->host_state variables), but I'll change my patches
if you have a strong preference for omitting the underscore.

>  #endif
>  	int i;
>  
> @@ -2575,12 +2576,20 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
>  
>  #ifdef CONFIG_X86_64
> -	save_fsgs_for_kvm();
> -	vmx->host_state.fs_sel = current->thread.fsindex;
> -	vmx->host_state.gs_sel = current->thread.gsindex;
> -#else
> -	savesegment(fs, vmx->host_state.fs_sel);
> -	savesegment(gs, vmx->host_state.gs_sel);
> +	if (likely(is_64bit_mm(current->mm))) {
> +		save_fsgs_for_kvm();
> +		vmx->host_state.fs_sel = current->thread.fsindex;
> +		vmx->host_state.gs_sel = current->thread.gsindex;
> +		fsbase = current->thread.fsbase;
> +		kernel_gsbase = current->thread.gsbase;
> +	} else {
> +#endif
> +		savesegment(fs, vmx->host_state.fs_sel);
> +		savesegment(gs, vmx->host_state.gs_sel);
> +#ifdef CONFIG_X86_64
> +		fsbase = read_msr(MSR_FS_BASE);
> +		kernel_gsbase = read_msr(MSR_KERNEL_GS_BASE);
> +	}
>  #endif
>  	if (!(vmx->host_state.fs_sel & 7)) {
>  		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
> @@ -2600,10 +2609,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  	savesegment(ds, vmx->host_state.ds_sel);
>  	savesegment(es, vmx->host_state.es_sel);
>  
> -	vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
> +	vmcs_writel(HOST_FS_BASE, fsbase);
>  	vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
>  
> -	vmx->msr_host_kernel_gs_base = current->thread.gsbase;
> +	vmx->msr_host_kernel_gs_base = kernel_gsbase;
>  	if (is_long_mode(&vmx->vcpu))
>  		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
>  #else
> -- 
> 2.14.4
>
Vitaly Kuznetsov July 13, 2018, 5:10 p.m. UTC | #5
Sean Christopherson <sean.j.christopherson@intel.com> writes:

> On Wed, Jul 11, 2018 at 07:37:18PM +0200, Vitaly Kuznetsov wrote:
>> When we switched from doing rdmsr() to reading FS/GS base values from
>> current->thread we completely forgot about legacy 32-bit userspaces which
>> we still support in KVM (why?). task->thread.{fsbase,gsbase} are only
>> synced for 64-bit processes, calling save_fsgs_for_kvm() and using
>> its result from current is illegal for legacy processes.
>> 
>> There's no ARCH_SET_FS/GS prctls for legacy applications. Base MSRs are,
>> however, not always equal to zero. Intel's manual says (3.4.4 Segment
>> Loading Instructions in IA-32e Mode):
>> 
>> "In order to set up compatibility mode for an application, segment-load
>> instructions (MOV to Sreg, POP Sreg) work normally in 64-bit mode. An
>> entry is read from the system descriptor table (GDT or LDT) and is loaded
>> in the hidden portion of the segment register.
>> ...
>> The hidden descriptor register fields for FS.base and GS.base are
>> physically mapped to MSRs in order to load all address bits supported by
>> a 64-bit implementation.
>> "
>> 
>> The issue was found by strace test suite where 32-bit ioctl_kvm_run test
>> started segfaulting.
>> 
>> Reported-by: Dmitry V. Levin <ldv@altlinux.org>
>> Bisected-by: Masatake YAMATO <yamato@redhat.com>
>> Fixes: 42b933b59721 ("x86/kvm/vmx: read MSR_{FS,KERNEL_GS}_BASE from current->thread")
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  arch/x86/kvm/vmx.c | 25 +++++++++++++++++--------
>>  1 file changed, 17 insertions(+), 8 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 559a12b6184d..65968649b365 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2560,6 +2560,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>  #ifdef CONFIG_X86_64
>>  	int cpu = raw_smp_processor_id();
>> +	unsigned long fsbase, kernel_gsbase;
>
> Because bikeshedding is fun, what do you think about using fs_base and
> kernel_gs_base for these names?  I have a series that touches this
> code and also adds local variables for {FS,GS}.base and {FS,GS}.sel.
> I used {fs,gs}_base and {fs,gs}_sel to be consistent with the
> vmx->host_state nomenclature (the local variables are used to update
> the associated vmx->host_state variables), but I'll change my patches
> if you have a strong preference for omitting the underscore.
>

I have nothing against underscores :-)

Hope this small change can be done by Paolo/Radim upon commit. Or I'll
send v2 if needed.
Paolo Bonzini July 15, 2018, 2:27 p.m. UTC | #6
On 13/07/2018 19:10, Vitaly Kuznetsov wrote:
>> Because bikeshedding is fun, what do you think about using fs_base and
>> kernel_gs_base for these names?  I have a series that touches this
>> code and also adds local variables for {FS,GS}.base and {FS,GS}.sel.
>> I used {fs,gs}_base and {fs,gs}_sel to be consistent with the
>> vmx->host_state nomenclature (the local variables are used to update
>> the associated vmx->host_state variables), but I'll change my patches
>> if you have a strong preference for omitting the underscore.
>>
> I have nothing against underscores :-)
> 
> Hope this small change can be done by Paolo/Radim upon commit. Or I'll
> send v2 if needed.

Yup, I've made the change and queued for 4.18-rc.

Paolo
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 559a12b6184d..65968649b365 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2560,6 +2560,7 @@  static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 #ifdef CONFIG_X86_64
 	int cpu = raw_smp_processor_id();
+	unsigned long fsbase, kernel_gsbase;
 #endif
 	int i;
 
@@ -2575,12 +2576,20 @@  static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 
 #ifdef CONFIG_X86_64
-	save_fsgs_for_kvm();
-	vmx->host_state.fs_sel = current->thread.fsindex;
-	vmx->host_state.gs_sel = current->thread.gsindex;
-#else
-	savesegment(fs, vmx->host_state.fs_sel);
-	savesegment(gs, vmx->host_state.gs_sel);
+	if (likely(is_64bit_mm(current->mm))) {
+		save_fsgs_for_kvm();
+		vmx->host_state.fs_sel = current->thread.fsindex;
+		vmx->host_state.gs_sel = current->thread.gsindex;
+		fsbase = current->thread.fsbase;
+		kernel_gsbase = current->thread.gsbase;
+	} else {
+#endif
+		savesegment(fs, vmx->host_state.fs_sel);
+		savesegment(gs, vmx->host_state.gs_sel);
+#ifdef CONFIG_X86_64
+		fsbase = read_msr(MSR_FS_BASE);
+		kernel_gsbase = read_msr(MSR_KERNEL_GS_BASE);
+	}
 #endif
 	if (!(vmx->host_state.fs_sel & 7)) {
 		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
@@ -2600,10 +2609,10 @@  static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	savesegment(ds, vmx->host_state.ds_sel);
 	savesegment(es, vmx->host_state.es_sel);
 
-	vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
+	vmcs_writel(HOST_FS_BASE, fsbase);
 	vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
 
-	vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+	vmx->msr_host_kernel_gs_base = kernel_gsbase;
 	if (is_long_mode(&vmx->vcpu))
 		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 #else