diff mbox

x86/HVM: correct notion of new CPL in task switch emulation

Message ID 59302098020000780015EB40@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich June 1, 2017, 12:11 p.m. UTC
Commit aac1df3d03 ("x86/HVM: introduce hvm_get_cpl() and respective
hook") went too far in one aspect: When emulating a task switch we
really shouldn't be looking at what hvm_get_cpl() returns, as we're
switching all segment registers.

However, instead of reverting the relevant parts of that commit, have
the caller tell the segment loading function what the new CPL is. This
at once fixes ES being loaded before CS so far having had its checks
done against the old CPL.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
An alternative to adding yet another parameter to the function would
be to have "cpl" have a special case value (e.g. negative) to indicate
VM86 mode. That would allow replacing the current "eflags" parameter.
x86/HVM: correct notion of new CPL in task switch emulation

Commit aac1df3d03 ("x86/HVM: introduce hvm_get_cpl() and respective
hook") went too far in one aspect: When emulating a task switch we
really shouldn't be looking at what hvm_get_cpl() returns, as we're
switching all segment registers.

However, instead of reverting the relevant parts of that commit, have
the caller tell the segment loading function what the new CPL is. This
at once fixes ES being loaded before CS so far having had its checks
done against the old CPL.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
An alternative to adding yet another parameter to the function would
be to have "cpl" have a special case value (e.g. negative) to indicate
VM86 mode. That would allow replacing the current "eflags" parameter.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2616,11 +2616,11 @@ static void hvm_unmap_entry(void *p)
 }
 
 static int hvm_load_segment_selector(
-    enum x86_segment seg, uint16_t sel, unsigned int eflags)
+    enum x86_segment seg, uint16_t sel, unsigned int cpl, unsigned int eflags)
 {
     struct segment_register desctab, segr;
     struct desc_struct *pdesc, desc;
-    u8 dpl, rpl, cpl;
+    u8 dpl, rpl;
     bool_t writable;
     int fault_type = TRAP_invalid_tss;
     struct vcpu *v = current;
@@ -2674,7 +2674,6 @@ static int hvm_load_segment_selector(
 
         dpl = (desc.b >> 13) & 3;
         rpl = sel & 3;
-        cpl = hvm_get_cpl(v);
 
         switch ( seg )
         {
@@ -2804,7 +2803,7 @@ void hvm_task_switch(
     struct segment_register gdt, tr, prev_tr, segr;
     struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
     bool_t otd_writable, ntd_writable;
-    unsigned int eflags;
+    unsigned int eflags, new_cpl;
     pagefault_info_t pfinfo;
     int exn_raised, rc;
     struct tss32 tss;
@@ -2918,7 +2917,9 @@ void hvm_task_switch(
     if ( rc != HVMCOPY_okay )
         goto out;
 
-    if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt, 0) )
+    new_cpl = tss.eflags & X86_EFLAGS_VM ? 3 : tss.cs & 3;
+
+    if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
         goto out;
 
     rc = hvm_set_cr3(tss.cr3, 1);
@@ -2939,12 +2940,12 @@ void hvm_task_switch(
     regs->rdi    = tss.edi;
 
     exn_raised = 0;
-    if ( hvm_load_segment_selector(x86_seg_es, tss.es, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_cs, tss.cs, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_ss, tss.ss, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_ds, tss.ds, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_fs, tss.fs, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_gs, tss.gs, tss.eflags) )
+    if ( hvm_load_segment_selector(x86_seg_es, tss.es, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_cs, tss.cs, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_ss, tss.ss, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_ds, tss.ds, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_fs, tss.fs, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_gs, tss.gs, new_cpl, tss.eflags) )
         exn_raised = 1;
 
     if ( taskswitch_reason == TSW_call_or_int )

Comments

Andrew Cooper June 2, 2017, 8:02 p.m. UTC | #1
On 01/06/17 13:11, Jan Beulich wrote:
> Commit aac1df3d03 ("x86/HVM: introduce hvm_get_cpl() and respective
> hook") went too far in one aspect: When emulating a task switch we
> really shouldn't be looking at what hvm_get_cpl() returns, as we're
> switching all segment registers.
>
> However, instead of reverting the relevant parts of that commit, have
> the caller tell the segment loading function what the new CPL is. This
> at once fixes ES being loaded before CS so far having had its checks
> done against the old CPL.

I'd have an extra paragraph describing the symptoms in practice.  e.g.

This bug manifests as a vmentry failure for 32bit VMs which use task
gates to service interrupts/exceptions, in situations where delivering
the event interrupts user code, and a privilege increase is required.

?

>
> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

I have finally managed to reproduce the original vmentry failure with an
XTF test.  This patch resolves the issue, so

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>

Julien: This really should be taken in 4.9, otherwise 32bit VMs will
sporadically crash, especially windows which uses this mechanism to
handle NMIs.

> ---
> An alternative to adding yet another parameter to the function would
> be to have "cpl" have a special case value (e.g. negative) to indicate
> VM86 mode. That would allow replacing the current "eflags" parameter.

Keeping the parameters separate is clearer.  It is not like this is a
hotpath we need to optimise.

~Andrew
Andrew Cooper June 2, 2017, 8:33 p.m. UTC | #2
On 02/06/17 21:02, Andrew Cooper wrote:
> On 01/06/17 13:11, Jan Beulich wrote:
>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> I have finally managed to reproduce the original vmentry failure with an
> XTF test.

FWIW, the vmentry failure is quite subtle.

%es gets reloaded first.  If the new TSS uses RPL0 data selectors, the
load fails, and #TS[%es] is yielded.

(d3) Going to userspace
(XEN) ** d3v0 Inject event { v 0x02, t 2, ec ffffffff }
(XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
(XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
(XEN) d3v0 Triple fault - invoking HVM shutdown action 1
(XEN) *** Dumping Dom3 vcpu#0 state: ***
(XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----

For some reason I haven't gotten to the bottom of yet, end up calling
__vmx_inject_exception() twice while handling the task switch path.  We
shouldn't be.

If however the TSS uses RPL3 data selectors, the %es load succeeds, then
the %cs load succeeds (rpl is 0 as this is an external interrupt
delivery).  The %ss load then fails because the old dpl is 3 (being in
userspace) but the new dpl is 0.  This then yields #TS[%ss], but the
VMCS is in a state with %cs and %ss disagreeing.

~Andrew
Andrew Cooper June 5, 2017, 1:06 p.m. UTC | #3
On 01/06/17 13:11, Jan Beulich wrote:
> Commit aac1df3d03 ("x86/HVM: introduce hvm_get_cpl() and respective
> hook") went too far in one aspect: When emulating a task switch we
> really shouldn't be looking at what hvm_get_cpl() returns, as we're
> switching all segment registers.
>
> However, instead of reverting the relevant parts of that commit, have
> the caller tell the segment loading function what the new CPL is. This
> at once fixes ES being loaded before CS so far having had its checks
> done against the old CPL.
>
> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

On further consideration, wouldn't it be better to audit all segment
registers, before updating any of them in the vmcs/vmcb?  This would
leave us with a far lower chance of other vmentry failures.

Loading the segment registers is beyond the commit point of a task
switch, and the manual says that the processor will try to skip further
segmentation checks in an attempt to deliver a fault in the new context.

~Andrew
Jan Beulich June 6, 2017, 6:42 a.m. UTC | #4
>>> On 05.06.17 at 15:06, <andrew.cooper3@citrix.com> wrote:
> On 01/06/17 13:11, Jan Beulich wrote:
>> Commit aac1df3d03 ("x86/HVM: introduce hvm_get_cpl() and respective
>> hook") went too far in one aspect: When emulating a task switch we
>> really shouldn't be looking at what hvm_get_cpl() returns, as we're
>> switching all segment registers.
>>
>> However, instead of reverting the relevant parts of that commit, have
>> the caller tell the segment loading function what the new CPL is. This
>> at once fixes ES being loaded before CS so far having had its checks
>> done against the old CPL.
>>
>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> On further consideration, wouldn't it be better to audit all segment
> registers, before updating any of them in the vmcs/vmcb?  This would
> leave us with a far lower chance of other vmentry failures.

Overall yes (and I did make a not on my todo list), but I think we
want to address the regression with no meaningful re-work right
now.

Jan
Jan Beulich June 6, 2017, 7:06 a.m. UTC | #5
>>> On 02.06.17 at 22:33, <andrew.cooper3@citrix.com> wrote:
> On 02/06/17 21:02, Andrew Cooper wrote:
>> On 01/06/17 13:11, Jan Beulich wrote:
>>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> I have finally managed to reproduce the original vmentry failure with an
>> XTF test.
> 
> FWIW, the vmentry failure is quite subtle.
> 
> %es gets reloaded first.  If the new TSS uses RPL0 data selectors, the
> load fails, and #TS[%es] is yielded.
> 
> (d3) Going to userspace
> (XEN) ** d3v0 Inject event { v 0x02, t 2, ec ffffffff }
> (XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
> (XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
> (XEN) d3v0 Triple fault - invoking HVM shutdown action 1
> (XEN) *** Dumping Dom3 vcpu#0 state: ***
> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
> 
> For some reason I haven't gotten to the bottom of yet, end up calling
> __vmx_inject_exception() twice while handling the task switch path.  We
> shouldn't be.

There's no sign of #DF above - how are you handling that? Is the
above perhaps a 2nd task switch to handle #DF?

Jan
Andrew Cooper June 6, 2017, 10:21 a.m. UTC | #6
On 06/06/17 08:06, Jan Beulich wrote:
>>>> On 02.06.17 at 22:33, <andrew.cooper3@citrix.com> wrote:
>> On 02/06/17 21:02, Andrew Cooper wrote:
>>> On 01/06/17 13:11, Jan Beulich wrote:
>>>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> I have finally managed to reproduce the original vmentry failure with an
>>> XTF test.
>> FWIW, the vmentry failure is quite subtle.
>>
>> %es gets reloaded first.  If the new TSS uses RPL0 data selectors, the
>> load fails, and #TS[%es] is yielded.
>>
>> (d3) Going to userspace
>> (XEN) ** d3v0 Inject event { v 0x02, t 2, ec ffffffff }
>> (XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
>> (XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
>> (XEN) d3v0 Triple fault - invoking HVM shutdown action 1
>> (XEN) *** Dumping Dom3 vcpu#0 state: ***
>> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
>>
>> For some reason I haven't gotten to the bottom of yet, end up calling
>> __vmx_inject_exception() twice while handling the task switch path.  We
>> shouldn't be.
> There's no sign of #DF above - how are you handling that? Is the
> above perhaps a 2nd task switch to handle #DF?

The sequence of events is:

d3v0 raises self NMI.
(XEN) ** d3v0 Inject event { v 0x02, t 2, ec ffffffff }
vmentry
vmexit(task_switch)
(XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
(XEN) ** d3v0 Inject event { v 0x0a, t 3, ec 0018 }
vmentry
vmexit(triple_fault)

I expect the triple fault is something to do with the fact that we had a
incomplete update of the segment registers.

~Andrew
Andrew Cooper June 6, 2017, 10:21 a.m. UTC | #7
On 06/06/17 07:42, Jan Beulich wrote:
>>>> On 05.06.17 at 15:06, <andrew.cooper3@citrix.com> wrote:
>> On 01/06/17 13:11, Jan Beulich wrote:
>>> Commit aac1df3d03 ("x86/HVM: introduce hvm_get_cpl() and respective
>>> hook") went too far in one aspect: When emulating a task switch we
>>> really shouldn't be looking at what hvm_get_cpl() returns, as we're
>>> switching all segment registers.
>>>
>>> However, instead of reverting the relevant parts of that commit, have
>>> the caller tell the segment loading function what the new CPL is. This
>>> at once fixes ES being loaded before CS so far having had its checks
>>> done against the old CPL.
>>>
>>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> On further consideration, wouldn't it be better to audit all segment
>> registers, before updating any of them in the vmcs/vmcb?  This would
>> leave us with a far lower chance of other vmentry failures.
> Overall yes (and I did make a not on my todo list), but I think we
> want to address the regression with no meaningful re-work right
> now.

Entirely reasonable.

~Andrew
Julien Grall June 6, 2017, 5:14 p.m. UTC | #8
Hi Andrew,

On 02/06/17 21:02, Andrew Cooper wrote:
> On 01/06/17 13:11, Jan Beulich wrote:
>> Commit aac1df3d03 ("x86/HVM: introduce hvm_get_cpl() and respective
>> hook") went too far in one aspect: When emulating a task switch we
>> really shouldn't be looking at what hvm_get_cpl() returns, as we're
>> switching all segment registers.
>>
>> However, instead of reverting the relevant parts of that commit, have
>> the caller tell the segment loading function what the new CPL is. This
>> at once fixes ES being loaded before CS so far having had its checks
>> done against the old CPL.
>
> I'd have an extra paragraph describing the symptoms in practice.  e.g.
>
> This bug manifests as a vmentry failure for 32bit VMs which use task
> gates to service interrupts/exceptions, in situations where delivering
> the event interrupts user code, and a privilege increase is required.
>
> ?
>
>>
>> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> I have finally managed to reproduce the original vmentry failure with an
> XTF test.  This patch resolves the issue, so
>
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
> Julien: This really should be taken in 4.9, otherwise 32bit VMs will
> sporadically crash, especially windows which uses this mechanism to
> handle NMIs.

Release-acked-by: Julien Grall <julien.grall@arm.com>

Cheers,

>
>> ---
>> An alternative to adding yet another parameter to the function would
>> be to have "cpl" have a special case value (e.g. negative) to indicate
>> VM86 mode. That would allow replacing the current "eflags" parameter.
>
> Keeping the parameters separate is clearer.  It is not like this is a
> hotpath we need to optimise.
>
> ~Andrew
>
diff mbox

Patch

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2616,11 +2616,11 @@  static void hvm_unmap_entry(void *p)
 }
 
 static int hvm_load_segment_selector(
-    enum x86_segment seg, uint16_t sel, unsigned int eflags)
+    enum x86_segment seg, uint16_t sel, unsigned int cpl, unsigned int eflags)
 {
     struct segment_register desctab, segr;
     struct desc_struct *pdesc, desc;
-    u8 dpl, rpl, cpl;
+    u8 dpl, rpl;
     bool_t writable;
     int fault_type = TRAP_invalid_tss;
     struct vcpu *v = current;
@@ -2674,7 +2674,6 @@  static int hvm_load_segment_selector(
 
         dpl = (desc.b >> 13) & 3;
         rpl = sel & 3;
-        cpl = hvm_get_cpl(v);
 
         switch ( seg )
         {
@@ -2804,7 +2803,7 @@  void hvm_task_switch(
     struct segment_register gdt, tr, prev_tr, segr;
     struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
     bool_t otd_writable, ntd_writable;
-    unsigned int eflags;
+    unsigned int eflags, new_cpl;
     pagefault_info_t pfinfo;
     int exn_raised, rc;
     struct tss32 tss;
@@ -2918,7 +2917,9 @@  void hvm_task_switch(
     if ( rc != HVMCOPY_okay )
         goto out;
 
-    if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt, 0) )
+    new_cpl = tss.eflags & X86_EFLAGS_VM ? 3 : tss.cs & 3;
+
+    if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
         goto out;
 
     rc = hvm_set_cr3(tss.cr3, 1);
@@ -2939,12 +2940,12 @@  void hvm_task_switch(
     regs->rdi    = tss.edi;
 
     exn_raised = 0;
-    if ( hvm_load_segment_selector(x86_seg_es, tss.es, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_cs, tss.cs, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_ss, tss.ss, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_ds, tss.ds, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_fs, tss.fs, tss.eflags) ||
-         hvm_load_segment_selector(x86_seg_gs, tss.gs, tss.eflags) )
+    if ( hvm_load_segment_selector(x86_seg_es, tss.es, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_cs, tss.cs, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_ss, tss.ss, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_ds, tss.ds, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_fs, tss.fs, new_cpl, tss.eflags) ||
+         hvm_load_segment_selector(x86_seg_gs, tss.gs, new_cpl, tss.eflags) )
         exn_raised = 1;
 
     if ( taskswitch_reason == TSW_call_or_int )