diff mbox

[2/2] xen/x86: Introduce a new VMASSIST for architectural behaviour of iopl

Message ID 1458158749-21846-3-git-send-email-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andrew Cooper March 16, 2016, 8:05 p.m. UTC
The existing vIOPL interface is hard to use, and need not be.

Introduce a VMASSIST with which a guest can opt-in to having vIOPL behaviour
consistenly with native hardware.

Specifically:
 - virtual iopl updated from do_iret() hypercalls.
 - virtual iopl reported in bounce frames.
 - guest kernels assumed to be level 0 for the purpose of iopl checks.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
---
 xen/arch/x86/domain.c              | 10 +++++++---
 xen/arch/x86/physdev.c             |  2 +-
 xen/arch/x86/traps.c               |  8 ++++++--
 xen/arch/x86/x86_64/asm-offsets.c  |  3 +++
 xen/arch/x86/x86_64/compat/entry.S |  7 ++++++-
 xen/arch/x86/x86_64/compat/traps.c |  4 ++++
 xen/arch/x86/x86_64/entry.S        |  7 ++++++-
 xen/arch/x86/x86_64/traps.c        |  3 +++
 xen/include/asm-x86/config.h       |  1 +
 xen/include/asm-x86/domain.h       |  3 ++-
 xen/include/public/xen.h           |  8 ++++++++
 11 files changed, 47 insertions(+), 9 deletions(-)

Comments

Jan Beulich March 17, 2016, 10:25 a.m. UTC | #1
>>> On 16.03.16 at 21:05, <andrew.cooper3@citrix.com> wrote:
> @@ -1742,8 +1742,10 @@ static void load_segments(struct vcpu *n)
>              cs_and_mask = (unsigned short)regs->cs |
>                  ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
>              /* Fold upcall mask into RFLAGS.IF. */
> -            eflags  = regs->_eflags & ~X86_EFLAGS_IF;
> +            eflags  = regs->_eflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);

This and ...

> @@ -1788,8 +1790,10 @@ static void load_segments(struct vcpu *n)
>              ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
>  
>          /* Fold upcall mask into RFLAGS.IF. */
> -        rflags  = regs->rflags & ~X86_EFLAGS_IF;
> +        rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);

... this is not really necessary (but also not wrong) - the actual
EFLAGS.IOPL is always zero (and assumed to be so by code
further down from the respective adjustments you make). For
consistency's sake it might be better to either drop the changes
here, or also adjust the two places masking regs->eflags.

> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -1806,7 +1806,9 @@ static int guest_io_okay(
>  #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
>  
>      if ( !vm86_mode(regs) &&
> -         (v->arch.pv_vcpu.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
> +         (MASK_EXTR(v->arch.pv_vcpu.iopl, X86_EFLAGS_IOPL) >=
> +          (guest_kernel_mode(v, regs) ?
> +           (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3)) )
>          return 1;
>  
>      if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
> @@ -2367,7 +2369,9 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
>  
>      case 0xfa: /* CLI */
>      case 0xfb: /* STI */
> -        if ( v->arch.pv_vcpu.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
> +        if ( MASK_EXTR(v->arch.pv_vcpu.iopl, X86_EFLAGS_IOPL) <
> +             (guest_kernel_mode(v, regs) ?
> +              (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3) )
>              goto fail;

The similarity of the two together with the growing complexity
suggests to make this a macro or inline function. Additionally
resulting binary code would likely be better if you compared
v->arch.pv_vcpu.iopl with MASK_INSR(<literal value>,
X86_EFLAGS_IOPL), even if that means having three
MASK_INSR() (albeit those perhaps again would be hidden in
a macro, e.g.

#define IOPL(n) MASK_INSR(n, X86_EFLAGS_IOPL)

).

> --- a/xen/arch/x86/x86_64/compat/entry.S
> +++ b/xen/arch/x86/x86_64/compat/entry.S
> @@ -277,9 +277,14 @@ compat_create_bounce_frame:
>          testb %al,%al                   # Bits 0-7: saved_upcall_mask
>          setz  %ch                       # %ch == !saved_upcall_mask
>          movl  UREGS_eflags+8(%rsp),%eax
> -        andl  $~X86_EFLAGS_IF,%eax
> +        andl  $~(X86_EFLAGS_IF|X86_EFLAGS_IOPL),%eax

See earlier comment.

>          addb  %ch,%ch                   # Bit 9 (EFLAGS.IF)
>          orb   %ch,%ah                   # Fold EFLAGS.IF into %eax
> +        movq  VCPU_domain(%rbx),%rcx    # if ( VM_ASSIST(v->domain, architectural_iopl) )

If you used another register, this could be pulled up quite a bit,
to hide the latency of the load before the loaded value gets used.

> +        testb $1 << VMASST_TYPE_architectural_iopl,DOMAIN_vm_assist(%rcx)
> +        jz    .Lft6
> +        movzwl VCPU_iopl(%rbx),%ecx     # Bits 13:12 (EFLAGS.IOPL)

Why not just MOVL?

> +        orl   %ecx,%eax                 # Fold EFLAGS.IOPL into %eax

Also I continue to think this would better be done with CMOVcc,
avoiding yet another conditional branch here.

Jan
Andrew Cooper March 17, 2016, 10:45 a.m. UTC | #2
On 17/03/16 10:25, Jan Beulich wrote:
>>>> On 16.03.16 at 21:05, <andrew.cooper3@citrix.com> wrote:
>> @@ -1742,8 +1742,10 @@ static void load_segments(struct vcpu *n)
>>              cs_and_mask = (unsigned short)regs->cs |
>>                  ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
>>              /* Fold upcall mask into RFLAGS.IF. */
>> -            eflags  = regs->_eflags & ~X86_EFLAGS_IF;
>> +            eflags  = regs->_eflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
> This and ...
>
>> @@ -1788,8 +1790,10 @@ static void load_segments(struct vcpu *n)
>>              ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
>>  
>>          /* Fold upcall mask into RFLAGS.IF. */
>> -        rflags  = regs->rflags & ~X86_EFLAGS_IF;
>> +        rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
> ... this is not really necessary (but also not wrong) - the actual
> EFLAGS.IOPL is always zero (and assumed to be so by code
> further down from the respective adjustments you make). For
> consistency's sake it might be better to either drop the changes
> here, or also adjust the two places masking regs->eflags.

I will adjust the others.  I would prefer not to rely on the assumption
that it is actually 0.

>
>> --- a/xen/arch/x86/traps.c
>> +++ b/xen/arch/x86/traps.c
>> @@ -1806,7 +1806,9 @@ static int guest_io_okay(
>>  #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
>>  
>>      if ( !vm86_mode(regs) &&
>> -         (v->arch.pv_vcpu.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
>> +         (MASK_EXTR(v->arch.pv_vcpu.iopl, X86_EFLAGS_IOPL) >=
>> +          (guest_kernel_mode(v, regs) ?
>> +           (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3)) )
>>          return 1;
>>  
>>      if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
>> @@ -2367,7 +2369,9 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
>>  
>>      case 0xfa: /* CLI */
>>      case 0xfb: /* STI */
>> -        if ( v->arch.pv_vcpu.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
>> +        if ( MASK_EXTR(v->arch.pv_vcpu.iopl, X86_EFLAGS_IOPL) <
>> +             (guest_kernel_mode(v, regs) ?
>> +              (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3) )
>>              goto fail;
> The similarity of the two together with the growing complexity
> suggests to make this a macro or inline function. Additionally
> resulting binary code would likely be better if you compared
> v->arch.pv_vcpu.iopl with MASK_INSR(<literal value>,
> X86_EFLAGS_IOPL), even if that means having three
> MASK_INSR() (albeit those perhaps again would be hidden in
> a macro, e.g.
>
> #define IOPL(n) MASK_INSR(n, X86_EFLAGS_IOPL)

I will see what I can do.

>
> ).
>
>> --- a/xen/arch/x86/x86_64/compat/entry.S
>> +++ b/xen/arch/x86/x86_64/compat/entry.S
>> @@ -277,9 +277,14 @@ compat_create_bounce_frame:
>>          testb %al,%al                   # Bits 0-7: saved_upcall_mask
>>          setz  %ch                       # %ch == !saved_upcall_mask
>>          movl  UREGS_eflags+8(%rsp),%eax
>> -        andl  $~X86_EFLAGS_IF,%eax
>> +        andl  $~(X86_EFLAGS_IF|X86_EFLAGS_IOPL),%eax
> See earlier comment.

I hope I have suitably answered why this is staying.

>
>>          addb  %ch,%ch                   # Bit 9 (EFLAGS.IF)
>>          orb   %ch,%ah                   # Fold EFLAGS.IF into %eax
>> +        movq  VCPU_domain(%rbx),%rcx    # if ( VM_ASSIST(v->domain, architectural_iopl) )
> If you used another register, this could be pulled up quite a bit,
> to hide the latency of the load before the loaded value gets used.

Can do, but given all pipeline stalls which currently exist in this
code, I doubt it will make any observable difference.

>
>> +        testb $1 << VMASST_TYPE_architectural_iopl,DOMAIN_vm_assist(%rcx)
>> +        jz    .Lft6
>> +        movzwl VCPU_iopl(%rbx),%ecx     # Bits 13:12 (EFLAGS.IOPL)
> Why not just MOVL?

Ah yes - this is leftover from the first iteration.

~Andrew
Jan Beulich March 17, 2016, 11 a.m. UTC | #3
>>> On 17.03.16 at 11:45, <andrew.cooper3@citrix.com> wrote:
> On 17/03/16 10:25, Jan Beulich wrote:
>>>>> On 16.03.16 at 21:05, <andrew.cooper3@citrix.com> wrote:
>>> @@ -1742,8 +1742,10 @@ static void load_segments(struct vcpu *n)
>>>              cs_and_mask = (unsigned short)regs->cs |
>>>                  ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
>>>              /* Fold upcall mask into RFLAGS.IF. */
>>> -            eflags  = regs->_eflags & ~X86_EFLAGS_IF;
>>> +            eflags  = regs->_eflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
>> This and ...
>>
>>> @@ -1788,8 +1790,10 @@ static void load_segments(struct vcpu *n)
>>>              ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
>>>  
>>>          /* Fold upcall mask into RFLAGS.IF. */
>>> -        rflags  = regs->rflags & ~X86_EFLAGS_IF;
>>> +        rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
>> ... this is not really necessary (but also not wrong) - the actual
>> EFLAGS.IOPL is always zero (and assumed to be so by code
>> further down from the respective adjustments you make). For
>> consistency's sake it might be better to either drop the changes
>> here, or also adjust the two places masking regs->eflags.
> 
> I will adjust the others.  I would prefer not to rely on the assumption
> that it is actually 0.

But you realize that if it wasn't zero, we'd have a security issue?
(This notwithstanding I'm fine with both directions, as indicated
before.)

Jan
Andrew Cooper March 17, 2016, 11:05 a.m. UTC | #4
On 17/03/16 11:00, Jan Beulich wrote:
>>>> On 17.03.16 at 11:45, <andrew.cooper3@citrix.com> wrote:
>> On 17/03/16 10:25, Jan Beulich wrote:
>>>>>> On 16.03.16 at 21:05, <andrew.cooper3@citrix.com> wrote:
>>>> @@ -1742,8 +1742,10 @@ static void load_segments(struct vcpu *n)
>>>>              cs_and_mask = (unsigned short)regs->cs |
>>>>                  ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
>>>>              /* Fold upcall mask into RFLAGS.IF. */
>>>> -            eflags  = regs->_eflags & ~X86_EFLAGS_IF;
>>>> +            eflags  = regs->_eflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
>>> This and ...
>>>
>>>> @@ -1788,8 +1790,10 @@ static void load_segments(struct vcpu *n)
>>>>              ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
>>>>  
>>>>          /* Fold upcall mask into RFLAGS.IF. */
>>>> -        rflags  = regs->rflags & ~X86_EFLAGS_IF;
>>>> +        rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
>>> ... this is not really necessary (but also not wrong) - the actual
>>> EFLAGS.IOPL is always zero (and assumed to be so by code
>>> further down from the respective adjustments you make). For
>>> consistency's sake it might be better to either drop the changes
>>> here, or also adjust the two places masking regs->eflags.
>> I will adjust the others.  I would prefer not to rely on the assumption
>> that it is actually 0.
> But you realize that if it wasn't zero, we'd have a security issue?

Indeed.  But as this adjustment is literally free for us to use, making
Xen a little more robust in the (hopefully never) case were IOPL ends up
not being 0.

~Andrew

> (This notwithstanding I'm fine with both directions, as indicated
> before.)
>
> Jan
>
diff mbox

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index a6d721b..36b9aaa 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1001,7 +1001,7 @@  int arch_set_info_guest(
     init_int80_direct_trap(v);
 
     /* IOPL privileges are virtualised. */
-    v->arch.pv_vcpu.iopl = (v->arch.user_regs.eflags >> 12) & 3;
+    v->arch.pv_vcpu.iopl = v->arch.user_regs.eflags & X86_EFLAGS_IOPL;
     v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL;
 
     /* Ensure real hardware interrupts are enabled. */
@@ -1742,8 +1742,10 @@  static void load_segments(struct vcpu *n)
             cs_and_mask = (unsigned short)regs->cs |
                 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
             /* Fold upcall mask into RFLAGS.IF. */
-            eflags  = regs->_eflags & ~X86_EFLAGS_IF;
+            eflags  = regs->_eflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
             eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
+            if ( VM_ASSIST(n->domain, architectural_iopl) )
+                eflags |= n->arch.pv_vcpu.iopl;
 
             if ( !ring_1(regs) )
             {
@@ -1788,8 +1790,10 @@  static void load_segments(struct vcpu *n)
             ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
 
         /* Fold upcall mask into RFLAGS.IF. */
-        rflags  = regs->rflags & ~X86_EFLAGS_IF;
+        rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
         rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
+        if ( VM_ASSIST(n->domain, architectural_iopl) )
+            rflags |= n->arch.pv_vcpu.iopl;
 
         if ( put_user(regs->ss,            rsp- 1) |
              put_user(regs->rsp,           rsp- 2) |
diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c
index 1cb9b58..5a49796 100644
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -529,7 +529,7 @@  ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( set_iopl.iopl > 3 )
             break;
         ret = 0;
-        curr->arch.pv_vcpu.iopl = set_iopl.iopl;
+        curr->arch.pv_vcpu.iopl = MASK_INSR(set_iopl.iopl, X86_EFLAGS_IOPL);
         break;
     }
 
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 564a107..9754a2f 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1806,7 +1806,9 @@  static int guest_io_okay(
 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
 
     if ( !vm86_mode(regs) &&
-         (v->arch.pv_vcpu.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
+         (MASK_EXTR(v->arch.pv_vcpu.iopl, X86_EFLAGS_IOPL) >=
+          (guest_kernel_mode(v, regs) ?
+           (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3)) )
         return 1;
 
     if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
@@ -2367,7 +2369,9 @@  static int emulate_privileged_op(struct cpu_user_regs *regs)
 
     case 0xfa: /* CLI */
     case 0xfb: /* STI */
-        if ( v->arch.pv_vcpu.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
+        if ( MASK_EXTR(v->arch.pv_vcpu.iopl, X86_EFLAGS_IOPL) <
+             (guest_kernel_mode(v, regs) ?
+              (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3) )
             goto fail;
         /*
          * This is just too dangerous to allow, in my opinion. Consider if the
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index 447c650..fa37ee0 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -86,6 +86,7 @@  void __dummy__(void)
     OFFSET(VCPU_trap_ctxt, struct vcpu, arch.pv_vcpu.trap_ctxt);
     OFFSET(VCPU_kernel_sp, struct vcpu, arch.pv_vcpu.kernel_sp);
     OFFSET(VCPU_kernel_ss, struct vcpu, arch.pv_vcpu.kernel_ss);
+    OFFSET(VCPU_iopl, struct vcpu, arch.pv_vcpu.iopl);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.vgc_flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
     OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
@@ -166,4 +167,6 @@  void __dummy__(void)
     OFFSET(MB_flags, multiboot_info_t, flags);
     OFFSET(MB_cmdline, multiboot_info_t, cmdline);
     OFFSET(MB_mem_lower, multiboot_info_t, mem_lower);
+
+    OFFSET(DOMAIN_vm_assist, struct domain, vm_assist);
 }
diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
index 36a8eae..45efd33 100644
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -277,9 +277,14 @@  compat_create_bounce_frame:
         testb %al,%al                   # Bits 0-7: saved_upcall_mask
         setz  %ch                       # %ch == !saved_upcall_mask
         movl  UREGS_eflags+8(%rsp),%eax
-        andl  $~X86_EFLAGS_IF,%eax
+        andl  $~(X86_EFLAGS_IF|X86_EFLAGS_IOPL),%eax
         addb  %ch,%ch                   # Bit 9 (EFLAGS.IF)
         orb   %ch,%ah                   # Fold EFLAGS.IF into %eax
+        movq  VCPU_domain(%rbx),%rcx    # if ( VM_ASSIST(v->domain, architectural_iopl) )
+        testb $1 << VMASST_TYPE_architectural_iopl,DOMAIN_vm_assist(%rcx)
+        jz    .Lft6
+        movzwl VCPU_iopl(%rbx),%ecx     # Bits 13:12 (EFLAGS.IOPL)
+        orl   %ecx,%eax                 # Fold EFLAGS.IOPL into %eax
 .Lft6:  movl  %eax,%fs:2*4(%rsi)        # EFLAGS
         movl  UREGS_rip+8(%rsp),%eax
 .Lft7:  movl  %eax,%fs:(%rsi)           # EIP
diff --git a/xen/arch/x86/x86_64/compat/traps.c b/xen/arch/x86/x86_64/compat/traps.c
index bbf18e4..a6afb26 100644
--- a/xen/arch/x86/x86_64/compat/traps.c
+++ b/xen/arch/x86/x86_64/compat/traps.c
@@ -99,6 +99,10 @@  unsigned int compat_iret(void)
         domain_crash(v->domain);
         return 0;
     }
+
+    if ( VM_ASSIST(v->domain, architectural_iopl) )
+        v->arch.pv_vcpu.iopl = eflags & X86_EFLAGS_IOPL;
+
     regs->_eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF;
 
     if ( unlikely(eflags & X86_EFLAGS_VM) )
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 221de01..1d6dedc 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -362,9 +362,14 @@  __UNLIKELY_END(create_bounce_frame_bad_sp)
         testb $0xFF,%al                 # Bits 0-7: saved_upcall_mask
         setz  %ch                       # %ch == !saved_upcall_mask
         movl  UREGS_eflags+8(%rsp),%eax
-        andl  $~X86_EFLAGS_IF,%eax
+        andl  $~(X86_EFLAGS_IF|X86_EFLAGS_IOPL),%eax
         addb  %ch,%ch                   # Bit 9 (EFLAGS.IF)
         orb   %ch,%ah                   # Fold EFLAGS.IF into %eax
+        movq  VCPU_domain(%rbx),%rcx    # if ( VM_ASSIST(v->domain, architectural_iopl) )
+        testb $1 << VMASST_TYPE_architectural_iopl,DOMAIN_vm_assist(%rcx)
+        jz    .Lft5
+        movzwl VCPU_iopl(%rbx),%ecx     # Bits 13:12 (EFLAGS.IOPL)
+        orl   %ecx,%eax                 # Fold EFLAGS.IOPL into %eax
 .Lft5:  movq  %rax,16(%rsi)             # RFLAGS
         movq  UREGS_rip+8(%rsp),%rax
 .Lft6:  movq  %rax,(%rsi)               # RIP
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index 26e2dd7..19f58a1 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -310,6 +310,9 @@  unsigned long do_iret(void)
         toggle_guest_mode(v);
     }
 
+    if ( VM_ASSIST(v->domain, architectural_iopl) )
+        v->arch.pv_vcpu.iopl = iret_saved.rflags & X86_EFLAGS_IOPL;
+
     regs->rip    = iret_saved.rip;
     regs->cs     = iret_saved.cs | 3; /* force guest privilege */
     regs->rflags = ((iret_saved.rflags & ~(X86_EFLAGS_IOPL|X86_EFLAGS_VM))
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 4527ce3..4fe8a94 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -332,6 +332,7 @@  extern unsigned long xen_phys_start;
                                   (1UL << VMASST_TYPE_4gb_segments_notify) | \
                                   (1UL << VMASST_TYPE_writable_pagetables) | \
                                   (1UL << VMASST_TYPE_pae_extended_cr3)    | \
+                                  (1UL << VMASST_TYPE_architectural_iopl)  | \
                                   (1UL << VMASST_TYPE_m2p_strict))
 #define VM_ASSIST_VALID          NATIVE_VM_ASSIST_VALID
 #define COMPAT_VM_ASSIST_VALID   (NATIVE_VM_ASSIST_VALID & \
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index de60def..a6cbae0 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -470,7 +470,8 @@  struct pv_vcpu
     /* I/O-port access bitmap. */
     XEN_GUEST_HANDLE(uint8) iobmp; /* Guest kernel vaddr of the bitmap. */
     unsigned int iobmp_limit; /* Number of ports represented in the bitmap. */
-    unsigned int iopl;        /* Current IOPL for this VCPU. */
+    unsigned int iopl;        /* Current IOPL for this VCPU, shifted left by
+                               * 12 to match the eflags register. */
 
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index 64ba7ab..4f73ded 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -502,6 +502,14 @@  DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_pae_extended_cr3     3
 
 /*
+ * x86 guests: Sane behaviour for virtual iopl
+ *  - virtual iopl updated from do_iret() hypercalls.
+ *  - virtual iopl reported in bounce frames.
+ *  - guest kernels assumed to be level 0 for the purpose of iopl checks.
+ */
+#define VMASST_TYPE_architectural_iopl   4
+
+/*
  * x86/64 guests: strictly hide M2P from user mode.
  * This allows the guest to control respective hypervisor behavior:
  * - when not set, L4 tables get created with the respective slot blank,