diff mbox

[for-next,4/7] x86/traps: move all PV emulation and hypercalls to pv/traps.c

Message ID 20170406171434.15484-5-wei.liu2@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wei Liu April 6, 2017, 5:14 p.m. UTC
Move the following emulation code:
1. invalid op
2. rdtsc
3. privilege instructions
4. gate operation

Move the following hypercalls:
1. do_set_trap_table
2. do_set_debugreg
3. do_get_debugreg
4. do_fpu_taskswitch
5. do_set_trap_table

Some helper functions which are PV only are also moved.

The code movement requires making a functions non-static. They are added
to traps.h.

No functional change.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 xen/arch/x86/pv/Makefile    |    1 +
 xen/arch/x86/pv/traps.c     | 2152 +++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/traps.c        | 2357 +++----------------------------------------
 xen/include/asm-x86/traps.h |   19 +
 4 files changed, 2295 insertions(+), 2234 deletions(-)
 create mode 100644 xen/arch/x86/pv/traps.c

Comments

Jan Beulich April 21, 2017, 9:25 a.m. UTC | #1
>>> On 06.04.17 at 19:14, <wei.liu2@citrix.com> wrote:
> @@ -1576,1963 +1358,173 @@ void __init do_early_page_fault(struct 
> cpu_user_regs *regs)
>      }
>  }
>  
> -long do_fpu_taskswitch(int set)
> +void do_general_protection(struct cpu_user_regs *regs)
>  {
>      struct vcpu *v = current;
> +    unsigned long fixup;

Same comment as on one of the patches in the other series: For
reviewers' sake, if the diff can't be produced such that plain
removals also come out as such, please split this into smaller steps.

> --- a/xen/include/asm-x86/traps.h
> +++ b/xen/include/asm-x86/traps.h
> @@ -51,4 +51,23 @@ uint32_t guest_io_read(unsigned int port, unsigned int bytes,
>  void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
>                      struct domain *);
>  
> +int emulate_privileged_op(struct cpu_user_regs *regs);
> +int emulate_invalid_rdtscp(struct cpu_user_regs *regs);
> +int emulate_forced_invalid_op(struct cpu_user_regs *regs);
> +void emulate_gate_op(struct cpu_user_regs *regs);
> +
> +static inline const char *trapstr(unsigned int trapnr)
> +{
> +    static const char * const strings[] = {
> +        "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
> +        "invalid opcode", "device not available", "double fault",
> +        "coprocessor segment", "invalid tss", "segment not found",
> +        "stack error", "general protection fault", "page fault",
> +        "spurious interrupt", "coprocessor error", "alignment check",
> +        "machine check", "simd error", "virtualisation exception"
> +    };
> +
> +    return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
> +}

This doesn't belong into a header - using it from more than one CU
would mean multiple instances of the static array and strings. This
isn't C++, where they can be folded by the linker. We don't need
this on any fast paths (as the intention here is to provide input to
printk(), which is inherently slow), so there's no reason not to make
this an out of line function.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/pv/Makefile b/xen/arch/x86/pv/Makefile
index d8fc13f6fe..c996dcc149 100644
--- a/xen/arch/x86/pv/Makefile
+++ b/xen/arch/x86/pv/Makefile
@@ -3,3 +3,4 @@  subdir-y += compat
 obj-y += hypercall.o
 obj-bin-y += dom0_build.init.o
 obj-bin-y += entry.o
+obj-y += traps.o
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
new file mode 100644
index 0000000000..0f09d858f6
--- /dev/null
+++ b/xen/arch/x86/pv/traps.c
@@ -0,0 +1,2152 @@ 
+/******************************************************************************
+ * arch/x86/pv/traps.c
+ *
+ * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <acpi/acpi.h>
+
+#include <xen/event.h>
+#include <xen/guest_access.h>
+#include <xen/iocap.h>
+#include <xen/lib.h>
+#include <xen/paging.h>
+#include <xen/spinlock.h>
+#include <xen/types.h>
+#include <xen/trace.h>
+#include <xsm/xsm.h>
+
+#include <asm/debugreg.h>
+#include <asm/hpet.h>
+#include <asm/mc146818rtc.h>
+#include <asm/regs.h>
+#include <asm/shared.h>
+#include <asm/traps.h>
+#include <asm/x86_emulate.h>
+
+long unregister_guest_nmi_callback(void)
+{
+    struct vcpu *v = current;
+    struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi];
+
+    memset(t, 0, sizeof(*t));
+
+    return 0;
+}
+
+int guest_has_trap_callback(struct domain *d, uint16_t vcpuid,
+                            unsigned int trap_nr)
+{
+    struct vcpu *v;
+    struct trap_info *t;
+
+    BUG_ON(d == NULL);
+    BUG_ON(vcpuid >= d->max_vcpus);
+
+    /* Sanity check - XXX should be more fine grained. */
+    BUG_ON(trap_nr >= NR_VECTORS);
+
+    v = d->vcpu[vcpuid];
+    t = &v->arch.pv_vcpu.trap_ctxt[trap_nr];
+
+    return (t->address != 0);
+}
+
+void pv_inject_event(const struct x86_event *event)
+{
+    struct vcpu *v = current;
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct trap_bounce *tb;
+    const struct trap_info *ti;
+    const uint8_t vector = event->vector;
+    const bool use_error_code =
+        ((vector < 32) && (TRAP_HAVE_EC & (1u << vector)));
+    unsigned int error_code = event->error_code;
+
+    ASSERT(vector == event->vector); /* Confirm no truncation. */
+    if ( use_error_code )
+        ASSERT(error_code != X86_EVENT_NO_EC);
+    else
+        ASSERT(error_code == X86_EVENT_NO_EC);
+
+    tb = &v->arch.pv_vcpu.trap_bounce;
+    ti = &v->arch.pv_vcpu.trap_ctxt[vector];
+
+    tb->flags = TBF_EXCEPTION;
+    tb->cs    = ti->cs;
+    tb->eip   = ti->address;
+
+    if ( vector == TRAP_page_fault )
+    {
+        v->arch.pv_vcpu.ctrlreg[2] = event->cr2;
+        arch_set_cr2(v, event->cr2);
+
+        /* Re-set error_code.user flag appropriately for the guest. */
+        error_code &= ~PFEC_user_mode;
+        if ( !guest_kernel_mode(v, regs) )
+            error_code |= PFEC_user_mode;
+
+        trace_pv_page_fault(event->cr2, error_code);
+    }
+    else
+        trace_pv_trap(vector, regs->rip, use_error_code, error_code);
+
+    if ( use_error_code )
+    {
+        tb->flags |= TBF_EXCEPTION_ERRCODE;
+        tb->error_code = error_code;
+    }
+
+    if ( TI_GET_IF(ti) )
+        tb->flags |= TBF_INTERRUPT;
+
+    if ( unlikely(null_trap_bounce(v, tb)) )
+    {
+        gprintk(XENLOG_WARNING,
+                "Unhandled %s fault/trap [#%d, ec=%04x]\n",
+                trapstr(vector), vector, error_code);
+
+        if ( vector == TRAP_page_fault )
+            show_page_walk(event->cr2);
+    }
+}
+
+static void instruction_done(struct cpu_user_regs *regs, unsigned long rip)
+{
+    regs->rip = rip;
+    regs->eflags &= ~X86_EFLAGS_RF;
+    if ( regs->eflags & X86_EFLAGS_TF )
+    {
+        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
+        pv_inject_guest_trap(TRAP_debug, regs);
+    }
+}
+
+int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
+{
+    char opcode[3];
+    unsigned long eip, rc;
+    struct vcpu *v = current;
+
+    eip = regs->rip;
+    if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
+    {
+        pv_inject_page_fault(0, eip + sizeof(opcode) - rc);
+        return EXCRET_fault_fixed;
+    }
+    if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
+        return 0;
+    eip += sizeof(opcode);
+    pv_soft_rdtsc(v, regs, 1);
+    instruction_done(regs, eip);
+    return EXCRET_fault_fixed;
+}
+
+int emulate_forced_invalid_op(struct cpu_user_regs *regs)
+{
+    char sig[5], instr[2];
+    unsigned long eip, rc;
+    struct cpuid_leaf res;
+
+    eip = regs->rip;
+
+    /* Check for forced emulation signature: ud2 ; .ascii "xen". */
+    if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
+    {
+        pv_inject_page_fault(0, eip + sizeof(sig) - rc);
+        return EXCRET_fault_fixed;
+    }
+    if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
+        return 0;
+    eip += sizeof(sig);
+
+    /* We only emulate CPUID. */
+    if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
+    {
+        pv_inject_page_fault(0, eip + sizeof(instr) - rc);
+        return EXCRET_fault_fixed;
+    }
+    if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
+        return 0;
+
+    /* If cpuid faulting is enabled and CPL>0 inject a #GP in place of #UD. */
+    if ( current->arch.cpuid_faulting && !guest_kernel_mode(current, regs) )
+    {
+        regs->rip = eip;
+        pv_inject_guest_trap(TRAP_gp_fault, regs);
+        return EXCRET_fault_fixed;
+    }
+
+    eip += sizeof(instr);
+
+    guest_cpuid(current, regs->eax, regs->ecx, &res);
+
+    regs->rax = res.a;
+    regs->rbx = res.b;
+    regs->rcx = res.c;
+    regs->rdx = res.d;
+
+    instruction_done(regs, eip);
+
+    trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->rip);
+
+    return EXCRET_fault_fixed;
+}
+
+static unsigned int check_guest_io_breakpoint(struct vcpu *v,
+    unsigned int port, unsigned int len)
+{
+    unsigned int width, i, match = 0;
+    unsigned long start;
+
+    if ( !(v->arch.debugreg[5]) ||
+         !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
+        return 0;
+
+    for ( i = 0; i < 4; i++ )
+    {
+        if ( !(v->arch.debugreg[5] &
+               (3 << (i * DR_ENABLE_SIZE))) )
+            continue;
+
+        start = v->arch.debugreg[i];
+        width = 0;
+
+        switch ( (v->arch.debugreg[7] >>
+                  (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
+        {
+        case DR_LEN_1: width = 1; break;
+        case DR_LEN_2: width = 2; break;
+        case DR_LEN_4: width = 4; break;
+        case DR_LEN_8: width = 8; break;
+        }
+
+        if ( (start < (port + len)) && ((start + width) > port) )
+            match |= 1 << i;
+    }
+
+    return match;
+}
+
+/*
+ * Called from asm to set up the MCE trapbounce info.
+ * Returns 0 if no callback is set up, else 1.
+ */
+int set_guest_machinecheck_trapbounce(void)
+{
+    struct vcpu *v = current;
+    struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
+
+    pv_inject_guest_trap(TRAP_machine_check, guest_cpu_user_regs());
+    tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
+    return !null_trap_bounce(v, tb);
+}
+
+/*
+ * Called from asm to set up the NMI trapbounce info.
+ * Returns 0 if no callback is set up, else 1.
+ */
+int set_guest_nmi_trapbounce(void)
+{
+    struct vcpu *v = current;
+    struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
+    pv_inject_guest_trap(TRAP_nmi, guest_cpu_user_regs());
+    tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
+    return !null_trap_bounce(v, tb);
+}
+
+long do_set_trap_table(XEN_GUEST_HANDLE_PARAM(const_trap_info_t) traps)
+{
+    struct trap_info cur;
+    struct vcpu *curr = current;
+    struct trap_info *dst = curr->arch.pv_vcpu.trap_ctxt;
+    long rc = 0;
+
+    /* If no table is presented then clear the entire virtual IDT. */
+    if ( guest_handle_is_null(traps) )
+    {
+        memset(dst, 0, NR_VECTORS * sizeof(*dst));
+        init_int80_direct_trap(curr);
+        return 0;
+    }
+
+    for ( ; ; )
+    {
+        if ( copy_from_guest(&cur, traps, 1) )
+        {
+            rc = -EFAULT;
+            break;
+        }
+
+        if ( cur.address == 0 )
+            break;
+
+        if ( !is_canonical_address(cur.address) )
+            return -EINVAL;
+
+        fixup_guest_code_selector(curr->domain, cur.cs);
+
+        memcpy(&dst[cur.vector], &cur, sizeof(cur));
+
+        if ( cur.vector == 0x80 )
+            init_int80_direct_trap(curr);
+
+        guest_handle_add_offset(traps, 1);
+
+        if ( hypercall_preempt_check() )
+        {
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_set_trap_table, "h", traps);
+            break;
+        }
+    }
+
+    return rc;
+}
+
+long do_set_debugreg(int reg, unsigned long value)
+{
+    return set_debugreg(current, reg, value);
+}
+
+unsigned long do_get_debugreg(int reg)
+{
+    struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0 ... 3:
+    case 6:
+        return curr->arch.debugreg[reg];
+    case 7:
+        return (curr->arch.debugreg[7] |
+                curr->arch.debugreg[5]);
+    case 4 ... 5:
+        return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ?
+                curr->arch.debugreg[reg + 2] : 0);
+    }
+
+    return -EINVAL;
+}
+
+long do_fpu_taskswitch(int set)
+{
+    struct vcpu *v = current;
+
+    if ( set )
+    {
+        v->arch.pv_vcpu.ctrlreg[0] |= X86_CR0_TS;
+        stts();
+    }
+    else
+    {
+        v->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
+        if ( v->fpu_dirtied )
+            clts();
+    }
+
+    return 0;
+}
+
+static int read_descriptor(unsigned int sel,
+                           const struct vcpu *v,
+                           unsigned long *base,
+                           unsigned long *limit,
+                           unsigned int *ar,
+                           bool_t insn_fetch)
+{
+    struct desc_struct desc;
+
+    if ( sel < 4)
+        desc.b = desc.a = 0;
+    else if ( __get_user(desc,
+                         (const struct desc_struct *)(!(sel & 4)
+                                                      ? GDT_VIRT_START(v)
+                                                      : LDT_VIRT_START(v))
+                         + (sel >> 3)) )
+        return 0;
+    if ( !insn_fetch )
+        desc.b &= ~_SEGMENT_L;
+
+    *ar = desc.b & 0x00f0ff00;
+    if ( !(desc.b & _SEGMENT_L) )
+    {
+        *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
+                 (desc.b & 0xff000000));
+        *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
+        if ( desc.b & _SEGMENT_G )
+            *limit = ((*limit + 1) << 12) - 1;
+#ifndef NDEBUG
+        if ( sel > 3 )
+        {
+            unsigned int a, l;
+            unsigned char valid;
+
+            asm volatile (
+                "larl %2,%0 ; setz %1"
+                : "=r" (a), "=qm" (valid) : "rm" (sel));
+            BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
+            asm volatile (
+                "lsll %2,%0 ; setz %1"
+                : "=r" (l), "=qm" (valid) : "rm" (sel));
+            BUG_ON(valid && (l != *limit));
+        }
+#endif
+    }
+    else
+    {
+        *base = 0UL;
+        *limit = ~0UL;
+    }
+
+    return 1;
+}
+
+static int read_gate_descriptor(unsigned int gate_sel,
+                                const struct vcpu *v,
+                                unsigned int *sel,
+                                unsigned long *off,
+                                unsigned int *ar)
+{
+    struct desc_struct desc;
+    const struct desc_struct *pdesc;
+
+
+    pdesc = (const struct desc_struct *)
+        (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
+        + (gate_sel >> 3);
+    if ( (gate_sel < 4) ||
+         ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
+         __get_user(desc, pdesc) )
+        return 0;
+
+    *sel = (desc.a >> 16) & 0x0000fffc;
+    *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
+    *ar = desc.b & 0x0000ffff;
+
+    /*
+     * check_descriptor() clears the DPL field and stores the
+     * guest requested DPL in the selector's RPL field.
+     */
+    if ( *ar & _SEGMENT_DPL )
+        return 0;
+    *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
+
+    if ( !is_pv_32bit_vcpu(v) )
+    {
+        if ( (*ar & 0x1f00) != 0x0c00 ||
+             (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
+             __get_user(desc, pdesc + 1) ||
+             (desc.b & 0x1f00) )
+            return 0;
+
+        *off |= (unsigned long)desc.a << 32;
+        return 1;
+    }
+
+    switch ( *ar & 0x1f00 )
+    {
+    case 0x0400:
+        *off &= 0xffff;
+        break;
+    case 0x0c00:
+        break;
+    default:
+        return 0;
+    }
+
+    return 1;
+}
+
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+                                  unsigned int bytes, unsigned long limit,
+                                  enum x86_segment seg,
+                                  struct x86_emulate_ctxt *ctxt,
+                                  unsigned long *addr)
+{
+    int rc = X86EMUL_OKAY;
+
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 64 )
+    {
+        if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+            rc = X86EMUL_EXCEPTION;
+        *addr = (uint32_t)*addr;
+    }
+    else if ( !__addr_ok(*addr) )
+        rc = X86EMUL_EXCEPTION;
+
+    if ( unlikely(rc == X86EMUL_EXCEPTION) )
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+
+    return rc;
+}
+
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+                              unsigned long offset,
+                              void *p_data,
+                              unsigned int bytes,
+                              struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                                x86_seg_cs, ctxt, &addr);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        /*
+         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+         * cpu_has_nx, but we'd then need a "fetch" variant of
+         * __copy_from_user() respecting NX, SMEP, and protection keys.
+         */
+        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    /* Check if this is an attempt to access the I/O bitmap. */
+    if ( seg == x86_seg_tr )
+    {
+        switch ( ctxt->opcode )
+        {
+        case 0x6c ... 0x6f: /* ins / outs */
+        case 0xe4 ... 0xe7: /* in / out (immediate port) */
+        case 0xec ... 0xef: /* in / out (port in %dx) */
+            /* Defer the check to priv_op_{read,write}_io(). */
+            return X86EMUL_DONE;
+        }
+    }
+
+    if ( ctxt->addr_size < 64 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        default: return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            if ( !is_x86_user_segment(seg) )
+                return X86EMUL_UNHANDLEABLE;
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+        {
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+            reg->attr.fields.l = 1;
+        }
+        else
+            reg->attr.fields.db = 1;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
+/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
+static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
+{
+    unsigned int cpl = guest_kernel_mode(v, regs) ?
+        (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
+
+    ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
+
+    return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
+}
+
+/* Has the guest requested sufficient permission for this I/O access? */
+static int guest_io_okay(
+    unsigned int port, unsigned int bytes,
+    struct vcpu *v, struct cpu_user_regs *regs)
+{
+    /* If in user mode, switch to kernel mode just to read I/O bitmap. */
+    int user_mode = !(v->arch.flags & TF_kernel_mode);
+#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
+
+    if ( iopl_ok(v, regs) )
+        return 1;
+
+    if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
+    {
+        union { uint8_t bytes[2]; uint16_t mask; } x;
+
+        /*
+         * Grab permission bytes from guest space. Inaccessible bytes are
+         * read as 0xff (no access allowed).
+         */
+        TOGGLE_MODE();
+        switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
+                                          port>>3, 2) )
+        {
+        default: x.bytes[0] = ~0;
+            /* fallthrough */
+        case 1:  x.bytes[1] = ~0;
+            /* fallthrough */
+        case 0:  break;
+        }
+        TOGGLE_MODE();
+
+        if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
+            return 1;
+    }
+
+    return 0;
+}
+
+/* Has the administrator granted sufficient permission for this I/O access? */
+static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
+                            const struct domain *d)
+{
+    /*
+     * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
+     * We never permit direct access to that register.
+     */
+    if ( (port == 0xcf8) && (bytes == 4) )
+        return 0;
+
+    /* We also never permit direct access to the RTC/CMOS registers. */
+    if ( ((port & ~1) == RTC_PORT(0)) )
+        return 0;
+
+    return ioports_access_permitted(d, port, port + bytes - 1);
+}
+
+static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
+                         unsigned int size, uint32_t *write)
+{
+    uint32_t machine_bdf;
+
+    if ( !is_hardware_domain(currd) )
+        return 0;
+
+    if ( !CF8_ENABLED(currd->arch.pci_cf8) )
+        return 1;
+
+    machine_bdf = CF8_BDF(currd->arch.pci_cf8);
+    if ( write )
+    {
+        const unsigned long *ro_map = pci_get_ro_map(0);
+
+        if ( ro_map && test_bit(machine_bdf, ro_map) )
+            return 0;
+    }
+    start |= CF8_ADDR_LO(currd->arch.pci_cf8);
+    /* AMD extended configuration space access? */
+    if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
+         boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+         boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
+    {
+        uint64_t msr_val;
+
+        if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
+            return 0;
+        if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
+            start |= CF8_ADDR_HI(currd->arch.pci_cf8);
+    }
+
+    return !write ?
+           xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
+                                     start, start + size - 1, 0) == 0 :
+           pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
+}
+
+uint32_t guest_io_read(unsigned int port, unsigned int bytes,
+                       struct domain *currd)
+{
+    uint32_t data = 0;
+    unsigned int shift = 0;
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        switch ( bytes )
+        {
+        case 1: return inb(port);
+        case 2: return inw(port);
+        case 4: return inl(port);
+        }
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+        uint32_t sub_data = ~0;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            sub_data = pv_pit_handler(port, 0, 0);
+        }
+        else if ( port == RTC_PORT(0) )
+        {
+            sub_data = currd->arch.cmos_idx;
+        }
+        else if ( (port == RTC_PORT(1)) &&
+                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+        {
+            unsigned long flags;
+
+            spin_lock_irqsave(&rtc_lock, flags);
+            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+            sub_data = inb(RTC_PORT(1));
+            spin_unlock_irqrestore(&rtc_lock, flags);
+        }
+        else if ( (port == 0xcf8) && (bytes == 4) )
+        {
+            size = 4;
+            sub_data = currd->arch.pci_cf8;
+        }
+        else if ( (port & 0xfffc) == 0xcfc )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            if ( pci_cfg_ok(currd, port & 3, size, NULL) )
+                sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
+        }
+
+        if ( size == 4 )
+            return sub_data;
+
+        data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
+        shift += size * 8;
+        port += size;
+        bytes -= size;
+    }
+
+    return data;
+}
+
+void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
+                    struct domain *currd)
+{
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        switch ( bytes ) {
+        case 1:
+            outb((uint8_t)data, port);
+            if ( pv_post_outb_hook )
+                pv_post_outb_hook(port, (uint8_t)data);
+            break;
+        case 2:
+            outw((uint16_t)data, port);
+            break;
+        case 4:
+            outl(data, port);
+            break;
+        }
+        return;
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            pv_pit_handler(port, (uint8_t)data, 1);
+        }
+        else if ( port == RTC_PORT(0) )
+        {
+            currd->arch.cmos_idx = data;
+        }
+        else if ( (port == RTC_PORT(1)) &&
+                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+        {
+            unsigned long flags;
+
+            if ( pv_rtc_handler )
+                pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
+            spin_lock_irqsave(&rtc_lock, flags);
+            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+            outb(data, RTC_PORT(1));
+            spin_unlock_irqrestore(&rtc_lock, flags);
+        }
+        else if ( (port == 0xcf8) && (bytes == 4) )
+        {
+            size = 4;
+            currd->arch.pci_cf8 = data;
+        }
+        else if ( (port & 0xfffc) == 0xcfc )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            if ( pci_cfg_ok(currd, port & 3, size, &data) )
+                pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
+        }
+
+        if ( size == 4 )
+            return;
+
+        port += size;
+        bytes -= size;
+        data >>= size * 8;
+    }
+}
+
+/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
+void host_to_guest_gpr_switch(struct cpu_user_regs *);
+unsigned long guest_to_host_gpr_switch(unsigned long);
+
+void (*pv_post_outb_hook)(unsigned int port, u8 value);
+
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, x86_seg_es, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(PFEC_write_access,
+                               addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, seg, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Read CR0 */
+        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+        return X86EMUL_OKAY;
+
+    case 2: /* Read CR2 */
+    case 4: /* Read CR4 */
+        *val = curr->arch.pv_vcpu.ctrlreg[reg];
+        return X86EMUL_OKAY;
+
+    case 3: /* Read CR3 */
+    {
+        const struct domain *currd = curr->domain;
+        unsigned long mfn;
+
+        if ( !is_pv_32bit_domain(currd) )
+        {
+            mfn = pagetable_get_pfn(curr->arch.guest_table);
+            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        else
+        {
+            l4_pgentry_t *pl4e =
+                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+            mfn = l4e_get_pfn(*pl4e);
+            unmap_domain_page(pl4e);
+            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        /* PTs should not be shared */
+        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+        return X86EMUL_OKAY;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Write CR0 */
+        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+        {
+            gdprintk(XENLOG_WARNING,
+                    "Attempt to change unmodifiable CR0 flags\n");
+            break;
+        }
+        do_fpu_taskswitch(!!(val & X86_CR0_TS));
+        return X86EMUL_OKAY;
+
+    case 2: /* Write CR2 */
+        curr->arch.pv_vcpu.ctrlreg[2] = val;
+        arch_set_cr2(curr, val);
+        return X86EMUL_OKAY;
+
+    case 3: /* Write CR3 */
+    {
+        struct domain *currd = curr->domain;
+        unsigned long gfn;
+        struct page_info *page;
+        int rc;
+
+        gfn = !is_pv_32bit_domain(currd)
+              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+        if ( !page )
+            break;
+        rc = new_guest_cr3(page_to_mfn(page));
+        put_page(page);
+
+        switch ( rc )
+        {
+        case 0:
+            return X86EMUL_OKAY;
+        case -ERESTART: /* retry after preemption */
+            return X86EMUL_RETRY;
+        }
+        break;
+    }
+
+    case 4: /* Write CR4 */
+        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+        write_cr4(pv_guest_cr4_to_real_cr4(curr));
+        ctxt_switch_levelling(curr);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    unsigned long res = do_get_debugreg(reg);
+
+    if ( IS_ERR_VALUE(res) )
+        return X86EMUL_UNHANDLEABLE;
+
+    *val = res;
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    return do_set_debugreg(reg, val) == 0
+           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
+static inline uint64_t guest_misc_enable(uint64_t val)
+{
+    val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
+             MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
+    val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+           MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
+    return val;
+}
+
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    const struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdgsbase()
+                                : curr->arch.pv_vcpu.gs_base_kernel;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = curr->arch.pv_vcpu.gs_base_user;
+        return X86EMUL_OKAY;
+
+    /*
+     * In order to fully retain original behavior, defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_EFER:
+        *val = read_efer();
+        if ( is_pv_32bit_domain(currd) )
+            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( unlikely(is_cpufreq_controller(currd)) )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_UCODE_REV:
+        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+                break;
+            /* As documented in the SDM: Do a CPUID 1 here */
+            cpuid_eax(1);
+        }
+        goto normal;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        *val = guest_misc_enable(*val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[0];
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_PERF_CAPABILITIES:
+        /* No extra capabilities are supported. */
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
+            break;
+        *val = 0;
+        if ( this_cpu(cpuid_faulting_enabled) )
+            *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_MISC_FEATURES_ENABLES:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
+            break;
+        *val = 0;
+        if ( curr->arch.cpuid_faulting )
+            *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+            /* fall through */
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( vpmu_do_rdmsr(reg, val) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( rdmsr_hypervisor_regs(reg, val) )
+            return X86EMUL_OKAY;
+
+        rc = vmce_rdmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+        /* fall through */
+    normal:
+        /* Everyone can read the MSR space. */
+        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+#include "../x86_64/mmconfig.h"
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        uint64_t temp;
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+            break;
+        wrfsbase(val);
+        curr->arch.pv_vcpu.fs_base = val;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+            break;
+        wrgsbase(val);
+        curr->arch.pv_vcpu.gs_base_kernel = val;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+            break;
+        wrmsrl(MSR_SHADOW_GS_BASE, val);
+        curr->arch.pv_vcpu.gs_base_user = val;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+    case MSR_K8_HWCR:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_NB_CFG:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_FAM10H_MMIO_CONF_BASE:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+            break;
+        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+             temp != val :
+             ((temp ^ val) &
+              ~(FAM10H_MMIO_CONF_ENABLE |
+                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_UCODE_REV:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val != guest_misc_enable(temp) )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MPERF:
+    case MSR_IA32_APERF:
+        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_PERF_CTL:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_THERM_CONTROL:
+    case MSR_IA32_ENERGY_PERF_BIAS:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[0] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(reg, val);
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
+            break;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_MISC_FEATURES_ENABLES:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
+             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
+            break;
+        if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
+             !this_cpu(cpuid_faulting_enabled) )
+            break;
+        curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    return X86EMUL_OKAY;
+
+                if ( vpmu_do_wrmsr(reg, val, 0) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+            return X86EMUL_OKAY;
+
+        rc = vmce_wrmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+
+        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+    invalid:
+            gdprintk(XENLOG_WARNING,
+                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+                     reg, temp, val);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
+
+    return X86EMUL_OKAY;
+}
+
+int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
+                  struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
+{
+    guest_cpuid(current, leaf, subleaf, res);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_validate(const struct x86_emulate_state *state,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    switch ( ctxt->opcode )
+    {
+    case 0x6c ... 0x6f: /* ins / outs */
+    case 0xe4 ... 0xe7: /* in / out (immediate port) */
+    case 0xec ... 0xef: /* in / out (port in %dx) */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x20) ...
+         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+        return X86EMUL_OKAY;
+
+    case 0xfa: case 0xfb: /* cli / sti */
+        if ( !iopl_ok(current, ctxt->regs) )
+            break;
+        /*
+         * This is just too dangerous to allow, in my opinion. Consider if the
+         * caller then tries to reenable interrupts using POPF: we can't trap
+         * that and we'll end up with hard-to-debug lockups. Fast & loose will
+         * do for us. :-)
+        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
+         */
+        return X86EMUL_DONE;
+
+    case X86EMUL_OPC(0x0f, 0x01):
+    {
+        unsigned int modrm_rm, modrm_reg;
+
+        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+             (modrm_rm & 7) != 1 )
+            break;
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* xsetbv */
+        case 7: /* rdtscp */
+            return X86EMUL_OKAY;
+        }
+        break;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .validate            = priv_op_validate,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = pv_emul_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+};
+
+int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = {
+        .ctxt.regs = regs,
+        .ctxt.vendor = currd->arch.cpuid->x86_vendor,
+    };
+    int rc;
+    unsigned int eflags, ar;
+
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
+
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
+    regs->eflags |= curr->arch.pv_vcpu.iopl;
+    eflags = regs->eflags;
+
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
+
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
+
+    /*
+     * Un-mirror virtualized state from EFLAGS.
+     * Nothing we allow to be emulated can change anything other than the
+     * arithmetic bits, and the resume flag.
+     */
+    ASSERT(!((regs->eflags ^ eflags) &
+             ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
+    regs->eflags |= X86_EFLAGS_IF;
+    regs->eflags &= ~X86_EFLAGS_IOPL;
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
+                msr_split(regs, rdtsc());
+        }
+
+        if ( ctxt.ctxt.retire.singlestep )
+            ctxt.bpmatch |= DR_STEP;
+        if ( ctxt.bpmatch )
+        {
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                pv_inject_guest_trap(TRAP_debug, regs);
+        }
+        /* fall through */
+    case X86EMUL_RETRY:
+        return EXCRET_fault_fixed;
+
+    case X86EMUL_EXCEPTION:
+        pv_inject_event(&ctxt.ctxt.event);
+        return EXCRET_fault_fixed;
+    }
+
+    return 0;
+}
+
+
+static inline int check_stack_limit(unsigned int ar, unsigned int limit,
+                                    unsigned int esp, unsigned int decr)
+{
+    return (((esp - decr) < (esp - 1)) &&
+            (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
+}
+
+struct gate_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    bool insn_fetch;
+};
+
+static int gate_op_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    const struct gate_op_ctxt *goc =
+        container_of(ctxt, struct gate_op_ctxt, ctxt);
+    unsigned int rc = bytes, sel = 0;
+    unsigned long addr = offset, limit = 0;
+
+    switch ( seg )
+    {
+    case x86_seg_cs:
+        addr += goc->cs.base;
+        limit = goc->cs.limit;
+        break;
+    case x86_seg_ds:
+        sel = read_sreg(ds);
+        break;
+    case x86_seg_es:
+        sel = read_sreg(es);
+        break;
+    case x86_seg_fs:
+        sel = read_sreg(fs);
+        break;
+    case x86_seg_gs:
+        sel = read_sreg(gs);
+        break;
+    case x86_seg_ss:
+        sel = ctxt->regs->ss;
+        break;
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+    if ( sel )
+    {
+        unsigned int ar;
+
+        ASSERT(!goc->insn_fetch);
+        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+             !(ar & _SEGMENT_S) ||
+             !(ar & _SEGMENT_P) ||
+             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+            return X86EMUL_UNHANDLEABLE;
+        addr += offset;
+    }
+    else if ( seg != x86_seg_cs )
+        return X86EMUL_UNHANDLEABLE;
+
+    /* We don't mean to emulate any branches. */
+    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+        return X86EMUL_UNHANDLEABLE;
+
+    addr = (uint32_t)addr;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+    {
+        /*
+         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+         * cpu_has_nx, but we'd then need a "fetch" variant of
+         * __copy_from_user() respecting NX, SMEP, and protection keys.
+         */
+        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+void emulate_gate_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+    unsigned int sel, ar, dpl, nparm, insn_len;
+    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+    struct x86_emulate_state *state;
+    unsigned long off, base, limit;
+    uint16_t opnd_sel = 0;
+    int jump = -1, rc = X86EMUL_OKAY;
+
+    /* Check whether this fault is due to the use of a call gate. */
+    if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
+         (((ar >> 13) & 3) < (regs->cs & 3)) ||
+         ((ar & _SEGMENT_TYPE) != 0xc00) )
+    {
+        pv_inject_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+    if ( !(ar & _SEGMENT_P) )
+    {
+        pv_inject_guest_trap(TRAP_no_segment, regs);
+        return;
+    }
+    dpl = (ar >> 13) & 3;
+    nparm = ar & 0x1f;
+
+    /*
+     * Decode instruction (and perhaps operand) to determine RPL,
+     * whether this is a jump or a call, and the call return offset.
+     */
+    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 0) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+    {
+        pv_inject_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+    ctxt.insn_fetch = false;
+    if ( IS_ERR_OR_NULL(state) )
+    {
+        if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
+        {
+            ASSERT(ctxt.ctxt.event_pending);
+            pv_inject_event(&ctxt.ctxt.event);
+        }
+        else
+        {
+            ASSERT(!ctxt.ctxt.event_pending);
+            pv_inject_guest_trap(TRAP_gp_fault, regs);
+        }
+        return;
+    }
+
+    switch ( ctxt.ctxt.opcode )
+    {
+        unsigned int modrm_345;
+
+    case 0xea:
+        ++jump;
+        /* fall through */
+    case 0x9a:
+        ++jump;
+        opnd_sel = x86_insn_immediate(state, 1);
+        break;
+    case 0xff:
+        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
+            break;
+        switch ( modrm_345 & 7 )
+        {
+            enum x86_segment seg;
+
+        case 5:
+            ++jump;
+            /* fall through */
+        case 3:
+            ++jump;
+            base = x86_insn_operand_ea(state, &seg);
+            rc = gate_op_read(seg,
+                              base + (x86_insn_opsize(state) >> 3),
+                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
+            break;
+        }
+        break;
+    }
+
+    insn_len = x86_insn_length(state, &ctxt.ctxt);
+    x86_emulate_free_state(state);
+
+    if ( rc == X86EMUL_EXCEPTION )
+    {
+        ASSERT(ctxt.ctxt.event_pending);
+        pv_inject_event(&ctxt.ctxt.event);
+        return;
+    }
+
+    ASSERT(!ctxt.ctxt.event_pending);
+
+    if ( rc != X86EMUL_OKAY ||
+         jump < 0 ||
+         (opnd_sel & ~3) != regs->error_code ||
+         dpl < (opnd_sel & 3) )
+    {
+        pv_inject_guest_trap(TRAP_gp_fault, regs);
+        return;
+    }
+
+    if ( !read_descriptor(sel, v, &base, &limit, &ar, 0) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_CODE) ||
+         (!jump || (ar & _SEGMENT_EC) ?
+          ((ar >> 13) & 3) > (regs->cs & 3) :
+          ((ar >> 13) & 3) != (regs->cs & 3)) )
+    {
+        pv_inject_hw_exception(TRAP_gp_fault, sel);
+        return;
+    }
+    if ( !(ar & _SEGMENT_P) )
+    {
+        pv_inject_hw_exception(TRAP_no_segment, sel);
+        return;
+    }
+    if ( off > limit )
+    {
+        pv_inject_hw_exception(TRAP_gp_fault, 0);
+        return;
+    }
+
+    if ( !jump )
+    {
+        unsigned int ss, esp, *stkp;
+        int rc;
+#define push(item) do \
+        { \
+            --stkp; \
+            esp -= 4; \
+            rc = __put_user(item, stkp); \
+            if ( rc ) \
+            { \
+                pv_inject_page_fault(PFEC_write_access, \
+                                     (unsigned long)(stkp + 1) - rc); \
+                return; \
+            } \
+        } while ( 0 )
+
+        if ( ((ar >> 13) & 3) < (regs->cs & 3) )
+        {
+            sel |= (ar >> 13) & 3;
+            /* Inner stack known only for kernel ring. */
+            if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
+            {
+                pv_inject_guest_trap(TRAP_gp_fault, regs);
+                return;
+            }
+            esp = v->arch.pv_vcpu.kernel_sp;
+            ss = v->arch.pv_vcpu.kernel_ss;
+            if ( (ss & 3) != (sel & 3) ||
+                 !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
+                 ((ar >> 13) & 3) != (sel & 3) ||
+                 !(ar & _SEGMENT_S) ||
+                 (ar & _SEGMENT_CODE) ||
+                 !(ar & _SEGMENT_WR) )
+            {
+                pv_inject_hw_exception(TRAP_invalid_tss, ss & ~3);
+                return;
+            }
+            if ( !(ar & _SEGMENT_P) ||
+                 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
+            {
+                pv_inject_hw_exception(TRAP_stack_error, ss & ~3);
+                return;
+            }
+            stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+            if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
+            {
+                pv_inject_guest_trap(TRAP_gp_fault, regs);
+                return;
+            }
+            push(regs->ss);
+            push(regs->rsp);
+            if ( nparm )
+            {
+                const unsigned int *ustkp;
+
+                if ( !read_descriptor(regs->ss, v, &base, &limit, &ar, 0) ||
+                     ((ar >> 13) & 3) != (regs->cs & 3) ||
+                     !(ar & _SEGMENT_S) ||
+                     (ar & _SEGMENT_CODE) ||
+                     !(ar & _SEGMENT_WR) ||
+                     !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
+                    return pv_inject_guest_trap(TRAP_gp_fault, regs);
+                ustkp = (unsigned int *)(unsigned long)
+                        ((unsigned int)base + regs->esp + nparm * 4);
+                if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
+                {
+                    pv_inject_guest_trap(TRAP_gp_fault, regs);
+                    return;
+                }
+                do
+                {
+                    unsigned int parm;
+
+                    --ustkp;
+                    rc = __get_user(parm, ustkp);
+                    if ( rc )
+                    {
+                        pv_inject_page_fault(0,
+                                             (unsigned long)(ustkp + 1) - rc);
+                        return;
+                    }
+                    push(parm);
+                } while ( --nparm );
+            }
+        }
+        else
+        {
+            sel |= (regs->cs & 3);
+            esp = regs->rsp;
+            ss = regs->ss;
+            if ( !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
+                 ((ar >> 13) & 3) != (sel & 3) )
+            {
+                pv_inject_guest_trap(TRAP_gp_fault, regs);
+                return;
+            }
+            if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
+            {
+                pv_inject_hw_exception(TRAP_stack_error, 0);
+                return;
+            }
+            stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+            if ( !compat_access_ok(stkp - 2, 2 * 4) )
+            {
+                pv_inject_guest_trap(TRAP_gp_fault, regs);
+                return;
+            }
+        }
+        push(regs->cs);
+        push(regs->rip + insn_len);
+#undef push
+        regs->rsp = esp;
+        regs->ss = ss;
+    }
+    else
+        sel |= (regs->cs & 3);
+
+    regs->cs = sel;
+    instruction_done(regs, off);
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index a1981289a4..351bb950d8 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -65,7 +65,6 @@ 
 #include <asm/debugger.h>
 #include <asm/msr.h>
 #include <asm/shared.h>
-#include <asm/x86_emulate.h>
 #include <asm/traps.h>
 #include <asm/hvm/vpt.h>
 #include <asm/hypercall.h>
@@ -544,20 +543,6 @@  static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
     return 1;
 }
 
-static const char *trapstr(unsigned int trapnr)
-{
-    static const char * const strings[] = {
-        "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
-        "invalid opcode", "device not available", "double fault",
-        "coprocessor segment", "invalid tss", "segment not found",
-        "stack error", "general protection fault", "page fault",
-        "spurious interrupt", "coprocessor error", "alignment check",
-        "machine check", "simd error", "virtualisation exception"
-    };
-
-    return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
-}
-
 /*
  * This is called for faults at very unexpected times (e.g., when interrupts
  * are disabled). In such situations we can't do much that is safe. We try to
@@ -625,138 +610,6 @@  void fatal_trap(const struct cpu_user_regs *regs, bool_t show_remote)
           (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
 }
 
-void pv_inject_event(const struct x86_event *event)
-{
-    struct vcpu *v = current;
-    struct cpu_user_regs *regs = guest_cpu_user_regs();
-    struct trap_bounce *tb;
-    const struct trap_info *ti;
-    const uint8_t vector = event->vector;
-    const bool use_error_code =
-        ((vector < 32) && (TRAP_HAVE_EC & (1u << vector)));
-    unsigned int error_code = event->error_code;
-
-    ASSERT(vector == event->vector); /* Confirm no truncation. */
-    if ( use_error_code )
-        ASSERT(error_code != X86_EVENT_NO_EC);
-    else
-        ASSERT(error_code == X86_EVENT_NO_EC);
-
-    tb = &v->arch.pv_vcpu.trap_bounce;
-    ti = &v->arch.pv_vcpu.trap_ctxt[vector];
-
-    tb->flags = TBF_EXCEPTION;
-    tb->cs    = ti->cs;
-    tb->eip   = ti->address;
-
-    if ( vector == TRAP_page_fault )
-    {
-        v->arch.pv_vcpu.ctrlreg[2] = event->cr2;
-        arch_set_cr2(v, event->cr2);
-
-        /* Re-set error_code.user flag appropriately for the guest. */
-        error_code &= ~PFEC_user_mode;
-        if ( !guest_kernel_mode(v, regs) )
-            error_code |= PFEC_user_mode;
-
-        trace_pv_page_fault(event->cr2, error_code);
-    }
-    else
-        trace_pv_trap(vector, regs->rip, use_error_code, error_code);
-
-    if ( use_error_code )
-    {
-        tb->flags |= TBF_EXCEPTION_ERRCODE;
-        tb->error_code = error_code;
-    }
-
-    if ( TI_GET_IF(ti) )
-        tb->flags |= TBF_INTERRUPT;
-
-    if ( unlikely(null_trap_bounce(v, tb)) )
-    {
-        gprintk(XENLOG_WARNING,
-                "Unhandled %s fault/trap [#%d, ec=%04x]\n",
-                trapstr(vector), vector, error_code);
-
-        if ( vector == TRAP_page_fault )
-            show_page_walk(event->cr2);
-    }
-}
-
-static void instruction_done(struct cpu_user_regs *regs, unsigned long rip)
-{
-    regs->rip = rip;
-    regs->eflags &= ~X86_EFLAGS_RF;
-    if ( regs->eflags & X86_EFLAGS_TF )
-    {
-        current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
-        pv_inject_guest_trap(TRAP_debug, regs);
-    }
-}
-
-static unsigned int check_guest_io_breakpoint(struct vcpu *v,
-    unsigned int port, unsigned int len)
-{
-    unsigned int width, i, match = 0;
-    unsigned long start;
-
-    if ( !(v->arch.debugreg[5]) ||
-         !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
-        return 0;
-
-    for ( i = 0; i < 4; i++ )
-    {
-        if ( !(v->arch.debugreg[5] &
-               (3 << (i * DR_ENABLE_SIZE))) )
-            continue;
-
-        start = v->arch.debugreg[i];
-        width = 0;
-
-        switch ( (v->arch.debugreg[7] >>
-                  (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
-        {
-        case DR_LEN_1: width = 1; break;
-        case DR_LEN_2: width = 2; break;
-        case DR_LEN_4: width = 4; break;
-        case DR_LEN_8: width = 8; break;
-        }
-
-        if ( (start < (port + len)) && ((start + width) > port) )
-            match |= 1 << i;
-    }
-
-    return match;
-}
-
-/*
- * Called from asm to set up the MCE trapbounce info.
- * Returns 0 if no callback is set up, else 1.
- */
-int set_guest_machinecheck_trapbounce(void)
-{
-    struct vcpu *v = current;
-    struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
- 
-    pv_inject_guest_trap(TRAP_machine_check, guest_cpu_user_regs());
-    tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
-    return !null_trap_bounce(v, tb);
-}
-
-/*
- * Called from asm to set up the NMI trapbounce info.
- * Returns 0 if no callback is set up, else 1.
- */
-int set_guest_nmi_trapbounce(void)
-{
-    struct vcpu *v = current;
-    struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce;
-    pv_inject_guest_trap(TRAP_nmi, guest_cpu_user_regs());
-    tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
-    return !null_trap_bounce(v, tb);
-}
-
 void do_reserved_trap(struct cpu_user_regs *regs)
 {
     unsigned int trapnr = regs->entry_vector;
@@ -997,77 +850,6 @@  void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
     }
 }
 
-static int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
-{
-    char opcode[3];
-    unsigned long eip, rc;
-    struct vcpu *v = current;
-
-    eip = regs->rip;
-    if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
-    {
-        pv_inject_page_fault(0, eip + sizeof(opcode) - rc);
-        return EXCRET_fault_fixed;
-    }
-    if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
-        return 0;
-    eip += sizeof(opcode);
-    pv_soft_rdtsc(v, regs, 1);
-    instruction_done(regs, eip);
-    return EXCRET_fault_fixed;
-}
-
-static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
-{
-    char sig[5], instr[2];
-    unsigned long eip, rc;
-    struct cpuid_leaf res;
-
-    eip = regs->rip;
-
-    /* Check for forced emulation signature: ud2 ; .ascii "xen". */
-    if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
-    {
-        pv_inject_page_fault(0, eip + sizeof(sig) - rc);
-        return EXCRET_fault_fixed;
-    }
-    if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
-        return 0;
-    eip += sizeof(sig);
-
-    /* We only emulate CPUID. */
-    if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
-    {
-        pv_inject_page_fault(0, eip + sizeof(instr) - rc);
-        return EXCRET_fault_fixed;
-    }
-    if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
-        return 0;
-
-    /* If cpuid faulting is enabled and CPL>0 inject a #GP in place of #UD. */
-    if ( current->arch.cpuid_faulting && !guest_kernel_mode(current, regs) )
-    {
-        regs->rip = eip;
-        pv_inject_guest_trap(TRAP_gp_fault, regs);
-        return EXCRET_fault_fixed;
-    }
-
-    eip += sizeof(instr);
-
-    guest_cpuid(current, regs->eax, regs->ecx, &res);
-
-    regs->rax = res.a;
-    regs->rbx = res.b;
-    regs->rcx = res.c;
-    regs->rdx = res.d;
-
-    instruction_done(regs, eip);
-
-    trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->rip);
-
-    return EXCRET_fault_fixed;
-}
-
 void do_invalid_op(struct cpu_user_regs *regs)
 {
     const struct bug_frame *bug = NULL;
@@ -1576,1963 +1358,173 @@  void __init do_early_page_fault(struct cpu_user_regs *regs)
     }
 }
 
-long do_fpu_taskswitch(int set)
+void do_general_protection(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
+    unsigned long fixup;
 
-    if ( set )
-    {
-        v->arch.pv_vcpu.ctrlreg[0] |= X86_CR0_TS;
-        stts();
-    }
-    else
-    {
-        v->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
-        if ( v->fpu_dirtied )
-            clts();
-    }
+    if ( debugger_trap_entry(TRAP_gp_fault, regs) )
+        return;
 
-    return 0;
-}
+    if ( regs->error_code & X86_XEC_EXT )
+        goto hardware_gp;
 
-static int read_descriptor(unsigned int sel,
-                           const struct vcpu *v,
-                           unsigned long *base,
-                           unsigned long *limit,
-                           unsigned int *ar,
-                           bool_t insn_fetch)
-{
-    struct desc_struct desc;
-
-    if ( sel < 4)
-        desc.b = desc.a = 0;
-    else if ( __get_user(desc,
-                         (const struct desc_struct *)(!(sel & 4)
-                                                      ? GDT_VIRT_START(v)
-                                                      : LDT_VIRT_START(v))
-                         + (sel >> 3)) )
-        return 0;
-    if ( !insn_fetch )
-        desc.b &= ~_SEGMENT_L;
+    if ( !guest_mode(regs) )
+        goto gp_in_kernel;
 
-    *ar = desc.b & 0x00f0ff00;
-    if ( !(desc.b & _SEGMENT_L) )
+    /*
+     * Cunning trick to allow arbitrary "INT n" handling.
+     * 
+     * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
+     * instruction from trapping to the appropriate vector, when that might not
+     * be expected by Xen or the guest OS. For example, that entry might be for
+     * a fault handler (unlike traps, faults don't increment EIP), or might
+     * expect an error code on the stack (which a software trap never
+     * provides), or might be a hardware interrupt handler that doesn't like
+     * being called spuriously.
+     * 
+     * Instead, a GPF occurs with the faulting IDT vector in the error code.
+     * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is 
+     * clear (which got already checked above) to indicate that it's a software
+     * fault, not a hardware one.
+     * 
+     * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
+     * okay because they can only be triggered by an explicit DPL-checked
+     * instruction. The DPL specified by the guest OS for these vectors is NOT
+     * CHECKED!!
+     */
+    if ( regs->error_code & X86_XEC_IDT )
     {
-        *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
-                 (desc.b & 0xff000000));
-        *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
-        if ( desc.b & _SEGMENT_G )
-            *limit = ((*limit + 1) << 12) - 1;
-#ifndef NDEBUG
-        if ( sel > 3 )
+        /* This fault must be due to <INT n> instruction. */
+        const struct trap_info *ti;
+        unsigned char vector = regs->error_code >> 3;
+        ti = &v->arch.pv_vcpu.trap_ctxt[vector];
+        if ( permit_softint(TI_GET_DPL(ti), v, regs) )
         {
-            unsigned int a, l;
-            unsigned char valid;
-
-            asm volatile (
-                "larl %2,%0 ; setz %1"
-                : "=r" (a), "=qm" (valid) : "rm" (sel));
-            BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
-            asm volatile (
-                "lsll %2,%0 ; setz %1"
-                : "=r" (l), "=qm" (valid) : "rm" (sel));
-            BUG_ON(valid && (l != *limit));
+            regs->rip += 2;
+            pv_inject_guest_trap(vector, regs);
+            return;
         }
-#endif
     }
-    else
+    else if ( is_pv_32bit_vcpu(v) && regs->error_code )
     {
-        *base = 0UL;
-        *limit = ~0UL;
+        emulate_gate_op(regs);
+        return;
     }
 
-    return 1;
-}
-
-static int read_gate_descriptor(unsigned int gate_sel,
-                                const struct vcpu *v,
-                                unsigned int *sel,
-                                unsigned long *off,
-                                unsigned int *ar)
-{
-    struct desc_struct desc;
-    const struct desc_struct *pdesc;
-
-
-    pdesc = (const struct desc_struct *)
-        (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
-        + (gate_sel >> 3);
-    if ( (gate_sel < 4) ||
-         ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
-         __get_user(desc, pdesc) )
-        return 0;
+    /* Emulate some simple privileged and I/O instructions. */
+    if ( (regs->error_code == 0) &&
+         emulate_privileged_op(regs) )
+    {
+        trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
+        return;
+    }
 
-    *sel = (desc.a >> 16) & 0x0000fffc;
-    *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
-    *ar = desc.b & 0x0000ffff;
+    /* Pass on GPF as is. */
+    pv_inject_guest_trap(TRAP_gp_fault, regs);
+    return;
 
-    /*
-     * check_descriptor() clears the DPL field and stores the
-     * guest requested DPL in the selector's RPL field.
-     */
-    if ( *ar & _SEGMENT_DPL )
-        return 0;
-    *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
+ gp_in_kernel:
 
-    if ( !is_pv_32bit_vcpu(v) )
+    if ( likely((fixup = search_exception_table(regs)) != 0) )
     {
-        if ( (*ar & 0x1f00) != 0x0c00 ||
-             (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
-             __get_user(desc, pdesc + 1) ||
-             (desc.b & 0x1f00) )
-            return 0;
-
-        *off |= (unsigned long)desc.a << 32;
-        return 1;
+        dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n",
+                regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup));
+        this_cpu(last_extable_addr) = regs->rip;
+        regs->rip = fixup;
+        return;
     }
 
-    switch ( *ar & 0x1f00 )
-    {
-    case 0x0400:
-        *off &= 0xffff;
-        break;
-    case 0x0c00:
-        break;
-    default:
-        return 0;
-    }
+ hardware_gp:
+    if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
+        return;
 
-    return 1;
+    show_execution_state(regs);
+    panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code);
 }
 
-static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
-                                  unsigned int bytes, unsigned long limit,
-                                  enum x86_segment seg,
-                                  struct x86_emulate_ctxt *ctxt,
-                                  unsigned long *addr)
+static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
+
+static void nmi_mce_softirq(void)
 {
-    int rc = X86EMUL_OKAY;
+    int cpu = smp_processor_id();
+    struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
 
-    *addr = base + offset;
+    BUG_ON(st->vcpu == NULL);
+
+    /* Set the tmp value unconditionally, so that
+     * the check in the iret hypercall works. */
+    cpumask_copy(st->vcpu->cpu_hard_affinity_tmp,
+                 st->vcpu->cpu_hard_affinity);
 
-    if ( ctxt->addr_size < 64 )
+    if ((cpu != st->processor)
+       || (st->processor != st->vcpu->processor))
     {
-        if ( limit < bytes - 1 || offset > limit - bytes + 1 )
-            rc = X86EMUL_EXCEPTION;
-        *addr = (uint32_t)*addr;
-    }
-    else if ( !__addr_ok(*addr) )
-        rc = X86EMUL_EXCEPTION;
+        /* We are on a different physical cpu.
+         * Make sure to wakeup the vcpu on the
+         * specified processor.
+         */
+        vcpu_set_hard_affinity(st->vcpu, cpumask_of(st->processor));
 
-    if ( unlikely(rc == X86EMUL_EXCEPTION) )
-        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
-                                                : TRAP_stack_error,
-                              0, ctxt);
+        /* Affinity is restored in the iret hypercall. */
+    }
 
-    return rc;
+    /* Only used to defer wakeup of domain/vcpu to
+     * a safe (non-NMI/MCE) context.
+     */
+    vcpu_kick(st->vcpu);
+    st->vcpu = NULL;
 }
 
-struct priv_op_ctxt {
-    struct x86_emulate_ctxt ctxt;
-    struct {
-        unsigned long base, limit;
-    } cs;
-    char *io_emul_stub;
-    unsigned int bpmatch;
-    unsigned int tsc;
-#define TSC_BASE 1
-#define TSC_AUX 2
-};
-
-static int priv_op_insn_fetch(enum x86_segment seg,
-                              unsigned long offset,
-                              void *p_data,
-                              unsigned int bytes,
-                              struct x86_emulate_ctxt *ctxt)
+static void pci_serr_softirq(void)
 {
-    const struct priv_op_ctxt *poc =
-        container_of(ctxt, struct priv_op_ctxt, ctxt);
-    unsigned int rc;
-    unsigned long addr = poc->cs.base + offset;
+    printk("\n\nNMI - PCI system error (SERR)\n");
+    outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
+}
 
-    ASSERT(seg == x86_seg_cs);
+void async_exception_cleanup(struct vcpu *curr)
+{
+    int trap;
 
-    /* We don't mean to emulate any branches. */
-    if ( !bytes )
-        return X86EMUL_UNHANDLEABLE;
+    if ( !curr->async_exception_mask )
+        return;
 
-    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
-                                x86_seg_cs, ctxt, &addr);
-    if ( rc != X86EMUL_OKAY )
-        return rc;
+    /* Restore affinity.  */
+    if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) &&
+         !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) )
+    {
+        vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp);
+        cpumask_clear(curr->cpu_hard_affinity_tmp);
+    }
 
-    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
+        trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
+    else
+        for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
+            if ( (curr->async_exception_mask ^
+                  curr->async_exception_state(trap).old_mask) == (1 << trap) )
+                break;
+    if ( unlikely(trap > VCPU_TRAP_LAST) )
     {
-        /*
-         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
-         * cpu_has_nx, but we'd then need a "fetch" variant of
-         * __copy_from_user() respecting NX, SMEP, and protection keys.
-         */
-        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
-        return X86EMUL_EXCEPTION;
+        ASSERT_UNREACHABLE();
+        return;
     }
 
-    return X86EMUL_OKAY;
+    /* Restore previous asynchronous exception mask. */
+    curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
 }
 
-static int priv_op_read_segment(enum x86_segment seg,
-                                struct segment_register *reg,
-                                struct x86_emulate_ctxt *ctxt)
+static void nmi_hwdom_report(unsigned int reason_idx)
 {
-    /* Check if this is an attempt to access the I/O bitmap. */
-    if ( seg == x86_seg_tr )
-    {
-        switch ( ctxt->opcode )
-        {
-        case 0x6c ... 0x6f: /* ins / outs */
-        case 0xe4 ... 0xe7: /* in / out (immediate port) */
-        case 0xec ... 0xef: /* in / out (port in %dx) */
-            /* Defer the check to priv_op_{read,write}_io(). */
-            return X86EMUL_DONE;
-        }
-    }
+    struct domain *d = hardware_domain;
 
-    if ( ctxt->addr_size < 64 )
-    {
-        unsigned long limit;
-        unsigned int sel, ar;
+    if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
+        return;
 
-        switch ( seg )
-        {
-        case x86_seg_cs: sel = ctxt->regs->cs; break;
-        case x86_seg_ds: sel = read_sreg(ds);  break;
-        case x86_seg_es: sel = read_sreg(es);  break;
-        case x86_seg_fs: sel = read_sreg(fs);  break;
-        case x86_seg_gs: sel = read_sreg(gs);  break;
-        case x86_seg_ss: sel = ctxt->regs->ss; break;
-        default: return X86EMUL_UNHANDLEABLE;
-        }
+    set_bit(reason_idx, nmi_reason(d));
 
-        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
-            return X86EMUL_UNHANDLEABLE;
-
-        reg->limit = limit;
-        reg->attr.bytes = ar >> 8;
-    }
-    else
-    {
-        switch ( seg )
-        {
-        default:
-            if ( !is_x86_user_segment(seg) )
-                return X86EMUL_UNHANDLEABLE;
-            reg->base = 0;
-            break;
-        case x86_seg_fs:
-            reg->base = rdfsbase();
-            break;
-        case x86_seg_gs:
-            reg->base = rdgsbase();
-            break;
-        }
-
-        reg->limit = ~0U;
-
-        reg->attr.bytes = 0;
-        reg->attr.fields.type = _SEGMENT_WR >> 8;
-        if ( seg == x86_seg_cs )
-        {
-            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
-            reg->attr.fields.l = 1;
-        }
-        else
-            reg->attr.fields.db = 1;
-        reg->attr.fields.s   = 1;
-        reg->attr.fields.dpl = 3;
-        reg->attr.fields.p   = 1;
-        reg->attr.fields.g   = 1;
-    }
-
-    /*
-     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
-     * Also do this for consistency for non-conforming code segments.
-     */
-    if ( (seg == x86_seg_ss ||
-          (seg == x86_seg_cs &&
-           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
-         guest_kernel_mode(current, ctxt->regs) )
-        reg->attr.fields.dpl = 0;
-
-    return X86EMUL_OKAY;
-}
-
-/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
-static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
-{
-    unsigned int cpl = guest_kernel_mode(v, regs) ?
-        (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
-
-    ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
-
-    return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
-}
-
-/* Has the guest requested sufficient permission for this I/O access? */
-static int guest_io_okay(
-    unsigned int port, unsigned int bytes,
-    struct vcpu *v, struct cpu_user_regs *regs)
-{
-    /* If in user mode, switch to kernel mode just to read I/O bitmap. */
-    int user_mode = !(v->arch.flags & TF_kernel_mode);
-#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
-
-    if ( iopl_ok(v, regs) )
-        return 1;
-
-    if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
-    {
-        union { uint8_t bytes[2]; uint16_t mask; } x;
-
-        /*
-         * Grab permission bytes from guest space. Inaccessible bytes are
-         * read as 0xff (no access allowed).
-         */
-        TOGGLE_MODE();
-        switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
-                                          port>>3, 2) )
-        {
-        default: x.bytes[0] = ~0;
-            /* fallthrough */
-        case 1:  x.bytes[1] = ~0;
-            /* fallthrough */
-        case 0:  break;
-        }
-        TOGGLE_MODE();
-
-        if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
-            return 1;
-    }
-
-    return 0;
-}
-
-/* Has the administrator granted sufficient permission for this I/O access? */
-static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
-                            const struct domain *d)
-{
-    /*
-     * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
-     * We never permit direct access to that register.
-     */
-    if ( (port == 0xcf8) && (bytes == 4) )
-        return 0;
-
-    /* We also never permit direct access to the RTC/CMOS registers. */
-    if ( ((port & ~1) == RTC_PORT(0)) )
-        return 0;
-
-    return ioports_access_permitted(d, port, port + bytes - 1);
-}
-
-static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
-                         unsigned int size, uint32_t *write)
-{
-    uint32_t machine_bdf;
-
-    if ( !is_hardware_domain(currd) )
-        return 0;
-
-    if ( !CF8_ENABLED(currd->arch.pci_cf8) )
-        return 1;
-
-    machine_bdf = CF8_BDF(currd->arch.pci_cf8);
-    if ( write )
-    {
-        const unsigned long *ro_map = pci_get_ro_map(0);
-
-        if ( ro_map && test_bit(machine_bdf, ro_map) )
-            return 0;
-    }
-    start |= CF8_ADDR_LO(currd->arch.pci_cf8);
-    /* AMD extended configuration space access? */
-    if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
-         boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-         boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
-    {
-        uint64_t msr_val;
-
-        if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
-            return 0;
-        if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
-            start |= CF8_ADDR_HI(currd->arch.pci_cf8);
-    }
-
-    return !write ?
-           xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
-                                     start, start + size - 1, 0) == 0 :
-           pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
-}
-
-uint32_t guest_io_read(unsigned int port, unsigned int bytes,
-                       struct domain *currd)
-{
-    uint32_t data = 0;
-    unsigned int shift = 0;
-
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        switch ( bytes )
-        {
-        case 1: return inb(port);
-        case 2: return inw(port);
-        case 4: return inl(port);
-        }
-    }
-
-    while ( bytes != 0 )
-    {
-        unsigned int size = 1;
-        uint32_t sub_data = ~0;
-
-        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
-        {
-            sub_data = pv_pit_handler(port, 0, 0);
-        }
-        else if ( port == RTC_PORT(0) )
-        {
-            sub_data = currd->arch.cmos_idx;
-        }
-        else if ( (port == RTC_PORT(1)) &&
-                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
-        {
-            unsigned long flags;
-
-            spin_lock_irqsave(&rtc_lock, flags);
-            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
-            sub_data = inb(RTC_PORT(1));
-            spin_unlock_irqrestore(&rtc_lock, flags);
-        }
-        else if ( (port == 0xcf8) && (bytes == 4) )
-        {
-            size = 4;
-            sub_data = currd->arch.pci_cf8;
-        }
-        else if ( (port & 0xfffc) == 0xcfc )
-        {
-            size = min(bytes, 4 - (port & 3));
-            if ( size == 3 )
-                size = 2;
-            if ( pci_cfg_ok(currd, port & 3, size, NULL) )
-                sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
-        }
-
-        if ( size == 4 )
-            return sub_data;
-
-        data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
-        shift += size * 8;
-        port += size;
-        bytes -= size;
-    }
-
-    return data;
-}
-
-void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
-                    struct domain *currd)
-{
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        switch ( bytes ) {
-        case 1:
-            outb((uint8_t)data, port);
-            if ( pv_post_outb_hook )
-                pv_post_outb_hook(port, (uint8_t)data);
-            break;
-        case 2:
-            outw((uint16_t)data, port);
-            break;
-        case 4:
-            outl(data, port);
-            break;
-        }
-        return;
-    }
-
-    while ( bytes != 0 )
-    {
-        unsigned int size = 1;
-
-        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
-        {
-            pv_pit_handler(port, (uint8_t)data, 1);
-        }
-        else if ( port == RTC_PORT(0) )
-        {
-            currd->arch.cmos_idx = data;
-        }
-        else if ( (port == RTC_PORT(1)) &&
-                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
-        {
-            unsigned long flags;
-
-            if ( pv_rtc_handler )
-                pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
-            spin_lock_irqsave(&rtc_lock, flags);
-            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
-            outb(data, RTC_PORT(1));
-            spin_unlock_irqrestore(&rtc_lock, flags);
-        }
-        else if ( (port == 0xcf8) && (bytes == 4) )
-        {
-            size = 4;
-            currd->arch.pci_cf8 = data;
-        }
-        else if ( (port & 0xfffc) == 0xcfc )
-        {
-            size = min(bytes, 4 - (port & 3));
-            if ( size == 3 )
-                size = 2;
-            if ( pci_cfg_ok(currd, port & 3, size, &data) )
-                pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
-        }
-
-        if ( size == 4 )
-            return;
-
-        port += size;
-        bytes -= size;
-        data >>= size * 8;
-    }
-}
-
-/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
-void host_to_guest_gpr_switch(struct cpu_user_regs *);
-unsigned long guest_to_host_gpr_switch(unsigned long);
-
-void (*pv_post_outb_hook)(unsigned int port, u8 value);
-
-typedef void io_emul_stub_t(struct cpu_user_regs *);
-
-static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
-                                          unsigned int port, unsigned int bytes)
-{
-    if ( !ctxt->io_emul_stub )
-        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                                             (this_cpu(stubs.addr) &
-                                              ~PAGE_MASK) +
-                                             STUB_BUF_SIZE / 2;
-
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    ctxt->io_emul_stub[0] = 0x48;
-    ctxt->io_emul_stub[1] = 0xb9;
-    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    ctxt->io_emul_stub[10] = 0xff;
-    ctxt->io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    ctxt->io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    ctxt->io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
-
-    /* Handy function-typed pointer to the stub. */
-    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-}
-
-static int priv_op_read_io(unsigned int port, unsigned int bytes,
-                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-
-    /* INS must not come here. */
-    ASSERT((ctxt->opcode & ~9) == 0xe4);
-
-    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        io_emul_stub_t *io_emul =
-            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
-        mark_regs_dirty(ctxt->regs);
-        io_emul(ctxt->regs);
-        return X86EMUL_DONE;
-    }
-
-    *val = guest_io_read(port, bytes, currd);
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_write_io(unsigned int port, unsigned int bytes,
-                            unsigned long val, struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-
-    /* OUTS must not come here. */
-    ASSERT((ctxt->opcode & ~9) == 0xe6);
-
-    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        io_emul_stub_t *io_emul =
-            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
-        mark_regs_dirty(ctxt->regs);
-        io_emul(ctxt->regs);
-        if ( (bytes == 1) && pv_post_outb_hook )
-            pv_post_outb_hook(port, val);
-        return X86EMUL_DONE;
-    }
-
-    guest_io_write(port, bytes, val, currd);
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_ins(uint16_t port,
-                           enum x86_segment seg, unsigned long offset,
-                           unsigned int bytes_per_rep, unsigned long *reps,
-                           struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-    unsigned long goal = *reps;
-    struct segment_register sreg;
-    int rc;
-
-    ASSERT(seg == x86_seg_es);
-
-    *reps = 0;
-
-    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
-    if ( rc != X86EMUL_OKAY )
-        return rc;
-
-    if ( !sreg.attr.fields.p )
-        return X86EMUL_UNHANDLEABLE;
-    if ( !sreg.attr.fields.s ||
-         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
-         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
-    {
-        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
-        return X86EMUL_EXCEPTION;
-    }
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
-    while ( *reps < goal )
-    {
-        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
-        unsigned long addr;
-
-        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
-                                    sreg.limit, x86_seg_es, ctxt, &addr);
-        if ( rc != X86EMUL_OKAY )
-            return rc;
-
-        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
-        {
-            x86_emul_pagefault(PFEC_write_access,
-                               addr + bytes_per_rep - rc, ctxt);
-            return X86EMUL_EXCEPTION;
-        }
-
-        ++*reps;
-
-        if ( poc->bpmatch || hypercall_preempt_check() )
-            break;
-
-        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
-        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
-            offset -= bytes_per_rep;
-        else
-            offset += bytes_per_rep;
-    }
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
-                            uint16_t port,
-                            unsigned int bytes_per_rep, unsigned long *reps,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-    unsigned long goal = *reps;
-    struct segment_register sreg;
-    int rc;
-
-    *reps = 0;
-
-    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    rc = priv_op_read_segment(seg, &sreg, ctxt);
-    if ( rc != X86EMUL_OKAY )
-        return rc;
-
-    if ( !sreg.attr.fields.p )
-        return X86EMUL_UNHANDLEABLE;
-    if ( !sreg.attr.fields.s ||
-         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
-          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
-    {
-        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
-                                                : TRAP_stack_error,
-                              0, ctxt);
-        return X86EMUL_EXCEPTION;
-    }
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
-    while ( *reps < goal )
-    {
-        unsigned int data = 0;
-        unsigned long addr;
-
-        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
-                                    sreg.limit, seg, ctxt, &addr);
-        if ( rc != X86EMUL_OKAY )
-            return rc;
-
-        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
-        {
-            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
-            return X86EMUL_EXCEPTION;
-        }
-
-        guest_io_write(port, bytes_per_rep, data, currd);
-
-        ++*reps;
-
-        if ( poc->bpmatch || hypercall_preempt_check() )
-            break;
-
-        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
-        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
-            offset -= bytes_per_rep;
-        else
-            offset += bytes_per_rep;
-    }
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_read_cr(unsigned int reg, unsigned long *val,
-                           struct x86_emulate_ctxt *ctxt)
-{
-    const struct vcpu *curr = current;
-
-    switch ( reg )
-    {
-    case 0: /* Read CR0 */
-        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
-        return X86EMUL_OKAY;
-
-    case 2: /* Read CR2 */
-    case 4: /* Read CR4 */
-        *val = curr->arch.pv_vcpu.ctrlreg[reg];
-        return X86EMUL_OKAY;
-
-    case 3: /* Read CR3 */
-    {
-        const struct domain *currd = curr->domain;
-        unsigned long mfn;
-
-        if ( !is_pv_32bit_domain(currd) )
-        {
-            mfn = pagetable_get_pfn(curr->arch.guest_table);
-            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-        }
-        else
-        {
-            l4_pgentry_t *pl4e =
-                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
-
-            mfn = l4e_get_pfn(*pl4e);
-            unmap_domain_page(pl4e);
-            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-        }
-        /* PTs should not be shared */
-        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
-        return X86EMUL_OKAY;
-    }
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_write_cr(unsigned int reg, unsigned long val,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    struct vcpu *curr = current;
-
-    switch ( reg )
-    {
-    case 0: /* Write CR0 */
-        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
-        {
-            gdprintk(XENLOG_WARNING,
-                    "Attempt to change unmodifiable CR0 flags\n");
-            break;
-        }
-        do_fpu_taskswitch(!!(val & X86_CR0_TS));
-        return X86EMUL_OKAY;
-
-    case 2: /* Write CR2 */
-        curr->arch.pv_vcpu.ctrlreg[2] = val;
-        arch_set_cr2(curr, val);
-        return X86EMUL_OKAY;
-
-    case 3: /* Write CR3 */
-    {
-        struct domain *currd = curr->domain;
-        unsigned long gfn;
-        struct page_info *page;
-        int rc;
-
-        gfn = !is_pv_32bit_domain(currd)
-              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
-        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
-        if ( !page )
-            break;
-        rc = new_guest_cr3(page_to_mfn(page));
-        put_page(page);
-
-        switch ( rc )
-        {
-        case 0:
-            return X86EMUL_OKAY;
-        case -ERESTART: /* retry after preemption */
-            return X86EMUL_RETRY;
-        }
-        break;
-    }
-
-    case 4: /* Write CR4 */
-        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
-        write_cr4(pv_guest_cr4_to_real_cr4(curr));
-        ctxt_switch_levelling(curr);
-        return X86EMUL_OKAY;
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_read_dr(unsigned int reg, unsigned long *val,
-                           struct x86_emulate_ctxt *ctxt)
-{
-    unsigned long res = do_get_debugreg(reg);
-
-    if ( IS_ERR_VALUE(res) )
-        return X86EMUL_UNHANDLEABLE;
-
-    *val = res;
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_write_dr(unsigned int reg, unsigned long val,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    return do_set_debugreg(reg, val) == 0
-           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
-}
-
-static inline uint64_t guest_misc_enable(uint64_t val)
-{
-    val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
-             MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
-    val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
-           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
-           MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
-    return val;
-}
-
-static inline bool is_cpufreq_controller(const struct domain *d)
-{
-    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
-            is_hardware_domain(d));
-}
-
-static int priv_op_read_msr(unsigned int reg, uint64_t *val,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    const struct vcpu *curr = current;
-    const struct domain *currd = curr->domain;
-    bool vpmu_msr = false;
-
-    switch ( reg )
-    {
-        int rc;
-
-    case MSR_FS_BASE:
-        if ( is_pv_32bit_domain(currd) )
-            break;
-        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
-        return X86EMUL_OKAY;
-
-    case MSR_GS_BASE:
-        if ( is_pv_32bit_domain(currd) )
-            break;
-        *val = cpu_has_fsgsbase ? __rdgsbase()
-                                : curr->arch.pv_vcpu.gs_base_kernel;
-        return X86EMUL_OKAY;
-
-    case MSR_SHADOW_GS_BASE:
-        if ( is_pv_32bit_domain(currd) )
-            break;
-        *val = curr->arch.pv_vcpu.gs_base_user;
-        return X86EMUL_OKAY;
-
-    /*
-     * In order to fully retain original behavior, defer calling
-     * pv_soft_rdtsc() until after emulation. This may want/need to be
-     * reconsidered.
-     */
-    case MSR_IA32_TSC:
-        poc->tsc |= TSC_BASE;
-        goto normal;
-
-    case MSR_TSC_AUX:
-        poc->tsc |= TSC_AUX;
-        if ( cpu_has_rdtscp )
-            goto normal;
-        *val = 0;
-        return X86EMUL_OKAY;
-
-    case MSR_EFER:
-        *val = read_efer();
-        if ( is_pv_32bit_domain(currd) )
-            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
-        return X86EMUL_OKAY;
-
-    case MSR_K7_FID_VID_CTL:
-    case MSR_K7_FID_VID_STATUS:
-    case MSR_K8_PSTATE_LIMIT:
-    case MSR_K8_PSTATE_CTRL:
-    case MSR_K8_PSTATE_STATUS:
-    case MSR_K8_PSTATE0:
-    case MSR_K8_PSTATE1:
-    case MSR_K8_PSTATE2:
-    case MSR_K8_PSTATE3:
-    case MSR_K8_PSTATE4:
-    case MSR_K8_PSTATE5:
-    case MSR_K8_PSTATE6:
-    case MSR_K8_PSTATE7:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-            break;
-        if ( unlikely(is_cpufreq_controller(currd)) )
-            goto normal;
-        *val = 0;
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_UCODE_REV:
-        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
-        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-        {
-            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
-                break;
-            /* As documented in the SDM: Do a CPUID 1 here */
-            cpuid_eax(1);
-        }
-        goto normal;
-
-    case MSR_IA32_MISC_ENABLE:
-        if ( rdmsr_safe(reg, *val) )
-            break;
-        *val = guest_misc_enable(*val);
-        return X86EMUL_OKAY;
-
-    case MSR_AMD64_DR0_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-            break;
-        *val = curr->arch.pv_vcpu.dr_mask[0];
-        return X86EMUL_OKAY;
-
-    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-            break;
-        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_PERF_CAPABILITIES:
-        /* No extra capabilities are supported. */
-        *val = 0;
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_PLATFORM_INFO:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
-            break;
-        *val = 0;
-        if ( this_cpu(cpuid_faulting_enabled) )
-            *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_MISC_FEATURES_ENABLES:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
-            break;
-        *val = 0;
-        if ( curr->arch.cpuid_faulting )
-            *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
-        return X86EMUL_OKAY;
-
-    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-        {
-            vpmu_msr = true;
-            /* fall through */
-    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-            {
-                if ( vpmu_do_rdmsr(reg, val) )
-                    break;
-                return X86EMUL_OKAY;
-            }
-        }
-        /* fall through */
-    default:
-        if ( rdmsr_hypervisor_regs(reg, val) )
-            return X86EMUL_OKAY;
-
-        rc = vmce_rdmsr(reg, val);
-        if ( rc < 0 )
-            break;
-        if ( rc )
-            return X86EMUL_OKAY;
-        /* fall through */
-    normal:
-        /* Everyone can read the MSR space. */
-        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
-        if ( rdmsr_safe(reg, *val) )
-            break;
-        return X86EMUL_OKAY;
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-#include "x86_64/mmconfig.h"
-
-static int priv_op_write_msr(unsigned int reg, uint64_t val,
-                             struct x86_emulate_ctxt *ctxt)
-{
-    struct vcpu *curr = current;
-    const struct domain *currd = curr->domain;
-    bool vpmu_msr = false;
-
-    switch ( reg )
-    {
-        uint64_t temp;
-        int rc;
-
-    case MSR_FS_BASE:
-        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
-            break;
-        wrfsbase(val);
-        curr->arch.pv_vcpu.fs_base = val;
-        return X86EMUL_OKAY;
-
-    case MSR_GS_BASE:
-        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
-            break;
-        wrgsbase(val);
-        curr->arch.pv_vcpu.gs_base_kernel = val;
-        return X86EMUL_OKAY;
-
-    case MSR_SHADOW_GS_BASE:
-        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
-            break;
-        wrmsrl(MSR_SHADOW_GS_BASE, val);
-        curr->arch.pv_vcpu.gs_base_user = val;
-        return X86EMUL_OKAY;
-
-    case MSR_K7_FID_VID_STATUS:
-    case MSR_K7_FID_VID_CTL:
-    case MSR_K8_PSTATE_LIMIT:
-    case MSR_K8_PSTATE_CTRL:
-    case MSR_K8_PSTATE_STATUS:
-    case MSR_K8_PSTATE0:
-    case MSR_K8_PSTATE1:
-    case MSR_K8_PSTATE2:
-    case MSR_K8_PSTATE3:
-    case MSR_K8_PSTATE4:
-    case MSR_K8_PSTATE5:
-    case MSR_K8_PSTATE6:
-    case MSR_K8_PSTATE7:
-    case MSR_K8_HWCR:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-            break;
-        if ( likely(!is_cpufreq_controller(currd)) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_AMD64_NB_CFG:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
-            return X86EMUL_OKAY;
-        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
-             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
-            goto invalid;
-        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_FAM10H_MMIO_CONF_BASE:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
-            return X86EMUL_OKAY;
-        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
-            break;
-        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
-             temp != val :
-             ((temp ^ val) &
-              ~(FAM10H_MMIO_CONF_ENABLE |
-                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
-                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
-                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
-                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
-            goto invalid;
-        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_IA32_UCODE_REV:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
-            return X86EMUL_OKAY;
-        if ( rdmsr_safe(reg, temp) )
-            break;
-        if ( val )
-            goto invalid;
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_MISC_ENABLE:
-        if ( rdmsr_safe(reg, temp) )
-            break;
-        if ( val != guest_misc_enable(temp) )
-            goto invalid;
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_MPERF:
-    case MSR_IA32_APERF:
-        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
-             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
-            break;
-        if ( likely(!is_cpufreq_controller(currd)) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_IA32_PERF_CTL:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-            break;
-        if ( likely(!is_cpufreq_controller(currd)) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_IA32_THERM_CONTROL:
-    case MSR_IA32_ENERGY_PERF_BIAS:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_AMD64_DR0_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
-            break;
-        curr->arch.pv_vcpu.dr_mask[0] = val;
-        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
-            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
-        return X86EMUL_OKAY;
-
-    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
-            break;
-        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
-        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
-            wrmsrl(reg, val);
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_PLATFORM_INFO:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
-            break;
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_MISC_FEATURES_ENABLES:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
-             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
-            break;
-        if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
-             !this_cpu(cpuid_faulting_enabled) )
-            break;
-        curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
-        return X86EMUL_OKAY;
-
-    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-        {
-            vpmu_msr = true;
-    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-            {
-                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                     !is_hardware_domain(currd) )
-                    return X86EMUL_OKAY;
-
-                if ( vpmu_do_wrmsr(reg, val, 0) )
-                    break;
-                return X86EMUL_OKAY;
-            }
-        }
-        /* fall through */
-    default:
-        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
-            return X86EMUL_OKAY;
-
-        rc = vmce_wrmsr(reg, val);
-        if ( rc < 0 )
-            break;
-        if ( rc )
-            return X86EMUL_OKAY;
-
-        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
-    invalid:
-            gdprintk(XENLOG_WARNING,
-                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
-                     reg, temp, val);
-        return X86EMUL_OKAY;
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
-{
-    /* Ignore the instruction if unprivileged. */
-    if ( !cache_flush_permitted(current->domain) )
-        /*
-         * Non-physdev domain attempted WBINVD; ignore for now since
-         * newer linux uses this in some start-of-day timing loops.
-         */
-        ;
-    else
-        wbinvd();
-
-    return X86EMUL_OKAY;
-}
-
-int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
-                  struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
-{
-    guest_cpuid(current, leaf, subleaf, res);
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_validate(const struct x86_emulate_state *state,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    switch ( ctxt->opcode )
-    {
-    case 0x6c ... 0x6f: /* ins / outs */
-    case 0xe4 ... 0xe7: /* in / out (immediate port) */
-    case 0xec ... 0xef: /* in / out (port in %dx) */
-    case X86EMUL_OPC(0x0f, 0x06): /* clts */
-    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
-    case X86EMUL_OPC(0x0f, 0x20) ...
-         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
-    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
-    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
-    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
-    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
-        return X86EMUL_OKAY;
-
-    case 0xfa: case 0xfb: /* cli / sti */
-        if ( !iopl_ok(current, ctxt->regs) )
-            break;
-        /*
-         * This is just too dangerous to allow, in my opinion. Consider if the
-         * caller then tries to reenable interrupts using POPF: we can't trap
-         * that and we'll end up with hard-to-debug lockups. Fast & loose will
-         * do for us. :-)
-        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
-         */
-        return X86EMUL_DONE;
-
-    case X86EMUL_OPC(0x0f, 0x01):
-    {
-        unsigned int modrm_rm, modrm_reg;
-
-        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
-             (modrm_rm & 7) != 1 )
-            break;
-        switch ( modrm_reg & 7 )
-        {
-        case 2: /* xsetbv */
-        case 7: /* rdtscp */
-            return X86EMUL_OKAY;
-        }
-        break;
-    }
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static const struct x86_emulate_ops priv_op_ops = {
-    .insn_fetch          = priv_op_insn_fetch,
-    .read                = x86emul_unhandleable_rw,
-    .validate            = priv_op_validate,
-    .read_io             = priv_op_read_io,
-    .write_io            = priv_op_write_io,
-    .rep_ins             = priv_op_rep_ins,
-    .rep_outs            = priv_op_rep_outs,
-    .read_segment        = priv_op_read_segment,
-    .read_cr             = priv_op_read_cr,
-    .write_cr            = priv_op_write_cr,
-    .read_dr             = priv_op_read_dr,
-    .write_dr            = priv_op_write_dr,
-    .read_msr            = priv_op_read_msr,
-    .write_msr           = priv_op_write_msr,
-    .cpuid               = pv_emul_cpuid,
-    .wbinvd              = priv_op_wbinvd,
-};
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
-{
-    struct vcpu *curr = current;
-    struct domain *currd = curr->domain;
-    struct priv_op_ctxt ctxt = {
-        .ctxt.regs = regs,
-        .ctxt.vendor = currd->arch.cpuid->x86_vendor,
-    };
-    int rc;
-    unsigned int eflags, ar;
-
-    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
-                          &ar, 1) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        return 0;
-
-    /* Mirror virtualized state into EFLAGS. */
-    ASSERT(regs->eflags & X86_EFLAGS_IF);
-    if ( vcpu_info(curr, evtchn_upcall_mask) )
-        regs->eflags &= ~X86_EFLAGS_IF;
-    else
-        regs->eflags |= X86_EFLAGS_IF;
-    ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
-    regs->eflags |= curr->arch.pv_vcpu.iopl;
-    eflags = regs->eflags;
-
-    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
-    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
-    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
-
-    if ( ctxt.io_emul_stub )
-        unmap_domain_page(ctxt.io_emul_stub);
-
-    /*
-     * Un-mirror virtualized state from EFLAGS.
-     * Nothing we allow to be emulated can change anything other than the
-     * arithmetic bits, and the resume flag.
-     */
-    ASSERT(!((regs->eflags ^ eflags) &
-             ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
-    regs->eflags |= X86_EFLAGS_IF;
-    regs->eflags &= ~X86_EFLAGS_IOPL;
-
-    switch ( rc )
-    {
-    case X86EMUL_OKAY:
-        if ( ctxt.tsc & TSC_BASE )
-        {
-            if ( ctxt.tsc & TSC_AUX )
-                pv_soft_rdtsc(curr, regs, 1);
-            else if ( currd->arch.vtsc )
-                pv_soft_rdtsc(curr, regs, 0);
-            else
-                msr_split(regs, rdtsc());
-        }
-
-        if ( ctxt.ctxt.retire.singlestep )
-            ctxt.bpmatch |= DR_STEP;
-        if ( ctxt.bpmatch )
-        {
-            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
-            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
-                pv_inject_guest_trap(TRAP_debug, regs);
-        }
-        /* fall through */
-    case X86EMUL_RETRY:
-        return EXCRET_fault_fixed;
-
-    case X86EMUL_EXCEPTION:
-        pv_inject_event(&ctxt.ctxt.event);
-        return EXCRET_fault_fixed;
-    }
-
-    return 0;
-}
-
-static inline int check_stack_limit(unsigned int ar, unsigned int limit,
-                                    unsigned int esp, unsigned int decr)
-{
-    return (((esp - decr) < (esp - 1)) &&
-            (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
-}
-
-struct gate_op_ctxt {
-    struct x86_emulate_ctxt ctxt;
-    struct {
-        unsigned long base, limit;
-    } cs;
-    bool insn_fetch;
-};
-
-static int gate_op_read(
-    enum x86_segment seg,
-    unsigned long offset,
-    void *p_data,
-    unsigned int bytes,
-    struct x86_emulate_ctxt *ctxt)
-{
-    const struct gate_op_ctxt *goc =
-        container_of(ctxt, struct gate_op_ctxt, ctxt);
-    unsigned int rc = bytes, sel = 0;
-    unsigned long addr = offset, limit = 0;
-
-    switch ( seg )
-    {
-    case x86_seg_cs:
-        addr += goc->cs.base;
-        limit = goc->cs.limit;
-        break;
-    case x86_seg_ds:
-        sel = read_sreg(ds);
-        break;
-    case x86_seg_es:
-        sel = read_sreg(es);
-        break;
-    case x86_seg_fs:
-        sel = read_sreg(fs);
-        break;
-    case x86_seg_gs:
-        sel = read_sreg(gs);
-        break;
-    case x86_seg_ss:
-        sel = ctxt->regs->ss;
-        break;
-    default:
-        return X86EMUL_UNHANDLEABLE;
-    }
-    if ( sel )
-    {
-        unsigned int ar;
-
-        ASSERT(!goc->insn_fetch);
-        if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
-             !(ar & _SEGMENT_S) ||
-             !(ar & _SEGMENT_P) ||
-             ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
-            return X86EMUL_UNHANDLEABLE;
-        addr += offset;
-    }
-    else if ( seg != x86_seg_cs )
-        return X86EMUL_UNHANDLEABLE;
-
-    /* We don't mean to emulate any branches. */
-    if ( limit < bytes - 1 || offset > limit - bytes + 1 )
-        return X86EMUL_UNHANDLEABLE;
-
-    addr = (uint32_t)addr;
-
-    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
-    {
-        /*
-         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
-         * cpu_has_nx, but we'd then need a "fetch" variant of
-         * __copy_from_user() respecting NX, SMEP, and protection keys.
-         */
-        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
-        return X86EMUL_EXCEPTION;
-    }
-
-    return X86EMUL_OKAY;
-}
-
-static void emulate_gate_op(struct cpu_user_regs *regs)
-{
-    struct vcpu *v = current;
-    unsigned int sel, ar, dpl, nparm, insn_len;
-    struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
-    struct x86_emulate_state *state;
-    unsigned long off, base, limit;
-    uint16_t opnd_sel = 0;
-    int jump = -1, rc = X86EMUL_OKAY;
-
-    /* Check whether this fault is due to the use of a call gate. */
-    if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
-         (((ar >> 13) & 3) < (regs->cs & 3)) ||
-         ((ar & _SEGMENT_TYPE) != 0xc00) )
-    {
-        pv_inject_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
-    if ( !(ar & _SEGMENT_P) )
-    {
-        pv_inject_guest_trap(TRAP_no_segment, regs);
-        return;
-    }
-    dpl = (ar >> 13) & 3;
-    nparm = ar & 0x1f;
-
-    /*
-     * Decode instruction (and perhaps operand) to determine RPL,
-     * whether this is a jump or a call, and the call return offset.
-     */
-    if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
-                          &ar, 0) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-    {
-        pv_inject_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
-
-    ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
-    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
-    state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
-    ctxt.insn_fetch = false;
-    if ( IS_ERR_OR_NULL(state) )
-    {
-        if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
-        {
-            ASSERT(ctxt.ctxt.event_pending);
-            pv_inject_event(&ctxt.ctxt.event);
-        }
-        else
-        {
-            ASSERT(!ctxt.ctxt.event_pending);
-            pv_inject_guest_trap(TRAP_gp_fault, regs);
-        }
-        return;
-    }
-
-    switch ( ctxt.ctxt.opcode )
-    {
-        unsigned int modrm_345;
-
-    case 0xea:
-        ++jump;
-        /* fall through */
-    case 0x9a:
-        ++jump;
-        opnd_sel = x86_insn_immediate(state, 1);
-        break;
-    case 0xff:
-        if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
-            break;
-        switch ( modrm_345 & 7 )
-        {
-            enum x86_segment seg;
-
-        case 5:
-            ++jump;
-            /* fall through */
-        case 3:
-            ++jump;
-            base = x86_insn_operand_ea(state, &seg);
-            rc = gate_op_read(seg,
-                              base + (x86_insn_opsize(state) >> 3),
-                              &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
-            break;
-        }
-        break;
-    }
-
-    insn_len = x86_insn_length(state, &ctxt.ctxt);
-    x86_emulate_free_state(state);
-
-    if ( rc == X86EMUL_EXCEPTION )
-    {
-        ASSERT(ctxt.ctxt.event_pending);
-        pv_inject_event(&ctxt.ctxt.event);
-        return;
-    }
-
-    ASSERT(!ctxt.ctxt.event_pending);
-
-    if ( rc != X86EMUL_OKAY ||
-         jump < 0 ||
-         (opnd_sel & ~3) != regs->error_code ||
-         dpl < (opnd_sel & 3) )
-    {
-        pv_inject_guest_trap(TRAP_gp_fault, regs);
-        return;
-    }
-
-    if ( !read_descriptor(sel, v, &base, &limit, &ar, 0) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_CODE) ||
-         (!jump || (ar & _SEGMENT_EC) ?
-          ((ar >> 13) & 3) > (regs->cs & 3) :
-          ((ar >> 13) & 3) != (regs->cs & 3)) )
-    {
-        pv_inject_hw_exception(TRAP_gp_fault, sel);
-        return;
-    }
-    if ( !(ar & _SEGMENT_P) )
-    {
-        pv_inject_hw_exception(TRAP_no_segment, sel);
-        return;
-    }
-    if ( off > limit )
-    {
-        pv_inject_hw_exception(TRAP_gp_fault, 0);
-        return;
-    }
-
-    if ( !jump )
-    {
-        unsigned int ss, esp, *stkp;
-        int rc;
-#define push(item) do \
-        { \
-            --stkp; \
-            esp -= 4; \
-            rc = __put_user(item, stkp); \
-            if ( rc ) \
-            { \
-                pv_inject_page_fault(PFEC_write_access, \
-                                     (unsigned long)(stkp + 1) - rc); \
-                return; \
-            } \
-        } while ( 0 )
-
-        if ( ((ar >> 13) & 3) < (regs->cs & 3) )
-        {
-            sel |= (ar >> 13) & 3;
-            /* Inner stack known only for kernel ring. */
-            if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
-            {
-                pv_inject_guest_trap(TRAP_gp_fault, regs);
-                return;
-            }
-            esp = v->arch.pv_vcpu.kernel_sp;
-            ss = v->arch.pv_vcpu.kernel_ss;
-            if ( (ss & 3) != (sel & 3) ||
-                 !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
-                 ((ar >> 13) & 3) != (sel & 3) ||
-                 !(ar & _SEGMENT_S) ||
-                 (ar & _SEGMENT_CODE) ||
-                 !(ar & _SEGMENT_WR) )
-            {
-                pv_inject_hw_exception(TRAP_invalid_tss, ss & ~3);
-                return;
-            }
-            if ( !(ar & _SEGMENT_P) ||
-                 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
-            {
-                pv_inject_hw_exception(TRAP_stack_error, ss & ~3);
-                return;
-            }
-            stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
-            if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
-            {
-                pv_inject_guest_trap(TRAP_gp_fault, regs);
-                return;
-            }
-            push(regs->ss);
-            push(regs->rsp);
-            if ( nparm )
-            {
-                const unsigned int *ustkp;
-
-                if ( !read_descriptor(regs->ss, v, &base, &limit, &ar, 0) ||
-                     ((ar >> 13) & 3) != (regs->cs & 3) ||
-                     !(ar & _SEGMENT_S) ||
-                     (ar & _SEGMENT_CODE) ||
-                     !(ar & _SEGMENT_WR) ||
-                     !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
-                    return pv_inject_guest_trap(TRAP_gp_fault, regs);
-                ustkp = (unsigned int *)(unsigned long)
-                        ((unsigned int)base + regs->esp + nparm * 4);
-                if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
-                {
-                    pv_inject_guest_trap(TRAP_gp_fault, regs);
-                    return;
-                }
-                do
-                {
-                    unsigned int parm;
-
-                    --ustkp;
-                    rc = __get_user(parm, ustkp);
-                    if ( rc )
-                    {
-                        pv_inject_page_fault(0,
-                                             (unsigned long)(ustkp + 1) - rc);
-                        return;
-                    }
-                    push(parm);
-                } while ( --nparm );
-            }
-        }
-        else
-        {
-            sel |= (regs->cs & 3);
-            esp = regs->rsp;
-            ss = regs->ss;
-            if ( !read_descriptor(ss, v, &base, &limit, &ar, 0) ||
-                 ((ar >> 13) & 3) != (sel & 3) )
-            {
-                pv_inject_guest_trap(TRAP_gp_fault, regs);
-                return;
-            }
-            if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
-            {
-                pv_inject_hw_exception(TRAP_stack_error, 0);
-                return;
-            }
-            stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
-            if ( !compat_access_ok(stkp - 2, 2 * 4) )
-            {
-                pv_inject_guest_trap(TRAP_gp_fault, regs);
-                return;
-            }
-        }
-        push(regs->cs);
-        push(regs->rip + insn_len);
-#undef push
-        regs->rsp = esp;
-        regs->ss = ss;
-    }
-    else
-        sel |= (regs->cs & 3);
-
-    regs->cs = sel;
-    instruction_done(regs, off);
-}
-
-void do_general_protection(struct cpu_user_regs *regs)
-{
-    struct vcpu *v = current;
-    unsigned long fixup;
-
-    if ( debugger_trap_entry(TRAP_gp_fault, regs) )
-        return;
-
-    if ( regs->error_code & X86_XEC_EXT )
-        goto hardware_gp;
-
-    if ( !guest_mode(regs) )
-        goto gp_in_kernel;
-
-    /*
-     * Cunning trick to allow arbitrary "INT n" handling.
-     * 
-     * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
-     * instruction from trapping to the appropriate vector, when that might not
-     * be expected by Xen or the guest OS. For example, that entry might be for
-     * a fault handler (unlike traps, faults don't increment EIP), or might
-     * expect an error code on the stack (which a software trap never
-     * provides), or might be a hardware interrupt handler that doesn't like
-     * being called spuriously.
-     * 
-     * Instead, a GPF occurs with the faulting IDT vector in the error code.
-     * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is 
-     * clear (which got already checked above) to indicate that it's a software
-     * fault, not a hardware one.
-     * 
-     * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
-     * okay because they can only be triggered by an explicit DPL-checked
-     * instruction. The DPL specified by the guest OS for these vectors is NOT
-     * CHECKED!!
-     */
-    if ( regs->error_code & X86_XEC_IDT )
-    {
-        /* This fault must be due to <INT n> instruction. */
-        const struct trap_info *ti;
-        unsigned char vector = regs->error_code >> 3;
-        ti = &v->arch.pv_vcpu.trap_ctxt[vector];
-        if ( permit_softint(TI_GET_DPL(ti), v, regs) )
-        {
-            regs->rip += 2;
-            pv_inject_guest_trap(vector, regs);
-            return;
-        }
-    }
-    else if ( is_pv_32bit_vcpu(v) && regs->error_code )
-    {
-        emulate_gate_op(regs);
-        return;
-    }
-
-    /* Emulate some simple privileged and I/O instructions. */
-    if ( (regs->error_code == 0) &&
-         emulate_privileged_op(regs) )
-    {
-        trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
-        return;
-    }
-
-    /* Pass on GPF as is. */
-    pv_inject_guest_trap(TRAP_gp_fault, regs);
-    return;
-
- gp_in_kernel:
-
-    if ( likely((fixup = search_exception_table(regs)) != 0) )
-    {
-        dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n",
-                regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup));
-        this_cpu(last_extable_addr) = regs->rip;
-        regs->rip = fixup;
-        return;
-    }
-
- hardware_gp:
-    if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
-        return;
-
-    show_execution_state(regs);
-    panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code);
-}
-
-static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
-
-static void nmi_mce_softirq(void)
-{
-    int cpu = smp_processor_id();
-    struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
-
-    BUG_ON(st->vcpu == NULL);
-
-    /* Set the tmp value unconditionally, so that
-     * the check in the iret hypercall works. */
-    cpumask_copy(st->vcpu->cpu_hard_affinity_tmp,
-                 st->vcpu->cpu_hard_affinity);
-
-    if ((cpu != st->processor)
-       || (st->processor != st->vcpu->processor))
-    {
-        /* We are on a different physical cpu.
-         * Make sure to wakeup the vcpu on the
-         * specified processor.
-         */
-        vcpu_set_hard_affinity(st->vcpu, cpumask_of(st->processor));
-
-        /* Affinity is restored in the iret hypercall. */
-    }
-
-    /* Only used to defer wakeup of domain/vcpu to
-     * a safe (non-NMI/MCE) context.
-     */
-    vcpu_kick(st->vcpu);
-    st->vcpu = NULL;
-}
-
-static void pci_serr_softirq(void)
-{
-    printk("\n\nNMI - PCI system error (SERR)\n");
-    outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
-}
-
-void async_exception_cleanup(struct vcpu *curr)
-{
-    int trap;
-
-    if ( !curr->async_exception_mask )
-        return;
-
-    /* Restore affinity.  */
-    if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) &&
-         !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) )
-    {
-        vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp);
-        cpumask_clear(curr->cpu_hard_affinity_tmp);
-    }
-
-    if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
-        trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
-    else
-        for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
-            if ( (curr->async_exception_mask ^
-                  curr->async_exception_state(trap).old_mask) == (1 << trap) )
-                break;
-    if ( unlikely(trap > VCPU_TRAP_LAST) )
-    {
-        ASSERT_UNREACHABLE();
-        return;
-    }
-
-    /* Restore previous asynchronous exception mask. */
-    curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
-}
-
-static void nmi_hwdom_report(unsigned int reason_idx)
-{
-    struct domain *d = hardware_domain;
-
-    if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
-        return;
-
-    set_bit(reason_idx, nmi_reason(d));
-
-    send_guest_trap(d, 0, TRAP_nmi);
-}
+    send_guest_trap(d, 0, TRAP_nmi);
+}
 
 static void pci_serr_error(const struct cpu_user_regs *regs)
 {
@@ -3909,34 +1901,6 @@  long register_guest_nmi_callback(unsigned long address)
     return 0;
 }
 
-long unregister_guest_nmi_callback(void)
-{
-    struct vcpu *v = current;
-    struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi];
-
-    memset(t, 0, sizeof(*t));
-
-    return 0;
-}
-
-int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
-{
-    struct vcpu *v;
-    struct trap_info *t;
-
-    BUG_ON(d == NULL);
-    BUG_ON(vcpuid >= d->max_vcpus);
-
-    /* Sanity check - XXX should be more fine grained. */
-    BUG_ON(trap_nr >= NR_VECTORS);
-
-    v = d->vcpu[vcpuid];
-    t = &v->arch.pv_vcpu.trap_ctxt[trap_nr];
-
-    return (t->address != 0);
-}
-
-
 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
 {
     struct vcpu *v;
@@ -3984,56 +1948,6 @@  int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
     return -EIO;
 }
 
-
-long do_set_trap_table(XEN_GUEST_HANDLE_PARAM(const_trap_info_t) traps)
-{
-    struct trap_info cur;
-    struct vcpu *curr = current;
-    struct trap_info *dst = curr->arch.pv_vcpu.trap_ctxt;
-    long rc = 0;
-
-    /* If no table is presented then clear the entire virtual IDT. */
-    if ( guest_handle_is_null(traps) )
-    {
-        memset(dst, 0, NR_VECTORS * sizeof(*dst));
-        init_int80_direct_trap(curr);
-        return 0;
-    }
-
-    for ( ; ; )
-    {
-        if ( copy_from_guest(&cur, traps, 1) )
-        {
-            rc = -EFAULT;
-            break;
-        }
-
-        if ( cur.address == 0 )
-            break;
-
-        if ( !is_canonical_address(cur.address) )
-            return -EINVAL;
-
-        fixup_guest_code_selector(curr->domain, cur.cs);
-
-        memcpy(&dst[cur.vector], &cur, sizeof(cur));
-
-        if ( cur.vector == 0x80 )
-            init_int80_direct_trap(curr);
-
-        guest_handle_add_offset(traps, 1);
-
-        if ( hypercall_preempt_check() )
-        {
-            rc = hypercall_create_continuation(
-                __HYPERVISOR_set_trap_table, "h", traps);
-            break;
-        }
-    }
-
-    return rc;
-}
-
 void activate_debugregs(const struct vcpu *curr)
 {
     ASSERT(curr == current);
@@ -4157,31 +2071,6 @@  long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
     return 0;
 }
 
-long do_set_debugreg(int reg, unsigned long value)
-{
-    return set_debugreg(current, reg, value);
-}
-
-unsigned long do_get_debugreg(int reg)
-{
-    struct vcpu *curr = current;
-
-    switch ( reg )
-    {
-    case 0 ... 3:
-    case 6:
-        return curr->arch.debugreg[reg];
-    case 7:
-        return (curr->arch.debugreg[7] |
-                curr->arch.debugreg[5]);
-    case 4 ... 5:
-        return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ?
-                curr->arch.debugreg[reg + 2] : 0);
-    }
-
-    return -EINVAL;
-}
-
 void asm_domain_crash_synchronous(unsigned long addr)
 {
     /*
diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h
index e3884d8406..a8656c3574 100644
--- a/xen/include/asm-x86/traps.h
+++ b/xen/include/asm-x86/traps.h
@@ -51,4 +51,23 @@  uint32_t guest_io_read(unsigned int port, unsigned int bytes,
 void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
                     struct domain *);
 
+int emulate_privileged_op(struct cpu_user_regs *regs);
+int emulate_invalid_rdtscp(struct cpu_user_regs *regs);
+int emulate_forced_invalid_op(struct cpu_user_regs *regs);
+void emulate_gate_op(struct cpu_user_regs *regs);
+
+static inline const char *trapstr(unsigned int trapnr)
+{
+    static const char * const strings[] = {
+        "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
+        "invalid opcode", "device not available", "double fault",
+        "coprocessor segment", "invalid tss", "segment not found",
+        "stack error", "general protection fault", "page fault",
+        "spurious interrupt", "coprocessor error", "alignment check",
+        "machine check", "simd error", "virtualisation exception"
+    };
+
+    return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
+}
+
 #endif /* ASM_TRAP_H */