diff mbox

[09/17] SVM: use generic instruction decoding

Message ID 57D1806F020000780010D18F@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich Sept. 8, 2016, 1:14 p.m. UTC
... instead of custom handling. To facilitate this break out init code
from _hvm_emulate_one() into the new hvm_emulate_init(), and make
hvmemul_insn_fetch( globally available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
SVM: use generic instruction decoding

... instead of custom handling. To facilitate this break out init code
from _hvm_emulate_one() into the new hvm_emulate_init(), and make
hvmemul_insn_fetch( globally available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -835,7 +835,7 @@ static int hvmemul_read(
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
-static int hvmemul_insn_fetch(
+int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
     void *p_data,
@@ -1765,15 +1765,14 @@ static const struct x86_emulate_ops hvm_
     .vmfunc        = hvmemul_vmfunc,
 };
 
-static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
-    const struct x86_emulate_ops *ops)
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes)
 {
-    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
-    uint32_t new_intr_shadow, pfec = PFEC_page_present;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    unsigned int pfec = PFEC_page_present;
     unsigned long addr;
-    int rc;
 
     if ( hvm_long_mode_enabled(curr) &&
          hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
@@ -1791,14 +1790,14 @@ static int _hvm_emulate_one(struct hvm_e
     if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
         pfec |= PFEC_user_mode;
 
-    hvmemul_ctxt->insn_buf_eip = regs->eip;
-    if ( !vio->mmio_insn_bytes )
+    hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->eip;
+    if ( !insn_bytes )
     {
         hvmemul_ctxt->insn_buf_bytes =
             hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
             (hvm_virtual_to_linear_addr(x86_seg_cs,
                                         &hvmemul_ctxt->seg_reg[x86_seg_cs],
-                                        regs->eip,
+                                        hvmemul_ctxt->insn_buf_eip,
                                         sizeof(hvmemul_ctxt->insn_buf),
                                         hvm_access_insn_fetch,
                                         hvmemul_ctxt->ctxt.addr_size,
@@ -1810,11 +1809,24 @@ static int _hvm_emulate_one(struct hvm_e
     }
     else
     {
-        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
-        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
     }
 
     hvmemul_ctxt->exn_pending = 0;
+}
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t new_intr_shadow;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    int rc;
+
+    hvm_emulate_init(hvmemul_ctxt, vio->mmio_insn, vio->mmio_insn_bytes);
+
     vio->mmio_retry = 0;
 
     if ( cpu_has_vmx )
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -15,7 +15,7 @@
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <xen/config.h>
+#include <xen/err.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -26,41 +26,6 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 
-static unsigned int is_prefix(u8 opc)
-{
-    switch ( opc )
-    {
-    case 0x66:
-    case 0x67:
-    case 0x2E:
-    case 0x3E:
-    case 0x26:
-    case 0x64:
-    case 0x65:
-    case 0x36:
-    case 0xF0:
-    case 0xF3:
-    case 0xF2:
-    case 0x40 ... 0x4f:
-        return 1;
-    }
-    return 0;
-}
-
-static unsigned long svm_rip2pointer(struct vcpu *v, unsigned long *limit)
-{
-    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned long p = vmcb->cs.base + vmcb->rip;
-
-    if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) )
-    {
-        *limit = vmcb->cs.limit;
-        return (u32)p; /* mask to 32 bits */
-    }
-    *limit = ~0UL;
-    return p;
-}
-
 static unsigned long svm_nextrip_insn_length(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le
     return vmcb->nextrip - vmcb->rip;
 }
 
-/* First byte: Length. Following bytes: Opcode bytes. */
-#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
-MAKE_INSTR(INVD,   2, 0x0f, 0x08);
-MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
-MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
-MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
-MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
-MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
-MAKE_INSTR(HLT,    1, 0xf4);
-MAKE_INSTR(INT3,   1, 0xcc);
-MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
-MAKE_INSTR(PAUSE,  1, 0x90);
-MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
-MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
-MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
-MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
-MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
-MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
-MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
-
-static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
-{
-    [INSTR_INVD]   = OPCODE_INVD,
-    [INSTR_WBINVD] = OPCODE_WBINVD,
-    [INSTR_CPUID]  = OPCODE_CPUID,
-    [INSTR_RDMSR]  = OPCODE_RDMSR,
-    [INSTR_WRMSR]  = OPCODE_WRMSR,
-    [INSTR_VMCALL] = OPCODE_VMCALL,
-    [INSTR_HLT]    = OPCODE_HLT,
-    [INSTR_INT3]   = OPCODE_INT3,
-    [INSTR_RDTSC]  = OPCODE_RDTSC,
-    [INSTR_PAUSE]  = OPCODE_PAUSE,
-    [INSTR_XSETBV] = OPCODE_XSETBV,
-    [INSTR_VMRUN]  = OPCODE_VMRUN,
-    [INSTR_VMLOAD] = OPCODE_VMLOAD,
-    [INSTR_VMSAVE] = OPCODE_VMSAVE,
-    [INSTR_STGI]   = OPCODE_STGI,
-    [INSTR_CLGI]   = OPCODE_CLGI,
-    [INSTR_INVLPGA] = OPCODE_INVLPGA,
+static const struct {
+    unsigned int opcode;
+    struct {
+        unsigned int rm:3;
+        unsigned int reg:3;
+        unsigned int mod:2;
+#define MODRM(mod, reg, rm) { rm, reg, mod }
+    } modrm;
+} const opc_tab[INSTR_MAX_COUNT] = {
+    [INSTR_PAUSE]  = { X86EMUL_OPC_F3(0, 0x90) },
+    [INSTR_INT3]   = { X86EMUL_OPC(   0, 0xcc) },
+    [INSTR_HLT]    = { X86EMUL_OPC(   0, 0xf4) },
+    [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
+    [INSTR_VMRUN]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
+    [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
+    [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
+    [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
+    [INSTR_STGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
+    [INSTR_CLGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
+    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
+    [INSTR_INVD]   = { X86EMUL_OPC(0x0f, 0x08) },
+    [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) },
+    [INSTR_WRMSR]  = { X86EMUL_OPC(0x0f, 0x30) },
+    [INSTR_RDTSC]  = { X86EMUL_OPC(0x0f, 0x31) },
+    [INSTR_RDMSR]  = { X86EMUL_OPC(0x0f, 0x32) },
+    [INSTR_CPUID]  = { X86EMUL_OPC(0x0f, 0xa2) },
 };
 
-static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
-                    unsigned long addr, unsigned int len)
-{
-    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
-
-    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
-    {
-    case HVMCOPY_okay:
-        break;
-    case HVMCOPY_bad_gva_to_gfn:
-        /* OK just to give up; we'll have injected #PF already */
-        return 0;
-    default:
-        /* Not OK: fetches from non-RAM pages are not supportable. */
-        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
-                 vmcb->rip, addr);
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        return 0;
-    }
-    return 1;
-}
-
 int __get_instruction_length_from_list(struct vcpu *v,
         const enum instruction_index *list, unsigned int list_count)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int i, j, inst_len = 0;
-    enum instruction_index instr = 0;
-    u8 buf[MAX_INST_LEN];
-    const u8 *opcode = NULL;
-    unsigned long fetch_addr, fetch_limit;
-    unsigned int fetch_len, max_len;
+    struct hvm_emulate_ctxt ctxt;
+    struct x86_emulate_state *state;
+    unsigned int inst_len, j, modrm_rm, modrm_reg;
+    int modrm_mod;
 
+#ifdef NDEBUG
     if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
         return inst_len;
 
     if ( vmcb->exitcode == VMEXIT_IOIO )
         return vmcb->exitinfo2 - vmcb->rip;
+#endif
 
-    /* Fetch up to the next page break; we'll fetch from the next page
-     * later if we have to. */
-    fetch_addr = svm_rip2pointer(v, &fetch_limit);
-    if ( vmcb->rip > fetch_limit )
-        return 0;
-    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
-    fetch_len = min_t(unsigned int, max_len,
-                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
-    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
+    ASSERT(v == current);
+    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
+    hvm_emulate_init(&ctxt, NULL, 0);
+    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
+    if ( IS_ERR_OR_NULL(state) )
         return 0;
 
-    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
-    {
-        inst_len++;
-        if ( inst_len >= fetch_len )
-        {
-            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                        max_len - fetch_len) )
-                return 0;
-            fetch_len = max_len;
-        }
+    inst_len = x86_insn_length(state, &ctxt.ctxt);
+    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
+    x86_emulate_free_state(state);
+#ifndef NDEBUG
+    if ( vmcb->exitcode == VMEXIT_IOIO )
+        j = vmcb->exitinfo2 - vmcb->rip;
+    else
+        j = svm_nextrip_insn_length(v);
+    if ( j && j != inst_len )
+    {
+        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
+                ctxt.ctxt.opcode, inst_len, j);
+        return j;
     }
+#endif
 
     for ( j = 0; j < list_count; j++ )
     {
-        instr = list[j];
-        opcode = opc_bytes[instr];
+        enum instruction_index instr = list[j];
 
-        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
+        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));
+        if ( opc_tab[instr].opcode == ctxt.ctxt.opcode )
         {
-            if ( (inst_len + i) >= fetch_len ) 
-            {
-                if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                            max_len - fetch_len) )
-                    return 0;
-                fetch_len = max_len;
-            }
+            if ( !opc_tab[instr].modrm.mod )
+                return inst_len;
 
-            if ( buf[inst_len+i] != opcode[i+1] )
-                goto mismatch;
+            if ( modrm_mod == opc_tab[instr].modrm.mod &&
+                 (modrm_rm & 7) == opc_tab[instr].modrm.rm &&
+                 (modrm_reg & 7) == opc_tab[instr].modrm.reg )
+                return inst_len;
         }
-        goto done;
-    mismatch: ;
     }
 
     gdprintk(XENLOG_WARNING,
-             "%s: Mismatch between expected and actual instruction bytes: "
+             "%s: Mismatch between expected and actual instruction: "
              "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
     hvm_inject_hw_exception(TRAP_gp_fault, 0);
     return 0;
-
- done:
-    inst_len += opcode[0];
-    ASSERT(inst_len <= max_len);
-    return inst_len;
 }
 
 /*
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -382,7 +382,7 @@ struct operand {
     } mem;
 };
 #ifdef __x86_64__
-#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
+#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
 #else
 #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
@@ -1631,6 +1631,10 @@ struct x86_emulate_state {
 
     unsigned long eip;
     struct cpu_user_regs *regs;
+
+#ifndef NDEBUG
+    void *caller;
+#endif
 };
 
 /* Helper definitions. */
@@ -1658,6 +1662,11 @@ x86_decode_base(
 
     switch ( ctxt->opcode )
     {
+    case 0x90: /* nop / pause */
+        if ( repe_prefix() )
+            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
+        break;
+
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
         generate_exception_if(mode_64bit(), EXC_UD, -1);
@@ -2852,8 +2861,9 @@ x86_emulate(
         break;
 
     case 0x90: /* nop / xchg %%r8,%%rax */
+    case X86EMUL_OPC_F3(0, 0x90): /* pause / xchg %%r8,%%rax */
         if ( !(rex_prefix & 1) )
-            break; /* nop */
+            break; /* nop / pause */
         /* fall through */
 
     case 0x91 ... 0x97: /* xchg reg,%%rax */
@@ -5200,3 +5210,89 @@ x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
+
+#ifdef __XEN__
+
+#include <xen/err.h>
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt))
+{
+    static DEFINE_PER_CPU(struct x86_emulate_state, state);
+    struct x86_emulate_state *state = &this_cpu(state);
+    const struct x86_emulate_ops ops = {
+        .insn_fetch = insn_fetch,
+        .read       = x86emul_unhandleable_rw,
+        .write      = REG_POISON,
+        .cmpxchg    = REG_POISON,
+    };
+    int rc = x86_decode(state, ctxt, &ops);
+
+    if ( unlikely(rc != X86EMUL_OKAY) )
+        return ERR_PTR(-rc);
+
+#ifndef NDEBUG
+    /*
+     * While we avoid memory allocation (by use of per-CPU data) above,
+     * nevertheless make sure callers properly release the state structure
+     * for forward compatibility.
+     */
+    if ( state->caller )
+    {
+        printk(XENLOG_ERR "Unreleased emulation state acquired by %ps\n",
+               state->caller);
+        dump_execution_state();
+    }
+    state->caller = __builtin_return_address(0);
+#endif
+
+    return state;
+}
+
+static inline void check_state(const struct x86_emulate_state *state)
+{
+#ifndef NDEBUG
+    ASSERT(state->caller);
+#endif
+}
+
+#ifndef NDEBUG
+void x86_emulate_free_state(struct x86_emulate_state *state)
+{
+    check_state(state);
+    state->caller = NULL;
+}
+#endif
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg)
+{
+    check_state(state);
+
+    if ( !(state->desc & ModRM) )
+        return -EINVAL;
+
+    if ( rm )
+        *rm = state->modrm_rm;
+    if ( reg )
+        *reg = state->modrm_reg;
+
+    return state->modrm_mod;
+}
+
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt)
+{
+    check_state(state);
+
+    return state->eip - ctxt->regs->eip;
+}
+
+#endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -523,4 +523,29 @@ x86emul_unhandleable_rw(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
 
+#ifdef __XEN__
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt));
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg);
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt);
+
+#ifdef NDEBUG
+static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+#else
+void x86_emulate_free_state(struct x86_emulate_state *state);
+#endif
+
+#endif
+
 #endif /* __X86_EMULATE_H__ */
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -53,6 +53,10 @@ void hvm_mem_access_emulate_one(enum emu
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes);
 void hvm_emulate_writeback(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 struct segment_register *hvmemul_get_seg_reg(
@@ -60,6 +64,11 @@ struct segment_register *hvmemul_get_seg
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
 
+int hvmemul_insn_fetch(enum x86_segment seg,
+                       unsigned long offset,
+                       void *p_data,
+                       unsigned int bytes,
+                       struct x86_emulate_ctxt *ctxt);
 int hvmemul_do_pio_buffer(uint16_t port,
                           unsigned int size,
                           uint8_t dir,

Comments

Andrew Cooper Sept. 14, 2016, 5:56 p.m. UTC | #1
On 08/09/16 14:14, Jan Beulich wrote:
> @@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le
>      return vmcb->nextrip - vmcb->rip;
>  }
>  
> -/* First byte: Length. Following bytes: Opcode bytes. */
> -#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
> -MAKE_INSTR(INVD,   2, 0x0f, 0x08);
> -MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
> -MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
> -MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
> -MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
> -MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
> -MAKE_INSTR(HLT,    1, 0xf4);
> -MAKE_INSTR(INT3,   1, 0xcc);
> -MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
> -MAKE_INSTR(PAUSE,  1, 0x90);
> -MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
> -MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
> -MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
> -MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
> -MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
> -MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
> -MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
> -
> -static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
> -{
> -    [INSTR_INVD]   = OPCODE_INVD,
> -    [INSTR_WBINVD] = OPCODE_WBINVD,
> -    [INSTR_CPUID]  = OPCODE_CPUID,
> -    [INSTR_RDMSR]  = OPCODE_RDMSR,
> -    [INSTR_WRMSR]  = OPCODE_WRMSR,
> -    [INSTR_VMCALL] = OPCODE_VMCALL,
> -    [INSTR_HLT]    = OPCODE_HLT,
> -    [INSTR_INT3]   = OPCODE_INT3,
> -    [INSTR_RDTSC]  = OPCODE_RDTSC,
> -    [INSTR_PAUSE]  = OPCODE_PAUSE,
> -    [INSTR_XSETBV] = OPCODE_XSETBV,
> -    [INSTR_VMRUN]  = OPCODE_VMRUN,
> -    [INSTR_VMLOAD] = OPCODE_VMLOAD,
> -    [INSTR_VMSAVE] = OPCODE_VMSAVE,
> -    [INSTR_STGI]   = OPCODE_STGI,
> -    [INSTR_CLGI]   = OPCODE_CLGI,
> -    [INSTR_INVLPGA] = OPCODE_INVLPGA,
> +static const struct {
> +    unsigned int opcode;
> +    struct {
> +        unsigned int rm:3;
> +        unsigned int reg:3;
> +        unsigned int mod:2;
> +#define MODRM(mod, reg, rm) { rm, reg, mod }
> +    } modrm;
> +} const opc_tab[INSTR_MAX_COUNT] = {
> +    [INSTR_PAUSE]  = { X86EMUL_OPC_F3(0, 0x90) },
> +    [INSTR_INT3]   = { X86EMUL_OPC(   0, 0xcc) },
> +    [INSTR_HLT]    = { X86EMUL_OPC(   0, 0xf4) },
> +    [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
> +    [INSTR_VMRUN]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
> +    [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
> +    [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
> +    [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
> +    [INSTR_STGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
> +    [INSTR_CLGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
> +    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
> +    [INSTR_INVD]   = { X86EMUL_OPC(0x0f, 0x08) },
> +    [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) },
> +    [INSTR_WRMSR]  = { X86EMUL_OPC(0x0f, 0x30) },
> +    [INSTR_RDTSC]  = { X86EMUL_OPC(0x0f, 0x31) },
> +    [INSTR_RDMSR]  = { X86EMUL_OPC(0x0f, 0x32) },
> +    [INSTR_CPUID]  = { X86EMUL_OPC(0x0f, 0xa2) },
>  };
>  
> -static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
> -                    unsigned long addr, unsigned int len)
> -{
> -    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
> -
> -    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
> -    {
> -    case HVMCOPY_okay:
> -        break;
> -    case HVMCOPY_bad_gva_to_gfn:
> -        /* OK just to give up; we'll have injected #PF already */
> -        return 0;
> -    default:
> -        /* Not OK: fetches from non-RAM pages are not supportable. */
> -        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
> -                 vmcb->rip, addr);
> -        hvm_inject_hw_exception(TRAP_gp_fault, 0);
> -        return 0;
> -    }
> -    return 1;
> -}
> -
>  int __get_instruction_length_from_list(struct vcpu *v,
>          const enum instruction_index *list, unsigned int list_count)
>  {
>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
> -    unsigned int i, j, inst_len = 0;
> -    enum instruction_index instr = 0;
> -    u8 buf[MAX_INST_LEN];
> -    const u8 *opcode = NULL;
> -    unsigned long fetch_addr, fetch_limit;
> -    unsigned int fetch_len, max_len;
> +    struct hvm_emulate_ctxt ctxt;
> +    struct x86_emulate_state *state;
> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
> +    int modrm_mod;
>  
> +#ifdef NDEBUG

Presumably this is just for your testing?

>      if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
>          return inst_len;
>  
>      if ( vmcb->exitcode == VMEXIT_IOIO )
>          return vmcb->exitinfo2 - vmcb->rip;
> +#endif
>  
> -    /* Fetch up to the next page break; we'll fetch from the next page
> -     * later if we have to. */
> -    fetch_addr = svm_rip2pointer(v, &fetch_limit);
> -    if ( vmcb->rip > fetch_limit )
> -        return 0;
> -    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
> -    fetch_len = min_t(unsigned int, max_len,
> -                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
> -    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
> +    ASSERT(v == current);
> +    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
> +    hvm_emulate_init(&ctxt, NULL, 0);
> +    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
> +    if ( IS_ERR_OR_NULL(state) )
>          return 0;
>  
> -    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
> -    {
> -        inst_len++;
> -        if ( inst_len >= fetch_len )
> -        {
> -            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
> -                        max_len - fetch_len) )
> -                return 0;
> -            fetch_len = max_len;
> -        }
> +    inst_len = x86_insn_length(state, &ctxt.ctxt);
> +    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
> +    x86_emulate_free_state(state);

From an API point of view, it is weird to have x86_emulate_free_state()
without a matching allocation function.  Perhaps that is just me.

However, the x86_insn_modrm() API is definitely more weird.  Wouldn't it
be more natural to take optional pointers for the mod, rm and reg parts
individually?


> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -382,7 +382,7 @@ struct operand {
>      } mem;
>  };
>  #ifdef __x86_64__
> -#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
> +#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
>  #else
>  #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>  #endif

Given that these are now used for general pointer poisoning, they should
be renamed.  There are only 3 instances.

> @@ -1631,6 +1631,10 @@ struct x86_emulate_state {
>  
>      unsigned long eip;
>      struct cpu_user_regs *regs;
> +
> +#ifndef NDEBUG
> +    void *caller;
> +#endif

Perhaps worth a comment here?  Its purpose is rather opaque.

>  };
>  
>  /* Helper definitions. */
> @@ -1658,6 +1662,11 @@ x86_decode_base(
>  
>      switch ( ctxt->opcode )
>      {
> +    case 0x90: /* nop / pause */
> +        if ( repe_prefix() )
> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
> +        break;

Why is it necessary to special case the rep prefix handling in this case?

> +int
> +x86_insn_modrm(const struct x86_emulate_state *state,
> +               unsigned int *rm, unsigned int *reg)
> +{
> +    check_state(state);
> +
> +    if ( !(state->desc & ModRM) )
> +        return -EINVAL;
> +
> +    if ( rm )
> +        *rm = state->modrm_rm;
> +    if ( reg )
> +        *reg = state->modrm_reg;
> +
> +    return state->modrm_mod;
> +}
> +
> +unsigned int
> +x86_insn_length(const struct x86_emulate_state *state,
> +                const struct x86_emulate_ctxt *ctxt)
> +{
> +    check_state(state);
> +
> +    return state->eip - ctxt->regs->eip;

Is it worth stashing a starting eip?  This calculation will go wrong
after the emulated state has been committed.

~Andrew
Jan Beulich Sept. 15, 2016, 6:55 a.m. UTC | #2
>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:14, Jan Beulich wrote:
>>  int __get_instruction_length_from_list(struct vcpu *v,
>>          const enum instruction_index *list, unsigned int list_count)
>>  {
>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>> -    unsigned int i, j, inst_len = 0;
>> -    enum instruction_index instr = 0;
>> -    u8 buf[MAX_INST_LEN];
>> -    const u8 *opcode = NULL;
>> -    unsigned long fetch_addr, fetch_limit;
>> -    unsigned int fetch_len, max_len;
>> +    struct hvm_emulate_ctxt ctxt;
>> +    struct x86_emulate_state *state;
>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>> +    int modrm_mod;
>>  
>> +#ifdef NDEBUG
> 
> Presumably this is just for your testing?

No, I actually meant it to stay that way. Along the lines of the extra
debugging code we have in map_domain_page().

>>      if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
>>          return inst_len;
>>  
>>      if ( vmcb->exitcode == VMEXIT_IOIO )
>>          return vmcb->exitinfo2 - vmcb->rip;
>> +#endif
>>  
>> -    /* Fetch up to the next page break; we'll fetch from the next page
>> -     * later if we have to. */
>> -    fetch_addr = svm_rip2pointer(v, &fetch_limit);
>> -    if ( vmcb->rip > fetch_limit )
>> -        return 0;
>> -    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
>> -    fetch_len = min_t(unsigned int, max_len,
>> -                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
>> -    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
>> +    ASSERT(v == current);
>> +    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
>> +    hvm_emulate_init(&ctxt, NULL, 0);
>> +    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
>> +    if ( IS_ERR_OR_NULL(state) )
>>          return 0;
>>  
>> -    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
>> -    {
>> -        inst_len++;
>> -        if ( inst_len >= fetch_len )
>> -        {
>> -            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
>> -                        max_len - fetch_len) )
>> -                return 0;
>> -            fetch_len = max_len;
>> -        }
>> +    inst_len = x86_insn_length(state, &ctxt.ctxt);
>> +    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
>> +    x86_emulate_free_state(state);
> 
> From an API point of view, it is weird to have x86_emulate_free_state()
> without a matching allocation function.  Perhaps that is just me.

With x86_decode_insn() returning the state, that to me _is_ the
allocation function.

> However, the x86_insn_modrm() API is definitely more weird.  Wouldn't it
> be more natural to take optional pointers for the mod, rm and reg parts
> individually?

I could change it to that, but I did it this way because without mod
at least rm is meaningless. Or said differently, I can't really see there
being a caller not caring about mod.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -382,7 +382,7 @@ struct operand {
>>      } mem;
>>  };
>>  #ifdef __x86_64__
>> -#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
>> +#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
>>  #else
>>  #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>>  #endif
> 
> Given that these are now used for general pointer poisoning, they should
> be renamed.  There are only 3 instances.

Okay. I'll make the PTR_POISON then.

>> @@ -1658,6 +1662,11 @@ x86_decode_base(
>>  
>>      switch ( ctxt->opcode )
>>      {
>> +    case 0x90: /* nop / pause */
>> +        if ( repe_prefix() )
>> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
>> +        break;
> 
> Why is it necessary to special case the rep prefix handling in this case?

Because SVM's pause intercept should not mistakenly also accept a
plain NOP.

>> +unsigned int
>> +x86_insn_length(const struct x86_emulate_state *state,
>> +                const struct x86_emulate_ctxt *ctxt)
>> +{
>> +    check_state(state);
>> +
>> +    return state->eip - ctxt->regs->eip;
> 
> Is it worth stashing a starting eip?  This calculation will go wrong
> after the emulated state has been committed.

This function (taking a state parameter) can't be called by users of
x86_emulate(), and I don't think we need to cater for callers
committing state themselves - they should clearly use the result of
this function for what to commit in the first place.

Jan
Andrew Cooper Sept. 27, 2016, 1:42 p.m. UTC | #3
On 15/09/16 07:55, Jan Beulich wrote:
>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
>> On 08/09/16 14:14, Jan Beulich wrote:
>>>  int __get_instruction_length_from_list(struct vcpu *v,
>>>          const enum instruction_index *list, unsigned int list_count)
>>>  {
>>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>>> -    unsigned int i, j, inst_len = 0;
>>> -    enum instruction_index instr = 0;
>>> -    u8 buf[MAX_INST_LEN];
>>> -    const u8 *opcode = NULL;
>>> -    unsigned long fetch_addr, fetch_limit;
>>> -    unsigned int fetch_len, max_len;
>>> +    struct hvm_emulate_ctxt ctxt;
>>> +    struct x86_emulate_state *state;
>>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>>> +    int modrm_mod;
>>>  
>>> +#ifdef NDEBUG
>> Presumably this is just for your testing?
> No, I actually meant it to stay that way. Along the lines of the extra
> debugging code we have in map_domain_page().

I was never very happy with the older version of this debugging.  Surely
in a case like this, we should use the intercept information when
available, and check it against the emulator in a debug build.

That way, we don't entirely change the underlying logic in this function
between a debug and non debug build.

>>> @@ -1658,6 +1662,11 @@ x86_decode_base(
>>>  
>>>      switch ( ctxt->opcode )
>>>      {
>>> +    case 0x90: /* nop / pause */
>>> +        if ( repe_prefix() )
>>> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
>>> +        break;
>> Why is it necessary to special case the rep prefix handling in this case?
> Because SVM's pause intercept should not mistakenly also accept a
> plain NOP.

How about a comment to this effect:

/* Note: Prefixes such as rep/repne are only encoded when semantically
meaningful to the instruction, to reduce the complexity of interpreting
this opcode representation. */

~Andrew
Jan Beulich Sept. 27, 2016, 1:56 p.m. UTC | #4
>>> On 27.09.16 at 15:42, <andrew.cooper3@citrix.com> wrote:
> On 15/09/16 07:55, Jan Beulich wrote:
>>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
>>> On 08/09/16 14:14, Jan Beulich wrote:
>>>>  int __get_instruction_length_from_list(struct vcpu *v,
>>>>          const enum instruction_index *list, unsigned int list_count)
>>>>  {
>>>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>>>> -    unsigned int i, j, inst_len = 0;
>>>> -    enum instruction_index instr = 0;
>>>> -    u8 buf[MAX_INST_LEN];
>>>> -    const u8 *opcode = NULL;
>>>> -    unsigned long fetch_addr, fetch_limit;
>>>> -    unsigned int fetch_len, max_len;
>>>> +    struct hvm_emulate_ctxt ctxt;
>>>> +    struct x86_emulate_state *state;
>>>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>>>> +    int modrm_mod;
>>>>  
>>>> +#ifdef NDEBUG
>>> Presumably this is just for your testing?
>> No, I actually meant it to stay that way. Along the lines of the extra
>> debugging code we have in map_domain_page().
> 
> I was never very happy with the older version of this debugging.  Surely
> in a case like this, we should use the intercept information when
> available, and check it against the emulator in a debug build.
> 
> That way, we don't entirely change the underlying logic in this function
> between a debug and non debug build.

But that is exactly what the code is doing:

#ifndef NDEBUG
    if ( vmcb->exitcode == VMEXIT_IOIO )
        j = vmcb->exitinfo2 - vmcb->rip;
    else
        j = svm_nextrip_insn_length(v);
    if ( j && j != inst_len )
    {
        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
                ctxt.ctxt.opcode, inst_len, j);
        return j;
    }
#endif

I.e. in case of a mismatch we use the data from hardware, plus a
message gets logged. In case of a match we further exercise the
opcode lookup logic, which non-debug builds would never hit on
capable hardware.

>>>> @@ -1658,6 +1662,11 @@ x86_decode_base(
>>>>  
>>>>      switch ( ctxt->opcode )
>>>>      {
>>>> +    case 0x90: /* nop / pause */
>>>> +        if ( repe_prefix() )
>>>> +            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
>>>> +        break;
>>> Why is it necessary to special case the rep prefix handling in this case?
>> Because SVM's pause intercept should not mistakenly also accept a
>> plain NOP.
> 
> How about a comment to this effect:
> 
> /* Note: Prefixes such as rep/repne are only encoded when semantically
> meaningful to the instruction, to reduce the complexity of interpreting
> this opcode representation. */

Yes, I've added a slight variation of this to the definition of
X86EMUL_OPC_PFX_MASK.

Jan
Andrew Cooper Sept. 27, 2016, 3:53 p.m. UTC | #5
On 27/09/16 14:56, Jan Beulich wrote:
>>>> On 27.09.16 at 15:42, <andrew.cooper3@citrix.com> wrote:
>> On 15/09/16 07:55, Jan Beulich wrote:
>>>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote:
>>>> On 08/09/16 14:14, Jan Beulich wrote:
>>>>>  int __get_instruction_length_from_list(struct vcpu *v,
>>>>>          const enum instruction_index *list, unsigned int list_count)
>>>>>  {
>>>>>      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
>>>>> -    unsigned int i, j, inst_len = 0;
>>>>> -    enum instruction_index instr = 0;
>>>>> -    u8 buf[MAX_INST_LEN];
>>>>> -    const u8 *opcode = NULL;
>>>>> -    unsigned long fetch_addr, fetch_limit;
>>>>> -    unsigned int fetch_len, max_len;
>>>>> +    struct hvm_emulate_ctxt ctxt;
>>>>> +    struct x86_emulate_state *state;
>>>>> +    unsigned int inst_len, j, modrm_rm, modrm_reg;
>>>>> +    int modrm_mod;
>>>>>  
>>>>> +#ifdef NDEBUG
>>>> Presumably this is just for your testing?
>>> No, I actually meant it to stay that way. Along the lines of the extra
>>> debugging code we have in map_domain_page().
>> I was never very happy with the older version of this debugging.  Surely
>> in a case like this, we should use the intercept information when
>> available, and check it against the emulator in a debug build.
>>
>> That way, we don't entirely change the underlying logic in this function
>> between a debug and non debug build.
> But that is exactly what the code is doing:
>
> #ifndef NDEBUG
>     if ( vmcb->exitcode == VMEXIT_IOIO )
>         j = vmcb->exitinfo2 - vmcb->rip;
>     else
>         j = svm_nextrip_insn_length(v);
>     if ( j && j != inst_len )
>     {
>         gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
>                 ctxt.ctxt.opcode, inst_len, j);
>         return j;
>     }
> #endif
>
> I.e. in case of a mismatch we use the data from hardware, plus a
> message gets logged. In case of a match we further exercise the
> opcode lookup logic, which non-debug builds would never hit on
> capable hardware.

Ah yes - I see now.  The split between #ifdef NDEBUG and #ifndef NDEBUG
is the confusing factor.

~Andrew
diff mbox

Patch

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -835,7 +835,7 @@  static int hvmemul_read(
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
-static int hvmemul_insn_fetch(
+int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
     void *p_data,
@@ -1765,15 +1765,14 @@  static const struct x86_emulate_ops hvm_
     .vmfunc        = hvmemul_vmfunc,
 };
 
-static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
-    const struct x86_emulate_ops *ops)
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes)
 {
-    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
-    uint32_t new_intr_shadow, pfec = PFEC_page_present;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    unsigned int pfec = PFEC_page_present;
     unsigned long addr;
-    int rc;
 
     if ( hvm_long_mode_enabled(curr) &&
          hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
@@ -1791,14 +1790,14 @@  static int _hvm_emulate_one(struct hvm_e
     if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
         pfec |= PFEC_user_mode;
 
-    hvmemul_ctxt->insn_buf_eip = regs->eip;
-    if ( !vio->mmio_insn_bytes )
+    hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->eip;
+    if ( !insn_bytes )
     {
         hvmemul_ctxt->insn_buf_bytes =
             hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
             (hvm_virtual_to_linear_addr(x86_seg_cs,
                                         &hvmemul_ctxt->seg_reg[x86_seg_cs],
-                                        regs->eip,
+                                        hvmemul_ctxt->insn_buf_eip,
                                         sizeof(hvmemul_ctxt->insn_buf),
                                         hvm_access_insn_fetch,
                                         hvmemul_ctxt->ctxt.addr_size,
@@ -1810,11 +1809,24 @@  static int _hvm_emulate_one(struct hvm_e
     }
     else
     {
-        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
-        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
     }
 
     hvmemul_ctxt->exn_pending = 0;
+}
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t new_intr_shadow;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    int rc;
+
+    hvm_emulate_init(hvmemul_ctxt, vio->mmio_insn, vio->mmio_insn_bytes);
+
     vio->mmio_retry = 0;
 
     if ( cpu_has_vmx )
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -15,7 +15,7 @@ 
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <xen/config.h>
+#include <xen/err.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -26,41 +26,6 @@ 
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 
-static unsigned int is_prefix(u8 opc)
-{
-    switch ( opc )
-    {
-    case 0x66:
-    case 0x67:
-    case 0x2E:
-    case 0x3E:
-    case 0x26:
-    case 0x64:
-    case 0x65:
-    case 0x36:
-    case 0xF0:
-    case 0xF3:
-    case 0xF2:
-    case 0x40 ... 0x4f:
-        return 1;
-    }
-    return 0;
-}
-
-static unsigned long svm_rip2pointer(struct vcpu *v, unsigned long *limit)
-{
-    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned long p = vmcb->cs.base + vmcb->rip;
-
-    if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) )
-    {
-        *limit = vmcb->cs.limit;
-        return (u32)p; /* mask to 32 bits */
-    }
-    *limit = ~0UL;
-    return p;
-}
-
 static unsigned long svm_nextrip_insn_length(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -89,141 +54,96 @@  static unsigned long svm_nextrip_insn_le
     return vmcb->nextrip - vmcb->rip;
 }
 
-/* First byte: Length. Following bytes: Opcode bytes. */
-#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ }
-MAKE_INSTR(INVD,   2, 0x0f, 0x08);
-MAKE_INSTR(WBINVD, 2, 0x0f, 0x09);
-MAKE_INSTR(CPUID,  2, 0x0f, 0xa2);
-MAKE_INSTR(RDMSR,  2, 0x0f, 0x32);
-MAKE_INSTR(WRMSR,  2, 0x0f, 0x30);
-MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9);
-MAKE_INSTR(HLT,    1, 0xf4);
-MAKE_INSTR(INT3,   1, 0xcc);
-MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
-MAKE_INSTR(PAUSE,  1, 0x90);
-MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
-MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
-MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
-MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
-MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
-MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
-MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf);
-
-static const u8 *const opc_bytes[INSTR_MAX_COUNT] =
-{
-    [INSTR_INVD]   = OPCODE_INVD,
-    [INSTR_WBINVD] = OPCODE_WBINVD,
-    [INSTR_CPUID]  = OPCODE_CPUID,
-    [INSTR_RDMSR]  = OPCODE_RDMSR,
-    [INSTR_WRMSR]  = OPCODE_WRMSR,
-    [INSTR_VMCALL] = OPCODE_VMCALL,
-    [INSTR_HLT]    = OPCODE_HLT,
-    [INSTR_INT3]   = OPCODE_INT3,
-    [INSTR_RDTSC]  = OPCODE_RDTSC,
-    [INSTR_PAUSE]  = OPCODE_PAUSE,
-    [INSTR_XSETBV] = OPCODE_XSETBV,
-    [INSTR_VMRUN]  = OPCODE_VMRUN,
-    [INSTR_VMLOAD] = OPCODE_VMLOAD,
-    [INSTR_VMSAVE] = OPCODE_VMSAVE,
-    [INSTR_STGI]   = OPCODE_STGI,
-    [INSTR_CLGI]   = OPCODE_CLGI,
-    [INSTR_INVLPGA] = OPCODE_INVLPGA,
+static const struct {
+    unsigned int opcode;
+    struct {
+        unsigned int rm:3;
+        unsigned int reg:3;
+        unsigned int mod:2;
+#define MODRM(mod, reg, rm) { rm, reg, mod }
+    } modrm;
+} const opc_tab[INSTR_MAX_COUNT] = {
+    [INSTR_PAUSE]  = { X86EMUL_OPC_F3(0, 0x90) },
+    [INSTR_INT3]   = { X86EMUL_OPC(   0, 0xcc) },
+    [INSTR_HLT]    = { X86EMUL_OPC(   0, 0xf4) },
+    [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) },
+    [INSTR_VMRUN]  = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) },
+    [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) },
+    [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) },
+    [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) },
+    [INSTR_STGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) },
+    [INSTR_CLGI]   = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) },
+    [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) },
+    [INSTR_INVD]   = { X86EMUL_OPC(0x0f, 0x08) },
+    [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) },
+    [INSTR_WRMSR]  = { X86EMUL_OPC(0x0f, 0x30) },
+    [INSTR_RDTSC]  = { X86EMUL_OPC(0x0f, 0x31) },
+    [INSTR_RDMSR]  = { X86EMUL_OPC(0x0f, 0x32) },
+    [INSTR_CPUID]  = { X86EMUL_OPC(0x0f, 0xa2) },
 };
 
-static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf,
-                    unsigned long addr, unsigned int len)
-{
-    uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0;
-
-    switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) )
-    {
-    case HVMCOPY_okay:
-        break;
-    case HVMCOPY_bad_gva_to_gfn:
-        /* OK just to give up; we'll have injected #PF already */
-        return 0;
-    default:
-        /* Not OK: fetches from non-RAM pages are not supportable. */
-        gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n",
-                 vmcb->rip, addr);
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        return 0;
-    }
-    return 1;
-}
-
 int __get_instruction_length_from_list(struct vcpu *v,
         const enum instruction_index *list, unsigned int list_count)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int i, j, inst_len = 0;
-    enum instruction_index instr = 0;
-    u8 buf[MAX_INST_LEN];
-    const u8 *opcode = NULL;
-    unsigned long fetch_addr, fetch_limit;
-    unsigned int fetch_len, max_len;
+    struct hvm_emulate_ctxt ctxt;
+    struct x86_emulate_state *state;
+    unsigned int inst_len, j, modrm_rm, modrm_reg;
+    int modrm_mod;
 
+#ifdef NDEBUG
     if ( (inst_len = svm_nextrip_insn_length(v)) != 0 )
         return inst_len;
 
     if ( vmcb->exitcode == VMEXIT_IOIO )
         return vmcb->exitinfo2 - vmcb->rip;
+#endif
 
-    /* Fetch up to the next page break; we'll fetch from the next page
-     * later if we have to. */
-    fetch_addr = svm_rip2pointer(v, &fetch_limit);
-    if ( vmcb->rip > fetch_limit )
-        return 0;
-    max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL);
-    fetch_len = min_t(unsigned int, max_len,
-                      PAGE_SIZE - (fetch_addr & ~PAGE_MASK));
-    if ( !fetch(vmcb, buf, fetch_addr, fetch_len) )
+    ASSERT(v == current);
+    hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
+    hvm_emulate_init(&ctxt, NULL, 0);
+    state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch);
+    if ( IS_ERR_OR_NULL(state) )
         return 0;
 
-    while ( (inst_len < max_len) && is_prefix(buf[inst_len]) )
-    {
-        inst_len++;
-        if ( inst_len >= fetch_len )
-        {
-            if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                        max_len - fetch_len) )
-                return 0;
-            fetch_len = max_len;
-        }
+    inst_len = x86_insn_length(state, &ctxt.ctxt);
+    modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg);
+    x86_emulate_free_state(state);
+#ifndef NDEBUG
+    if ( vmcb->exitcode == VMEXIT_IOIO )
+        j = vmcb->exitinfo2 - vmcb->rip;
+    else
+        j = svm_nextrip_insn_length(v);
+    if ( j && j != inst_len )
+    {
+        gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n",
+                ctxt.ctxt.opcode, inst_len, j);
+        return j;
     }
+#endif
 
     for ( j = 0; j < list_count; j++ )
     {
-        instr = list[j];
-        opcode = opc_bytes[instr];
+        enum instruction_index instr = list[j];
 
-        for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ )
+        ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab));
+        if ( opc_tab[instr].opcode == ctxt.ctxt.opcode )
         {
-            if ( (inst_len + i) >= fetch_len ) 
-            {
-                if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len,
-                            max_len - fetch_len) )
-                    return 0;
-                fetch_len = max_len;
-            }
+            if ( !opc_tab[instr].modrm.mod )
+                return inst_len;
 
-            if ( buf[inst_len+i] != opcode[i+1] )
-                goto mismatch;
+            if ( modrm_mod == opc_tab[instr].modrm.mod &&
+                 (modrm_rm & 7) == opc_tab[instr].modrm.rm &&
+                 (modrm_reg & 7) == opc_tab[instr].modrm.reg )
+                return inst_len;
         }
-        goto done;
-    mismatch: ;
     }
 
     gdprintk(XENLOG_WARNING,
-             "%s: Mismatch between expected and actual instruction bytes: "
+             "%s: Mismatch between expected and actual instruction: "
              "eip = %lx\n",  __func__, (unsigned long)vmcb->rip);
     hvm_inject_hw_exception(TRAP_gp_fault, 0);
     return 0;
-
- done:
-    inst_len += opcode[0];
-    ASSERT(inst_len <= max_len);
-    return inst_len;
 }
 
 /*
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -382,7 +382,7 @@  struct operand {
     } mem;
 };
 #ifdef __x86_64__
-#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */
+#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */
 #else
 #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
@@ -1631,6 +1631,10 @@  struct x86_emulate_state {
 
     unsigned long eip;
     struct cpu_user_regs *regs;
+
+#ifndef NDEBUG
+    void *caller;
+#endif
 };
 
 /* Helper definitions. */
@@ -1658,6 +1662,11 @@  x86_decode_base(
 
     switch ( ctxt->opcode )
     {
+    case 0x90: /* nop / pause */
+        if ( repe_prefix() )
+            ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
+        break;
+
     case 0x9a: /* call (far, absolute) */
     case 0xea: /* jmp (far, absolute) */
         generate_exception_if(mode_64bit(), EXC_UD, -1);
@@ -2852,8 +2861,9 @@  x86_emulate(
         break;
 
     case 0x90: /* nop / xchg %%r8,%%rax */
+    case X86EMUL_OPC_F3(0, 0x90): /* pause / xchg %%r8,%%rax */
         if ( !(rex_prefix & 1) )
-            break; /* nop */
+            break; /* nop / pause */
         /* fall through */
 
     case 0x91 ... 0x97: /* xchg reg,%%rax */
@@ -5200,3 +5210,89 @@  x86_emulate(
 #undef vex
 #undef override_seg
 #undef ea
+
+#ifdef __XEN__
+
+#include <xen/err.h>
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt))
+{
+    static DEFINE_PER_CPU(struct x86_emulate_state, state);
+    struct x86_emulate_state *state = &this_cpu(state);
+    const struct x86_emulate_ops ops = {
+        .insn_fetch = insn_fetch,
+        .read       = x86emul_unhandleable_rw,
+        .write      = REG_POISON,
+        .cmpxchg    = REG_POISON,
+    };
+    int rc = x86_decode(state, ctxt, &ops);
+
+    if ( unlikely(rc != X86EMUL_OKAY) )
+        return ERR_PTR(-rc);
+
+#ifndef NDEBUG
+    /*
+     * While we avoid memory allocation (by use of per-CPU data) above,
+     * nevertheless make sure callers properly release the state structure
+     * for forward compatibility.
+     */
+    if ( state->caller )
+    {
+        printk(XENLOG_ERR "Unreleased emulation state acquired by %ps\n",
+               state->caller);
+        dump_execution_state();
+    }
+    state->caller = __builtin_return_address(0);
+#endif
+
+    return state;
+}
+
+static inline void check_state(const struct x86_emulate_state *state)
+{
+#ifndef NDEBUG
+    ASSERT(state->caller);
+#endif
+}
+
+#ifndef NDEBUG
+void x86_emulate_free_state(struct x86_emulate_state *state)
+{
+    check_state(state);
+    state->caller = NULL;
+}
+#endif
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg)
+{
+    check_state(state);
+
+    if ( !(state->desc & ModRM) )
+        return -EINVAL;
+
+    if ( rm )
+        *rm = state->modrm_rm;
+    if ( reg )
+        *reg = state->modrm_reg;
+
+    return state->modrm_mod;
+}
+
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt)
+{
+    check_state(state);
+
+    return state->eip - ctxt->regs->eip;
+}
+
+#endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -523,4 +523,29 @@  x86emul_unhandleable_rw(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
 
+#ifdef __XEN__
+
+struct x86_emulate_state *
+x86_decode_insn(
+    struct x86_emulate_ctxt *ctxt,
+    int (*insn_fetch)(
+        enum x86_segment seg, unsigned long offset,
+        void *p_data, unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt));
+
+int
+x86_insn_modrm(const struct x86_emulate_state *state,
+               unsigned int *rm, unsigned int *reg);
+unsigned int
+x86_insn_length(const struct x86_emulate_state *state,
+                const struct x86_emulate_ctxt *ctxt);
+
+#ifdef NDEBUG
+static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+#else
+void x86_emulate_free_state(struct x86_emulate_state *state);
+#endif
+
+#endif
+
 #endif /* __X86_EMULATE_H__ */
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -53,6 +53,10 @@  void hvm_mem_access_emulate_one(enum emu
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
+void hvm_emulate_init(
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const unsigned char *insn_buf,
+    unsigned int insn_bytes);
 void hvm_emulate_writeback(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 struct segment_register *hvmemul_get_seg_reg(
@@ -60,6 +64,11 @@  struct segment_register *hvmemul_get_seg
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
 
+int hvmemul_insn_fetch(enum x86_segment seg,
+                       unsigned long offset,
+                       void *p_data,
+                       unsigned int bytes,
+                       struct x86_emulate_ctxt *ctxt);
 int hvmemul_do_pio_buffer(uint16_t port,
                           unsigned int size,
                           uint8_t dir,