diff mbox

[5/5] x86/emulate: add support for {, v}movd {, x}mm, r/m32 and {, v}movq {, x}mm, r/m64

Message ID 57D187D4020000780010D28E@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich Sept. 8, 2016, 1:46 p.m. UTC
From: Zhi Wang <zhi.a.wang@intel.com>

Found that Windows driver was using a SSE2 instruction MOVD.

Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Re-base on decoding changes. Address Andrew's and my own review
    comments (where still applicable). #UD when vex.l is set. Various
    adjustments to the test tool change.
x86/emulate: add support for {,v}movd {,x}mm,r/m32 and {,v}movq {,x}mm,r/m64

From: Zhi Wang <zhi.a.wang@intel.com>

Found that Windows driver was using a SSE2 instruction MOVD.

Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Re-base on decoding changes. Address Andrew's and my own review
    comments (where still applicable). #UD when vex.l is set. Various
    adjustments to the test tool change.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -973,6 +973,296 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing movd %%mm3,32(%%ecx)...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movd_to_mem);
+
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movd_to_mem, "movd %%mm3, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(movd_to_mem);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movd_to_mem) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movd %%xmm2,32(%%edx)...");
+    if ( stack_exec && cpu_has_sse2 )
+    {
+        decl_insn(movd_to_mem2);
+
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movd_to_mem2, "movd %%xmm2, 32(%0)")
+                       :: "d" (NULL) );
+
+        memset(res, 0xdb, 64);
+        set_insn(movd_to_mem2);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movd_to_mem2) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovd %%xmm1,32(%%ecx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovd_to_mem);
+
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovd_to_mem, "vmovd %%xmm1, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(vmovd_to_mem);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovd_to_mem) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movd %%mm3,%%ebx...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movd_to_reg);
+
+        /*
+         * Intentionally not specifying "b" as an input (or even output) here
+         * to not keep the compiler from using the variable, which in turn
+         * allows noticing whether the emulator touches the actual register
+         * instead of the regs field.
+         */
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movd_to_reg, "movd %%mm3, %%ebx")
+                       :: );
+
+        set_insn(movd_to_reg);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(movd_to_reg) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movd %%xmm2,%%ebx...");
+    if ( stack_exec && cpu_has_sse2 )
+    {
+        decl_insn(movd_to_reg2);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movd_to_reg2, "movd %%xmm2, %%ebx")
+                       :: );
+
+        set_insn(movd_to_reg2);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(movd_to_reg2) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovd %%xmm1,%%ebx...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovd_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovd_to_reg, "vmovd %%xmm1, %%ebx")
+                       :: );
+
+        set_insn(vmovd_to_reg);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovd_to_reg) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+#ifdef __x86_64__
+    printf("%-40s", "Testing movq %%mm3,32(%%ecx)...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movq_to_mem3);
+
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movq_to_mem3, "rex64 movd %%mm3, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(movq_to_mem3);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movq_to_mem3) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movq %%xmm2,32(%%edx)...");
+    if ( stack_exec )
+    {
+        decl_insn(movq_to_mem4);
+
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movq_to_mem4, "rex64 movd %%xmm2, 32(%0)")
+                       :: "d" (NULL) );
+
+        memset(res, 0xdb, 64);
+        set_insn(movq_to_mem4);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movq_to_mem4) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %%xmm1,32(%%ecx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovq_to_mem2);
+
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+#if 0 /* This doesn't work, as the assembler will pick opcode D6. */
+                       put_insn(vmovq_to_mem2, "vmovq %%xmm1, 32(%0)")
+#else
+                       put_insn(vmovq_to_mem2, ".byte 0xc4, 0xe1, 0xf9, 0x7e, 0x49, 0x20")
+#endif
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(vmovq_to_mem2);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovq_to_mem2) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movq %%mm3,%%rbx...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movq_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movq_to_reg, "movq %%mm3, %%rbx")
+                       :: );
+
+        set_insn(movq_to_reg);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rbx + 1 || !check_eip(movq_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movq %%xmm2,%%rbx...");
+    if ( stack_exec )
+    {
+        decl_insn(movq_to_reg2);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movq_to_reg2, "movq %%xmm2, %%rbx")
+                       :: );
+
+        set_insn(movq_to_reg2);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rbx + 1 || !check_eip(movq_to_reg2) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %%xmm1,%%rbx...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovq_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovq_to_reg, "vmovq %%xmm1, %%rbx")
+                       :: );
+
+        set_insn(vmovq_to_reg);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rbx + 1 || !check_eip(vmovq_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+#endif
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -223,7 +223,7 @@ static const opcode_desc_t twobyte_table
     /* 0x70 - 0x7F */
     SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
     ModRM, ModRM, ModRM, ImplicitOps,
-    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
+    ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x80 - 0x87 */
     DstImplicit|SrcImm, DstImplicit|SrcImm,
     DstImplicit|SrcImm, DstImplicit|SrcImm,
@@ -2291,6 +2291,10 @@ x86_decode(
         return X86EMUL_UNHANDLEABLE;
     }
 
+    if ( op_bytes == 2 &&
+         (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+        op_bytes = 4;
+
  done:
     return rc;
 }
@@ -4772,6 +4776,12 @@ x86_emulate(
                                          /* vmovdqa ymm/m256,ymm */
     case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
                                          /* vmovdqu ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x7e):        /* movd mm,r/m32 */
+                                         /* movq mm,r/m64 */
+    case X86EMUL_OPC_66(0x0f, 0x7e):     /* movd xmm,r/m32 */
+                                         /* movq xmm,r/m64 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmovd xmm,r/m32 */
+                                         /* vmovq xmm,r/m64 */
     case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
     case X86EMUL_OPC_66(0x0f, 0x7f):     /* movdqa xmm,xmm/m128 */
     case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
@@ -4822,10 +4832,16 @@ x86_emulate(
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
         }
-        if ( b == 0xd6 )
+        switch ( b )
         {
+        case 0x7e:
+            generate_exception_if(vex.l, EXC_UD, -1);
+            ea.bytes = op_bytes;
+            break;
+        case 0xd6:
             generate_exception_if(vex.l, EXC_UD, -1);
             ea.bytes = 8;
+            break;
         }
         if ( ea.type == OP_MEM )
         {
@@ -4836,15 +4852,22 @@ x86_emulate(
             if ( b == 0x6f )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
-            /* convert memory operand to (%rAX) */
+        }
+        if ( ea.type == OP_MEM || b == 0x7e )
+        {
+            /* Convert memory operand or GPR destination to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
             buf[4] &= 0x38;
+            if ( ea.type == OP_MEM )
+                ea.reg = (void *)mmvalp;
+            else /* Ensure zero-extension of a 32-bit result. */
+                *ea.reg = 0;
         }
         if ( !rc )
         {
            copy_REX_VEX(buf, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (ea.reg)
                                      : "memory" );
         }
         put_fpu(&fic);

Comments

Andrew Cooper Sept. 30, 2016, 11:59 a.m. UTC | #1
On 08/09/16 14:46, Jan Beulich wrote:
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -223,7 +223,7 @@ static const opcode_desc_t twobyte_table
>      /* 0x70 - 0x7F */
>      SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
>      ModRM, ModRM, ModRM, ImplicitOps,
> -    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
> +    ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
>      /* 0x80 - 0x87 */
>      DstImplicit|SrcImm, DstImplicit|SrcImm,
>      DstImplicit|SrcImm, DstImplicit|SrcImm,
> @@ -2291,6 +2291,10 @@ x86_decode(
>          return X86EMUL_UNHANDLEABLE;
>      }
>  
> +    if ( op_bytes == 2 &&
> +         (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
> +        op_bytes = 4;

What is this change for?  I presume it is to undo the effect of the
operand size override prefix when we have decided that the prefix
actually had an alternate meaning?

If so, can we have a comment to this effect?

Everything else looks ok.

~Andrew
Jan Beulich Sept. 30, 2016, 12:11 p.m. UTC | #2
>>> On 30.09.16 at 13:59, <andrew.cooper3@citrix.com> wrote:
> On 08/09/16 14:46, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -223,7 +223,7 @@ static const opcode_desc_t twobyte_table
>>      /* 0x70 - 0x7F */
>>      SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
>>      ModRM, ModRM, ModRM, ImplicitOps,
>> -    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
>> +    ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
>>      /* 0x80 - 0x87 */
>>      DstImplicit|SrcImm, DstImplicit|SrcImm,
>>      DstImplicit|SrcImm, DstImplicit|SrcImm,
>> @@ -2291,6 +2291,10 @@ x86_decode(
>>          return X86EMUL_UNHANDLEABLE;
>>      }
>>  
>> +    if ( op_bytes == 2 &&
>> +         (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
>> +        op_bytes = 4;
> 
> What is this change for?  I presume it is to undo the effect of the
> operand size override prefix when we have decided that the prefix
> actually had an alternate meaning?

Yes.

> If so, can we have a comment to this effect?

+    /*
+     * Undo the operand-size override effect of prefix 66 when it was
+     * determined to have another meaning.
+     */

> Everything else looks ok.

Can I take this as R-b then with the comment added?

Jan
Andrew Cooper Sept. 30, 2016, 12:12 p.m. UTC | #3
On 30/09/16 13:11, Jan Beulich wrote:
>>>> On 30.09.16 at 13:59, <andrew.cooper3@citrix.com> wrote:
>> On 08/09/16 14:46, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -223,7 +223,7 @@ static const opcode_desc_t twobyte_table
>>>      /* 0x70 - 0x7F */
>>>      SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
>>>      ModRM, ModRM, ModRM, ImplicitOps,
>>> -    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
>>> +    ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
>>>      /* 0x80 - 0x87 */
>>>      DstImplicit|SrcImm, DstImplicit|SrcImm,
>>>      DstImplicit|SrcImm, DstImplicit|SrcImm,
>>> @@ -2291,6 +2291,10 @@ x86_decode(
>>>          return X86EMUL_UNHANDLEABLE;
>>>      }
>>>  
>>> +    if ( op_bytes == 2 &&
>>> +         (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
>>> +        op_bytes = 4;
>> What is this change for?  I presume it is to undo the effect of the
>> operand size override prefix when we have decided that the prefix
>> actually had an alternate meaning?
> Yes.
>
>> If so, can we have a comment to this effect?
> +    /*
> +     * Undo the operand-size override effect of prefix 66 when it was
> +     * determined to have another meaning.
> +     */
>
>> Everything else looks ok.
> Can I take this as R-b then with the comment added?

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox

Patch

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -973,6 +973,296 @@  int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing movd %%mm3,32(%%ecx)...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movd_to_mem);
+
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movd_to_mem, "movd %%mm3, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(movd_to_mem);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movd_to_mem) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movd %%xmm2,32(%%edx)...");
+    if ( stack_exec && cpu_has_sse2 )
+    {
+        decl_insn(movd_to_mem2);
+
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movd_to_mem2, "movd %%xmm2, 32(%0)")
+                       :: "d" (NULL) );
+
+        memset(res, 0xdb, 64);
+        set_insn(movd_to_mem2);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movd_to_mem2) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovd %%xmm1,32(%%ecx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovd_to_mem);
+
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovd_to_mem, "vmovd %%xmm1, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(vmovd_to_mem);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovd_to_mem) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movd %%mm3,%%ebx...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movd_to_reg);
+
+        /*
+         * Intentionally not specifying "b" as an input (or even output) here
+         * to not keep the compiler from using the variable, which in turn
+         * allows noticing whether the emulator touches the actual register
+         * instead of the regs field.
+         */
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movd_to_reg, "movd %%mm3, %%ebx")
+                       :: );
+
+        set_insn(movd_to_reg);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(movd_to_reg) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movd %%xmm2,%%ebx...");
+    if ( stack_exec && cpu_has_sse2 )
+    {
+        decl_insn(movd_to_reg2);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movd_to_reg2, "movd %%xmm2, %%ebx")
+                       :: );
+
+        set_insn(movd_to_reg2);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(movd_to_reg2) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovd %%xmm1,%%ebx...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovd_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovd_to_reg, "vmovd %%xmm1, %%ebx")
+                       :: );
+
+        set_insn(vmovd_to_reg);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovd_to_reg) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+#ifdef __x86_64__
+    printf("%-40s", "Testing movq %%mm3,32(%%ecx)...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movq_to_mem3);
+
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movq_to_mem3, "rex64 movd %%mm3, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(movq_to_mem3);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movq_to_mem3) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movq %%xmm2,32(%%edx)...");
+    if ( stack_exec )
+    {
+        decl_insn(movq_to_mem4);
+
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movq_to_mem4, "rex64 movd %%xmm2, 32(%0)")
+                       :: "d" (NULL) );
+
+        memset(res, 0xdb, 64);
+        set_insn(movq_to_mem4);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movq_to_mem4) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %%xmm1,32(%%ecx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovq_to_mem2);
+
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+#if 0 /* This doesn't work, as the assembler will pick opcode D6. */
+                       put_insn(vmovq_to_mem2, "vmovq %%xmm1, 32(%0)")
+#else
+                       put_insn(vmovq_to_mem2, ".byte 0xc4, 0xe1, 0xf9, 0x7e, 0x49, 0x20")
+#endif
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(vmovq_to_mem2);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovq_to_mem2) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movq %%mm3,%%rbx...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movq_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movq_to_reg, "movq %%mm3, %%rbx")
+                       :: );
+
+        set_insn(movq_to_reg);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rbx + 1 || !check_eip(movq_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movq %%xmm2,%%rbx...");
+    if ( stack_exec )
+    {
+        decl_insn(movq_to_reg2);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(movq_to_reg2, "movq %%xmm2, %%rbx")
+                       :: );
+
+        set_insn(movq_to_reg2);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rbx + 1 || !check_eip(movq_to_reg2) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %%xmm1,%%rbx...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovq_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovq_to_reg, "vmovq %%xmm1, %%rbx")
+                       :: );
+
+        set_insn(vmovq_to_reg);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rbx + 1 || !check_eip(vmovq_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+#endif
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -223,7 +223,7 @@  static const opcode_desc_t twobyte_table
     /* 0x70 - 0x7F */
     SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
     ModRM, ModRM, ModRM, ImplicitOps,
-    ModRM, ModRM, 0, 0, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
+    ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x80 - 0x87 */
     DstImplicit|SrcImm, DstImplicit|SrcImm,
     DstImplicit|SrcImm, DstImplicit|SrcImm,
@@ -2291,6 +2291,10 @@  x86_decode(
         return X86EMUL_UNHANDLEABLE;
     }
 
+    if ( op_bytes == 2 &&
+         (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+        op_bytes = 4;
+
  done:
     return rc;
 }
@@ -4772,6 +4776,12 @@  x86_emulate(
                                          /* vmovdqa ymm/m256,ymm */
     case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
                                          /* vmovdqu ymm/m256,ymm */
+    case X86EMUL_OPC(0x0f, 0x7e):        /* movd mm,r/m32 */
+                                         /* movq mm,r/m64 */
+    case X86EMUL_OPC_66(0x0f, 0x7e):     /* movd xmm,r/m32 */
+                                         /* movq xmm,r/m64 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmovd xmm,r/m32 */
+                                         /* vmovq xmm,r/m64 */
     case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
     case X86EMUL_OPC_66(0x0f, 0x7f):     /* movdqa xmm,xmm/m128 */
     case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
@@ -4822,10 +4832,16 @@  x86_emulate(
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
         }
-        if ( b == 0xd6 )
+        switch ( b )
         {
+        case 0x7e:
+            generate_exception_if(vex.l, EXC_UD, -1);
+            ea.bytes = op_bytes;
+            break;
+        case 0xd6:
             generate_exception_if(vex.l, EXC_UD, -1);
             ea.bytes = 8;
+            break;
         }
         if ( ea.type == OP_MEM )
         {
@@ -4836,15 +4852,22 @@  x86_emulate(
             if ( b == 0x6f )
                 rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
                                ea.bytes, ctxt);
-            /* convert memory operand to (%rAX) */
+        }
+        if ( ea.type == OP_MEM || b == 0x7e )
+        {
+            /* Convert memory operand or GPR destination to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
             buf[4] &= 0x38;
+            if ( ea.type == OP_MEM )
+                ea.reg = (void *)mmvalp;
+            else /* Ensure zero-extension of a 32-bit result. */
+                *ea.reg = 0;
         }
         if ( !rc )
         {
            copy_REX_VEX(buf, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (ea.reg)
                                      : "memory" );
         }
         put_fpu(&fic);