diff mbox series

[v8,03/50] x86emul: support AVX512{F, BW, DQ} insert insns

Message ID 5C8B8060020000780021F11C@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich March 15, 2019, 10:37 a.m. UTC
Also correct the comment of the AVX form of VINSERTPS.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v7: Re-base.
v6: Don't refuse to emulate VINSERTPS without AVX512VL.
v4: Make use of d8s_dq64.
v3: New.
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -213,6 +213,7 @@  static const struct test avx512f_all[] =
 
 static const struct test avx512f_128[] = {
     INSN(extractps, 66, 0f3a, 17, el,    d, el),
+    INSN(insertps,  66, 0f3a, 21, el,    d, el),
     INSN(mov,       66,   0f, 6e, el, dq64, el),
     INSN(mov,       66,   0f, 7e, el, dq64, el),
     INSN(movq,      f3,   0f, 7e, el,    q, el),
@@ -224,12 +225,16 @@  static const struct test avx512f_no128[]
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
     INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
+    INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
+    INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
 };
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
     INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
     INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
+    INSN(insertf64x4,    66, 0f3a, 1a, el_4, q, vl),
+    INSN(inserti64x4,    66, 0f3a, 3a, el_4, q, vl),
 };
 
 static const struct test avx512bw_all[] = {
@@ -289,6 +294,8 @@  static const struct test avx512bw_128[]
     INSN(pextrb, 66, 0f3a, 14, el, b, el),
 //       pextrw, 66,   0f, c5,     w
     INSN(pextrw, 66, 0f3a, 15, el, w, el),
+    INSN(pinsrb, 66, 0f3a, 20, el, b, el),
+    INSN(pinsrw, 66,   0f, c4, el, w, el),
 };
 
 static const struct test avx512dq_all[] = {
@@ -301,6 +308,7 @@  static const struct test avx512dq_all[]
 
 static const struct test avx512dq_128[] = {
     INSN(pextr, 66, 0f3a, 16, el, dq64, el),
+    INSN(pinsr, 66, 0f3a, 22, el, dq64, el),
 };
 
 static const struct test avx512dq_no128[] = {
@@ -308,12 +316,16 @@  static const struct test avx512dq_no128[
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
     INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
     INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
+    INSN(insertf64x2,    66, 0f3a, 18, el_2, q, vl),
+    INSN(inserti64x2,    66, 0f3a, 38, el_2, q, vl),
 };
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
     INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
     INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
+    INSN(insertf32x8,    66, 0f3a, 1a, el_8, d, vl),
+    INSN(inserti32x8,    66, 0f3a, 3a, el_8, d, vl),
 };
 
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -360,7 +360,7 @@  static const struct twobyte_table {
     [0xc1] = { DstMem|SrcReg|ModRM },
     [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp, d8s_vl },
     [0xc3] = { DstMem|SrcReg|ModRM|Mov },
-    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int, 1 },
     [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
     [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp, d8s_vl },
     [0xc7] = { ImplicitOps|ModRM },
@@ -516,17 +516,19 @@  static const struct ext0f3a_table {
     [0x15] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 1 },
     [0x16] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = d8s_dq64 },
     [0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 2 },
-    [0x18] = { .simd_size = simd_128 },
+    [0x18] = { .simd_size = simd_128, .d8s = 4 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
+    [0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x20] = { .simd_size = simd_none },
-    [0x21] = { .simd_size = simd_other },
-    [0x22] = { .simd_size = simd_none },
+    [0x20] = { .simd_size = simd_none, .d8s = 0 },
+    [0x21] = { .simd_size = simd_other, .d8s = 2 },
+    [0x22] = { .simd_size = simd_none, .d8s = d8s_dq64 },
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
-    [0x38] = { .simd_size = simd_128 },
+    [0x38] = { .simd_size = simd_128, .d8s = 4 },
+    [0x3a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
     [0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
@@ -2586,6 +2588,7 @@  x86_decode_twobyte(
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
     case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+    case X86EMUL_OPC_EVEX_66(0, 0xc4): /* vpinsrw */
         state->desc = DstReg | SrcMem16;
         break;
 
@@ -2688,6 +2691,7 @@  x86_decode_0f3a(
 
     case X86EMUL_OPC_66(0, 0x20):     /* pinsrb */
     case X86EMUL_OPC_VEX_66(0, 0x20): /* vpinsrb */
+    case X86EMUL_OPC_EVEX_66(0, 0x20): /* vpinsrb */
         state->desc = DstImplicit | SrcMem;
         if ( modrm_mod != 3 )
             state->desc |= ByteOp;
@@ -2695,6 +2699,7 @@  x86_decode_0f3a(
 
     case X86EMUL_OPC_66(0, 0x22):     /* pinsr{d,q} */
     case X86EMUL_OPC_VEX_66(0, 0x22): /* vpinsr{d,q} */
+    case X86EMUL_OPC_EVEX_66(0, 0x22): /* vpinsr{d,q} */
         state->desc = DstImplicit | SrcMem;
         break;
 
@@ -7735,6 +7740,23 @@  x86_emulate(
         ea.type = OP_MEM;
         goto simd_0f_int_imm8;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xc4):   /* vpinsrw $imm8,r32/m16,xmm,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
+        generate_exception_if(evex.lr || evex.opmsk || evex.brs, EXC_UD);
+        if ( b & 2 )
+            host_and_vcpu_must_have(avx512dq);
+        else
+            host_and_vcpu_must_have(avx512bw);
+        if ( !mode_64bit() )
+            evex.w = 0;
+        memcpy(mmvalp, &src.val, op_bytes);
+        ea.type = OP_MEM;
+        op_bytes = src.bytes;
+        d = SrcMem16; /* Fake for the common SIMD code below. */
+        state->simd_size = simd_other;
+        goto avx512f_imm8_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0xc5):      /* pextrw $imm8,{,x}mm,reg */
     case X86EMUL_OPC_VEX_66(0x0f, 0xc5):   /* vpextrw $imm8,xmm,reg */
         generate_exception_if(vex.l, EXC_UD);
@@ -8951,8 +8973,12 @@  x86_emulate(
         opc = init_evex(stub);
         goto pextr;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x18): /* vinsertf32x4 $imm8,xmm/m128,{y,z}mm{k} */
+                                            /* vinsertf64x2 $imm8,xmm/m128,{y,z}mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x19): /* vextractf32x4 $imm8,{y,z}mm,xmm/m128{k} */
                                             /* vextractf64x2 $imm8,{y,z}mm,xmm/m128{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x38): /* vinserti32x4 $imm8,xmm/m128,{y,z}mm{k} */
+                                            /* vinserti64x2 $imm8,xmm/m128,{y,z}mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x39): /* vextracti32x4 $imm8,{y,z}mm,xmm/m128{k} */
                                             /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
         if ( evex.w )
@@ -8961,8 +8987,12 @@  x86_emulate(
         fault_suppression = false;
         goto avx512f_imm8_no_sae;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1a): /* vinsertf32x4 $imm8,ymm/m256,zmm{k} */
+                                            /* vinsertf64x2 $imm8,ymm/m256,zmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1b): /* vextractf32x8 $imm8,zmm,ymm/m256{k} */
                                             /* vextractf64x4 $imm8,zmm,ymm/m256{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3a): /* vinserti32x4 $imm8,ymm/m256,zmm{k} */
+                                            /* vinserti64x2 $imm8,ymm/m256,zmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3b): /* vextracti32x8 $imm8,zmm,ymm/m256{k} */
                                             /* vextracti64x4 $imm8,zmm,ymm/m256{k} */
         if ( !evex.w )
@@ -9055,13 +9085,20 @@  x86_emulate(
         op_bytes = 4;
         goto simd_0f3a_common;
 
-    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m32,xmm,xmm */
         op_bytes = 4;
         /* fall through */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x41): /* vdppd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m32,xmm,xmm */
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(evex.lr || evex.w || evex.opmsk || evex.brs,
+                              EXC_UD);
+        op_bytes = 4;
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
         if ( !vex.w )