diff mbox series

[v8,07/50] x86emul: support AVX512{F, BW} zero- and sign-extending moves

Message ID 5C8B810A020000780021F128@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich March 15, 2019, 10:40 a.m. UTC
Note that the testing in simd.c doesn't really follow the ISA extension
pattern - to fit the scheme, extensions from byte and word granular
vectors can (currently) sensibly only happen in the AVX512BW case (and
hence respective abstraction macros will be added there rather than
here).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: Raise #UD when EVEX.b is set. Re-base.
v3: New.

Comments

Andrew Cooper March 15, 2019, 6:02 p.m. UTC | #1
On 15/03/2019 10:40, Jan Beulich wrote:
> Note that the testing in simd.c doesn't really follow the ISA extension
> pattern - to fit the scheme, extensions from byte and word granular
> vectors can (currently) sensibly only happen in the AVX512BW case (and
> hence respective abstraction macros will be added there rather than
> here).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -177,6 +177,16 @@  static const struct test avx512f_all[] =
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
     INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmovsxbd,     66, 0f38, 21,    vl_4,    b, vl),
+    INSN(pmovsxbq,     66, 0f38, 22,    vl_8,    b, vl),
+    INSN(pmovsxwd,     66, 0f38, 23,    vl_2,    w, vl),
+    INSN(pmovsxwq,     66, 0f38, 24,    vl_4,    w, vl),
+    INSN(pmovsxdq,     66, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovzxbd,     66, 0f38, 31,    vl_4,    b, vl),
+    INSN(pmovzxbq,     66, 0f38, 32,    vl_8,    b, vl),
+    INSN(pmovzxwd,     66, 0f38, 33,    vl_2,    w, vl),
+    INSN(pmovzxwq,     66, 0f38, 34,    vl_4,    w, vl),
+    INSN(pmovzxdq,     66, 0f38, 35,    vl_2, d_nb, vl),
     INSN(pmuldq,       66, 0f38, 28,    vl,      q, vl),
     INSN(pmulld,       66, 0f38, 40,    vl,      d, vl),
     INSN(pmuludq,      66,   0f, f4,    vl,      q, vl),
@@ -274,6 +284,8 @@  static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,    w, vl),
     INSN(pminub,      66,   0f, da,    vl,    b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,    w, vl),
+    INSN(pmovsxbw,    66, 0f38, 20,    vl_2,  b, vl),
+    INSN(pmovzxbw,    66, 0f38, 30,    vl_2,  b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,    w, vl),
     INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,    w, vl),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -443,13 +443,23 @@  static const struct ext0f38_table {
     [0x1a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
     [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
+    [0x20] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x21] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x22] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
+    [0x23] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x24] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x25] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
-    [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
+    [0x30] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x31] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x32] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
+    [0x33] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x34] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x35] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -8349,6 +8359,25 @@  x86_emulate(
         op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x25): /* vpmovsxdq {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.brs || (evex.w && (b & 7) == 5), EXC_UD);
+        op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
+        elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f38, 0x2a):     /* movntdqa m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -311,10 +311,12 @@  static inline bool _to_bool(byte_vec_t b
 #  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
 #  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
 #  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovsxdq, _mask, x, (vdi_t)undef(), ~0))
 # elif UINT_SIZE == 4
 #  define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxdq, _mask, (vsi_half_t)(x), (vdi_t)undef(), ~0))
 # elif INT_SIZE == 8
 #  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -222,6 +222,16 @@  REN(pxor, , d);
 #  endif
 OVR(movntdq);
 OVR(movntdqa);
+OVR(pmovsxbd);
+OVR(pmovsxbq);
+OVR(pmovsxdq);
+OVR(pmovsxwd);
+OVR(pmovsxwq);
+OVR(pmovzxbd);
+OVR(pmovzxbq);
+OVR(pmovzxdq);
+OVR(pmovzxwd);
+OVR(pmovzxwq);
 OVR(pmulld);
 OVR(pmuldq);
 OVR(pmuludq);