diff mbox series

[v8,08/50] x86emul: support AVX512{F, BW} down conversion moves

Message ID 5C8B8122020000780021F161@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich March 15, 2019, 10:40 a.m. UTC
Note that the vpmov{,s,us}{d,q}w table entries in evex-disp8.c are
slightly different from what one would expect, due to them requiring
EVEX.W to be zero.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v8: Adjustment for XSA-289: Use XOR instead of ADD when fiddling with b
    as an array index.
v7: ea.type == OP_* -> ea.type != OP_*. Re-base over change in previous
    patch. Re-base.
v5: Also adjust x86_insn_is_mem_write().
v4: Also #UD when evex.z is set with a memory operand.
v3: New.

Comments

Andrew Cooper March 15, 2019, 6:10 p.m. UTC | #1
On 15/03/2019 10:40, Jan Beulich wrote:
> Note that the vpmov{,s,us}{d,q}w table entries in evex-disp8.c are
> slightly different from what one would expect, due to them requiring
> EVEX.W to be zero.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -177,11 +177,26 @@  static const struct test avx512f_all[] =
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
     INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmovdb,       f3, 0f38, 31,    vl_4,    b, vl),
+    INSN(pmovdw,       f3, 0f38, 33,    vl_2,    b, vl),
+    INSN(pmovqb,       f3, 0f38, 32,    vl_8,    b, vl),
+    INSN(pmovqd,       f3, 0f38, 35,    vl_2, d_nb, vl),
+    INSN(pmovqw,       f3, 0f38, 34,    vl_4,    b, vl),
+    INSN(pmovsdb,      f3, 0f38, 21,    vl_4,    b, vl),
+    INSN(pmovsdw,      f3, 0f38, 23,    vl_2,    b, vl),
+    INSN(pmovsqb,      f3, 0f38, 22,    vl_8,    b, vl),
+    INSN(pmovsqd,      f3, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovsqw,      f3, 0f38, 24,    vl_4,    b, vl),
     INSN(pmovsxbd,     66, 0f38, 21,    vl_4,    b, vl),
     INSN(pmovsxbq,     66, 0f38, 22,    vl_8,    b, vl),
     INSN(pmovsxwd,     66, 0f38, 23,    vl_2,    w, vl),
     INSN(pmovsxwq,     66, 0f38, 24,    vl_4,    w, vl),
     INSN(pmovsxdq,     66, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovusdb,     f3, 0f38, 11,    vl_4,    b, vl),
+    INSN(pmovusdw,     f3, 0f38, 13,    vl_2,    b, vl),
+    INSN(pmovusqb,     f3, 0f38, 12,    vl_8,    b, vl),
+    INSN(pmovusqd,     f3, 0f38, 15,    vl_2, d_nb, vl),
+    INSN(pmovusqw,     f3, 0f38, 14,    vl_4,    b, vl),
     INSN(pmovzxbd,     66, 0f38, 31,    vl_4,    b, vl),
     INSN(pmovzxbq,     66, 0f38, 32,    vl_8,    b, vl),
     INSN(pmovzxwd,     66, 0f38, 33,    vl_2,    w, vl),
@@ -284,7 +299,10 @@  static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,    w, vl),
     INSN(pminub,      66,   0f, da,    vl,    b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,    w, vl),
+    INSN(pmovswb,     f3, 0f38, 20,    vl_2,  b, vl),
     INSN(pmovsxbw,    66, 0f38, 20,    vl_2,  b, vl),
+    INSN(pmovuswb,    f3, 0f38, 10,    vl_2,  b, vl),
+    INSN(pmovwb,      f3, 0f38, 30,    vl_2,  b, vl),
     INSN(pmovzxbw,    66, 0f38, 30,    vl_2,  b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,    w, vl),
     INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -277,6 +277,17 @@  static inline bool _to_bool(byte_vec_t b
 #endif
 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \
+     (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextracti32x8 */ || \
+     (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+#  define low_half(x) ({ \
+    half_t t_; \
+    asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+    t_; \
+})
+# endif
 # if INT_SIZE == 4 || UINT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -291,6 +302,7 @@  static inline bool _to_bool(byte_vec_t b
 })
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+#  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -720,6 +732,27 @@  static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
+#if VEC_SIZE >= 16
+
+# if !defined(low_half) && defined(HALF_SIZE)
+static inline half_t low_half(vec_t x)
+{
+#  if HALF_SIZE < VEC_SIZE
+    half_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 2; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
+#endif
+
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
 # include "simd-fma.c"
 #endif
@@ -1087,6 +1120,21 @@  int simd_test(void)
 
 #endif
 
+#if defined(widen1) && defined(shrink1)
+    {
+        half_t aux1 = low_half(src), aux2;
+
+        touch(aux1);
+        x = widen1(aux1);
+        touch(x);
+        aux2 = shrink1(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 2; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
+#endif
+
 #ifdef dup_lo
     touch(src);
     x = dup_lo(src);
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,6 +70,23 @@  typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if VEC_SIZE >= 16
+
+# if ELEM_COUNT >= 2
+#  if VEC_SIZE > 32
+#   define HALF_SIZE (VEC_SIZE / 2)
+#  else
+#   define HALF_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t;
+typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
+typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
+typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
+typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+# endif
+
+#endif
+
 #if VEC_SIZE == 16
 # define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
 # define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3068,7 +3068,22 @@  x86_decode(
                 d |= vSIB;
             state->simd_size = ext0f38_table[b].simd_size;
             if ( evex_encoded() )
-                disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+            {
+                /*
+                 * VPMOVUS* are identical to VPMOVS* Disp8-scaling-wise, but
+                 * their attributes don't match those of the vex_66 encoded
+                 * insns with the same base opcodes. Rather than adding new
+                 * columns to the table, handle this here for now.
+                 */
+                if ( evex.pfx != vex_f3 || (b & 0xf8) != 0x10 )
+                    disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+                else
+                {
+                    disp8scale = decode_disp8scale(ext0f38_table[b ^ 0x30].d8s,
+                                                   state);
+                    state->simd_size = simd_other;
+                }
+            }
             break;
 
         case ext_0f3a:
@@ -8359,10 +8374,14 @@  x86_emulate(
         op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb [xyz]mm,{x,y}mm/mem{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30): /* vpmovwb [xyz]mm,{x,y}mm/mem{k} */
         host_and_vcpu_must_have(avx512bw);
-        /* fall through */
+        if ( evex.pfx != vex_f3 )
+        {
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */
@@ -8373,7 +8392,29 @@  x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */
-        generate_exception_if(evex.brs || (evex.w && (b & 7) == 5), EXC_UD);
+            generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+        }
+        else
+        {
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x11): /* vpmovusdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x12): /* vpmovusqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x13): /* vpmovusdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x14): /* vpmovusqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* vpmovusqd [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x21): /* vpmovsdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x22): /* vpmovsqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x23): /* vpmovsdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x24): /* vpmovsqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* vpmovsqd [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x31): /* vpmovdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x32): /* vpmovqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x33): /* vpmovdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x34): /* vpmovqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* vpmovqd [xyz]mm,{x,y}mm/mem{k} */
+            generate_exception_if(evex.w || (ea.type != OP_REG && evex.z), EXC_UD);
+            d = DstMem | SrcReg | TwoOp;
+        }
+        generate_exception_if(evex.brs, EXC_UD);
         op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
         elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
         goto avx512f_no_sae;
@@ -10212,6 +10253,12 @@  x86_insn_is_mem_write(const struct x86_e
     case X86EMUL_OPC(0x0f, 0xab):        /* BTS */
     case X86EMUL_OPC(0x0f, 0xb3):        /* BTR */
     case X86EMUL_OPC(0x0f, 0xbb):        /* BTC */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10) ...
+         X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* VPMOVUS* */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20) ...
+         X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* VPMOVS* */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30) ...
+         X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* VPMOV{D,Q,W}* */
         return true;
 
     case 0xd9: