@@ -177,11 +177,26 @@ static const struct test avx512f_all[] =
INSN(pmaxu, 66, 0f38, 3f, vl, dq, vl),
INSN(pmins, 66, 0f38, 39, vl, dq, vl),
INSN(pminu, 66, 0f38, 3b, vl, dq, vl),
+ INSN(pmovdb, f3, 0f38, 31, vl_4, b, vl),
+ INSN(pmovdw, f3, 0f38, 33, vl_2, b, vl),
+ INSN(pmovqb, f3, 0f38, 32, vl_8, b, vl),
+ INSN(pmovqd, f3, 0f38, 35, vl_2, d_nb, vl),
+ INSN(pmovqw, f3, 0f38, 34, vl_4, b, vl),
+ INSN(pmovsdb, f3, 0f38, 21, vl_4, b, vl),
+ INSN(pmovsdw, f3, 0f38, 23, vl_2, b, vl),
+ INSN(pmovsqb, f3, 0f38, 22, vl_8, b, vl),
+ INSN(pmovsqd, f3, 0f38, 25, vl_2, d_nb, vl),
+ INSN(pmovsqw, f3, 0f38, 24, vl_4, b, vl),
INSN(pmovsxbd, 66, 0f38, 21, vl_4, b, vl),
INSN(pmovsxbq, 66, 0f38, 22, vl_8, b, vl),
INSN(pmovsxwd, 66, 0f38, 23, vl_2, w, vl),
INSN(pmovsxwq, 66, 0f38, 24, vl_4, w, vl),
INSN(pmovsxdq, 66, 0f38, 25, vl_2, d_nb, vl),
+ INSN(pmovusdb, f3, 0f38, 11, vl_4, b, vl),
+ INSN(pmovusdw, f3, 0f38, 13, vl_2, b, vl),
+ INSN(pmovusqb, f3, 0f38, 12, vl_8, b, vl),
+ INSN(pmovusqd, f3, 0f38, 15, vl_2, d_nb, vl),
+ INSN(pmovusqw, f3, 0f38, 14, vl_4, b, vl),
INSN(pmovzxbd, 66, 0f38, 31, vl_4, b, vl),
INSN(pmovzxbq, 66, 0f38, 32, vl_8, b, vl),
INSN(pmovzxwd, 66, 0f38, 33, vl_2, w, vl),
@@ -284,7 +299,10 @@ static const struct test avx512bw_all[]
INSN(pminsw, 66, 0f, ea, vl, w, vl),
INSN(pminub, 66, 0f, da, vl, b, vl),
INSN(pminuw, 66, 0f38, 3a, vl, w, vl),
+ INSN(pmovswb, f3, 0f38, 20, vl_2, b, vl),
INSN(pmovsxbw, 66, 0f38, 20, vl_2, b, vl),
+ INSN(pmovuswb, f3, 0f38, 10, vl_2, b, vl),
+ INSN(pmovwb, f3, 0f38, 30, vl_2, b, vl),
INSN(pmovzxbw, 66, 0f38, 30, vl_2, b, vl),
INSN(pmulhuw, 66, 0f, e4, vl, w, vl),
INSN(pmulhw, 66, 0f, e5, vl, w, vl),
@@ -277,6 +277,17 @@ static inline bool _to_bool(byte_vec_t b
#endif
#if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \
+ (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextracti32x8 */ || \
+ (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+# define low_half(x) ({ \
+ half_t t_; \
+ asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+ : [d] "=m" (t_) \
+ : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+ t_; \
+})
+# endif
# if INT_SIZE == 4 || UINT_SIZE == 4
# define broadcast(x) ({ \
vec_t t_; \
@@ -291,6 +302,7 @@ static inline bool _to_bool(byte_vec_t b
})
# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
(0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+# define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
# elif INT_SIZE == 8 || UINT_SIZE == 8
# define broadcast(x) ({ \
vec_t t_; \
@@ -720,6 +732,27 @@ static inline bool _to_bool(byte_vec_t b
# endif
#endif
+#if VEC_SIZE >= 16
+
+# if !defined(low_half) && defined(HALF_SIZE)
+static inline half_t low_half(vec_t x)
+{
+# if HALF_SIZE < VEC_SIZE
+ half_t y;
+ unsigned int i;
+
+ for ( i = 0; i < ELEM_COUNT / 2; ++i )
+ y[i] = x[i];
+
+ return y;
+# else
+ return x;
+# endif
+}
+# endif
+
+#endif
+
#if defined(__AVX512F__) && defined(FLOAT_SIZE)
# include "simd-fma.c"
#endif
@@ -1087,6 +1120,21 @@ int simd_test(void)
#endif
+#if defined(widen1) && defined(shrink1)
+ {
+ half_t aux1 = low_half(src), aux2;
+
+ touch(aux1);
+ x = widen1(aux1);
+ touch(x);
+ aux2 = shrink1(x);
+ touch(aux2);
+ for ( i = 0; i < ELEM_COUNT / 2; ++i )
+ if ( aux2[i] != src[i] )
+ return __LINE__;
+ }
+#endif
+
#ifdef dup_lo
touch(src);
x = dup_lo(src);
@@ -70,6 +70,23 @@ typedef int __attribute__((vector_size(V
typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
#endif
+#if VEC_SIZE >= 16
+
+# if ELEM_COUNT >= 2
+# if VEC_SIZE > 32
+# define HALF_SIZE (VEC_SIZE / 2)
+# else
+# define HALF_SIZE 16
+# endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t;
+typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
+typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
+typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
+typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+# endif
+
+#endif
+
#if VEC_SIZE == 16
# define B(n, s, a...) __builtin_ia32_ ## n ## 128 ## s(a)
# define B_(n, s, a...) __builtin_ia32_ ## n ## s(a)
@@ -3068,7 +3068,22 @@ x86_decode(
d |= vSIB;
state->simd_size = ext0f38_table[b].simd_size;
if ( evex_encoded() )
- disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+ {
+ /*
+ * VPMOVUS* are identical to VPMOVS* Disp8-scaling-wise, but
+ * their attributes don't match those of the vex_66 encoded
+ * insns with the same base opcodes. Rather than adding new
+ * columns to the table, handle this here for now.
+ */
+ if ( evex.pfx != vex_f3 || (b & 0xf8) != 0x10 )
+ disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+ else
+ {
+ disp8scale = decode_disp8scale(ext0f38_table[b ^ 0x30].d8s,
+ state);
+ state->simd_size = simd_other;
+ }
+ }
break;
case ext_0f3a:
@@ -8359,10 +8374,14 @@ x86_emulate(
op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
goto simd_0f_int;
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb [xyz]mm,{x,y}mm/mem{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30): /* vpmovwb [xyz]mm,{x,y}mm/mem{k} */
host_and_vcpu_must_have(avx512bw);
- /* fall through */
+ if ( evex.pfx != vex_f3 )
+ {
case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */
@@ -8373,7 +8392,29 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */
- generate_exception_if(evex.brs || (evex.w && (b & 7) == 5), EXC_UD);
+ generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+ }
+ else
+ {
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x11): /* vpmovusdb [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x12): /* vpmovusqb [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x13): /* vpmovusdw [xyz]mm,{x,y}mm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x14): /* vpmovusqw [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* vpmovusqd [xyz]mm,{x,y}mm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x21): /* vpmovsdb [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x22): /* vpmovsqb [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x23): /* vpmovsdw [xyz]mm,{x,y}mm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x24): /* vpmovsqw [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* vpmovsqd [xyz]mm,{x,y}mm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x31): /* vpmovdb [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x32): /* vpmovqb [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x33): /* vpmovdw [xyz]mm,{x,y}mm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x34): /* vpmovqw [xyz]mm,xmm/mem{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* vpmovqd [xyz]mm,{x,y}mm/mem{k} */
+ generate_exception_if(evex.w || (ea.type != OP_REG && evex.z), EXC_UD);
+ d = DstMem | SrcReg | TwoOp;
+ }
+ generate_exception_if(evex.brs, EXC_UD);
op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
goto avx512f_no_sae;
@@ -10212,6 +10253,12 @@ x86_insn_is_mem_write(const struct x86_e
case X86EMUL_OPC(0x0f, 0xab): /* BTS */
case X86EMUL_OPC(0x0f, 0xb3): /* BTR */
case X86EMUL_OPC(0x0f, 0xbb): /* BTC */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10) ...
+ X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* VPMOVUS* */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20) ...
+ X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* VPMOVS* */
+ case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30) ...
+ X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* VPMOV{D,Q,W}* */
return true;
case 0xd9:
Note that the vpmov{,s,us}{d,q}w table entries in evex-disp8.c are slightly different from what one would expect, due to them requiring EVEX.W to be zero. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v8: Adjustment for XSA-289: Use XOR instead of ADD when fiddling with b as an array index. v7: ea.type == OP_* -> ea.type != OP_*. Re-base over change in previous patch. Re-base. v5: Also adjust x86_insn_is_mem_write(). v4: Also #UD when evex.z is set with a memory operand. v3: New.