@@ -178,6 +178,10 @@ static const struct test avx512f_all[] =
INSN(pcmpu, 66, 0f3a, 1e, vl, dq, vl),
INSN(permi2, 66, 0f38, 76, vl, dq, vl),
INSN(permi2, 66, 0f38, 77, vl, sd, vl),
+ INSN(permilpd, 66, 0f38, 0d, vl, q, vl),
+ INSN(permilpd, 66, 0f3a, 05, vl, q, vl),
+ INSN(permilps, 66, 0f38, 0c, vl, d, vl),
+ INSN(permilps, 66, 0f3a, 04, vl, d, vl),
INSN(permt2, 66, 0f38, 7e, vl, dq, vl),
INSN(permt2, 66, 0f38, 7f, vl, sd, vl),
INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl),
@@ -278,6 +282,10 @@ static const struct test avx512f_no128[]
INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl),
INSN(insertf32x4, 66, 0f3a, 18, el_4, d, vl),
INSN(inserti32x4, 66, 0f3a, 38, el_4, d, vl),
+ INSN(perm, 66, 0f38, 36, vl, dq, vl),
+ INSN(perm, 66, 0f38, 16, vl, sd, vl),
+ INSN(permpd, 66, 0f3a, 01, vl, q, vl),
+ INSN(permq, 66, 0f3a, 00, vl, q, vl),
INSN(shuff32x4, 66, 0f3a, 23, vl, d, vl),
INSN(shuff64x2, 66, 0f3a, 23, vl, q, vl),
INSN(shufi32x4, 66, 0f3a, 43, vl, d, vl),
@@ -316,6 +324,7 @@ static const struct test avx512bw_all[]
INSN(pcmpgtb, 66, 0f, 64, vl, b, vl),
INSN(pcmpgtw, 66, 0f, 65, vl, w, vl),
INSN(pcmpu, 66, 0f3a, 3e, vl, bw, vl),
+ INSN(permw, 66, 0f38, 8d, vl, w, vl),
INSN(permi2w, 66, 0f38, 75, vl, w, vl),
INSN(permt2w, 66, 0f38, 7d, vl, w, vl),
INSN(pmaddwd, 66, 0f, f5, vl, w, vl),
@@ -412,6 +421,7 @@ static const struct test avx512dq_512[]
};
static const struct test avx512_vbmi_all[] = {
+ INSN(permb, 66, 0f38, 8d, vl, b, vl),
INSN(permi2b, 66, 0f38, 75, vl, b, vl),
INSN(permt2b, 66, 0f38, 7d, vl, b, vl),
};
@@ -186,6 +186,7 @@ static inline bool _to_bool(byte_vec_t b
# define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
# define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
# define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+# define swap2(x) B_(vpermilps, _mask, x, 0b00011011, undef(), ~0)
# else
# define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
# define insert_pair(x, y, p) \
@@ -200,6 +201,10 @@ static inline bool _to_bool(byte_vec_t b
vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
})
+# define swap2(x) B(vpermilps, _mask, \
+ B(shuf_f32x4_, _mask, x, x, \
+ VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
+ 0b00011011, undef(), ~0)
# endif
# elif FLOAT_SIZE == 8
# if VEC_SIZE >= 32
@@ -233,6 +238,7 @@ static inline bool _to_bool(byte_vec_t b
# define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
# define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
# define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+# define swap2(x) B_(vpermilpd, _mask, x, 0b01, undef(), ~0)
# else
# define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
# define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
@@ -240,6 +246,10 @@ static inline bool _to_bool(byte_vec_t b
vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
})
+# define swap2(x) B(vpermilpd, _mask, \
+ B(shuf_f64x2_, _mask, x, x, \
+ VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
+ 0b01010101, undef(), ~0)
# endif
# endif
#elif FLOAT_SIZE == 4 && defined(__SSE__)
@@ -405,6 +415,7 @@ static inline bool _to_bool(byte_vec_t b
B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \
0b00011011, (vsi_t)undef(), ~0))
+# define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
(0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -442,8 +453,17 @@ static inline bool _to_bool(byte_vec_t b
(vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \
VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \
0b01001110, (vsi_t)undef(), ~0))
+# define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+# if VEC_SIZE == 32
+# define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0))
+# elif VEC_SIZE == 64
+# define swap3(x) ({ \
+ vdi_t t_ = B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0); \
+ B(shuf_i64x2_, _mask, t_, t_, 0b01001110, (vdi_t)undef(), ~0); \
+})
+# endif
# endif
# if INT_SIZE == 4
# define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
@@ -489,6 +509,9 @@ static inline bool _to_bool(byte_vec_t b
# define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
# define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
# define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
+# ifdef __AVX512VBMI__
+# define swap2(x) ((vec_t)B(permvarqi, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
+# endif
# elif INT_SIZE == 2 || UINT_SIZE == 2
# define broadcast(x) ({ \
vec_t t_; \
@@ -517,6 +540,7 @@ static inline bool _to_bool(byte_vec_t b
(0b01010101010101010101010101010101 & ALL_TRUE)))
# define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
# define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
+# define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
# endif
# if INT_SIZE == 1
# define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
@@ -1325,6 +1349,12 @@ int simd_test(void)
if ( !eq(swap2(src), inv) ) return __LINE__;
#endif
+#ifdef swap3
+ touch(src);
+ if ( !eq(swap3(src), inv) ) return __LINE__;
+ touch(src);
+#endif
+
#ifdef broadcast
if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
#endif
@@ -275,6 +275,8 @@ OVR(movlps);
OVR_VFP(movnt);
OVR_VFP(movu);
OVR_FP(mul);
+OVR_VFP(perm);
+OVR_VFP(permil);
OVR_VFP(shuf);
OVR_INT(sll);
OVR_DQ(sllv);
@@ -331,6 +333,8 @@ OVR(movntdq);
OVR(movntdqa);
OVR(movshdup);
OVR(movsldup);
+OVR(permd);
+OVR(permq);
OVR(pmovsxbd);
OVR(pmovsxbq);
OVR(pmovsxdq);
@@ -434,7 +434,8 @@ static const struct ext0f38_table {
} ext0f38_table[256] = {
[0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x01 ... 0x0b] = { .simd_size = simd_packed_int },
- [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
+ [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+ [0x0e ... 0x0f] = { .simd_size = simd_packed_fp },
[0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x13] = { .simd_size = simd_other, .two_op = 1 },
[0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -477,6 +478,7 @@ static const struct ext0f38_table {
[0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x8c] = { .simd_size = simd_packed_int },
+ [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
[0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
[0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -522,10 +524,10 @@ static const struct ext0f3a_table {
uint8_t four_op:1;
disp8scale_t d8s:4;
} ext0f3a_table[256] = {
- [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
- [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+ [0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
+ [0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x02] = { .simd_size = simd_packed_int },
- [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+ [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x06] = { .simd_size = simd_packed_fp },
[0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
[0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
@@ -8102,6 +8104,9 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f, 0xf2): /* vpslld xmm/m128,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,[xyz]mm,[xyz]mm{k} */
generate_exception_if(evex.brs, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x0c): /* vpermilps [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x0d): /* vpermilpd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
fault_suppression = false;
if ( b == 0xe2 )
goto avx512f_no_sae;
@@ -8447,6 +8452,12 @@ x86_emulate(
generate_exception_if(!vex.l || vex.w, EXC_UD);
goto simd_0f_avx2;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ generate_exception_if(!evex.lr, EXC_UD);
+ fault_suppression = false;
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -8652,6 +8663,7 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
if ( !evex.w )
host_and_vcpu_must_have(avx512_vbmi);
else
@@ -9077,6 +9089,12 @@ x86_emulate(
generate_exception_if(!vex.l || !vex.w, EXC_UD);
goto simd_0f_imm8_avx2;
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x00): /* vpermq $imm8,{y,z}mm/mem,{y,z}mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x01): /* vpermpd $imm8,{y,z}mm/mem,{y,z}mm{k} */
+ generate_exception_if(!evex.lr || !evex.w, EXC_UD);
+ fault_suppression = false;
+ goto avx512f_imm8_no_sae;
+
case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */
@@ -9096,6 +9114,12 @@ x86_emulate(
generate_exception_if(vex.w, EXC_UD);
goto simd_0f_imm8_avx;
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x04): /* vpermilps $imm8,[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,[xyz]mm/mem,[xyz]mm{k} */
+ generate_exception_if(evex.w != (b & 1), EXC_UD);
+ fault_suppression = false;
+ goto avx512f_imm8_no_sae;
+
case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v7: Re-base. v5: Re-base over changes earlier in the series. v4: New.