@@ -105,6 +105,8 @@ enum esz {
static const struct test avx512f_all[] = {
INSN_FP(add, 0f, 58),
+ INSN(align, 66, 0f3a, 03, vl, dq, vl),
+ INSN(blendm, 66, 0f38, 65, vl, sd, vl),
INSN(broadcastss, 66, 0f38, 18, el, d, el),
INSN_FP(cmp, 0f, c2),
INSN(comisd, 66, 0f, 2f, el, q, el),
@@ -207,6 +209,7 @@ static const struct test avx512f_all[] =
INSN(paddq, 66, 0f, d4, vl, q, vl),
INSN(pand, 66, 0f, db, vl, dq, vl),
INSN(pandn, 66, 0f, df, vl, dq, vl),
+ INSN(pblendm, 66, 0f38, 64, vl, dq, vl),
// pbroadcast, 66, 0f38, 7c, dq64
INSN(pbroadcastd, 66, 0f38, 58, el, d, el),
INSN(pbroadcastq, 66, 0f38, 59, el, q, el),
@@ -354,6 +357,7 @@ static const struct test avx512f_512[] =
};
static const struct test avx512bw_all[] = {
+ INSN(dbpsadbw, 66, 0f3a, 42, vl, b, vl),
INSN(movdqu8, f2, 0f, 6f, vl, b, vl),
INSN(movdqu8, f2, 0f, 7f, vl, b, vl),
INSN(movdqu16, f2, 0f, 6f, vl, w, vl),
@@ -373,6 +377,7 @@ static const struct test avx512bw_all[]
INSN(palignr, 66, 0f3a, 0f, vl, b, vl),
INSN(pavgb, 66, 0f, e0, vl, b, vl),
INSN(pavgw, 66, 0f, e3, vl, w, vl),
+ INSN(pblendm, 66, 0f38, 66, vl, bw, vl),
INSN(pbroadcastb, 66, 0f38, 78, el, b, el),
// pbroadcastb, 66, 0f38, 7a, b
INSN(pbroadcastw, 66, 0f38, 79, el_2, b, vl),
@@ -297,7 +297,7 @@ static inline vec_t movlhps(vec_t x, vec
# define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
# define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
# endif
-# define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+# define mix(x, y) B(blendmps_, _mask, x, y, (0b1010101010101010 & ALL_TRUE))
# define scale(x, y) BR(scalefps, _mask, x, y, undef(), ~0)
# if VEC_SIZE == 64 && defined(__AVX512ER__)
# define recip(x) BR(rcp28ps, _mask, x, undef(), ~0)
@@ -370,7 +370,7 @@ static inline vec_t movlhps(vec_t x, vec
# define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
# define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
# endif
-# define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+# define mix(x, y) B(blendmpd_, _mask, x, y, 0b10101010)
# define scale(x, y) BR(scalefpd, _mask, x, y, undef(), ~0)
# if VEC_SIZE == 64 && defined(__AVX512ER__)
# define recip(x) BR(rcp28pd, _mask, x, undef(), ~0)
@@ -564,8 +564,9 @@ static inline vec_t movlhps(vec_t x, vec
0b00011011, (vsi_t)undef(), ~0))
# define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0))
# endif
-# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
- (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+# define mix(x, y) ((vec_t)B(blendmd_, _mask, (vsi_t)(x), (vsi_t)(y), \
+ (0b1010101010101010 & ((1 << ELEM_COUNT) - 1))))
+# define rotr(x, n) ((vec_t)B(alignd, _mask, (vsi_t)(x), (vsi_t)(x), n, (vsi_t)undef(), ~0))
# define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
# elif INT_SIZE == 8 || UINT_SIZE == 8
# define broadcast(x) ({ \
@@ -602,7 +603,8 @@ static inline vec_t movlhps(vec_t x, vec
0b01001110, (vsi_t)undef(), ~0))
# define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0))
# endif
-# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+# define mix(x, y) ((vec_t)B(blendmq_, _mask, (vdi_t)(x), (vdi_t)(y), 0b10101010))
+# define rotr(x, n) ((vec_t)B(alignq, _mask, (vdi_t)(x), (vdi_t)(x), n, (vdi_t)undef(), ~0))
# if VEC_SIZE == 32
# define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0))
# elif VEC_SIZE == 64
@@ -654,8 +656,8 @@ static inline vec_t movlhps(vec_t x, vec
# define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0))
# endif
-# define mix(x, y) ((vec_t)B(movdquqi, _mask, (vqi_t)(x), (vqi_t)(y), \
- (0b0101010101010101010101010101010101010101010101010101010101010101LL & ALL_TRUE)))
+# define mix(x, y) ((vec_t)B(blendmb_, _mask, (vqi_t)(x), (vqi_t)(y), \
+ (0b1010101010101010101010101010101010101010101010101010101010101010LL & ALL_TRUE)))
# define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
# define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
# define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
@@ -687,8 +689,8 @@ static inline vec_t movlhps(vec_t x, vec
# define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0))
# endif
-# define mix(x, y) ((vec_t)B(movdquhi, _mask, (vhi_t)(x), (vhi_t)(y), \
- (0b01010101010101010101010101010101 & ALL_TRUE)))
+# define mix(x, y) ((vec_t)B(blendmw_, _mask, (vhi_t)(x), (vhi_t)(y), \
+ (0b10101010101010101010101010101010 & ALL_TRUE)))
# define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
# define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
# define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
@@ -484,6 +484,7 @@ static const struct ext0f38_table {
[0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
[0x62] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_bw },
[0x63] = { .simd_size = simd_packed_int, .to_mem = 1, .two_op = 1, .d8s = d8s_bw },
+ [0x64 ... 0x66] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x78] = { .simd_size = simd_other, .two_op = 1 },
@@ -550,6 +551,7 @@ static const struct ext0f3a_table {
[0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
[0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x02] = { .simd_size = simd_packed_int },
+ [0x03] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x06] = { .simd_size = simd_packed_fp },
[0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
@@ -581,8 +583,7 @@ static const struct ext0f3a_table {
[0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
[0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x40 ... 0x41] = { .simd_size = simd_packed_fp },
- [0x42] = { .simd_size = simd_packed_int },
- [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+ [0x42 ... 0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x44] = { .simd_size = simd_packed_int },
[0x46] = { .simd_size = simd_packed_int },
[0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6204,6 +6205,8 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f38, 0x47): /* vpsllv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x4c): /* vrcp14p{s,d} [xyz]mm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x4e): /* vrsqrt14p{s,d} [xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x64): /* vpblendm{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x65): /* vblendmp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
avx512f_no_sae:
host_and_vcpu_must_have(avx512f);
generate_exception_if(ea.type != OP_MEM && evex.brs, EXC_UD);
@@ -6961,6 +6964,7 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f38, 0x0b): /* vpmulhrsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x1c): /* vpabsb [xyz]mm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x1d): /* vpabsw [xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x66): /* vpblendm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
host_and_vcpu_must_have(avx512bw);
generate_exception_if(evex.brs, EXC_UD);
elem_bytes = 1 << (b & 1);
@@ -8130,10 +8134,12 @@ x86_emulate(
goto simd_0f_to_gpr;
CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- fault_suppression = false;
generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
EXC_UD);
/* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x03): /* valign{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ fault_suppression = false;
+ /* fall through */
case X86EMUL_OPC_EVEX_66(0x0f3a, 0x25): /* vpternlog{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
avx512f_imm8_no_sae:
host_and_vcpu_must_have(avx512f);
@@ -9471,6 +9477,9 @@ x86_emulate(
insn_bytes = PFX_BYTES + 4;
break;
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x42): /* vdbpsadbw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ generate_exception_if(evex.w, EXC_UD);
+ /* fall through */
case X86EMUL_OPC_EVEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
fault_suppression = false;
goto avx512bw_imm;
This completes support of AVX512BW in the insn emulator, and leaves just the scatter/gather ones open in the AVX512F set. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v5: New. --- TBD: The *blendm* inline functions don't reliably produce the intended insns, as the respective moves are about as good a fit for the compiler when looking for a match for the intended operation. We'd need to switch to inline assembly if we wanted to guarantee the testing of those insns. Thoughts?