@@ -229,6 +229,10 @@ static const struct test avx512f_all[] =
INSN(pternlog, 66, 0f3a, 25, vl, dq, vl),
INSN(ptestm, 66, 0f38, 27, vl, dq, vl),
INSN(ptestnm, f3, 0f38, 27, vl, dq, vl),
+ INSN(punpckhdq, 66, 0f, 6a, vl, d, vl),
+ INSN(punpckhqdq, 66, 0f, 6d, vl, q, vl),
+ INSN(punpckldq, 66, 0f, 62, vl, d, vl),
+ INSN(punpcklqdq, 66, 0f, 6c, vl, q, vl),
INSN(pxor, 66, 0f, ef, vl, dq, vl),
INSN_PFP(shuf, 0f, c6),
INSN_FP(sqrt, 0f, 51),
@@ -327,6 +331,10 @@ static const struct test avx512bw_all[]
INSN(psubw, 66, 0f, f9, vl, w, vl),
INSN(ptestm, 66, 0f38, 26, vl, bw, vl),
INSN(ptestnm, f3, 0f38, 26, vl, bw, vl),
+ INSN(punpckhbw, 66, 0f, 68, vl, b, vl),
+ INSN(punpckhwd, 66, 0f, 69, vl, w, vl),
+ INSN(punpcklbw, 66, 0f, 60, vl, b, vl),
+ INSN(punpcklwd, 66, 0f, 61, vl, w, vl),
};
static const struct test avx512bw_128[] = {
@@ -300,6 +300,10 @@ static inline bool _to_bool(byte_vec_t b
asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
t_; \
})
+# if VEC_SIZE == 16
+# define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+# define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+# endif
# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
(0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
# define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
@@ -317,6 +321,10 @@ static inline bool _to_bool(byte_vec_t b
t_; \
})
# endif
+# if VEC_SIZE == 16
+# define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# endif
# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
# endif
# if INT_SIZE == 4
@@ -252,6 +252,10 @@ OVR(pmovzxwq);
OVR(pmulld);
OVR(pmuldq);
OVR(pmuludq);
+OVR(punpckhdq);
+OVR(punpckhqdq);
+OVR(punpckldq);
+OVR(punpcklqdq);
# endif
# undef OVR_VFP
@@ -312,10 +312,10 @@ static const struct twobyte_table {
[0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
[0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
- [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
[0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
- [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
- [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
+ [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
[0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
[0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
@@ -6681,6 +6681,12 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm */
generate_exception_if(evex.opmsk, EXC_UD);
/* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x60): /* vpunpcklbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x61): /* vpunpcklwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x68): /* vpunpckhbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ op_bytes = 16 << evex.lr;
+ /* fall through */
case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -6708,6 +6714,13 @@ x86_emulate(
elem_bytes = 1 << (b & 1);
goto avx512f_no_sae;
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x62): /* vpunpckldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x6a): /* vpunpckhdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ generate_exception_if(evex.w, EXC_UD);
+ fault_suppression = false;
+ op_bytes = 16 << evex.lr;
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_EVEX_F3(0x0f38, 0x26): /* vptestnm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
case X86EMUL_OPC_EVEX_F3(0x0f38, 0x27): /* vptestnm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
op_bytes = 16 << evex.lr;
@@ -6734,6 +6747,10 @@ x86_emulate(
avx512_vlen_check(false);
goto simd_zmm;
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x6c): /* vpunpcklqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x6d): /* vpunpckhqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ fault_suppression = false;
+ /* fall through */
case X86EMUL_OPC_EVEX_66(0x0f, 0xd4): /* vpaddq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xf4): /* vpmuludq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x28): /* vpmuldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
There's once again one extra twobyte_table[] entry which gets its Disp8 shift value set right away without getting support implemented just yet, again to avoid needlessly splitting groups of entries. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v8: Re-base. v6: Re-base over changes earlier in the series. v4: Move OVR() additions into __AVX512VL__ conditional. v3: New.