@@ -214,6 +214,7 @@ static const struct test avx512f_all[] =
INSN(prolv, 66, 0f38, 15, vl, dq, vl),
INSNX(pror, 66, 0f, 72, 0, vl, dq, vl),
INSN(prorv, 66, 0f38, 14, vl, dq, vl),
+ INSN(pshufd, 66, 0f, 70, vl, d, vl),
INSN(pslld, 66, 0f, f2, el_4, d, vl),
INSNX(pslld, 66, 0f, 72, 6, vl, d, vl),
INSN(psllq, 66, 0f, f3, el_2, q, vl),
@@ -264,6 +265,10 @@ static const struct test avx512f_no128[]
INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl),
INSN(insertf32x4, 66, 0f3a, 18, el_4, d, vl),
INSN(inserti32x4, 66, 0f3a, 38, el_4, d, vl),
+ INSN(shuff32x4, 66, 0f3a, 23, vl, d, vl),
+ INSN(shuff64x2, 66, 0f3a, 23, vl, q, vl),
+ INSN(shufi32x4, 66, 0f3a, 43, vl, d, vl),
+ INSN(shufi64x2, 66, 0f3a, 43, vl, q, vl),
};
static const struct test avx512f_512[] = {
@@ -318,6 +323,9 @@ static const struct test avx512bw_all[]
INSN(pmulhw, 66, 0f, e5, vl, w, vl),
INSN(pmullw, 66, 0f, d5, vl, w, vl),
INSN(psadbw, 66, 0f, f6, vl, b, vl),
+ INSN(pshufb, 66, 0f38, 00, vl, b, vl),
+ INSN(pshufhw, f3, 0f, 70, vl, w, vl),
+ INSN(pshuflw, f2, 0f, 70, vl, w, vl),
INSNX(pslldq, 66, 0f, 73, 7, vl, b, vl),
INSN(psllvw, 66, 0f38, 12, vl, w, vl),
INSN(psllw, 66, 0f, f1, el_8, w, vl),
@@ -153,6 +153,10 @@ static inline bool _to_bool(byte_vec_t b
# else
# define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
# define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
+# define swap(x) ({ \
+ vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
+ B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
+})
# endif
# elif FLOAT_SIZE == 8
# if VEC_SIZE >= 32
@@ -181,6 +185,10 @@ static inline bool _to_bool(byte_vec_t b
# else
# define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
# define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
+# define swap(x) ({ \
+ vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
+ B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
+})
# endif
# endif
#elif FLOAT_SIZE == 4 && defined(__SSE__)
@@ -309,9 +317,14 @@ static inline bool _to_bool(byte_vec_t b
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
# else
# define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, \
+ B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
+ VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \
+ 0b00011011, (vsi_t)undef(), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
(0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -333,9 +346,14 @@ static inline bool _to_bool(byte_vec_t b
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110, (vsi_t)undef(), ~0))
# else
# define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, \
+ (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \
+ VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \
+ 0b01001110, (vsi_t)undef(), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
# endif
@@ -119,6 +119,12 @@ typedef long long __attribute__((vector_
#ifdef __AVX512F__
+/* Sadly there are a few exceptions to the general naming rules. */
+# define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
+# define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
+# define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
+# define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask
+
# if VEC_SIZE > ELEM_SIZE && (defined(VEC_MAX) ? VEC_MAX : VEC_SIZE) < 64
# pragma GCC target ( "avx512vl" )
# endif
@@ -262,6 +268,7 @@ OVR(pmovzxwq);
OVR(pmulld);
OVR(pmuldq);
OVR(pmuludq);
+OVR(pshufd);
OVR(punpckhdq);
OVR(punpckhqdq);
OVR(punpckldq);
@@ -318,7 +318,7 @@ static const struct twobyte_table {
[0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
[0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
- [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+ [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other, d8s_vl },
[0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
[0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0x77] = { DstImplicit|SrcNone },
@@ -432,7 +432,8 @@ static const struct ext0f38_table {
uint8_t vsib:1;
disp8scale_t d8s:4;
} ext0f38_table[256] = {
- [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+ [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+ [0x01 ... 0x0b] = { .simd_size = simd_packed_int },
[0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
[0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x13] = { .simd_size = simd_other, .two_op = 1 },
@@ -543,6 +544,7 @@ static const struct ext0f3a_table {
[0x20] = { .simd_size = simd_none, .d8s = 0 },
[0x21] = { .simd_size = simd_other, .d8s = 2 },
[0x22] = { .simd_size = simd_none, .d8s = d8s_dq64 },
+ [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
[0x38] = { .simd_size = simd_128, .d8s = 4 },
@@ -552,6 +554,7 @@ static const struct ext0f3a_table {
[0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x40 ... 0x41] = { .simd_size = simd_packed_fp },
[0x42] = { .simd_size = simd_packed_int },
+ [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x44] = { .simd_size = simd_packed_int },
[0x46] = { .simd_size = simd_packed_int },
[0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6701,6 +6704,7 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x00): /* vpshufb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
fault_suppression = false;
/* fall through */
case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -6955,6 +6959,21 @@ x86_emulate(
insn_bytes = PFX_BYTES + 3;
break;
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd $imm8,[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+ if ( evex.pfx == vex_66 )
+ generate_exception_if(evex.w, EXC_UD);
+ else
+ {
+ host_and_vcpu_must_have(avx512bw);
+ generate_exception_if(evex.brs, EXC_UD);
+ }
+ d = (d & ~SrcMask) | SrcMem | TwoOp;
+ op_bytes = 16 << evex.lr;
+ fault_suppression = false;
+ goto avx512f_imm8_no_sae;
+
CASE_SIMD_PACKED_INT(0x0f, 0x71): /* Grp12 */
case X86EMUL_OPC_VEX_66(0x0f, 0x71):
CASE_SIMD_PACKED_INT(0x0f, 0x72): /* Grp13 */
@@ -9150,7 +9169,13 @@ x86_emulate(
/* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
if ( evex.w )
host_and_vcpu_must_have(avx512dq);
- generate_exception_if(!evex.lr || evex.brs, EXC_UD);
+ generate_exception_if(evex.brs, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x23): /* vshuff32x4 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ /* vshuff64x2 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x43): /* vshufi32x4 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ /* vshufi64x2 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ generate_exception_if(!evex.lr, EXC_UD);
fault_suppression = false;
goto avx512f_imm8_no_sae;
Also include vshuff{32x4,64x2} as being very similar to vshufi{32x4,64x2}. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v8: Re-base. v7: Disable fault suppression for VPSHUF{D,{H,L}W}. Re-base. v6: Re-base over changes earlier in the series. v5: Re-base over changes earlier in the series. v4: Move OVR() addition into __AVX512VL__ conditional. Correct comments. v3: New.