@@ -164,6 +164,9 @@ static const struct test avx512f_all[] =
INSN(paddq, 66, 0f, d4, vl, q, vl),
INSN(pand, 66, 0f, db, vl, dq, vl),
INSN(pandn, 66, 0f, df, vl, dq, vl),
+// pbroadcast, 66, 0f38, 7c, dq64
+ INSN(pbroadcastd, 66, 0f38, 58, el, d, el),
+ INSN(pbroadcastq, 66, 0f38, 59, el, q, el),
INSN(pcmp, 66, 0f3a, 1f, vl, dq, vl),
INSN(pcmpeqd, 66, 0f, 76, vl, d, vl),
INSN(pcmpeqq, 66, 0f38, 29, vl, q, vl),
@@ -222,6 +225,7 @@ static const struct test avx512f_128[] =
static const struct test avx512f_no128[] = {
INSN(broadcastf32x4, 66, 0f38, 1a, el_4, d, vl),
+ INSN(broadcasti32x4, 66, 0f38, 5a, el_4, d, vl),
INSN(broadcastsd, 66, 0f38, 19, el, q, el),
INSN(extractf32x4, 66, 0f3a, 19, el_4, d, vl),
INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl),
@@ -231,6 +235,7 @@ static const struct test avx512f_no128[]
static const struct test avx512f_512[] = {
INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+ INSN(broadcasti64x4, 66, 0f38, 5b, el_4, q, vl),
INSN(extractf64x4, 66, 0f3a, 1b, el_4, q, vl),
INSN(extracti64x4, 66, 0f3a, 3b, el_4, q, vl),
INSN(insertf64x4, 66, 0f3a, 1a, el_4, q, vl),
@@ -250,6 +255,10 @@ static const struct test avx512bw_all[]
INSN(paddw, 66, 0f, fd, vl, w, vl),
INSN(pavgb, 66, 0f, e0, vl, b, vl),
INSN(pavgw, 66, 0f, e3, vl, w, vl),
+ INSN(pbroadcastb, 66, 0f38, 78, el, b, el),
+// pbroadcastb, 66, 0f38, 7a, b
+ INSN(pbroadcastw, 66, 0f38, 79, el_2, b, vl),
+// pbroadcastw, 66, 0f38, 7b, b
INSN(pcmp, 66, 0f3a, 3f, vl, bw, vl),
INSN(pcmpeqb, 66, 0f, 74, vl, b, vl),
INSN(pcmpeqw, 66, 0f, 75, vl, w, vl),
@@ -301,6 +310,7 @@ static const struct test avx512bw_128[]
static const struct test avx512dq_all[] = {
INSN_PFP(and, 0f, 54),
INSN_PFP(andn, 0f, 55),
+ INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl),
INSN_PFP(or, 0f, 56),
INSN(pmullq, 66, 0f38, 40, vl, q, vl),
INSN_PFP(xor, 0f, 57),
@@ -314,6 +324,7 @@ static const struct test avx512dq_128[]
static const struct test avx512dq_no128[] = {
INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+ INSN(broadcasti64x2, 66, 0f38, 5a, el_2, q, vl),
INSN(extractf64x2, 66, 0f3a, 19, el_2, q, vl),
INSN(extracti64x2, 66, 0f3a, 39, el_2, q, vl),
INSN(insertf64x2, 66, 0f3a, 18, el_2, q, vl),
@@ -322,6 +333,7 @@ static const struct test avx512dq_no128[
static const struct test avx512dq_512[] = {
INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+ INSN(broadcasti32x8, 66, 0f38, 5b, el_8, d, vl),
INSN(extractf32x8, 66, 0f3a, 1b, el_8, d, vl),
INSN(extracti32x8, 66, 0f3a, 3b, el_8, d, vl),
INSN(insertf32x8, 66, 0f3a, 1a, el_8, d, vl),
@@ -278,9 +278,33 @@ static inline bool _to_bool(byte_vec_t b
#if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
# if INT_SIZE == 4 || UINT_SIZE == 4
+# define broadcast(x) ({ \
+ vec_t t_; \
+ asm ( "%{evex%} vpbroadcastd %1, %0" \
+ : "=v" (t_) : "m" (*(int[1]){ x }) ); \
+ t_; \
+})
+# define broadcast2(x) ({ \
+ vec_t t_; \
+ asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
+ t_; \
+})
# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
(0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define broadcast(x) ({ \
+ vec_t t_; \
+ asm ( "%{evex%} vpbroadcastq %1, %0" \
+ : "=v" (t_) : "m" (*(long long[1]){ x }) ); \
+ t_; \
+})
+# ifdef __x86_64__
+# define broadcast2(x) ({ \
+ vec_t t_; \
+ asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \
+ t_; \
+})
+# endif
# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
# endif
# if INT_SIZE == 4
@@ -977,10 +1001,14 @@ int simd_test(void)
if ( !eq(swap2(src), inv) ) return __LINE__;
#endif
-#if defined(broadcast)
+#ifdef broadcast
if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
#endif
+#ifdef broadcast2
+ if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
+#endif
+
#if defined(interleave_lo) && defined(interleave_hi)
touch(src);
x = interleave_lo(inv, src);
@@ -454,9 +454,13 @@ static const struct ext0f38_table {
[0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
- [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
- [0x5a] = { .simd_size = simd_128, .two_op = 1 },
- [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+ [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
+ [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
+ [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
+ [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+ [0x78] = { .simd_size = simd_other, .two_op = 1 },
+ [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 },
+ [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
[0x8c] = { .simd_size = simd_packed_int },
[0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
[0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
@@ -2636,6 +2640,11 @@ x86_decode_0f38(
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
break;
+ case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */
+ case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */
+ case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */
+ break;
+
case 0xf0: /* movbe / crc32 */
state->desc |= repne_prefix() ? ByteOp : Mov;
if ( rep_prefix() )
@@ -8233,6 +8242,8 @@ x86_emulate(
goto avx512f_no_sae;
case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */
+ op_bytes = elem_bytes;
generate_exception_if(evex.w || evex.brs, EXC_UD);
avx512_broadcast:
/*
@@ -8252,17 +8263,27 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
/* vbroadcastf64x4 m256,zmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
+ /* vbroadcasti64x4 m256,zmm{k} */
generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
/* fall through */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
/* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
- generate_exception_if(!evex.lr || evex.brs, EXC_UD);
+ generate_exception_if(!evex.lr, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */
+ /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */
+ if ( b == 0x59 )
+ op_bytes = 8;
+ generate_exception_if(evex.brs, EXC_UD);
if ( !evex.w )
host_and_vcpu_must_have(avx512dq);
goto avx512_broadcast;
case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
/* vbroadcastf64x2 m128,{y,z}mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */
+ /* vbroadcasti64x2 m128,{y,z}mm{k} */
generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.brs,
EXC_UD);
if ( evex.w )
@@ -8456,6 +8477,45 @@ x86_emulate(
generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
goto simd_0f_avx2;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512bw);
+ generate_exception_if(evex.w || evex.brs, EXC_UD);
+ op_bytes = elem_bytes = 1 << (b & 1);
+ /* See the comment at the avx512_broadcast label. */
+ op_mask |= !(b & 1 ? !(uint32_t)op_mask : !op_mask);
+ goto avx512f_no_sae;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7a): /* vpbroadcastb r32,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7b): /* vpbroadcastw r32,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512bw);
+ generate_exception_if(evex.w, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7c): /* vpbroadcast{d,q} reg,[xyz]mm{k} */
+ generate_exception_if((ea.type != OP_REG || evex.brs ||
+ evex.reg != 0xf || !evex.RX),
+ EXC_UD);
+ host_and_vcpu_must_have(avx512f);
+ avx512_vlen_check(false);
+ get_fpu(X86EMUL_FPU_zmm);
+
+ opc = init_evex(stub);
+ opc[0] = b;
+ /* Convert GPR source to %rAX. */
+ evex.b = 1;
+ if ( !mode_64bit() )
+ evex.w = 0;
+ opc[1] = modrm & 0xf8;
+ insn_bytes = EVEX_PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_EVEX(opc, evex);
+ invoke_stub("", "", "=g" (dummy) : "a" (src.val));
+
+ put_stub(stub);
+ ASSERT(!state->simd_size);
+ break;
+
case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
generate_exception_if(ea.type != OP_MEM, EXC_UD);
Note that the pbroadcastw table entry in evex-disp8.c is slightly different from what one would expect, due to it requiring EVEX.W to be zero. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v7: Use dummy output in invoke_stub(). Re-base. v3: New.