@@ -270,6 +270,8 @@ static const struct test avx512f_all[] =
INSN(prolv, 66, 0f38, 15, vl, dq, vl),
INSNX(pror, 66, 0f, 72, 0, vl, dq, vl),
INSN(prorv, 66, 0f38, 14, vl, dq, vl),
+ INSN(pscatterd, 66, 0f38, a0, vl, dq, el),
+ INSN(pscatterq, 66, 0f38, a1, vl, dq, el),
INSN(pshufd, 66, 0f, 70, vl, d, vl),
INSN(pslld, 66, 0f, f2, el_4, d, vl),
INSNX(pslld, 66, 0f, 72, 6, vl, d, vl),
@@ -305,6 +307,8 @@ static const struct test avx512f_all[] =
INSN(rsqrt14, 66, 0f38, 4f, el, sd, el),
INSN(scalef, 66, 0f38, 2c, vl, sd, vl),
INSN(scalef, 66, 0f38, 2d, el, sd, el),
+ INSN(scatterd, 66, 0f38, a2, vl, sd, el),
+ INSN(scatterq, 66, 0f38, a3, vl, sd, el),
INSN_PFP(shuf, 0f, c6),
INSN_FP(sqrt, 0f, 51),
INSN_FP(sub, 0f, 5c),
@@ -48,10 +48,14 @@ typedef long long __attribute__((vector_
# endif
# define BG_(dt, it, reg, mem, idx, msk, scl) \
__builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
+# define BS_(dt, it, mem, idx, reg, msk, scl) \
+ __builtin_ia32_scatter##it##dt(mem, to_mask(msk), idx, reg, scl)
# else
# define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
# define BG_(dt, it, reg, mem, idx, msk, scl) \
__builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl)
+# define BS_(dt, it, mem, idx, reg, msk, scl) \
+ __builtin_ia32_scatter##it##dt(mem, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), idx, reg, scl)
# endif
/*
* Instead of replicating the main IDX_SIZE conditional below three times, use
@@ -59,6 +63,7 @@ typedef long long __attribute__((vector_
* respective relevant macro argument tokens.
*/
# define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl)
+# define BS(dt, it, mem, idx, reg, msk, scl) BS_(dt, it##i, mem, idx, reg, msk, scl)
# if VEC_MAX < 64
/*
* The sub-512-bit built-ins have an extra "3" infix, presumably because the
@@ -82,22 +87,30 @@ typedef long long __attribute__((vector_
# if IDX_SIZE == 4
# if INT_SIZE == 4
# define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl)
+# define scatter(mem, idx, reg, msk, scl) BS(v16si, s, mem, idx, reg, msk, scl)
# elif INT_SIZE == 8
# define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl))
+# define scatter(mem, idx, reg, msk, scl) BS(v8di, s, mem, idx, (vdi_t)(reg), msk, scl)
# elif FLOAT_SIZE == 4
# define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl)
+# define scatter(mem, idx, reg, msk, scl) BS(v16sf, s, mem, idx, reg, msk, scl)
# elif FLOAT_SIZE == 8
# define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl)
+# define scatter(mem, idx, reg, msk, scl) BS(v8df, s, mem, idx, reg, msk, scl)
# endif
# elif IDX_SIZE == 8
# if INT_SIZE == 4
# define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl)
+# define scatter(mem, idx, reg, msk, scl) BS(v16si, d, mem, (idi_t)(idx), reg, msk, scl)
# elif INT_SIZE == 8
# define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl))
+# define scatter(mem, idx, reg, msk, scl) BS(v8di, d, mem, (idi_t)(idx), (vdi_t)(reg), msk, scl)
# elif FLOAT_SIZE == 4
# define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl)
+# define scatter(mem, idx, reg, msk, scl) BS(v16sf, d, mem, (idi_t)(idx), reg, msk, scl)
# elif FLOAT_SIZE == 8
# define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl)
+# define scatter(mem, idx, reg, msk, scl) BS(v8df, d, mem, (idi_t)(idx), reg, msk, scl)
# endif
# endif
#elif defined(__AVX2__)
@@ -195,6 +208,8 @@ const typeof((vec_t){}[0]) array[] = {
GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
};
+typeof((vec_t){}[0]) out[VEC_MAX * 2];
+
int sg_test(void)
{
unsigned int i;
@@ -275,5 +290,41 @@ int sg_test(void)
# endif
#endif
+#ifdef scatter
+
+ for ( i = 0; i < sizeof(out) / sizeof(*out); ++i )
+ out[i] = 0;
+
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ x[i] = i + 1;
+
+ touch(x);
+
+ scatter(out, (idx_t){}, x, (vec_t){ 1 } != 0, 1);
+ if ( out[0] != 1 )
+ return __LINE__;
+ for ( i = 1; i < ITEM_COUNT; ++i )
+ if ( out[i] )
+ return __LINE__;
+
+ scatter(out, (idx_t){}, x, full, 1);
+ if ( out[0] != ITEM_COUNT )
+ return __LINE__;
+ for ( i = 1; i < ITEM_COUNT; ++i )
+ if ( out[i] )
+ return __LINE__;
+
+ scatter(out, idx, x, full, ELEM_SIZE);
+ for ( i = 1; i <= ITEM_COUNT; ++i )
+ if ( out[i] != i )
+ return __LINE__;
+
+ scatter(out, inv, x, full, ELEM_SIZE);
+ for ( i = 1; i <= ITEM_COUNT; ++i )
+ if ( out[i] != ITEM_COUNT + 1 - i )
+ return __LINE__;
+
+#endif
+
return 0;
}
@@ -508,6 +508,7 @@ static const struct ext0f38_table {
[0x9d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0x9e] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x9f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+ [0xa0 ... 0xa3] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
[0xa6 ... 0xa8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0xa9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0xaa] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -9330,6 +9331,102 @@ x86_emulate(
avx512_vlen_check(true);
goto simd_zmm;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa0): /* vpscatterd{d,q} [xyz]mm,mem{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa1): /* vpscatterq{d,q} [xyz]mm,mem{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa2): /* vscatterdp{s,d} [xyz]mm,mem{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa3): /* vscatterqp{s,d} [xyz]mm,mem{k} */
+ {
+ typeof(evex) *pevex;
+ union {
+ int32_t dw[16];
+ int64_t qw[8];
+ } index;
+ bool done = false;
+
+ ASSERT(ea.type == OP_MEM);
+ fail_if(!ops->write);
+ generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
+ evex.reg != 0xf ||
+ modrm_reg == state->sib_index),
+ EXC_UD);
+ avx512_vlen_check(false);
+ host_and_vcpu_must_have(avx512f);
+ get_fpu(X86EMUL_FPU_zmm);
+
+ /* Read source and index registers. */
+ opc = init_evex(stub);
+ pevex = copy_EVEX(opc, evex);
+ pevex->opcx = vex_0f;
+ opc[0] = 0x7f; /* vmovdqa{32,64} */
+ /* Use (%rax) as destination and modrm_reg as source. */
+ pevex->b = 1;
+ opc[1] = (modrm_reg & 7) << 3;
+ pevex->RX = 1;
+ opc[2] = 0xc3;
+
+ invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+ pevex->pfx = vex_f3; /* vmovdqu{32,64} */
+ pevex->w = b & 1;
+ /* Switch to sib_index as source. */
+ pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+ pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+ opc[1] = (state->sib_index & 7) << 3;
+
+ invoke_stub("", "", "=m" (index) : "a" (&index));
+ put_stub(stub);
+
+ /* Clear untouched parts of the mask value. */
+ n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
+ op_bytes = 4 << evex.w;
+ op_mask &= (1 << n) - 1;
+
+ for ( i = 0; op_mask; ++i )
+ {
+ signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+ if ( !(op_mask & (1 << i)) )
+ continue;
+
+ rc = ops->write(ea.mem.seg,
+ truncate_ea(ea.mem.off + (idx << state->sib_scale)),
+ (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ {
+ /* See comment in gather emulation. */
+ if ( rc != X86EMUL_EXCEPTION && done )
+ rc = X86EMUL_RETRY;
+ break;
+ }
+
+ op_mask &= ~(1 << i);
+ done = true;
+
+#ifdef __XEN__
+ if ( op_mask && local_events_need_delivery() )
+ {
+ rc = X86EMUL_RETRY;
+ break;
+ }
+#endif
+ }
+
+ /* Write mask register. See comment in gather emulation. */
+ opc = get_stub(stub);
+ opc[0] = 0xc5;
+ opc[1] = 0xf8;
+ opc[2] = 0x90;
+ /* Use (%rax) as source. */
+ opc[3] = evex.opmsk << 3;
+ opc[4] = 0xc3;
+
+ invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+ put_stub(stub);
+
+ state->simd_size = simd_none;
+ break;
+ }
+
case X86EMUL_OPC(0x0f38, 0xc8): /* sha1nexte xmm/m128,xmm */
case X86EMUL_OPC(0x0f38, 0xc9): /* sha1msg1 xmm/m128,xmm */
case X86EMUL_OPC(0x0f38, 0xca): /* sha1msg2 xmm/m128,xmm */
This completes support of AVX512F in the insn emulator. Note that in the test harness there's a little bit of trickery needed to get around the not fully consistent naming of AVX512VL gather and scatter built-ins. To suppress expansion of the "di" and "si" tokens they get constructed by token concatenation in BS(), which is different from BG(). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- TBD: I couldn't really decide whether to duplicate code or merge scatter into gather emulation. --- v7: Re-base. v6: New.