@@ -18,7 +18,7 @@ CFLAGS += $(CFLAGS_xeninclude)
SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er
FMA := fma4 fma
-SG := avx2-sg
+SG := avx2-sg avx512f-sg avx512vl-sg
TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
OPMASK := avx512f avx512dq avx512bw
@@ -66,6 +66,14 @@ xop-flts := $(avx-flts)
avx512f-vecs := 64 16 32
avx512f-ints := 4 8
avx512f-flts := 4 8
+avx512f-sg-vecs := 64
+avx512f-sg-idxs := 4 8
+avx512f-sg-ints := $(avx512f-ints)
+avx512f-sg-flts := $(avx512f-flts)
+avx512vl-sg-vecs := 16 32
+avx512vl-sg-idxs := $(avx512f-sg-idxs)
+avx512vl-sg-ints := $(avx512f-ints)
+avx512vl-sg-flts := $(avx512f-flts)
avx512bw-vecs := $(avx512f-vecs)
avx512bw-ints := 1 2
avx512bw-flts :=
@@ -176,6 +176,8 @@ static const struct test avx512f_all[] =
INSN(fnmsub213, 66, 0f38, af, el, sd, el),
INSN(fnmsub231, 66, 0f38, be, vl, sd, vl),
INSN(fnmsub231, 66, 0f38, bf, el, sd, el),
+ INSN(gatherd, 66, 0f38, 92, vl, sd, el),
+ INSN(gatherq, 66, 0f38, 93, vl, sd, el),
INSN(getexp, 66, 0f38, 42, vl, sd, vl),
INSN(getexp, 66, 0f38, 43, el, sd, el),
INSN(getmant, 66, 0f3a, 26, vl, sd, vl),
@@ -229,6 +231,8 @@ static const struct test avx512f_all[] =
INSN(permt2, 66, 0f38, 7e, vl, dq, vl),
INSN(permt2, 66, 0f38, 7f, vl, sd, vl),
INSN(pexpand, 66, 0f38, 89, vl, dq, el),
+ INSN(pgatherd, 66, 0f38, 90, vl, dq, el),
+ INSN(pgatherq, 66, 0f38, 91, vl, dq, el),
INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl),
INSN(pmaxu, 66, 0f38, 3f, vl, dq, vl),
INSN(pmins, 66, 0f38, 39, vl, dq, vl),
@@ -698,7 +702,7 @@ static void test_one(const struct test *
instr[3] = evex.raw[2];
instr[4] = test->opc;
instr[5] = 0x44 | (test->ext << 3); /* ModR/M */
- instr[6] = 0x12; /* SIB: base rDX, index none / xMM4 */
+ instr[6] = 0x22; /* SIB: base rDX, index none / xMM4 */
instr[7] = 1; /* Disp8 */
instr[8] = 0; /* immediate, if any */
@@ -718,7 +722,8 @@ static void test_one(const struct test *
if ( accessed[i] )
goto fail;
for ( ; i < (test->scale == SC_vl ? vsz : esz) + (sg ? esz : vsz); ++i )
- if ( accessed[i] != (sg ? vsz / esz : 1) )
+ if ( accessed[i] != (sg ? (vsz / esz) >> (test->opc & 1 & !evex.w)
+ : 1) )
goto fail;
for ( ; i < ARRAY_SIZE(accessed); ++i )
if ( accessed[i] )
@@ -35,13 +35,78 @@ typedef long long __attribute__((vector_
#define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
-#if VEC_SIZE == 16
-# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
-#else
-# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
-#endif
+#if defined(__AVX512F__)
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# if ELEM_SIZE == 4
+# if IDX_SIZE == 4 || defined(__AVX512VL__)
+# define to_mask(msk) B(ptestmd, , (vsi_t)(msk), (vsi_t)(msk), ~0)
+# define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
+# else
+# define widen(x) __builtin_ia32_pmovzxdq512_mask((vsi_t)(x), (idi_t){}, ~0)
+# define to_mask(msk) __builtin_ia32_ptestmq512(widen(msk), widen(msk), ~0)
+# define eq(x, y) (__builtin_ia32_pcmpeqq512_mask(widen(x), widen(y), ~0) == ALL_TRUE)
+# endif
+# define BG_(dt, it, reg, mem, idx, msk, scl) \
+ __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
+# else
+# define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+# define BG_(dt, it, reg, mem, idx, msk, scl) \
+ __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl)
+# endif
+/*
+ * Instead of replicating the main IDX_SIZE conditional below three times, use
+ * a double layer of macro invocations, allowing for substitution of the
+ * respective relevant macro argument tokens.
+ */
+# define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl)
+# if VEC_MAX < 64
+/*
+ * The sub-512-bit built-ins have an extra "3" infix, presumably because the
+ * 512-bit names were chosen without the AVX512VL extension in mind (and hence
+ * making the latter collide with the AVX2 ones).
+ */
+# define si 3si
+# define di 3di
+# endif
+# if VEC_MAX == 16
+# define v8df v2df
+# define v8di v2di
+# define v16sf v4sf
+# define v16si v4si
+# elif VEC_MAX == 32
+# define v8df v4df
+# define v8di v4di
+# define v16sf v8sf
+# define v16si v8si
+# endif
+# if IDX_SIZE == 4
+# if INT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl)
+# elif INT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl))
+# elif FLOAT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl)
+# elif FLOAT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl)
+# endif
+# elif IDX_SIZE == 8
+# if INT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl)
+# elif INT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl))
+# elif FLOAT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl)
+# elif FLOAT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl)
+# endif
+# endif
+#elif defined(__AVX2__)
+# if VEC_SIZE == 16
+# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
+# else
+# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
+# endif
-#if defined(__AVX2__)
# if VEC_MAX == 16
# if IDX_SIZE == 4
# if INT_SIZE == 4
@@ -111,6 +176,10 @@ typedef long long __attribute__((vector_
# endif
#endif
+#ifndef eq
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
#define GLUE_(x, y) x ## y
#define GLUE(x, y) GLUE_(x, y)
@@ -119,6 +188,7 @@ typedef long long __attribute__((vector_
#define PUT8(n) PUT4(n), PUT4((n) + 4)
#define PUT16(n) PUT8(n), PUT8((n) + 8)
#define PUT32(n) PUT16(n), PUT16((n) + 16)
+#define PUT64(n) PUT32(n), PUT32((n) + 32)
const typeof((vec_t){}[0]) array[] = {
GLUE(PUT, VEC_MAX)(1),
@@ -174,7 +244,7 @@ int sg_test(void)
y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
#if ITEM_COUNT == ELEM_COUNT
- if ( !to_bool(y == x - 1) )
+ if ( !eq(y, x - 1) )
return __LINE__;
#else
for ( i = 0; i < ITEM_COUNT; ++i )
@@ -22,6 +22,8 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512dq-opmask.h"
#include "avx512bw-opmask.h"
#include "avx512f.h"
+#include "avx512f-sg.h"
+#include "avx512vl-sg.h"
#include "avx512bw.h"
#include "avx512dq.h"
#include "avx512er.h"
@@ -90,11 +92,13 @@ static bool simd_check_avx512f(void)
return cpu_has_avx512f;
}
#define simd_check_avx512f_opmask simd_check_avx512f
+#define simd_check_avx512f_sg simd_check_avx512f
static bool simd_check_avx512f_vl(void)
{
return cpu_has_avx512f && cpu_has_avx512vl;
}
+#define simd_check_avx512vl_sg simd_check_avx512f_vl
static bool simd_check_avx512dq(void)
{
@@ -291,6 +295,14 @@ static const struct {
SIMD(AVX512F u32x16, avx512f, 64u4),
SIMD(AVX512F s64x8, avx512f, 64i8),
SIMD(AVX512F u64x8, avx512f, 64u8),
+ SIMD(AVX512F S/G f32[16x32], avx512f_sg, 64x4f4),
+ SIMD(AVX512F S/G f64[ 8x32], avx512f_sg, 64x4f8),
+ SIMD(AVX512F S/G f32[ 8x64], avx512f_sg, 64x8f4),
+ SIMD(AVX512F S/G f64[ 8x64], avx512f_sg, 64x8f8),
+ SIMD(AVX512F S/G i32[16x32], avx512f_sg, 64x4i4),
+ SIMD(AVX512F S/G i64[ 8x32], avx512f_sg, 64x4i8),
+ SIMD(AVX512F S/G i32[ 8x64], avx512f_sg, 64x8i4),
+ SIMD(AVX512F S/G i64[ 8x64], avx512f_sg, 64x8i8),
AVX512VL(VL f32x4, avx512f, 16f4),
AVX512VL(VL f64x2, avx512f, 16f8),
AVX512VL(VL f32x8, avx512f, 32f4),
@@ -303,6 +315,22 @@ static const struct {
AVX512VL(VL u64x2, avx512f, 16u8),
AVX512VL(VL s64x4, avx512f, 32i8),
AVX512VL(VL u64x4, avx512f, 32u8),
+ SIMD(AVX512VL S/G f32[4x32], avx512vl_sg, 16x4f4),
+ SIMD(AVX512VL S/G f64[2x32], avx512vl_sg, 16x4f8),
+ SIMD(AVX512VL S/G f32[2x64], avx512vl_sg, 16x8f4),
+ SIMD(AVX512VL S/G f64[2x64], avx512vl_sg, 16x8f8),
+ SIMD(AVX512VL S/G f32[8x32], avx512vl_sg, 32x4f4),
+ SIMD(AVX512VL S/G f64[4x32], avx512vl_sg, 32x4f8),
+ SIMD(AVX512VL S/G f32[4x64], avx512vl_sg, 32x8f4),
+ SIMD(AVX512VL S/G f64[4x64], avx512vl_sg, 32x8f8),
+ SIMD(AVX512VL S/G i32[4x32], avx512vl_sg, 16x4i4),
+ SIMD(AVX512VL S/G i64[2x32], avx512vl_sg, 16x4i8),
+ SIMD(AVX512VL S/G i32[2x64], avx512vl_sg, 16x8i4),
+ SIMD(AVX512VL S/G i64[2x64], avx512vl_sg, 16x8i8),
+ SIMD(AVX512VL S/G i32[8x32], avx512vl_sg, 32x4i4),
+ SIMD(AVX512VL S/G i64[4x32], avx512vl_sg, 32x4i8),
+ SIMD(AVX512VL S/G i32[4x64], avx512vl_sg, 32x8i4),
+ SIMD(AVX512VL S/G i64[4x64], avx512vl_sg, 32x8i8),
SIMD(AVX512BW s8x64, avx512bw, 64i1),
SIMD(AVX512BW u8x64, avx512bw, 64u1),
SIMD(AVX512BW s16x32, avx512bw, 64i2),
@@ -4260,7 +4288,7 @@ int main(int argc, char **argv)
if ( !blobs[j].size )
{
- printf("%-39s n/a\n", blobs[j].name);
+ printf("%-39s n/a (%u-bit)\n", blobs[j].name, blobs[j].bitness);
continue;
}
@@ -499,7 +499,7 @@ static const struct ext0f38_table {
[0x8c] = { .simd_size = simd_packed_int },
[0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
- [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
+ [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
[0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x99] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0x9a] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -3054,7 +3054,8 @@ x86_decode(
d &= ~ModRM;
#undef ModRM /* Only its aliases are valid to use from here on. */
- modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
+ modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3) |
+ ((evex_encoded() && !evex.R) << 4);
modrm_rm = modrm & 0x07;
/*
@@ -3224,7 +3225,8 @@ x86_decode(
if ( modrm_mod == 3 )
{
generate_exception_if(d & vSIB, EXC_UD);
- modrm_rm |= (rex_prefix & 1) << 3;
+ modrm_rm |= ((rex_prefix & 1) << 3) |
+ (evex_encoded() && !evex.x) << 4;
ea.type = OP_REG;
}
else if ( ad_bytes == 2 )
@@ -3289,7 +3291,10 @@ x86_decode(
state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
state->sib_scale = (sib >> 6) & 3;
- if ( state->sib_index != 4 && !(d & vSIB) )
+ if ( unlikely(d & vSIB) )
+ state->sib_index |= (mode_64bit() && evex_encoded() &&
+ !evex.RX) << 4;
+ else if ( state->sib_index != 4 )
{
ea.mem.off = *decode_gpr(state->regs, state->sib_index);
ea.mem.off <<= state->sib_scale;
@@ -3592,7 +3597,7 @@ x86_emulate(
generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
if ( ea.type == OP_REG )
- ea.reg = _decode_gpr(&_regs, modrm_rm, (d & ByteOp) && !rex_prefix);
+ ea.reg = _decode_gpr(&_regs, modrm_rm, (d & ByteOp) && !rex_prefix && !vex.opcx);
memset(mmvalp, 0xaa /* arbitrary */, sizeof(*mmvalp));
@@ -3606,7 +3611,7 @@ x86_emulate(
src.type = OP_REG;
if ( d & ByteOp )
{
- src.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix);
+ src.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix && !vex.opcx);
src.val = *(uint8_t *)src.reg;
src.bytes = 1;
}
@@ -3704,7 +3709,7 @@ x86_emulate(
dst.type = OP_REG;
if ( d & ByteOp )
{
- dst.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix);
+ dst.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix && !vex.opcx);
dst.val = *(uint8_t *)dst.reg;
dst.bytes = 1;
}
@@ -9119,6 +9124,130 @@ x86_emulate(
put_stub(stub);
state->simd_size = simd_none;
+ break;
+ }
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
+ {
+ typeof(evex) *pevex;
+ union {
+ int32_t dw[16];
+ int64_t qw[8];
+ } index;
+ bool done = false;
+
+ ASSERT(ea.type == OP_MEM);
+ generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
+ evex.reg != 0xf ||
+ modrm_reg == state->sib_index),
+ EXC_UD);
+ avx512_vlen_check(false);
+ host_and_vcpu_must_have(avx512f);
+ get_fpu(X86EMUL_FPU_zmm);
+
+ /* Read destination and index registers. */
+ opc = init_evex(stub);
+ pevex = copy_EVEX(opc, evex);
+ pevex->opcx = vex_0f;
+ opc[0] = 0x7f; /* vmovdqa{32,64} */
+ /*
+ * The register writeback below has to retain masked-off elements, but
+ * needs to clear upper portions in the index-wider-than-data cases.
+ * Therefore read (and write below) the full register. The alternative
+ * would have been to fiddle with the mask register used.
+ */
+ pevex->opmsk = 0;
+ /* Use (%rax) as destination and modrm_reg as source. */
+ pevex->b = 1;
+ opc[1] = (modrm_reg & 7) << 3;
+ pevex->RX = 1;
+ opc[2] = 0xc3;
+
+ invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+ pevex->pfx = vex_f3; /* vmovdqu{32,64} */
+ pevex->w = b & 1;
+ /* Switch to sib_index as source. */
+ pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+ pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+ opc[1] = (state->sib_index & 7) << 3;
+
+ invoke_stub("", "", "=m" (index) : "a" (&index));
+ put_stub(stub);
+
+ /* Clear untouched parts of the destination and mask values. */
+ n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
+ op_bytes = 4 << evex.w;
+ memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
+ op_mask &= (1 << n) - 1;
+
+ for ( i = 0; op_mask; ++i )
+ {
+ signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+ if ( !(op_mask & (1 << i)) )
+ continue;
+
+ rc = ops->read(ea.mem.seg,
+ truncate_ea(ea.mem.off + (idx << state->sib_scale)),
+ (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ {
+ /*
+ * If we've made some progress and the access did not fault,
+ * force a retry instead. This is for example necessary to
+ * cope with the limited capacity of HVM's MMIO cache.
+ */
+ if ( rc != X86EMUL_EXCEPTION && done )
+ rc = X86EMUL_RETRY;
+ break;
+ }
+
+ op_mask &= ~(1 << i);
+ done = true;
+
+#ifdef __XEN__
+ if ( op_mask && local_events_need_delivery() )
+ {
+ rc = X86EMUL_RETRY;
+ break;
+ }
+#endif
+ }
+
+ /* Write destination and mask registers. */
+ opc = init_evex(stub);
+ pevex = copy_EVEX(opc, evex);
+ pevex->opcx = vex_0f;
+ opc[0] = 0x6f; /* vmovdqa{32,64} */
+ pevex->opmsk = 0;
+ /* Use modrm_reg as destination and (%rax) as source. */
+ pevex->b = 1;
+ opc[1] = (modrm_reg & 7) << 3;
+ pevex->RX = 1;
+ opc[2] = 0xc3;
+
+ invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+ /*
+ * kmovw: This is VEX-encoded, so we can't use pevex. Avoid copy_VEX() etc
+ * as well, since we can easily use the 2-byte VEX form here.
+ */
+ opc -= EVEX_PFX_BYTES;
+ opc[0] = 0xc5;
+ opc[1] = 0xf8;
+ opc[2] = 0x90;
+ /* Use (%rax) as source. */
+ opc[3] = evex.opmsk << 3;
+ opc[4] = 0xc3;
+
+ invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+ put_stub(stub);
+
+ state->simd_size = simd_none;
break;
}
@@ -662,8 +662,6 @@ static inline unsigned long *decode_gpr(
BUILD_BUG_ON(ARRAY_SIZE(cpu_user_regs_gpr_offsets) &
(ARRAY_SIZE(cpu_user_regs_gpr_offsets) - 1));
- ASSERT(modrm < ARRAY_SIZE(cpu_user_regs_gpr_offsets));
-
/* Note that this also acts as array_access_nospec() stand-in. */
modrm &= ARRAY_SIZE(cpu_user_regs_gpr_offsets) - 1;
This requires getting modrm_reg and sib_index set correctly in the EVEX case, to account for the high 16 [XYZ]MM registers. Extend the adjustments to modrm_rm as well, such that x86_insn_modrm() would correctly report register numbers (this was a latent issue only as we don't currently have callers of that function which would care about an EVEX case). The adjustment in turn requires dropping the assertion from decode_gpr() as well as re-introducing the explicit masking, as we now need to actively mask off the high bit when a GPR is meant. _decode_gpr() invocations also need slight adjustments, when invoked in generic code ahead of the main switch(). All other uses of modrm_reg and modrm_rm already get suitably masked where necessary. There was also an encoding mistake in the EVEX Disp8 test code, which was benign (due to %rdx getting set to zero) to all non-vSIB tests as it mistakenly encoded <disp8>(%rdx,%rdx) instead of <disp8>(%rdx,%riz). In the vSIB case this meant <disp8>(%rdx,%zmm2) instead of the intended <disp8>(%rdx,%zmm4). Likewise the access count check wasn't entirely correct for the S/G case: In the quad-word-index but dword-data case only half the number of full vector elements get accessed. As an unrelated change in the main test harness source file distinguish the "n/a" messages by bitness. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v8: Re-base. v7: Fix ByteOp register decode. Re-base. v6: New.