@@ -13,7 +13,8 @@ run: $(TARGET)
SIMD := sse sse2 sse4 avx avx2
FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) $(FMA)
+SG := avx2-sg
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
blowfish-cflags := ""
blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -39,6 +40,10 @@ fma-flts := $(avx-flts)
avx2-vecs := $(avx-vecs)
avx2-ints := 1 2 4 8
avx2-flts := 4 8
+avx2-sg-vecs := $(avx2-vecs)
+avx2-sg-idxs := 4 8
+avx2-sg-ints := 4 8
+avx2-sg-flts := 4 8
# For AVX and later, have the compiler avoid XMM0 to widen coverage of
# the VEX.vvvv checks in the emulator.
@@ -55,8 +60,18 @@ $(1)-cflags := \
$(foreach flt,$($(1)-flts), \
"-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
endef
+define simd-sg-defs
+$(1)-cflags := \
+ $(foreach vec,$($(1)-vecs), \
+ $(foreach idx,$($(1)-idxs), \
+ $(foreach int,$($(1)-ints), \
+ "-D_$(vec)x$(idx)i$(int) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DINT_SIZE=$(int)") \
+ $(foreach flt,$($(1)-flts), \
+ "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
+endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
+$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
$(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
rm -f $@.new $*.bin
@@ -78,7 +93,10 @@ $(addsuffix .c,$(SIMD)):
$(addsuffix .c,$(FMA)):
ln -sf simd-fma.c $@
-$(addsuffix .o,$(SIMD) $(FMA)): simd.h
+$(addsuffix .c,$(SG)):
+ ln -sf simd-sg.c $@
+
+$(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
$(TARGET): x86_emulate.o test_x86_emulator.o
$(HOSTCC) -o $@ $^
@@ -0,0 +1,209 @@
+#ifdef INT_SIZE
+# define ELEM_SIZE INT_SIZE
+#else
+# define ELEM_SIZE FLOAT_SIZE
+#endif
+
+#define VEC_SIZE (IDX_SIZE <= ELEM_SIZE ? VEC_MAX \
+ : VEC_MAX * ELEM_SIZE / IDX_SIZE)
+#if VEC_SIZE < 16
+# undef VEC_SIZE
+# define VEC_SIZE 16
+#endif
+
+#include "simd.h"
+
+ENTRY(sg_test);
+
+#undef MODE
+#if IDX_SIZE == 4
+# define MODE SI
+#elif IDX_SIZE == 8
+# define MODE DI
+#endif
+
+#define IVEC_SIZE (ELEM_SIZE <= IDX_SIZE ? VEC_MAX \
+ : VEC_MAX * IDX_SIZE / ELEM_SIZE)
+#if IVEC_SIZE < 16
+# undef IVEC_SIZE
+# define IVEC_SIZE 16
+#endif
+
+typedef signed int __attribute__((mode(MODE), vector_size(IVEC_SIZE))) idx_t;
+typedef long long __attribute__((vector_size(IVEC_SIZE))) idi_t;
+
+#define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
+ VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
+
+#if VEC_SIZE == 16
+# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
+#else
+# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
+#endif
+
+#if defined(__AVX2__)
+# if VEC_MAX == 16
+# if IDX_SIZE == 4
+# if INT_SIZE == 4
+# define gather __builtin_ia32_gathersiv4si
+# elif INT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) \
+ (vec_t)(__builtin_ia32_gathersiv2di((vdi_t)(reg), \
+ (const void *)(mem), \
+ idx, (vdi_t)(msk), scl))
+# elif FLOAT_SIZE == 4
+# define gather __builtin_ia32_gathersiv4sf
+# elif FLOAT_SIZE == 8
+# define gather __builtin_ia32_gathersiv2df
+# endif
+# elif IDX_SIZE == 8
+# if INT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) \
+ __builtin_ia32_gatherdiv4si(reg, mem, (vdi_t)(idx), msk, scl)
+# elif INT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) \
+ (vec_t)(__builtin_ia32_gatherdiv2di((vdi_t)(reg), \
+ (const void *)(mem), \
+ (vdi_t)(idx), (vdi_t)(msk), \
+ scl))
+# elif FLOAT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) \
+ __builtin_ia32_gatherdiv4sf(reg, mem, (vdi_t)(idx), msk, scl)
+# elif FLOAT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) \
+ __builtin_ia32_gatherdiv2df(reg, mem, (vdi_t)(idx), msk, scl)
+# endif
+# endif
+# elif VEC_MAX == 32
+# if IDX_SIZE == 4
+# if INT_SIZE == 4
+# define gather __builtin_ia32_gathersiv8si
+# elif INT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) \
+ (vec_t)(__builtin_ia32_gathersiv4di((vdi_t)(reg), \
+ (const void *)(mem), \
+ idx, (vdi_t)(msk), scl))
+
+# elif FLOAT_SIZE == 4
+# define gather __builtin_ia32_gathersiv8sf
+# elif FLOAT_SIZE == 8
+# define gather __builtin_ia32_gathersiv4df
+# endif
+# elif IDX_SIZE == 8
+# if INT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) \
+ __builtin_ia32_gatherdiv4si256(reg, mem, (idi_t)(idx), msk, scl)
+# elif INT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) \
+ (vec_t)(__builtin_ia32_gatherdiv4di((vdi_t)(reg), \
+ (const void *)(mem), \
+ (vdi_t)(idx), (vdi_t)(msk), \
+ scl))
+
+# elif FLOAT_SIZE == 4
+# define gather(reg, mem, idx, msk, scl) \
+ __builtin_ia32_gatherdiv4sf256(reg, mem, (idi_t)(idx), msk, scl)
+# elif FLOAT_SIZE == 8
+# define gather(reg, mem, idx, msk, scl) \
+ __builtin_ia32_gatherdiv4df(reg, mem, (vdi_t)(idx), msk, scl)
+# endif
+# endif
+# endif
+#endif
+
+#define GLUE_(x, y) x ## y
+#define GLUE(x, y) GLUE_(x, y)
+
+#define PUT2(n) (n), (n) + 1
+#define PUT4(n) PUT2(n), PUT2((n) + 2)
+#define PUT8(n) PUT4(n), PUT4((n) + 4)
+#define PUT16(n) PUT8(n), PUT8((n) + 8)
+#define PUT32(n) PUT16(n), PUT16((n) + 16)
+
+const typeof((vec_t){}[0]) array[] = {
+ GLUE(PUT, VEC_MAX)(1),
+ GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
+};
+
+int sg_test(void)
+{
+ unsigned int i;
+ vec_t x, y, full = (vec_t){} == 0;
+ idx_t idx, inv;
+
+ for ( i = 0; i < IVEC_SIZE / IDX_SIZE; ++i )
+ {
+ idx[i] = i + 1;
+ inv[i] = ITEM_COUNT - i;
+ }
+
+ touch(idx);
+ touch(inv);
+
+ x = gather(full, array, (idx_t){}, full, 1);
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ if ( x[i] != 1 )
+ return __LINE__;
+ for ( ; i < ELEM_COUNT; ++i )
+ if ( x[i] )
+ return __LINE__;
+
+ x = gather(full, array, idx, full, ELEM_SIZE);
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ if ( x[i] != i + 2 )
+ return __LINE__;
+ for ( ; i < ELEM_COUNT; ++i )
+ if ( x[i] )
+ return __LINE__;
+
+ x = gather(full, array, idx * ELEM_SIZE, full, 2);
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ if ( x[i] != i * 2 + 3 )
+ return __LINE__;
+ for ( ; i < ELEM_COUNT; ++i )
+ if ( x[i] )
+ return __LINE__;
+
+ x = gather(full, array, inv, full, ELEM_SIZE);
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ if ( x[i] != inv[i] + 1 )
+ return __LINE__;
+ for ( ; i < ELEM_COUNT; ++i )
+ if ( x[i] )
+ return __LINE__;
+
+ y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
+#if ITEM_COUNT == ELEM_COUNT
+ if ( !to_bool(y == x - 1) )
+ return __LINE__;
+#else
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ if ( y[i] != x[i] - 1 )
+ return __LINE__;
+ for ( ; i < ELEM_COUNT; ++i )
+ if ( y[i] )
+ return __LINE__;
+#endif
+
+#if ELEM_SIZE > 1
+ x = gather(full, array, inv * 2, full, ELEM_SIZE / 2);
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ if ( x[i] != inv[i] + 1 )
+ return __LINE__;
+ for ( ; i < ELEM_COUNT; ++i )
+ if ( x[i] )
+ return __LINE__;
+
+# if ELEM_SIZE == IDX_SIZE
+ y = gather(x, array, idx, (idx & inv) != 0, ELEM_SIZE);
+ for ( i = 0; i < ITEM_COUNT; ++i )
+ if ( y[i] != ((i + 1) & (ITEM_COUNT - i) ? idx : inv)[i] + 1 )
+ return __LINE__;
+ for ( ; i < ELEM_COUNT; ++i )
+ if ( y[i] )
+ return __LINE__;
+# endif
+#endif
+
+ return 0;
+}
@@ -12,6 +12,7 @@
#include "fma4.h"
#include "fma.h"
#include "avx2.h"
+#include "avx2-sg.h"
#define verbose false /* Switch to true for far more logging. */
@@ -60,6 +61,7 @@ static bool simd_check_avx2(void)
{
return cpu_has_avx2;
}
+#define simd_check_avx2_sg simd_check_avx2
static void simd_set_regs(struct cpu_user_regs *regs)
{
@@ -173,6 +175,22 @@ static const struct {
SIMD(AVX2 u32x8, avx2, 32u4),
SIMD(AVX2 s64x4, avx2, 32i8),
SIMD(AVX2 u64x4, avx2, 32u8),
+ SIMD(AVX2 S/G f32[4x32], avx2_sg, 16x4f4),
+ SIMD(AVX2 S/G f64[2x32], avx2_sg, 16x4f8),
+ SIMD(AVX2 S/G f32[2x64], avx2_sg, 16x8f4),
+ SIMD(AVX2 S/G f64[2x64], avx2_sg, 16x8f8),
+ SIMD(AVX2 S/G f32[8x32], avx2_sg, 32x4f4),
+ SIMD(AVX2 S/G f64[4x32], avx2_sg, 32x4f8),
+ SIMD(AVX2 S/G f32[4x64], avx2_sg, 32x8f4),
+ SIMD(AVX2 S/G f64[4x64], avx2_sg, 32x8f8),
+ SIMD(AVX2 S/G i32[4x32], avx2_sg, 16x4i4),
+ SIMD(AVX2 S/G i64[2x32], avx2_sg, 16x4i8),
+ SIMD(AVX2 S/G i32[2x64], avx2_sg, 16x8i4),
+ SIMD(AVX2 S/G i64[2x64], avx2_sg, 16x8i8),
+ SIMD(AVX2 S/G i32[8x32], avx2_sg, 32x4i4),
+ SIMD(AVX2 S/G i64[4x32], avx2_sg, 32x4i8),
+ SIMD(AVX2 S/G i32[4x64], avx2_sg, 32x8i4),
+ SIMD(AVX2 S/G i64[4x64], avx2_sg, 32x8i8),
#undef SIMD_
#undef SIMD
};
@@ -391,6 +391,7 @@ static const struct {
[0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
[0x8c] = { .simd_size = simd_other },
[0x8e] = { .simd_size = simd_other, .to_mem = 1 },
+ [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
[0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
[0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
[0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -598,6 +599,7 @@ struct x86_emulate_state {
ext_8f0a,
} ext;
uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+ uint8_t sib_index, sib_scale;
uint8_t rex_prefix;
bool lock_prefix;
bool not_64bit; /* Instruction not available in 64bit. */
@@ -2409,7 +2411,7 @@ x86_decode(
struct x86_emulate_ctxt *ctxt,
const struct x86_emulate_ops *ops)
{
- uint8_t b, d, sib, sib_index, sib_base;
+ uint8_t b, d;
unsigned int def_op_bytes, def_ad_bytes, opcode;
enum x86_segment override_seg = x86_seg_none;
bool pc_rel = false;
@@ -2735,6 +2737,7 @@ x86_decode(
if ( modrm_mod == 3 )
{
+ generate_exception_if(d & vSIB, EXC_UD);
modrm_rm |= (rex_prefix & 1) << 3;
ea.type = OP_REG;
}
@@ -2795,13 +2798,17 @@ x86_decode(
ea.type = OP_MEM;
if ( modrm_rm == 4 )
{
- sib = insn_fetch_type(uint8_t);
- sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
- sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
- if ( sib_index != 4 && !(d & vSIB) )
- ea.mem.off = *(long *)decode_register(sib_index,
+ uint8_t sib = insn_fetch_type(uint8_t);
+ uint8_t sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
+
+ state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
+ state->sib_scale = (sib >> 6) & 3;
+ if ( state->sib_index != 4 && !(d & vSIB) )
+ {
+ ea.mem.off = *(long *)decode_register(state->sib_index,
state->regs, 0);
- ea.mem.off <<= (sib >> 6) & 3;
+ ea.mem.off <<= state->sib_scale;
+ }
if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
ea.mem.off += insn_fetch_type(int32_t);
else if ( sib_base == 4 )
@@ -7443,6 +7450,110 @@ x86_emulate(
break;
}
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x92): /* vgatherdp{s,d} {x,y}mm,mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x93): /* vgatherqp{s,d} {x,y}mm,mem,{x,y}mm */
+ {
+ unsigned int mask_reg = ~vex.reg & (mode_64bit() ? 0xf : 7);
+ typeof(vex) *pvex;
+ union {
+ int32_t dw[8];
+ int64_t qw[4];
+ } index, mask;
+
+ ASSERT(ea.type == OP_MEM);
+ generate_exception_if(modrm_reg == state->sib_index ||
+ modrm_reg == mask_reg ||
+ state->sib_index == mask_reg, EXC_UD);
+ generate_exception_if(!cpu_has_avx, EXC_UD);
+ vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+
+ /* Read destination, index, and mask registers. */
+ opc = init_prefixes(stub);
+ pvex = copy_VEX(opc, vex);
+ pvex->opcx = vex_0f;
+ opc[0] = 0x7f; /* vmovdqa */
+ /* Use (%rax) as destination and modrm_reg as source. */
+ pvex->r = !mode_64bit() || !(modrm_reg & 8);
+ pvex->b = 1;
+ opc[1] = (modrm_reg & 7) << 3;
+ pvex->reg = 0xf;
+ opc[2] = 0xc3;
+
+ invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+ pvex->pfx = vex_f3; /* vmovdqu */
+ /* Switch to sib_index as source. */
+ pvex->r = !mode_64bit() || !(state->sib_index & 8);
+ opc[1] = (state->sib_index & 7) << 3;
+
+ invoke_stub("", "", "=m" (index) : "a" (&index));
+
+ /* Switch to mask_reg as source. */
+ pvex->r = !mode_64bit() || !(mask_reg & 8);
+ opc[1] = (mask_reg & 7) << 3;
+
+ invoke_stub("", "", "=m" (mask) : "a" (&mask));
+ put_stub(stub);
+
+ /* Clear untouched parts of the destination and mask values. */
+ n = 1 << (2 + vex.l - ((b & 1) | vex.w));
+ op_bytes = 4 << vex.w;
+ memset((void *)mmvalp + n * op_bytes, 0, 32 - n * op_bytes);
+ memset((void *)&mask + n * op_bytes, 0, 32 - n * op_bytes);
+
+ for ( i = 0; i < n && rc == X86EMUL_OKAY; ++i )
+ {
+ if ( (vex.w ? mask.qw[i] : mask.dw[i]) < 0 )
+ {
+ signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+ rc = ops->read(ea.mem.seg,
+ ea.mem.off + (idx << state->sib_scale),
+ (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ break;
+
+#ifdef __XEN__
+ if ( i + 1 < n && local_events_need_delivery() )
+ rc = X86EMUL_RETRY;
+#endif
+ }
+
+ if ( vex.w )
+ mask.qw[i] = 0;
+ else
+ mask.dw[i] = 0;
+ }
+
+ /* Write destination and mask registers. */
+ opc = init_prefixes(stub);
+ pvex = copy_VEX(opc, vex);
+ pvex->opcx = vex_0f;
+ opc[0] = 0x6f; /* vmovdqa */
+ /* Use modrm_reg as destination and (%rax) as source. */
+ pvex->r = !mode_64bit() || !(modrm_reg & 8);
+ pvex->b = 1;
+ opc[1] = (modrm_reg & 7) << 3;
+ pvex->reg = 0xf;
+ opc[2] = 0xc3;
+
+ invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+ pvex->pfx = vex_f3; /* vmovdqu */
+ /* Switch to mask_reg as destination. */
+ pvex->r = !mode_64bit() || !(mask_reg & 8);
+ opc[1] = (mask_reg & 7) << 3;
+
+ invoke_stub("", "", "+m" (mask) : "a" (&mask));
+ put_stub(stub);
+
+ state->simd_size = simd_none;
+ break;
+ }
+
case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -10,6 +10,7 @@
*/
#include <xen/domain_page.h>
+#include <xen/event.h>
#include <asm/x86_emulate.h>
#include <asm/asm_defns.h> /* mark_regs_dirty() */
#include <asm/processor.h> /* current_cpu_info */