@@ -230,7 +230,7 @@
tools/tests/x86_emulator/sse*.[ch]
tools/tests/x86_emulator/test_x86_emulator
tools/tests/x86_emulator/x86_emulate
+tools/tests/x86_emulator/xop*.[ch]
tools/tests/xen-access/xen-access
tools/tests/xenstore/xs-test
tools/tests/regression/installed/*
@@ -11,7 +11,7 @@ all: $(TARGET)
run: $(TARGET)
./$(TARGET)
-SIMD := sse sse2 sse4 avx avx2
+SIMD := sse sse2 sse4 avx avx2 xop
FMA := fma4 fma
SG := avx2-sg
TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -44,6 +44,9 @@ avx2-sg-vecs := $(avx2-vecs)
avx2-sg-idxs := 4 8
avx2-sg-ints := 4 8
avx2-sg-flts := 4 8
+xop-vecs := $(avx-vecs)
+xop-ints := 1 2 4 8
+xop-flts := $(avx-flts)
# For AVX and later, have the compiler avoid XMM0 to widen coverage of
# the VEX.vvvv checks in the emulator.
@@ -98,6 +101,8 @@ $(addsuffix .c,$(SG)):
$(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
+xop.o: simd-fma.c
+
$(TARGET): x86_emulate.o test_x86_emulator.o
$(HOSTCC) -o $@ $^
@@ -471,6 +471,86 @@ static inline bool _to_bool(byte_vec_t b
# endif
# endif
#endif
+#ifdef __XOP__
+# undef select
+# if VEC_SIZE == 16
+# if INT_SIZE == 2 || INT_SIZE == 4
+# include "simd-fma.c"
+# endif
+# define select(d, x, y, m) \
+ (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+# if INT_SIZE == 1 || UINT_SIZE == 1
+# define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+# define swap2(x) \
+ ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \
+ (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \
+ (2 * inv - 2))))
+# elif FLOAT_SIZE == 4
+# define frac(x) __builtin_ia32_vfrczps(x)
+# undef swap2
+# define swap2(x) ({ \
+ /* Buggy in gcc 7.1.0 and earlier. */ \
+ /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \
+ vec_t t_; \
+ asm ( "vpermil2ps $0, %3, %2, %1, %0" : \
+ "=x" (t_) : \
+ "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \
+ t_; \
+})
+# elif FLOAT_SIZE == 8
+# define frac(x) __builtin_ia32_vfrczpd(x)
+# undef swap2
+# define swap2(x) ({ \
+ /* Buggy in gcc 7.1.0 and earlier. */ \
+ /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \
+ /* __builtin_ia32_pmovsxdq128( */ \
+ /* __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \
+ vdi_t s_ = __builtin_ia32_pmovsxdq128( \
+ __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \
+ vec_t t_; \
+ asm ( "vpermil2pd $0, %3, %2, %1, %0" : \
+ "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \
+ t_; \
+})
+# endif
+# if INT_SIZE == 1
+# define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \
+ __builtin_ia32_vphaddbw((vqi_t)(y))))
+# define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \
+ __builtin_ia32_vphsubbw((vqi_t)(y))))
+# elif UINT_SIZE == 1
+# define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \
+ __builtin_ia32_vphaddubw((vqi_t)(y))))
+# elif INT_SIZE == 2
+# undef hadd
+# define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \
+ __builtin_ia32_vphaddwd(y))
+# undef hsub
+# define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \
+ __builtin_ia32_vphsubwd(y))
+# elif UINT_SIZE == 2
+# undef hadd
+# define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \
+ __builtin_ia32_vphadduwd((vhi_t)(y))))
+# undef hsub
+# endif
+# elif VEC_SIZE == 32
+# define select(d, x, y, m) \
+ (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+# if FLOAT_SIZE == 4
+# define frac(x) __builtin_ia32_vfrczps256(x)
+# elif FLOAT_SIZE == 8
+# define frac(x) __builtin_ia32_vfrczpd256(x)
+# endif
+# elif VEC_SIZE == FLOAT_SIZE
+# if VEC_SIZE == 4
+# define frac(x) scalar_1op(x, "vfrczss %[in], %[out]")
+# elif VEC_SIZE == 8
+# define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]")
+# endif
+# endif
+#endif
int simd_test(void)
{
@@ -576,6 +656,29 @@ int simd_test(void)
if ( !to_bool(y == z) ) return __LINE__;
# endif
+# ifdef frac
+ touch(src);
+ x = frac(src);
+ touch(src);
+ if ( !to_bool(x == 0) ) return __LINE__;
+
+ x = 1 / (src + 1);
+ touch(x);
+ y = frac(x);
+ touch(x);
+ if ( !to_bool(x == y) ) return __LINE__;
+# endif
+
+# if defined(trunc) && defined(frac)
+ x = src / 4;
+ touch(x);
+ y = trunc(x);
+ touch(x);
+ z = frac(x);
+ touch(x);
+ if ( !to_bool(x == y + z) ) return __LINE__;
+# endif
+
#else
# if ELEM_SIZE > 1
@@ -677,7 +780,7 @@ int simd_test(void)
y = z << sh;
if ( !to_bool(x == y + y) ) return __LINE__;
-# if defined(__AVX2__) && ELEM_SIZE >= 4
+# if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
touch(sh);
x = y >> sh;
if ( !to_bool(x == z) ) return __LINE__;
@@ -871,6 +974,8 @@ int simd_test(void)
#endif
#ifdef hadd
+# if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \
+ (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16)
x = src;
for ( i = ELEM_COUNT; i >>= 1; )
{
@@ -878,6 +983,7 @@ int simd_test(void)
x = hadd((vec_t){}, x);
}
if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
+# endif
# ifdef hsub
touch(src);
@@ -889,6 +995,9 @@ int simd_test(void)
# endif
#endif
+#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+ return -fma_test();
+#endif
return 0;
}
@@ -1,6 +1,8 @@
#include "simd.h"
+#ifndef __XOP__
ENTRY(fma_test);
+#endif
#if VEC_SIZE < 16
# define to_bool(cmp) (!~(cmp)[0])
@@ -13,6 +13,7 @@
#include "fma.h"
#include "avx2.h"
#include "avx2-sg.h"
+#include "xop.h"
#define verbose false /* Switch to true for far more logging. */
@@ -63,6 +64,11 @@ static bool simd_check_avx2(void)
}
#define simd_check_avx2_sg simd_check_avx2
+static bool simd_check_xop(void)
+{
+ return cpu_has_xop;
+}
+
static void simd_set_regs(struct cpu_user_regs *regs)
{
if ( cpu_has_mmx )
@@ -191,6 +197,22 @@ static const struct {
SIMD(AVX2 S/G i64[4x32], avx2_sg, 32x4i8),
SIMD(AVX2 S/G i32[4x64], avx2_sg, 32x8i4),
SIMD(AVX2 S/G i64[4x64], avx2_sg, 32x8i8),
+ SIMD(XOP 128bit single, xop, 16f4),
+ SIMD(XOP 256bit single, xop, 32f4),
+ SIMD(XOP 128bit double, xop, 16f8),
+ SIMD(XOP 256bit double, xop, 32f8),
+ SIMD(XOP s8x16, xop, 16i1),
+ SIMD(XOP u8x16, xop, 16u1),
+ SIMD(XOP s16x8, xop, 16i2),
+ SIMD(XOP u16x8, xop, 16u2),
+ SIMD(XOP s32x4, xop, 16i4),
+ SIMD(XOP u32x4, xop, 16u4),
+ SIMD(XOP s64x2, xop, 16i8),
+ SIMD(XOP u64x2, xop, 16u8),
+ SIMD(XOP i8x32, xop, 32i1),
+ SIMD(XOP i16x16, xop, 32i2),
+ SIMD(XOP i32x8, xop, 32i4),
+ SIMD(XOP i64x4, xop, 32i8),
#undef SIMD_
#undef SIMD
};
@@ -172,6 +172,16 @@ static inline uint64_t xgetbv(uint32_t x
(res.c & (1U << 6)) != 0; \
})
+#define cpu_has_xop ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+ res.c = 0; \
+ else \
+ emul_test_cpuid(0x80000001, 0, &res, NULL); \
+ (res.c & (1U << 11)) != 0; \
+})
+
#define cpu_has_fma4 ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
@@ -435,6 +435,7 @@ static const struct {
[0x42] = { .simd_size = simd_packed_int },
[0x44] = { .simd_size = simd_packed_int },
[0x46] = { .simd_size = simd_packed_int },
+ [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
[0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -463,6 +464,17 @@ static const struct {
uint8_t two_op:1;
uint8_t four_op:1;
} ext8f08_table[256] = {
+ [0xa2] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0x85 ... 0x87] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0x8e ... 0x8f] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0x95 ... 0x97] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0x9e ... 0x9f] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0xa3] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0xa6] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0xb6] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0xc0 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xcc ... 0xcf] = { .simd_size = simd_packed_int },
+ [0xec ... 0xef] = { .simd_size = simd_packed_int },
};
static const struct {
@@ -470,6 +482,16 @@ static const struct {
uint8_t two_op:1;
} ext8f09_table[256] = {
[0x01 ... 0x02] = { .two_op = 1 },
+ [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 },
+ [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+ [0x90 ... 0x9b] = { .simd_size = simd_packed_int },
+ [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xcb] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xd1 ... 0xd3] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xd6 ... 0xd7] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xe1 ... 0xe3] = { .simd_size = simd_packed_int, .two_op = 1 },
};
#define REX_PREFIX 0x40
@@ -528,7 +550,7 @@ union vex {
#define copy_VEX(ptr, vex) ({ \
if ( !mode_64bit() ) \
(vex).reg |= 8; \
- (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \
(ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
(ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
@@ -1653,6 +1675,7 @@ static bool vcpu_has(
#define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops)
#define vcpu_has_sse4a() vcpu_has(0x80000001, ECX, 6, ctxt, ops)
#define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX, 7, ctxt, ops)
+#define vcpu_has_xop() vcpu_has(0x80000001, ECX, 12, ctxt, ops)
#define vcpu_has_fma4() vcpu_has(0x80000001, ECX, 16, ctxt, ops)
#define vcpu_has_tbm() vcpu_has(0x80000001, ECX, 21, ctxt, ops)
#define vcpu_has_bmi1() vcpu_has( 7, EBX, 3, ctxt, ops)
@@ -2985,9 +3008,19 @@ x86_decode(
case simd_packed_int:
switch ( vex.pfx )
{
- case vex_none: op_bytes = 8; break;
- case vex_66: op_bytes = 16 << vex.l; break;
- default: op_bytes = 0; break;
+ case vex_none:
+ if ( !vex.opcx )
+ {
+ op_bytes = 8;
+ break;
+ }
+ /* fall through */
+ case vex_66:
+ op_bytes = 16 << vex.l;
+ break;
+ default:
+ op_bytes = 0;
+ break;
}
break;
@@ -7996,6 +8029,13 @@ x86_emulate(
generate_exception_if(vex.w, EXC_UD);
goto simd_0f_imm8_avx;
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+ /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+ /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ host_and_vcpu_must_have(xop);
+ goto simd_0f_imm8_ymm;
+
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
generate_exception_if(vex.w, EXC_UD);
goto simd_0f_int_imm8;
@@ -8133,6 +8173,41 @@ x86_emulate(
asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) );
break;
+ case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x8e): /* vpmacssdd xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x8f): /* vpmacssdqh xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x95): /* vpmacsww xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x96): /* vpmacswd xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x97): /* vpmacsdql xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x9e): /* vpmacsdd xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0x9f): /* vpmacsdqh xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xa6): /* vpmadcsswd xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xb6): /* vpmadcswd xmm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xc0): /* vprotb $imm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(08, 0xc1): /* vprotw $imm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(08, 0xc2): /* vprotd $imm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(08, 0xc3): /* vprotq $imm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(08, 0xcc): /* vpcomb $imm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xcd): /* vpcomw $imm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xce): /* vpcomd $imm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xcf): /* vpcomq $imm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xec): /* vpcomub $imm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xed): /* vpcomuw $imm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xee): /* vpcomud $imm,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_XOP(08, 0xef): /* vpcomuq $imm,xmm/m128,xmm,xmm */
+ generate_exception_if(vex.w, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_XOP(08, 0xa3): /* vpperm xmm/m128,xmm,xmm,xmm */
+ /* vpperm xmm,xmm/m128,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_XOP(08, 0xa2): /* vpcmov {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+ /* vpcmov {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ host_and_vcpu_must_have(xop);
+ goto simd_0f_imm8_ymm;
+
case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */
switch ( modrm_reg & 7 )
{
@@ -8182,6 +8257,61 @@ x86_emulate(
}
goto cannot_emulate;
+ case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_XOP(09, 0x80): /* vfrczps {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_XOP(09, 0x81): /* vfrczpd {x,y}mm/mem,{x,y}mm */
+ host_and_vcpu_must_have(xop);
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_ymm;
+
+ case X86EMUL_OPC_XOP(09, 0xc1): /* vphaddbw xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xc2): /* vphaddbd xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xc3): /* vphaddbq xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xc6): /* vphaddwd xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xc7): /* vphaddwq xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xcb): /* vphadddq xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xd1): /* vphaddubw xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xd2): /* vphaddubd xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xd3): /* vphaddubq xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xd6): /* vphadduwd xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xd7): /* vphadduwq xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xdb): /* vphaddudq xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xe2): /* vphsubwd xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xe3): /* vphsubdq xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0xe1): /* vphsubbw xmm/m128,xmm */
+ generate_exception_if(vex.w, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_XOP(09, 0x90): /* vprotb xmm/m128,xmm,xmm */
+ /* vprotb xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x91): /* vprotw xmm/m128,xmm,xmm */
+ /* vprotw xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x92): /* vprotd xmm/m128,xmm,xmm */
+ /* vprotd xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x93): /* vprotq xmm/m128,xmm,xmm */
+ /* vprotq xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x94): /* vpshlb xmm/m128,xmm,xmm */
+ /* vpshlb xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x95): /* vpshlw xmm/m128,xmm,xmm */
+ /* vpshlw xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x96): /* vpshld xmm/m128,xmm,xmm */
+ /* vpshld xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x97): /* vpshlq xmm/m128,xmm,xmm */
+ /* vpshlq xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x98): /* vpshab xmm/m128,xmm,xmm */
+ /* vpshab xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x99): /* vpshaw xmm/m128,xmm,xmm */
+ /* vpshaw xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x9a): /* vpshad xmm/m128,xmm,xmm */
+ /* vpshad xmm,xmm/m128,xmm */
+ case X86EMUL_OPC_XOP(09, 0x9b): /* vpshaq xmm/m128,xmm,xmm */
+ /* vpshaq xmm,xmm/m128,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ host_and_vcpu_must_have(xop);
+ goto simd_0f_ymm;
+
case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */
{
uint8_t *buf = get_stub(stub);
@@ -76,6 +76,7 @@
#define cpu_has_cmp_legacy boot_cpu_has(X86_FEATURE_CMP_LEGACY)
#define cpu_has_svm boot_cpu_has(X86_FEATURE_SVM)
#define cpu_has_sse4a boot_cpu_has(X86_FEATURE_SSE4A)
+#define cpu_has_xop boot_cpu_has(X86_FEATURE_XOP)
#define cpu_has_lwp boot_cpu_has(X86_FEATURE_LWP)
#define cpu_has_fma4 boot_cpu_has(X86_FEATURE_FMA4)
#define cpu_has_tbm boot_cpu_has(X86_FEATURE_TBM)