@@ -109,8 +109,12 @@ static const struct test avx512f_all[] =
INSN_FP(cmp, 0f, c2),
INSN(comisd, 66, 0f, 2f, el, q, el),
INSN(comiss, , 0f, 2f, el, d, el),
+ INSN(cvtdq2pd, f3, 0f, e6, vl_2, d, vl),
+ INSN(cvtdq2ps, , 0f, 5b, vl, d, vl),
+ INSN(cvtpd2dq, f2, 0f, e6, vl, q, vl),
INSN(cvtpd2ps, 66, 0f, 5a, vl, q, vl),
INSN(cvtph2ps, 66, 0f38, 13, vl_2, d_nb, vl),
+ INSN(cvtps2dq, 66, 0f, 5b, vl, d, vl),
INSN(cvtps2pd, , 0f, 5a, vl_2, d, vl),
INSN(cvtps2ph, 66, 0f3a, 1d, vl_2, d_nb, vl),
INSN(cvtsd2ss, f2, 0f, 5a, el, q, el),
@@ -398,6 +402,8 @@ static const struct test avx512dq_all[]
INSN_PFP(and, 0f, 54),
INSN_PFP(andn, 0f, 55),
INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl),
+ INSN(cvtqq2pd, f3, 0f, e6, vl, q, vl),
+ INSN(cvtqq2ps, , 0f, 5b, vl, q, vl),
INSN_PFP(or, 0f, 56),
// pmovd2m, f3, 0f38, 39, d
// pmovm2, f3, 0f38, 38, dq
@@ -92,6 +92,13 @@ static inline bool _to_bool(byte_vec_t b
# define to_int(x) ((vec_t){ (int)(x)[0] })
#elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
# define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
+#elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \
+ (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if FLOAT_SIZE == 4
+# define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0)
+# elif FLOAT_SIZE == 8
+# define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0)
+# endif
#elif VEC_SIZE == 16 && defined(__SSE2__)
# if FLOAT_SIZE == 4
# define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
@@ -1142,15 +1149,21 @@ int simd_test(void)
touch(src);
if ( !eq(x * -alt, -src) ) return __LINE__;
-# if defined(recip) && defined(to_int)
+# ifdef to_int
+
+ touch(src);
+ x = to_int(src);
+ touch(src);
+ if ( !eq(x, src) ) return __LINE__;
+# ifdef recip
touch(src);
x = recip(src);
touch(src);
touch(x);
if ( !eq(to_int(recip(x)), src) ) return __LINE__;
-# ifdef rsqrt
+# ifdef rsqrt
x = src * src;
touch(x);
y = rsqrt(x);
@@ -1158,6 +1171,7 @@ int simd_test(void)
if ( !eq(to_int(recip(y)), src) ) return __LINE__;
touch(src);
if ( !eq(to_int(y), to_int(recip(src))) ) return __LINE__;
+# endif
# endif
# endif
@@ -244,6 +244,7 @@ asm ( ".macro override insn \n\t"
OVR_INT(broadcast);
OVR_SFP(broadcast);
OVR_SFP(comi);
+OVR_VFP(cvtdq2);
OVR_FP(add);
OVR_INT(add);
OVR_BW(adds);
@@ -330,13 +331,19 @@ REN(pandn, , d);
REN(por, , d);
REN(pxor, , d);
# endif
+OVR(cvtpd2dqx);
+OVR(cvtpd2dqy);
OVR(cvtpd2psx);
OVR(cvtpd2psy);
OVR(cvtph2ps);
+OVR(cvtps2dq);
OVR(cvtps2pd);
OVR(cvtps2ph);
OVR(cvtsd2ss);
OVR(cvtss2sd);
+OVR(cvttpd2dqx);
+OVR(cvttpd2dqy);
+OVR(cvttps2dq);
OVR(movddup);
OVR(movntdq);
OVR(movntdqa);
@@ -311,7 +311,7 @@ static const struct twobyte_table {
[0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
[0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
[0x5a] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
- [0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
[0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
[0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
[0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -375,7 +375,7 @@ static const struct twobyte_table {
[0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_128, 4 },
[0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
- [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
[0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
[0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
@@ -3081,6 +3081,11 @@ x86_decode(
if ( disp8scale == 2 && evex.pfx == vex_f3 )
disp8scale = 3;
break;
+
+ case 0xe6: /* vcvtdq2pd needs special casing */
+ if ( disp8scale && evex.pfx == vex_f3 && !evex.w && !evex.brs )
+ --disp8scale;
+ break;
}
break;
@@ -6587,6 +6592,22 @@ x86_emulate(
op_bytes = 16 << vex.l;
goto simd_0f_cvt;
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x5b): /* vcvtps2dq [xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f, 0x5b): /* vcvttps2dq [xyz]mm/mem,[xyz]mm{k} */
+ generate_exception_if(evex.w, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX(0x0f, 0x5b): /* vcvtdq2ps [xyz]mm/mem,[xyz]mm{k} */
+ /* vcvtqq2ps [xyz]mm/mem,{x,y}mm{k} */
+ if ( evex.w )
+ host_and_vcpu_must_have(avx512dq);
+ else
+ host_and_vcpu_must_have(avx512f);
+ if ( ea.type != OP_REG || !evex.brs )
+ avx512_vlen_check(false);
+ d |= TwoOp;
+ op_bytes = 16 << evex.lr;
+ goto simd_zmm;
+
CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -7251,6 +7272,27 @@ x86_emulate(
op_bytes = 8;
goto simd_0f_xmm;
+ case X86EMUL_OPC_EVEX_66(0x0f, 0xe6): /* vcvttpd2dq [xyz]mm/mem,{x,y}mm{k} */
+ case X86EMUL_OPC_EVEX_F2(0x0f, 0xe6): /* vcvtpd2dq [xyz]mm/mem,{x,y}mm{k} */
+ generate_exception_if(!evex.w, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_F3(0x0f, 0xe6): /* vcvtdq2pd {x,y}mm/mem,[xyz]mm{k} */
+ /* vcvtqq2pd [xyz]mm/mem,[xyz]mm{k} */
+ if ( evex.pfx != vex_f3 )
+ host_and_vcpu_must_have(avx512f);
+ else if ( evex.w )
+ host_and_vcpu_must_have(avx512dq);
+ else
+ {
+ host_and_vcpu_must_have(avx512f);
+ generate_exception_if(ea.type != OP_MEM && evex.brs, EXC_UD);
+ }
+ if ( ea.type != OP_REG || !evex.brs )
+ avx512_vlen_check(false);
+ d |= TwoOp;
+ op_bytes = 8 << (evex.w + evex.lr);
+ goto simd_zmm;
+
case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */
case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
generate_exception_if(ea.type != OP_MEM, EXC_UD);
... including the two AVX512DQ forms which shared encodings, just with EVEX.W set there. VCVTDQ2PD, sharing its main opcode with others, needs a "manual" override of disp8scale. The simd_size changes for the twobyte_table[] entries are benign to pre-existing code, but allow decode_disp8scale() to work as is here. The at this point wrong placement of the 0xe6 case block is once again in anticipation of further additions of case labels. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v7: ea.type == OP_* -> ea.type != OP_*. Re-base. v6: Re-base over changes earlier in the series. v4: New.