@@ -410,8 +410,12 @@ static const struct test avx512dq_all[]
INSN_PFP(and, 0f, 54),
INSN_PFP(andn, 0f, 55),
INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl),
+ INSN(cvtpd2qq, 66, 0f, 7b, vl, q, vl),
+ INSN(cvtps2qq, 66, 0f, 7b, vl_2, d, vl),
INSN(cvtqq2pd, f3, 0f, e6, vl, q, vl),
INSN(cvtqq2ps, , 0f, 5b, vl, q, vl),
+ INSN(cvttpd2qq, 66, 0f, 7a, vl, q, vl),
+ INSN(cvttps2qq, 66, 0f, 7a, vl_2, d, vl),
INSN_PFP(or, 0f, 56),
// pmovd2m, f3, 0f38, 39, d
// pmovm2, f3, 0f38, 38, dq
@@ -90,14 +90,35 @@ static inline bool _to_bool(byte_vec_t b
#if VEC_SIZE == FLOAT_SIZE
# define to_int(x) ({ int i_ = (x)[0]; touch(i_); ((vec_t){ i_ }); })
+# ifdef __x86_64__
+# define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); })
+# endif
#elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
# define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
#elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \
(VEC_SIZE == 64 || defined(__AVX512VL__))
# if FLOAT_SIZE == 4
# define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0)
+# ifdef __AVX512DQ__
+# define to_wint(x) ({ \
+ vsf_half_t t_ = low_half(x); \
+ vdi_t lo_, hi_; \
+ touch(t_); \
+ lo_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \
+ t_ = high_half(x); \
+ touch(t_); \
+ hi_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \
+ touch(lo_); touch(hi_); \
+ insert_half(insert_half(undef(), \
+ BR(cvtqq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \
+ BR(cvtqq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \
+})
+# endif
# elif FLOAT_SIZE == 8
# define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0)
+# ifdef __AVX512DQ__
+# define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
+# endif
# endif
#elif VEC_SIZE == 16 && defined(__SSE2__)
# if FLOAT_SIZE == 4
@@ -121,6 +142,21 @@ static inline bool _to_bool(byte_vec_t b
})
#endif
+#if VEC_SIZE == 16 && FLOAT_SIZE == 4 && defined(__SSE__)
+# define low_half(x) (x)
+# define high_half(x) B_(movhlps, , undef(), x)
+/*
+ * GCC 7 (and perhaps earlier) report a bogus type mismatch for the conditional
+ * expression below. All works well with this no-op wrapper.
+ */
+static inline vec_t movlhps(vec_t x, vec_t y) {
+ return __builtin_ia32_movlhps(x, y);
+}
+# define insert_pair(x, y, p) \
+ ((p) ? movlhps(x, y) \
+ : ({ vec_t t_ = (x); t_[0] = (y)[0]; t_[1] = (y)[1]; t_; }))
+#endif
+
#if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__)
# define max __builtin_ia32_pfmax
# define min __builtin_ia32_pfmin
@@ -149,13 +185,16 @@ static inline bool _to_bool(byte_vec_t b
# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
(ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
(ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
-# define low_half(x) ({ \
+# define _half(x, lh) ({ \
half_t t_; \
- asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+ asm ( "vextractf%c[w]x%c[n] %[sel], %[s], %[d]" \
: [d] "=m" (t_) \
- : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+ : [s] "v" (x), [sel] "i" (lh), \
+ [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
t_; \
})
+# define low_half(x) _half(x, 0)
+# define high_half(x) _half(x, 1)
# endif
# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
(ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
@@ -1176,6 +1215,13 @@ int simd_test(void)
# endif
+# ifdef to_wint
+ touch(src);
+ x = to_wint(src);
+ touch(src);
+ if ( !eq(x, src) ) return __LINE__;
+# endif
+
# ifdef sqrt
x = src * src;
touch(x);
@@ -325,6 +325,8 @@ static const struct twobyte_table {
[0x77] = { DstImplicit|SrcNone },
[0x78] = { ImplicitOps|ModRM },
[0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
+ [0x7a] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
+ [0x7b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_vl },
[0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq64 },
[0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
@@ -3083,6 +3085,12 @@ x86_decode(
--disp8scale;
break;
+ case 0x7a: /* vcvttps2qq needs special casing */
+ case 0x7b: /* vcvtps2qq needs special casing */
+ if ( disp8scale && evex.pfx == vex_66 && !evex.w && !evex.brs )
+ --disp8scale;
+ break;
+
case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
if ( disp8scale == 2 && evex.pfx == vex_f3 )
disp8scale = 3;
@@ -7355,7 +7363,13 @@ x86_emulate(
if ( evex.pfx != vex_f3 )
host_and_vcpu_must_have(avx512f);
else if ( evex.w )
+ {
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x7a): /* vcvttps2qq {x,y}mm/mem,[xyz]mm{k} */
+ /* vcvttpd2qq [xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x7b): /* vcvtps2qq {x,y}mm/mem,[xyz]mm{k} */
+ /* vcvtpd2qq [xyz]mm/mem,[xyz]mm{k} */
host_and_vcpu_must_have(avx512dq);
+ }
else
{
host_and_vcpu_must_have(avx512f);
VCVT{,T}PS2QQ, sharing their main opcodes with others, once again need "manual" overrides of disp8scale. While not directly related here, also add a scalar variant of to_wint() to the test harness. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v7: Re-base. v6: Workaround for gcc 7 quirk. v5: Re-base over changes earlier in the series. v4: New.