diff mbox series

[v8,22/50] x86emul: support AVX512DQ packed quad-int/FP conversion insns

Message ID 5C8B83EE020000780021F20B@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich March 15, 2019, 10:52 a.m. UTC
VCVT{,T}PS2QQ, sharing their main opcodes with others, once again need
"manual" overrides of disp8scale.

While not directly related here, also add a scalar variant of to_wint()
to the test harness.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: Re-base.
v6: Workaround for gcc 7 quirk.
v5: Re-base over changes earlier in the series.
v4: New.

Comments

Andrew Cooper May 21, 2019, 11:53 a.m. UTC | #1
On 15/03/2019 10:52, Jan Beulich wrote:
> VCVT{,T}PS2QQ, sharing their main opcodes with others, once again need
> "manual" overrides of disp8scale.
>
> While not directly related here, also add a scalar variant of to_wint()
> to the test harness.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -410,8 +410,12 @@  static const struct test avx512dq_all[]
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
     INSN(broadcasti32x2, 66, 0f38, 59, el_2,  d, vl),
+    INSN(cvtpd2qq,       66,   0f, 7b,   vl,  q, vl),
+    INSN(cvtps2qq,       66,   0f, 7b, vl_2,  d, vl),
     INSN(cvtqq2pd,       f3,   0f, e6,   vl,  q, vl),
     INSN(cvtqq2ps,         ,   0f, 5b,   vl,  q, vl),
+    INSN(cvttpd2qq,      66,   0f, 7a,   vl,  q, vl),
+    INSN(cvttps2qq,      66,   0f, 7a, vl_2,  d, vl),
     INSN_PFP(or,               0f, 56),
 //       pmovd2m,        f3, 0f38, 39,        d
 //       pmovm2,         f3, 0f38, 38,       dq
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -90,14 +90,35 @@  static inline bool _to_bool(byte_vec_t b
 
 #if VEC_SIZE == FLOAT_SIZE
 # define to_int(x) ({ int i_ = (x)[0]; touch(i_); ((vec_t){ i_ }); })
+# ifdef __x86_64__
+#  define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); })
+# endif
 #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
 # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
 #elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \
       (VEC_SIZE == 64 || defined(__AVX512VL__))
 # if FLOAT_SIZE == 4
 #  define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0)
+#  ifdef __AVX512DQ__
+#   define to_wint(x) ({ \
+    vsf_half_t t_ = low_half(x); \
+    vdi_t lo_, hi_; \
+    touch(t_); \
+    lo_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \
+    t_ = high_half(x); \
+    touch(t_); \
+    hi_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \
+    touch(lo_); touch(hi_); \
+    insert_half(insert_half(undef(), \
+                            BR(cvtqq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \
+                BR(cvtqq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \
+})
+#  endif
 # elif FLOAT_SIZE == 8
 #  define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0)
+#  ifdef __AVX512DQ__
+#   define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
+#  endif
 # endif
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if FLOAT_SIZE == 4
@@ -121,6 +142,21 @@  static inline bool _to_bool(byte_vec_t b
 })
 #endif
 
+#if VEC_SIZE == 16 && FLOAT_SIZE == 4 && defined(__SSE__)
+# define low_half(x) (x)
+# define high_half(x) B_(movhlps, , undef(), x)
+/*
+ * GCC 7 (and perhaps earlier) report a bogus type mismatch for the conditional
+ * expression below. All works well with this no-op wrapper.
+ */
+static inline vec_t movlhps(vec_t x, vec_t y) {
+    return __builtin_ia32_movlhps(x, y);
+}
+# define insert_pair(x, y, p) \
+    ((p) ? movlhps(x, y) \
+         : ({ vec_t t_ = (x); t_[0] = (y)[0]; t_[1] = (y)[1]; t_; }))
+#endif
+
 #if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__)
 # define max __builtin_ia32_pfmax
 # define min __builtin_ia32_pfmin
@@ -149,13 +185,16 @@  static inline bool _to_bool(byte_vec_t b
 # if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
      (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
      (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
-#  define low_half(x) ({ \
+#  define _half(x, lh) ({ \
     half_t t_; \
-    asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+    asm ( "vextractf%c[w]x%c[n] %[sel], %[s], %[d]" \
           : [d] "=m" (t_) \
-          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+          : [s] "v" (x), [sel] "i" (lh), \
+            [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
     t_; \
 })
+#  define low_half(x)  _half(x, 0)
+#  define high_half(x) _half(x, 1)
 # endif
 # if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
      (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
@@ -1176,6 +1215,13 @@  int simd_test(void)
 
 # endif
 
+# ifdef to_wint
+    touch(src);
+    x = to_wint(src);
+    touch(src);
+    if ( !eq(x, src) ) return __LINE__;
+# endif
+
 # ifdef sqrt
     x = src * src;
     touch(x);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -325,6 +325,8 @@  static const struct twobyte_table {
     [0x77] = { DstImplicit|SrcNone },
     [0x78] = { ImplicitOps|ModRM },
     [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
+    [0x7a] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
+    [0x7b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_vl },
     [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq64 },
     [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
@@ -3083,6 +3085,12 @@  x86_decode(
                     --disp8scale;
                 break;
 
+            case 0x7a: /* vcvttps2qq needs special casing */
+            case 0x7b: /* vcvtps2qq needs special casing */
+                if ( disp8scale && evex.pfx == vex_66 && !evex.w && !evex.brs )
+                    --disp8scale;
+                break;
+
             case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
                 if ( disp8scale == 2 && evex.pfx == vex_f3 )
                     disp8scale = 3;
@@ -7355,7 +7363,13 @@  x86_emulate(
         if ( evex.pfx != vex_f3 )
             host_and_vcpu_must_have(avx512f);
         else if ( evex.w )
+        {
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7a):   /* vcvttps2qq {x,y}mm/mem,[xyz]mm{k} */
+                                            /* vcvttpd2qq [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7b):   /* vcvtps2qq {x,y}mm/mem,[xyz]mm{k} */
+                                            /* vcvtpd2qq [xyz]mm/mem,[xyz]mm{k} */
             host_and_vcpu_must_have(avx512dq);
+        }
         else
         {
             host_and_vcpu_must_have(avx512f);