From patchwork Fri Mar 15 10:52:30 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jan Beulich X-Patchwork-Id: 10854489 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 5939513B5 for ; Fri, 15 Mar 2019 10:54:16 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 3DFA12908D for ; Fri, 15 Mar 2019 10:54:16 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 2F81C291B4; Fri, 15 Mar 2019 10:54:16 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120]) (using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id A34EE2908D for ; Fri, 15 Mar 2019 10:54:15 +0000 (UTC) Received: from localhost ([127.0.0.1] helo=lists.xenproject.org) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kS2-0006aX-HJ; Fri, 15 Mar 2019 10:52:34 +0000 Received: from us1-rack-dfw2.inumbo.com ([104.130.134.6]) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kS0-0006aM-Qr for xen-devel@lists.xenproject.org; Fri, 15 Mar 2019 10:52:32 +0000 X-Inumbo-ID: 6eb0479b-4710-11e9-bc90-bc764e045a96 Received: from prv1-mh.provo.novell.com (unknown [137.65.248.33]) by us1-rack-dfw2.inumbo.com (Halon) with ESMTPS id 6eb0479b-4710-11e9-bc90-bc764e045a96; Fri, 15 Mar 2019 10:52:31 +0000 (UTC) Received: from INET-PRV1-MTA by prv1-mh.provo.novell.com with Novell_GroupWise; Fri, 15 Mar 2019 04:52:30 -0600 Message-Id: <5C8B83EE020000780021F20B@prv1-mh.provo.novell.com> X-Mailer: Novell GroupWise Internet Agent 18.1.0 Date: Fri, 15 Mar 2019 04:52:30 -0600 From: "Jan Beulich" To: "xen-devel" References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 22/50] x86emul: support AVX512DQ packed quad-int/FP conversion insns X-BeenThere: xen-devel@lists.xenproject.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Xen developer discussion List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Cc: George Dunlap , Andrew Cooper , Wei Liu , Roger Pau Monne Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" X-Virus-Scanned: ClamAV using ClamSMTP VCVT{,T}PS2QQ, sharing their main opcodes with others, once again need "manual" overrides of disp8scale. While not directly related here, also add a scalar variant of to_wint() to the test harness. Signed-off-by: Jan Beulich Acked-by: Andrew Cooper --- v7: Re-base. v6: Workaround for gcc 7 quirk. v5: Re-base over changes earlier in the series. v4: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -410,8 +410,12 @@ static const struct test avx512dq_all[] INSN_PFP(and, 0f, 54), INSN_PFP(andn, 0f, 55), INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl), + INSN(cvtpd2qq, 66, 0f, 7b, vl, q, vl), + INSN(cvtps2qq, 66, 0f, 7b, vl_2, d, vl), INSN(cvtqq2pd, f3, 0f, e6, vl, q, vl), INSN(cvtqq2ps, , 0f, 5b, vl, q, vl), + INSN(cvttpd2qq, 66, 0f, 7a, vl, q, vl), + INSN(cvttps2qq, 66, 0f, 7a, vl_2, d, vl), INSN_PFP(or, 0f, 56), // pmovd2m, f3, 0f38, 39, d // pmovm2, f3, 0f38, 38, dq --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -90,14 +90,35 @@ static inline bool _to_bool(byte_vec_t b #if VEC_SIZE == FLOAT_SIZE # define to_int(x) ({ int i_ = (x)[0]; touch(i_); ((vec_t){ i_ }); }) +# ifdef __x86_64__ +# define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); }) +# endif #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__) # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x)) #elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \ (VEC_SIZE == 64 || defined(__AVX512VL__)) # if FLOAT_SIZE == 4 # define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0) +# ifdef __AVX512DQ__ +# define to_wint(x) ({ \ + vsf_half_t t_ = low_half(x); \ + vdi_t lo_, hi_; \ + touch(t_); \ + lo_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \ + t_ = high_half(x); \ + touch(t_); \ + hi_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \ + touch(lo_); touch(hi_); \ + insert_half(insert_half(undef(), \ + BR(cvtqq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \ + BR(cvtqq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \ +}) +# endif # elif FLOAT_SIZE == 8 # define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0) +# ifdef __AVX512DQ__ +# define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0) +# endif # endif #elif VEC_SIZE == 16 && defined(__SSE2__) # if FLOAT_SIZE == 4 @@ -121,6 +142,21 @@ static inline bool _to_bool(byte_vec_t b }) #endif +#if VEC_SIZE == 16 && FLOAT_SIZE == 4 && defined(__SSE__) +# define low_half(x) (x) +# define high_half(x) B_(movhlps, , undef(), x) +/* + * GCC 7 (and perhaps earlier) report a bogus type mismatch for the conditional + * expression below. All works well with this no-op wrapper. + */ +static inline vec_t movlhps(vec_t x, vec_t y) { + return __builtin_ia32_movlhps(x, y); +} +# define insert_pair(x, y, p) \ + ((p) ? movlhps(x, y) \ + : ({ vec_t t_ = (x); t_[0] = (y)[0]; t_[1] = (y)[1]; t_; })) +#endif + #if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__) # define max __builtin_ia32_pfmax # define min __builtin_ia32_pfmin @@ -149,13 +185,16 @@ static inline bool _to_bool(byte_vec_t b # if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \ (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \ (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */ -# define low_half(x) ({ \ +# define _half(x, lh) ({ \ half_t t_; \ - asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \ + asm ( "vextractf%c[w]x%c[n] %[sel], %[s], %[d]" \ : [d] "=m" (t_) \ - : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \ + : [s] "v" (x), [sel] "i" (lh), \ + [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \ t_; \ }) +# define low_half(x) _half(x, 0) +# define high_half(x) _half(x, 1) # endif # if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \ (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */ @@ -1176,6 +1215,13 @@ int simd_test(void) # endif +# ifdef to_wint + touch(src); + x = to_wint(src); + touch(src); + if ( !eq(x, src) ) return __LINE__; +# endif + # ifdef sqrt x = src * src; touch(x); --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -325,6 +325,8 @@ static const struct twobyte_table { [0x77] = { DstImplicit|SrcNone }, [0x78] = { ImplicitOps|ModRM }, [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int }, + [0x7a] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl }, + [0x7b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_vl }, [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other }, [0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq64 }, [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl }, @@ -3083,6 +3085,12 @@ x86_decode( --disp8scale; break; + case 0x7a: /* vcvttps2qq needs special casing */ + case 0x7b: /* vcvtps2qq needs special casing */ + if ( disp8scale && evex.pfx == vex_66 && !evex.w && !evex.brs ) + --disp8scale; + break; + case 0x7e: /* vmovq xmm/m64,xmm needs special casing */ if ( disp8scale == 2 && evex.pfx == vex_f3 ) disp8scale = 3; @@ -7355,7 +7363,13 @@ x86_emulate( if ( evex.pfx != vex_f3 ) host_and_vcpu_must_have(avx512f); else if ( evex.w ) + { + case X86EMUL_OPC_EVEX_66(0x0f, 0x7a): /* vcvttps2qq {x,y}mm/mem,[xyz]mm{k} */ + /* vcvttpd2qq [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f, 0x7b): /* vcvtps2qq {x,y}mm/mem,[xyz]mm{k} */ + /* vcvtpd2qq [xyz]mm/mem,[xyz]mm{k} */ host_and_vcpu_must_have(avx512dq); + } else { host_and_vcpu_must_have(avx512f);