[v8,24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns

Message ID	5C8B844B020000780021F211@prv1-mh.provo.novell.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xenproject.org> Message-Id: <5C8B844B020000780021F211@prv1-mh.provo.novell.com> Date: Fri, 15 Mar 2019 04:54:03 -0600 From: "Jan Beulich" <JBeulich@suse.com> To: "xen-devel" <xen-devel@lists.xenproject.org> References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns Precedence: list Cc: George Dunlap <George.Dunlap@eu.citrix.com>, Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>, Roger Pau Monne <roger.pau@citrix.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>
Series	x86emul: remaining AVX512 support \| expand [v8,00/50] x86emul: remaining AVX512 support [v8,01/50] x86emul: no need to set fault_suppression to false for VMOVNT* [v8,02/50] x86emul: support AVX512{F, BW, DQ} extract insns [v8,03/50] x86emul: support AVX512{F, BW, DQ} insert insns [v8,04/50] x86emul: basic AVX512F testing [v8,05/50] x86emul: support AVX512{F, BW, DQ} integer broadcast insns [v8,06/50] x86emul: basic AVX512VL testing [v8,07/50] x86emul: support AVX512{F, BW} zero- and sign-extending moves [v8,08/50] x86emul: support AVX512{F, BW} down conversion moves [v8,09/50] x86emul: support AVX512{F, BW} integer unpack insns [v8,10/50] x86emul: support AVX512{F, BW, _VBMI} full permute insns [v8,11/50] x86emul: support AVX512{F, BW} integer shuffle insns [v8,12/50] x86emul: support AVX512{BW, DQ} mask move insns [v8,13/50] x86emul: basic AVX512BW testing [v8,14/50] x86emul: basic AVX512DQ testing [v8,15/50] x86emul: support AVX512F move high/low insns [v8,16/50] x86emul: support AVX512F move duplicate insns [v8,17/50] x86emul: support AVX512{F, BW, _VBMI} permute insns [v8,18/50] x86emul: support AVX512BW pack insns [v8,19/50] x86emul: support AVX512F floating-point conversion insns [v8,20/50] x86emul: support AVX512F legacy-equivalent packed int/FP conversion insns [v8,21/50] x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns [v8,22/50] x86emul: support AVX512DQ packed quad-int/FP conversion insns [v8,23/50] x86emul: support AVX512{F, DQ} uint-to-FP conversion insns [v8,24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns [v8,25/50] x86emul: support remaining AVX512F legacy-equivalent insns [v8,26/50] x86emul: support remaining AVX512BW legacy-equivalent insns [v8,27/50] x86emul: support AVX512{F, ER} reciprocal insns [v8,28/50] x86emul: support AVX512F floating point manipulation insns [v8,29/50] x86emul: support AVX512DQ floating point manipulation insns [v8,30/50] x86emul: support AVX512{F, _VBMI2} compress/expand insns [v8,31/50] x86emul: support remaining misc AVX512{F, BW} insns [v8,32/50] x86emul: support AVX512F gather insns [v8,33/50] x86emul: add high register S/G test cases [v8,34/50] x86emul: support AVX512F scatter insns [v8,35/50] x86emul: support AVX512PF insns [v8,36/50] x86emul: support AVX512CD insns [v8,37/50] x86emul: complete support of AVX512_VBMI insns [v8,38/50] x86emul: support of AVX512* population count insns [v8,39/50] x86emul: support of AVX512_IFMA insns [v8,40/50] x86emul: support remaining AVX512_VBMI2 insns [v8,41/50] x86emul: support AVX512_4FMAPS insns [v8,42/50] x86emul: support AVX512_4VNNIW insns [v8,43/50] x86emul: support AVX512_VNNI insns [v8,44/50] x86emul: support VPCLMULQDQ insns [v8,45/50] x86emul: support VAES insns [v8,46/50] x86emul: support GFNI insns [v8,47/50] x86emul: restore ordering within main switch statement [v8,48/50] x86emul: add an AES/VAES test case to the harness [v8,49/50] x86emul: add a SHA test case to the harness [v8,50/50] x86emul: add a PCLMUL/VPCLMUL test case to the harness

--- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -112,21 +112,29 @@ static const struct test avx512f_all[] = INSN(cvtdq2pd, f3, 0f, e6, vl_2, d, vl), INSN(cvtdq2ps, , 0f, 5b, vl, d, vl), INSN(cvtpd2dq, f2, 0f, e6, vl, q, vl), + INSN(cvtpd2udq, , 0f, 79, vl, q, vl), INSN(cvtpd2ps, 66, 0f, 5a, vl, q, vl), INSN(cvtph2ps, 66, 0f38, 13, vl_2, d_nb, vl), INSN(cvtps2dq, 66, 0f, 5b, vl, d, vl), INSN(cvtps2pd, , 0f, 5a, vl_2, d, vl), INSN(cvtps2ph, 66, 0f3a, 1d, vl_2, d_nb, vl), + INSN(cvtps2udq, , 0f, 79, vl, d, vl), INSN(cvtsd2si, f2, 0f, 2d, el, q, el), + INSN(cvtsd2usi, f2, 0f, 79, el, q, el), INSN(cvtsd2ss, f2, 0f, 5a, el, q, el), INSN(cvtsi2sd, f2, 0f, 2a, el, dq64, el), INSN(cvtsi2ss, f3, 0f, 2a, el, dq64, el), INSN(cvtss2sd, f3, 0f, 5a, el, d, el), INSN(cvtss2si, f3, 0f, 2d, el, d, el), + INSN(cvtss2usi, f3, 0f, 79, el, d, el), INSN(cvttpd2dq, 66, 0f, e6, vl, q, vl), + INSN(cvttpd2udq, , 0f, 78, vl, q, vl), INSN(cvttps2dq, f3, 0f, 5b, vl, d, vl), + INSN(cvttps2udq, , 0f, 78, vl, d, vl), INSN(cvttsd2si, f2, 0f, 2c, el, q, el), + INSN(cvttsd2usi, f2, 0f, 78, el, q, el), INSN(cvttss2si, f3, 0f, 2c, el, d, el), + INSN(cvttss2usi, f3, 0f, 78, el, d, el), INSN(cvtudq2pd, f3, 0f, 7a, vl_2, d, vl), INSN(cvtudq2ps, f2, 0f, 7a, vl, d, vl), INSN(cvtusi2sd, f2, 0f, 7b, el, dq64, el), @@ -415,11 +423,15 @@ static const struct test avx512dq_all[] INSN_PFP(andn, 0f, 55), INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl), INSN(cvtpd2qq, 66, 0f, 7b, vl, q, vl), + INSN(cvtpd2uqq, 66, 0f, 79, vl, q, vl), INSN(cvtps2qq, 66, 0f, 7b, vl_2, d, vl), + INSN(cvtps2uqq, 66, 0f, 79, vl_2, d, vl), INSN(cvtqq2pd, f3, 0f, e6, vl, q, vl), INSN(cvtqq2ps, , 0f, 5b, vl, q, vl), INSN(cvttpd2qq, 66, 0f, 7a, vl, q, vl), + INSN(cvttpd2uqq, 66, 0f, 78, vl, q, vl), INSN(cvttps2qq, 66, 0f, 7a, vl_2, d, vl), + INSN(cvttps2uqq, 66, 0f, 78, vl_2, d, vl), INSN(cvtuqq2pd, f3, 0f, 7a, vl, q, vl), INSN(cvtuqq2ps, f2, 0f, 7a, vl, q, vl), INSN_PFP(or, 0f, 56), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -93,31 +93,65 @@ static inline bool _to_bool(byte_vec_t b # ifdef __x86_64__ # define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); }) # endif +# ifdef __AVX512F__ +/* + * Sadly even gcc 9.x, at the time of writing, does not carry out at least + * uint -> FP conversions using VCVTUSI2S{S,D}, so we need to use builtins + * or inline assembly here. The full-vector parameter types of the builtins + * aren't very helpful for our purposes, so use inline assembly. + */ +# if FLOAT_SIZE == 4 +# define to_u_int(type, x) ({ \ + unsigned type u_; \ + float __attribute__((vector_size(16))) t_; \ + asm ( "vcvtss2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \ + asm ( "vcvtusi2ss%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \ + (vec_t){ t_[0] }; \ +}) +# elif FLOAT_SIZE == 8 +# define to_u_int(type, x) ({ \ + unsigned type u_; \ + double __attribute__((vector_size(16))) t_; \ + asm ( "vcvtsd2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \ + asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \ + (vec_t){ t_[0] }; \ +}) +# endif +# define to_uint(x) to_u_int(int, x) +# ifdef __x86_64__ +# define to_uwint(x) to_u_int(long, x) +# endif +# endif #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__) # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x)) #elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \ (VEC_SIZE == 64 || defined(__AVX512VL__)) # if FLOAT_SIZE == 4 # define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0) +# define to_uint(x) BR(cvtudq2ps, _mask, BR(cvtps2udq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0) # ifdef __AVX512DQ__ -# define to_wint(x) ({ \ +# define to_w_int(x, s) ({ \ vsf_half_t t_ = low_half(x); \ vdi_t lo_, hi_; \ touch(t_); \ - lo_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \ + lo_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \ t_ = high_half(x); \ touch(t_); \ - hi_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \ + hi_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \ touch(lo_); touch(hi_); \ insert_half(insert_half(undef(), \ - BR(cvtqq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \ - BR(cvtqq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \ + BR(cvt ## s ## qq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \ + BR(cvt ## s ## qq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \ }) +# define to_wint(x) to_w_int(x, ) +# define to_uwint(x) to_w_int(x, u) # endif # elif FLOAT_SIZE == 8 # define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0) +# define to_uint(x) B(cvtudq2pd, _mask, BR(cvtpd2udq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0) # ifdef __AVX512DQ__ # define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0) +# define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0) # endif # endif #elif VEC_SIZE == 16 && defined(__SSE2__) @@ -1221,6 +1255,20 @@ int simd_test(void) touch(src); if ( !eq(x, src) ) return __LINE__; # endif + +# ifdef to_uint + touch(src); + x = to_uint(src); + touch(src); + if ( !eq(x, src) ) return __LINE__; +# endif + +# ifdef to_uwint + touch(src); + x = to_uwint(src); + touch(src); + if ( !eq(x, src) ) return __LINE__; +# endif # ifdef sqrt x = src * src; --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -323,8 +323,7 @@ static const struct twobyte_table { [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl }, [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl }, [0x77] = { DstImplicit|SrcNone }, - [0x78] = { ImplicitOps|ModRM }, - [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int }, + [0x78 ... 0x79] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_vl }, [0x7a] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl }, [0x7b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_dq64 }, [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other }, @@ -2523,6 +2522,8 @@ x86_decode_twobyte( break; case 0x78: + state->desc = ImplicitOps; + state->simd_size = simd_none; switch ( vex.pfx ) { case vex_66: /* extrq $imm8, $imm8, xmm */ @@ -2535,7 +2536,7 @@ x86_decode_twobyte( case 0x10 ... 0x18: case 0x28 ... 0x2f: case 0x50 ... 0x77: - case 0x79 ... 0x7d: + case 0x7a ... 0x7d: case 0x7f: case 0xc2 ... 0xc3: case 0xc5 ... 0xc6: @@ -2557,6 +2558,12 @@ x86_decode_twobyte( op_bytes = mode_64bit() ? 8 : 4; break; + case 0x79: + state->desc = DstReg | SrcMem; + state->simd_size = simd_packed_int; + ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); + break; + case 0x7e: ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */ @@ -3074,6 +3081,18 @@ x86_decode( modrm_mod = 3; break; + case 0x78: + case 0x79: + if ( !evex.pfx ) + break; + /* vcvt{,t}ps2uqq need special casing */ + if ( evex.pfx == vex_66 ) + { + if ( !evex.w && !evex.brs ) + --disp8scale; + break; + } + /* vcvt{,t}s{s,d}2usi need special casing: fall through */ case 0x2c: /* vcvtts{s,d}2si need special casing */ case 0x2d: /* vcvts{s,d}2si need special casing */ if ( evex_encoded() ) @@ -6329,6 +6348,8 @@ x86_emulate( CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */ CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */ generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk || (ea.type != OP_REG && evex.brs)), EXC_UD); @@ -6690,7 +6711,11 @@ x86_emulate( if ( evex.w ) host_and_vcpu_must_have(avx512dq); else + { + case X86EMUL_OPC_EVEX(0x0f, 0x78): /* vcvttp{s,d}2udq [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(0x0f, 0x79): /* vcvtp{s,d}2udq [xyz]mm/mem,[xyz]mm{k} */ host_and_vcpu_must_have(avx512f); + } if ( ea.type != OP_REG || !evex.brs ) avx512_vlen_check(false); d |= TwoOp; @@ -7373,6 +7398,10 @@ x86_emulate( host_and_vcpu_must_have(avx512f); else if ( evex.w ) { + case X86EMUL_OPC_EVEX_66(0x0f, 0x78): /* vcvttps2uqq {x,y}mm/mem,[xyz]mm{k} */ + /* vcvttpd2uqq [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f, 0x79): /* vcvtps2uqq {x,y}mm/mem,[xyz]mm{k} */ + /* vcvtpd2uqq [xyz]mm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0x7a): /* vcvttps2qq {x,y}mm/mem,[xyz]mm{k} */ /* vcvttpd2qq [xyz]mm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0x7b): /* vcvtps2qq {x,y}mm/mem,[xyz]mm{k} */

[v8,24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns

Commit Message

Comments

Patch