[v8,28/50] x86emul: support AVX512F floating point manipulation insns

Message ID	5C8B84C5020000780021F242@prv1-mh.provo.novell.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xenproject.org> Message-Id: <5C8B84C5020000780021F242@prv1-mh.provo.novell.com> Date: Fri, 15 Mar 2019 04:56:05 -0600 From: "Jan Beulich" <JBeulich@suse.com> To: "xen-devel" <xen-devel@lists.xenproject.org> References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 28/50] x86emul: support AVX512F floating point manipulation insns Precedence: list Cc: George Dunlap <George.Dunlap@eu.citrix.com>, Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>, Roger Pau Monne <roger.pau@citrix.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>
Series	x86emul: remaining AVX512 support \| expand [v8,00/50] x86emul: remaining AVX512 support [v8,01/50] x86emul: no need to set fault_suppression to false for VMOVNT* [v8,02/50] x86emul: support AVX512{F, BW, DQ} extract insns [v8,03/50] x86emul: support AVX512{F, BW, DQ} insert insns [v8,04/50] x86emul: basic AVX512F testing [v8,05/50] x86emul: support AVX512{F, BW, DQ} integer broadcast insns [v8,06/50] x86emul: basic AVX512VL testing [v8,07/50] x86emul: support AVX512{F, BW} zero- and sign-extending moves [v8,08/50] x86emul: support AVX512{F, BW} down conversion moves [v8,09/50] x86emul: support AVX512{F, BW} integer unpack insns [v8,10/50] x86emul: support AVX512{F, BW, _VBMI} full permute insns [v8,11/50] x86emul: support AVX512{F, BW} integer shuffle insns [v8,12/50] x86emul: support AVX512{BW, DQ} mask move insns [v8,13/50] x86emul: basic AVX512BW testing [v8,14/50] x86emul: basic AVX512DQ testing [v8,15/50] x86emul: support AVX512F move high/low insns [v8,16/50] x86emul: support AVX512F move duplicate insns [v8,17/50] x86emul: support AVX512{F, BW, _VBMI} permute insns [v8,18/50] x86emul: support AVX512BW pack insns [v8,19/50] x86emul: support AVX512F floating-point conversion insns [v8,20/50] x86emul: support AVX512F legacy-equivalent packed int/FP conversion insns [v8,21/50] x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns [v8,22/50] x86emul: support AVX512DQ packed quad-int/FP conversion insns [v8,23/50] x86emul: support AVX512{F, DQ} uint-to-FP conversion insns [v8,24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns [v8,25/50] x86emul: support remaining AVX512F legacy-equivalent insns [v8,26/50] x86emul: support remaining AVX512BW legacy-equivalent insns [v8,27/50] x86emul: support AVX512{F, ER} reciprocal insns [v8,28/50] x86emul: support AVX512F floating point manipulation insns [v8,29/50] x86emul: support AVX512DQ floating point manipulation insns [v8,30/50] x86emul: support AVX512{F, _VBMI2} compress/expand insns [v8,31/50] x86emul: support remaining misc AVX512{F, BW} insns [v8,32/50] x86emul: support AVX512F gather insns [v8,33/50] x86emul: add high register S/G test cases [v8,34/50] x86emul: support AVX512F scatter insns [v8,35/50] x86emul: support AVX512PF insns [v8,36/50] x86emul: support AVX512CD insns [v8,37/50] x86emul: complete support of AVX512_VBMI insns [v8,38/50] x86emul: support of AVX512* population count insns [v8,39/50] x86emul: support of AVX512_IFMA insns [v8,40/50] x86emul: support remaining AVX512_VBMI2 insns [v8,41/50] x86emul: support AVX512_4FMAPS insns [v8,42/50] x86emul: support AVX512_4VNNIW insns [v8,43/50] x86emul: support AVX512_VNNI insns [v8,44/50] x86emul: support VPCLMULQDQ insns [v8,45/50] x86emul: support VAES insns [v8,46/50] x86emul: support GFNI insns [v8,47/50] x86emul: restore ordering within main switch statement [v8,48/50] x86emul: add an AES/VAES test case to the harness [v8,49/50] x86emul: add a SHA test case to the harness [v8,50/50] x86emul: add a PCLMUL/VPCLMUL test case to the harness

--- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -140,6 +140,8 @@ static const struct test avx512f_all[] = INSN(cvtusi2sd, f2, 0f, 7b, el, dq64, el), INSN(cvtusi2ss, f3, 0f, 7b, el, dq64, el), INSN_FP(div, 0f, 5e), + INSN(fixupimm, 66, 0f3a, 54, vl, sd, vl), + INSN(fixupimm, 66, 0f3a, 55, el, sd, el), INSN(fmadd132, 66, 0f38, 98, vl, sd, vl), INSN(fmadd132, 66, 0f38, 99, el, sd, el), INSN(fmadd213, 66, 0f38, a8, vl, sd, vl), @@ -170,6 +172,10 @@ static const struct test avx512f_all[] = INSN(fnmsub213, 66, 0f38, af, el, sd, el), INSN(fnmsub231, 66, 0f38, be, vl, sd, vl), INSN(fnmsub231, 66, 0f38, bf, el, sd, el), + INSN(getexp, 66, 0f38, 42, vl, sd, vl), + INSN(getexp, 66, 0f38, 43, el, sd, el), + INSN(getmant, 66, 0f3a, 26, vl, sd, vl), + INSN(getmant, 66, 0f3a, 27, el, sd, el), INSN_FP(max, 0f, 5f), INSN_FP(min, 0f, 5d), INSN_SFP(mov, 0f, 10), @@ -286,6 +292,8 @@ static const struct test avx512f_all[] = INSN(rndscaless, 66, 0f3a, 0a, el, d, el), INSN(rsqrt14, 66, 0f38, 4e, vl, sd, vl), INSN(rsqrt14, 66, 0f38, 4f, el, sd, el), + INSN(scalef, 66, 0f38, 2c, vl, sd, vl), + INSN(scalef, 66, 0f38, 2d, el, sd, el), INSN_PFP(shuf, 0f, c6), INSN_FP(sqrt, 0f, 51), INSN_FP(sub, 0f, 5c), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -174,6 +174,11 @@ static inline bool _to_bool(byte_vec_t b asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \ (vec_t){ r_[0] }; \ }) +# define scalar_2op(x, y, op) ({ \ + typeof((x)[0]) __attribute__((vector_size(16))) r_ = { x[0] }; \ + asm ( op : [out] "=&x" (r_) : [in1] "[out]" (r_), [in2] "m" (y) ); \ + (vec_t){ r_[0] }; \ +}) #endif #if VEC_SIZE == 16 && FLOAT_SIZE == 4 && defined(__SSE__) @@ -210,6 +215,8 @@ static inline vec_t movlhps(vec_t x, vec }) #elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__) # if FLOAT_SIZE == 4 +# define getexp(x) scalar_1op(x, "vgetexpss %[in], %[out], %[out]") +# define getmant(x) scalar_1op(x, "vgetmantss $0, %[in], %[out], %[out]") # ifdef __AVX512ER__ # define recip(x) scalar_1op(x, "vrcp28ss %[in], %[out], %[out]") # define rsqrt(x) scalar_1op(x, "vrsqrt28ss %[in], %[out], %[out]") @@ -217,9 +224,12 @@ static inline vec_t movlhps(vec_t x, vec # define recip(x) scalar_1op(x, "vrcp14ss %[in], %[out], %[out]") # define rsqrt(x) scalar_1op(x, "vrsqrt14ss %[in], %[out], %[out]") # endif +# define scale(x, y) scalar_2op(x, y, "vscalefss %[in2], %[in1], %[out]") # define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]") # define trunc(x) scalar_1op(x, "vrndscaless $0b1011, %[in], %[out], %[out]") # elif FLOAT_SIZE == 8 +# define getexp(x) scalar_1op(x, "vgetexpsd %[in], %[out], %[out]") +# define getmant(x) scalar_1op(x, "vgetmantsd $0, %[in], %[out], %[out]") # ifdef __AVX512ER__ # define recip(x) scalar_1op(x, "vrcp28sd %[in], %[out], %[out]") # define rsqrt(x) scalar_1op(x, "vrsqrt28sd %[in], %[out], %[out]") @@ -227,6 +237,7 @@ static inline vec_t movlhps(vec_t x, vec # define recip(x) scalar_1op(x, "vrcp14sd %[in], %[out], %[out]") # define rsqrt(x) scalar_1op(x, "vrsqrt14sd %[in], %[out], %[out]") # endif +# define scale(x, y) scalar_2op(x, y, "vscalefsd %[in2], %[in1], %[out]") # define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]") # define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]") # endif @@ -274,9 +285,12 @@ static inline vec_t movlhps(vec_t x, vec # define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0) # define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0) # endif +# define getexp(x) BR(getexpps, _mask, x, undef(), ~0) +# define getmant(x) BR(getmantps, _mask, x, 0, undef(), ~0) # define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0) # define min(x, y) BR_(minps, _mask, x, y, undef(), ~0) # define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE)) +# define scale(x, y) BR(scalefps, _mask, x, y, undef(), ~0) # if VEC_SIZE == 64 && defined(__AVX512ER__) # define recip(x) BR(rcp28ps, _mask, x, undef(), ~0) # define rsqrt(x) BR(rsqrt28ps, _mask, x, undef(), ~0) @@ -336,9 +350,12 @@ static inline vec_t movlhps(vec_t x, vec # define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0) # define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0) # endif +# define getexp(x) BR(getexppd, _mask, x, undef(), ~0) +# define getmant(x) BR(getmantpd, _mask, x, 0, undef(), ~0) # define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0) # define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0) # define mix(x, y) B(movapd, _mask, x, y, 0b01010101) +# define scale(x, y) BR(scalefpd, _mask, x, y, undef(), ~0) # if VEC_SIZE == 64 && defined(__AVX512ER__) # define recip(x) BR(rcp28pd, _mask, x, undef(), ~0) # define rsqrt(x) BR(rsqrt28pd, _mask, x, undef(), ~0) @@ -1766,6 +1783,28 @@ int simd_test(void) # endif #endif +#if defined(getexp) && defined(getmant) + touch(src); + x = getmant(src); + touch(src); + y = getexp(src); + touch(src); + for ( j = i = 0; i < ELEM_COUNT; ++i ) + { + if ( y[i] != j ) return __LINE__; + + if ( !((i + 1) & (i + 2)) ) + ++j; + + if ( !(i & (i + 1)) && x[i] != 1 ) return __LINE__; + } +# ifdef scale + touch(y); + z = scale(x, y); + if ( !eq(src, z) ) return __LINE__; +# endif +#endif + #if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \ (defined(__AVX512F__) && defined(FLOAT_SIZE)) return -fma_test(); --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -3924,6 +3924,44 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing vfixupimmpd $0,8(%edx){1to8},%zmm3,%zmm4..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(vfixupimmpd); + static const struct { + double d[4]; + } + src = { { -1, 0, 1, 2 } }, + dst = { { 3, 4, 5, 6 } }, + out = { { .5, -1, 90, 2 } }; + + asm volatile ( "vbroadcastf64x4 %1, %%zmm3\n\t" + "vbroadcastf64x4 %2, %%zmm4\n" + put_insn(vfixupimmpd, + "vfixupimmpd $0, 8(%0)%{1to8%}, %%zmm3, %%zmm4") + :: "d" (NULL), "m" (src), "m" (dst) ); + + set_insn(vfixupimmpd); + /* + * Nibble (token) mapping (unused ones simply set to zero): + * 2 (ZERO) -> -1 (0x9) + * 3 (POS_ONE) -> 90 (0xc) + * 6 (NEG) -> 1/2 (0xb) + * 7 (POS) -> src (0x1) + */ + res[2] = 0x1b00c900; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + asm volatile ( "vmovupd %%zmm4, %0" : "=m" (res[0]) ); + if ( rc != X86EMUL_OKAY || !check_eip(vfixupimmpd) || + memcmp(res + 0, &out, sizeof(out)) || + memcmp(res + 8, &out, sizeof(out)) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + #undef decl_insn #undef put_insn #undef set_insn --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -459,7 +459,8 @@ static const struct ext0f38_table { [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl }, [0x2b] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, - [0x2c ... 0x2d] = { .simd_size = simd_packed_fp }, + [0x2c] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, + [0x2d] = { .simd_size = simd_packed_fp, .d8s = d8s_dq }, [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 }, [0x30] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 }, [0x31] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 }, @@ -470,6 +471,8 @@ static const struct ext0f38_table { [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x41] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0x42] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, + [0x43] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x4c] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, [0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, @@ -563,6 +566,8 @@ static const struct ext0f3a_table { [0x22] = { .simd_size = simd_none, .d8s = d8s_dq64 }, [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0x26] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, + [0x27] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 }, [0x38] = { .simd_size = simd_128, .d8s = 4 }, [0x3a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 }, @@ -577,6 +582,8 @@ static const struct ext0f3a_table { [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0x54] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, + [0x55] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 }, @@ -2684,6 +2691,10 @@ x86_decode_0f38( ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); break; + case X86EMUL_OPC_EVEX_66(0, 0x2d): /* vscalefs{s,d} */ + state->simd_size = simd_scalar_vexw; + break; + case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */ case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */ case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */ @@ -9095,6 +9106,8 @@ x86_emulate( host_and_vcpu_must_have(fma); goto simd_0f_ymm; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x2c): /* vscalefp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x42): /* vgetexpp{s,d} [xyz]mm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ @@ -9118,6 +9131,8 @@ x86_emulate( avx512_vlen_check(false); goto simd_zmm; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x2d): /* vscalefs{s,d} xmm/mem,xmm,xmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x43): /* vgetexps{s,d} xmm/mem,xmm,xmm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */ @@ -9681,6 +9696,21 @@ x86_emulate( op_bytes = 4; goto simd_imm8_zmm; + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x26): /* vgetmantp{s,d} $imm8,[xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x54): /* vfixupimmp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + host_and_vcpu_must_have(avx512f); + if ( ea.type != OP_REG || !evex.brs ) + avx512_vlen_check(false); + goto simd_imm8_zmm; + + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x27): /* vgetmants{s,d} $imm8,xmm/mem,xmm,xmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x55): /* vfixupimms{s,d} $imm8,xmm/mem,xmm,xmm{k} */ + host_and_vcpu_must_have(avx512f); + generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD); + if ( !evex.brs ) + avx512_vlen_check(true); + goto simd_imm8_zmm; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */ if ( !vex.w )

[v8,28/50] x86emul: support AVX512F floating point manipulation insns

Commit Message

Comments

Patch