[v8,27/50] x86emul: support AVX512{F, ER} reciprocal insns

Message ID	5C8B84A8020000780021F23F@prv1-mh.provo.novell.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xenproject.org> Message-Id: <5C8B84A8020000780021F23F@prv1-mh.provo.novell.com> Date: Fri, 15 Mar 2019 04:55:36 -0600 From: "Jan Beulich" <JBeulich@suse.com> To: "xen-devel" <xen-devel@lists.xenproject.org> References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 27/50] x86emul: support AVX512{F, ER} reciprocal insns Precedence: list Cc: George Dunlap <George.Dunlap@eu.citrix.com>, Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>, Roger Pau Monne <roger.pau@citrix.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>
Series	x86emul: remaining AVX512 support \| expand [v8,00/50] x86emul: remaining AVX512 support [v8,01/50] x86emul: no need to set fault_suppression to false for VMOVNT* [v8,02/50] x86emul: support AVX512{F, BW, DQ} extract insns [v8,03/50] x86emul: support AVX512{F, BW, DQ} insert insns [v8,04/50] x86emul: basic AVX512F testing [v8,05/50] x86emul: support AVX512{F, BW, DQ} integer broadcast insns [v8,06/50] x86emul: basic AVX512VL testing [v8,07/50] x86emul: support AVX512{F, BW} zero- and sign-extending moves [v8,08/50] x86emul: support AVX512{F, BW} down conversion moves [v8,09/50] x86emul: support AVX512{F, BW} integer unpack insns [v8,10/50] x86emul: support AVX512{F, BW, _VBMI} full permute insns [v8,11/50] x86emul: support AVX512{F, BW} integer shuffle insns [v8,12/50] x86emul: support AVX512{BW, DQ} mask move insns [v8,13/50] x86emul: basic AVX512BW testing [v8,14/50] x86emul: basic AVX512DQ testing [v8,15/50] x86emul: support AVX512F move high/low insns [v8,16/50] x86emul: support AVX512F move duplicate insns [v8,17/50] x86emul: support AVX512{F, BW, _VBMI} permute insns [v8,18/50] x86emul: support AVX512BW pack insns [v8,19/50] x86emul: support AVX512F floating-point conversion insns [v8,20/50] x86emul: support AVX512F legacy-equivalent packed int/FP conversion insns [v8,21/50] x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns [v8,22/50] x86emul: support AVX512DQ packed quad-int/FP conversion insns [v8,23/50] x86emul: support AVX512{F, DQ} uint-to-FP conversion insns [v8,24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns [v8,25/50] x86emul: support remaining AVX512F legacy-equivalent insns [v8,26/50] x86emul: support remaining AVX512BW legacy-equivalent insns [v8,27/50] x86emul: support AVX512{F, ER} reciprocal insns [v8,28/50] x86emul: support AVX512F floating point manipulation insns [v8,29/50] x86emul: support AVX512DQ floating point manipulation insns [v8,30/50] x86emul: support AVX512{F, _VBMI2} compress/expand insns [v8,31/50] x86emul: support remaining misc AVX512{F, BW} insns [v8,32/50] x86emul: support AVX512F gather insns [v8,33/50] x86emul: add high register S/G test cases [v8,34/50] x86emul: support AVX512F scatter insns [v8,35/50] x86emul: support AVX512PF insns [v8,36/50] x86emul: support AVX512CD insns [v8,37/50] x86emul: complete support of AVX512_VBMI insns [v8,38/50] x86emul: support of AVX512* population count insns [v8,39/50] x86emul: support of AVX512_IFMA insns [v8,40/50] x86emul: support remaining AVX512_VBMI2 insns [v8,41/50] x86emul: support AVX512_4FMAPS insns [v8,42/50] x86emul: support AVX512_4VNNIW insns [v8,43/50] x86emul: support AVX512_VNNI insns [v8,44/50] x86emul: support VPCLMULQDQ insns [v8,45/50] x86emul: support VAES insns [v8,46/50] x86emul: support GFNI insns [v8,47/50] x86emul: restore ordering within main switch statement [v8,48/50] x86emul: add an AES/VAES test case to the harness [v8,49/50] x86emul: add a SHA test case to the harness [v8,50/50] x86emul: add a PCLMUL/VPCLMUL test case to the harness

--- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86 CFLAGS += $(CFLAGS_xeninclude) -SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq +SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er FMA := fma4 fma SG := avx2-sg TESTCASES := blowfish $(SIMD) $(FMA) $(SG) @@ -72,6 +72,9 @@ avx512bw-flts := avx512dq-vecs := $(avx512f-vecs) avx512dq-ints := $(avx512f-ints) avx512dq-flts := $(avx512f-flts) +avx512er-vecs := 64 +avx512er-ints := +avx512er-flts := 4 8 avx512f-opmask-vecs := 2 avx512dq-opmask-vecs := 1 2 --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -278,10 +278,14 @@ static const struct test avx512f_all[] = INSN(punpckldq, 66, 0f, 62, vl, d, vl), INSN(punpcklqdq, 66, 0f, 6c, vl, q, vl), INSN(pxor, 66, 0f, ef, vl, dq, vl), + INSN(rcp14, 66, 0f38, 4c, vl, sd, vl), + INSN(rcp14, 66, 0f38, 4d, el, sd, el), INSN(rndscalepd, 66, 0f3a, 09, vl, q, vl), INSN(rndscaleps, 66, 0f3a, 08, vl, d, vl), INSN(rndscalesd, 66, 0f3a, 0b, el, q, el), INSN(rndscaless, 66, 0f3a, 0a, el, d, el), + INSN(rsqrt14, 66, 0f38, 4e, vl, sd, vl), + INSN(rsqrt14, 66, 0f38, 4f, el, sd, el), INSN_PFP(shuf, 0f, c6), INSN_FP(sqrt, 0f, 51), INSN_FP(sub, 0f, 5c), @@ -477,6 +481,14 @@ static const struct test avx512dq_512[] INSN(inserti32x8, 66, 0f3a, 3a, el_8, d, vl), }; +static const struct test avx512er_512[] = { + INSN(exp2, 66, 0f38, c8, vl, sd, vl), + INSN(rcp28, 66, 0f38, ca, vl, sd, vl), + INSN(rcp28, 66, 0f38, cb, el, sd, el), + INSN(rsqrt28, 66, 0f38, cc, vl, sd, vl), + INSN(rsqrt28, 66, 0f38, cd, el, sd, el), +}; + static const struct test avx512_vbmi_all[] = { INSN(permb, 66, 0f38, 8d, vl, b, vl), INSN(permi2b, 66, 0f38, 75, vl, b, vl), @@ -837,5 +849,6 @@ void evex_disp8_test(void *instr, struct RUN(avx512dq, 128); RUN(avx512dq, no128); RUN(avx512dq, 512); + RUN(avx512er, 512); RUN(avx512_vbmi, all); } --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -210,9 +210,23 @@ static inline vec_t movlhps(vec_t x, vec }) #elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__) # if FLOAT_SIZE == 4 +# ifdef __AVX512ER__ +# define recip(x) scalar_1op(x, "vrcp28ss %[in], %[out], %[out]") +# define rsqrt(x) scalar_1op(x, "vrsqrt28ss %[in], %[out], %[out]") +# else +# define recip(x) scalar_1op(x, "vrcp14ss %[in], %[out], %[out]") +# define rsqrt(x) scalar_1op(x, "vrsqrt14ss %[in], %[out], %[out]") +# endif # define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]") # define trunc(x) scalar_1op(x, "vrndscaless $0b1011, %[in], %[out], %[out]") # elif FLOAT_SIZE == 8 +# ifdef __AVX512ER__ +# define recip(x) scalar_1op(x, "vrcp28sd %[in], %[out], %[out]") +# define rsqrt(x) scalar_1op(x, "vrsqrt28sd %[in], %[out], %[out]") +# else +# define recip(x) scalar_1op(x, "vrcp14sd %[in], %[out], %[out]") +# define rsqrt(x) scalar_1op(x, "vrsqrt14sd %[in], %[out], %[out]") +# endif # define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]") # define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]") # endif @@ -263,6 +277,13 @@ static inline vec_t movlhps(vec_t x, vec # define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0) # define min(x, y) BR_(minps, _mask, x, y, undef(), ~0) # define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE)) +# if VEC_SIZE == 64 && defined(__AVX512ER__) +# define recip(x) BR(rcp28ps, _mask, x, undef(), ~0) +# define rsqrt(x) BR(rsqrt28ps, _mask, x, undef(), ~0) +# else +# define recip(x) B(rcp14ps, _mask, x, undef(), ~0) +# define rsqrt(x) B(rsqrt14ps, _mask, x, undef(), ~0) +# endif # define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0) # define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0) # define trunc(x) BR(rndscaleps_, _mask, x, 0b1011, undef(), ~0) @@ -318,6 +339,13 @@ static inline vec_t movlhps(vec_t x, vec # define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0) # define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0) # define mix(x, y) B(movapd, _mask, x, y, 0b01010101) +# if VEC_SIZE == 64 && defined(__AVX512ER__) +# define recip(x) BR(rcp28pd, _mask, x, undef(), ~0) +# define rsqrt(x) BR(rsqrt28pd, _mask, x, undef(), ~0) +# else +# define recip(x) B(rcp14pd, _mask, x, undef(), ~0) +# define rsqrt(x) B(rsqrt14pd, _mask, x, undef(), ~0) +# endif # define sqrt(x) BR(sqrtpd, _mask, x, undef(), ~0) # define trunc(x) BR(rndscalepd_, _mask, x, 0b1011, undef(), ~0) # if VEC_SIZE == 16 --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -178,14 +178,20 @@ DECL_OCTET(half); /* Sadly there are a few exceptions to the general naming rules. */ # define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512 # define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512 +# define __builtin_ia32_exp2pd512_mask __builtin_ia32_exp2pd_mask +# define __builtin_ia32_exp2ps512_mask __builtin_ia32_exp2ps_mask # define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask # define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask # define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask # define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask # define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask # define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask +# define __builtin_ia32_rcp28pd512_mask __builtin_ia32_rcp28pd_mask +# define __builtin_ia32_rcp28ps512_mask __builtin_ia32_rcp28ps_mask # define __builtin_ia32_rndscalepd_512_mask __builtin_ia32_rndscalepd_mask # define __builtin_ia32_rndscaleps_512_mask __builtin_ia32_rndscaleps_mask +# define __builtin_ia32_rsqrt28pd512_mask __builtin_ia32_rsqrt28pd_mask +# define __builtin_ia32_rsqrt28ps512_mask __builtin_ia32_rsqrt28ps_mask # define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask # define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask # define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -24,6 +24,7 @@ asm ( ".pushsection .test, \"ax\", @prog #include "avx512f.h" #include "avx512bw.h" #include "avx512dq.h" +#include "avx512er.h" #define verbose false /* Switch to true for far more logging. */ @@ -106,6 +107,11 @@ static bool simd_check_avx512dq_vl(void) return cpu_has_avx512dq && cpu_has_avx512vl; } +static bool simd_check_avx512er(void) +{ + return cpu_has_avx512er; +} + static bool simd_check_avx512bw(void) { return cpu_has_avx512bw; @@ -327,6 +333,10 @@ static const struct { AVX512VL(DQ+VL u64x2, avx512dq, 16u8), AVX512VL(DQ+VL s64x4, avx512dq, 32i8), AVX512VL(DQ+VL u64x4, avx512dq, 32u8), + SIMD(AVX512ER f32 scalar,avx512er, f4), + SIMD(AVX512ER f32x16, avx512er, 64f4), + SIMD(AVX512ER f64 scalar,avx512er, f8), + SIMD(AVX512ER f64x8, avx512er, 64f8), #undef AVX512VL_ #undef AVX512VL #undef SIMD_ --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -134,6 +134,7 @@ static inline bool xcr0_mask(uint64_t ma #define cpu_has_bmi2 cp.feat.bmi2 #define cpu_has_avx512f (cp.feat.avx512f && xcr0_mask(0xe6)) #define cpu_has_avx512dq (cp.feat.avx512dq && xcr0_mask(0xe6)) +#define cpu_has_avx512er (cp.feat.avx512er && xcr0_mask(0xe6)) #define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6)) #define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6)) #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6)) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -471,6 +471,10 @@ static const struct ext0f38_table { [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x41] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0x4c] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, + [0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, + [0x4e] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, + [0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 }, [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 }, [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 }, @@ -510,7 +514,12 @@ static const struct ext0f38_table { [0xbd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0xbe] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0xbf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, - [0xc8 ... 0xcd] = { .simd_size = simd_other }, + [0xc8] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, + [0xc9] = { .simd_size = simd_other }, + [0xca] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, + [0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, + [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, + [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xdc ... 0xdf] = { .simd_size = simd_packed_int }, [0xf0] = { .two_op = 1 }, @@ -1873,6 +1882,7 @@ static bool vcpu_has( #define vcpu_has_smap() vcpu_has( 7, EBX, 20, ctxt, ops) #define vcpu_has_clflushopt() vcpu_has( 7, EBX, 23, ctxt, ops) #define vcpu_has_clwb() vcpu_has( 7, EBX, 24, ctxt, ops) +#define vcpu_has_avx512er() vcpu_has( 7, EBX, 27, ctxt, ops) #define vcpu_has_sha() vcpu_has( 7, EBX, 29, ctxt, ops) #define vcpu_has_avx512bw() vcpu_has( 7, EBX, 30, ctxt, ops) #define vcpu_has_avx512vl() vcpu_has( 7, EBX, 31, ctxt, ops) @@ -6168,6 +6178,8 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f38, 0x45): /* vpsrlv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x46): /* vpsrav{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x47): /* vpsllv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x4c): /* vrcp14p{s,d} [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x4e): /* vrsqrt14p{s,d} [xyz]mm/mem,[xyz]mm{k} */ avx512f_no_sae: host_and_vcpu_must_have(avx512f); generate_exception_if(ea.type != OP_MEM && evex.brs, EXC_UD); @@ -8865,6 +8877,13 @@ x86_emulate( generate_exception_if(vex.w, EXC_UD); goto simd_0f_avx2; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x4d): /* vrcp14s{s,d} xmm/mem,xmm,xmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x4f): /* vrsqrt14s{s,d} xmm/mem,xmm,xmm{k} */ + host_and_vcpu_must_have(avx512f); + generate_exception_if(evex.brs, EXC_UD); + avx512_vlen_check(true); + goto simd_zmm; + case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */ generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD); goto simd_0f_avx2; @@ -9112,6 +9131,7 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm{k} */ host_and_vcpu_must_have(avx512f); + simd_zmm_scalar_sae: generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD); if ( !evex.brs ) avx512_vlen_check(true); @@ -9127,6 +9147,19 @@ x86_emulate( op_bytes = 16; goto simd_0f38_common; + case X86EMUL_OPC_EVEX_66(0x0f38, 0xc8): /* vexp2p{s,d} zmm/mem,zmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0xca): /* vrcp28p{s,d} zmm/mem,zmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0xcc): /* vrsqrt28p{s,d} zmm/mem,zmm{k} */ + host_and_vcpu_must_have(avx512er); + generate_exception_if((ea.type != OP_REG || !evex.brs) && evex.lr != 2, + EXC_UD); + goto simd_zmm; + + case X86EMUL_OPC_EVEX_66(0x0f38, 0xcb): /* vrcp28s{s,d} xmm/mem,xmm,xmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0xcd): /* vrsqrt28s{s,d} xmm/mem,xmm,xmm{k} */ + host_and_vcpu_must_have(avx512er); + goto simd_zmm_scalar_sae; + case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */ case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */ vcpu_must_have(movbe); --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -102,6 +102,7 @@ #define cpu_has_avx512dq boot_cpu_has(X86_FEATURE_AVX512DQ) #define cpu_has_rdseed boot_cpu_has(X86_FEATURE_RDSEED) #define cpu_has_smap boot_cpu_has(X86_FEATURE_SMAP) +#define cpu_has_avx512er boot_cpu_has(X86_FEATURE_AVX512ER) #define cpu_has_sha boot_cpu_has(X86_FEATURE_SHA) #define cpu_has_avx512bw boot_cpu_has(X86_FEATURE_AVX512BW) #define cpu_has_avx512vl boot_cpu_has(X86_FEATURE_AVX512VL)

[v8,27/50] x86emul: support AVX512{F, ER} reciprocal insns

Commit Message

Comments

Patch