[v8,46/50] x86emul: support GFNI insns

Message ID	5C8B8733020000780021F323@prv1-mh.provo.novell.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xenproject.org> Message-Id: <5C8B8733020000780021F323@prv1-mh.provo.novell.com> Date: Fri, 15 Mar 2019 05:06:27 -0600 From: "Jan Beulich" <JBeulich@suse.com> To: "xen-devel" <xen-devel@lists.xenproject.org> References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 46/50] x86emul: support GFNI insns Precedence: list Cc: George Dunlap <George.Dunlap@eu.citrix.com>, Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>, Roger Pau Monne <roger.pau@citrix.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>
Series	x86emul: remaining AVX512 support \| expand [v8,00/50] x86emul: remaining AVX512 support [v8,01/50] x86emul: no need to set fault_suppression to false for VMOVNT* [v8,02/50] x86emul: support AVX512{F, BW, DQ} extract insns [v8,03/50] x86emul: support AVX512{F, BW, DQ} insert insns [v8,04/50] x86emul: basic AVX512F testing [v8,05/50] x86emul: support AVX512{F, BW, DQ} integer broadcast insns [v8,06/50] x86emul: basic AVX512VL testing [v8,07/50] x86emul: support AVX512{F, BW} zero- and sign-extending moves [v8,08/50] x86emul: support AVX512{F, BW} down conversion moves [v8,09/50] x86emul: support AVX512{F, BW} integer unpack insns [v8,10/50] x86emul: support AVX512{F, BW, _VBMI} full permute insns [v8,11/50] x86emul: support AVX512{F, BW} integer shuffle insns [v8,12/50] x86emul: support AVX512{BW, DQ} mask move insns [v8,13/50] x86emul: basic AVX512BW testing [v8,14/50] x86emul: basic AVX512DQ testing [v8,15/50] x86emul: support AVX512F move high/low insns [v8,16/50] x86emul: support AVX512F move duplicate insns [v8,17/50] x86emul: support AVX512{F, BW, _VBMI} permute insns [v8,18/50] x86emul: support AVX512BW pack insns [v8,19/50] x86emul: support AVX512F floating-point conversion insns [v8,20/50] x86emul: support AVX512F legacy-equivalent packed int/FP conversion insns [v8,21/50] x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns [v8,22/50] x86emul: support AVX512DQ packed quad-int/FP conversion insns [v8,23/50] x86emul: support AVX512{F, DQ} uint-to-FP conversion insns [v8,24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns [v8,25/50] x86emul: support remaining AVX512F legacy-equivalent insns [v8,26/50] x86emul: support remaining AVX512BW legacy-equivalent insns [v8,27/50] x86emul: support AVX512{F, ER} reciprocal insns [v8,28/50] x86emul: support AVX512F floating point manipulation insns [v8,29/50] x86emul: support AVX512DQ floating point manipulation insns [v8,30/50] x86emul: support AVX512{F, _VBMI2} compress/expand insns [v8,31/50] x86emul: support remaining misc AVX512{F, BW} insns [v8,32/50] x86emul: support AVX512F gather insns [v8,33/50] x86emul: add high register S/G test cases [v8,34/50] x86emul: support AVX512F scatter insns [v8,35/50] x86emul: support AVX512PF insns [v8,36/50] x86emul: support AVX512CD insns [v8,37/50] x86emul: complete support of AVX512_VBMI insns [v8,38/50] x86emul: support of AVX512* population count insns [v8,39/50] x86emul: support of AVX512_IFMA insns [v8,40/50] x86emul: support remaining AVX512_VBMI2 insns [v8,41/50] x86emul: support AVX512_4FMAPS insns [v8,42/50] x86emul: support AVX512_4VNNIW insns [v8,43/50] x86emul: support AVX512_VNNI insns [v8,44/50] x86emul: support VPCLMULQDQ insns [v8,45/50] x86emul: support VAES insns [v8,46/50] x86emul: support GFNI insns [v8,47/50] x86emul: restore ordering within main switch statement [v8,48/50] x86emul: add an AES/VAES test case to the harness [v8,49/50] x86emul: add a SHA test case to the harness [v8,50/50] x86emul: add a PCLMUL/VPCLMUL test case to the harness

--- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -19,7 +19,8 @@ CFLAGS += $(CFLAGS_xeninclude) SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi FMA := fma4 fma SG := avx2-sg avx512f-sg avx512vl-sg -TESTCASES := blowfish $(SIMD) $(FMA) $(SG) +GF := sse2-gf avx2-gf avx512bw-gf +TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF) OPMASK := avx512f avx512dq avx512bw @@ -142,12 +143,17 @@ $(1)-cflags := \ $(foreach flt,$($(1)-flts), \ "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)"))) endef +define simd-gf-defs +$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \ + "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)") +endef define opmask-defs $(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)") endef $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor)))) $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor)))) +$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor)))) $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor)))) first-string = $(shell for s in $(1); do echo "$$s"; break; done) @@ -197,7 +203,10 @@ $(addsuffix .c,$(FMA)): $(addsuffix .c,$(SG)): ln -sf simd-sg.c $@ -$(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h +$(addsuffix .c,$(GF)): + ln -sf simd-gf.c $@ + +$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h xop.h avx512f.h: simd-fma.c --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -591,6 +591,12 @@ static const struct test avx512_vpopcntd INSN(popcnt, 66, 0f38, 55, vl, dq, vl) }; +static const struct test gfni_all[] = { + INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl), + INSN(gf2p8affineqb, 66, 0f3a, ce, vl, q, vl), + INSN(gf2p8mulb, 66, 0f38, cf, vl, b, vl), +}; + /* * The uses of b in this table are simply (one of) the shortest form(s) of * saying "no broadcast" without introducing a 128-bit granularity enumerator. @@ -987,6 +993,7 @@ void evex_disp8_test(void *instr, struct if ( cpu_has_avx512f ) { + RUN(gfni, all); RUN(vaes, all); RUN(vpclmulqdq, all); } --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -371,6 +371,7 @@ OVR(cvttsd2siq); OVR(cvttss2si); OVR(cvttss2sil); OVR(cvttss2siq); +OVR(gf2p8mulb); OVR(movddup); OVR(movntdq); OVR(movntdqa); --- /dev/null +++ b/tools/tests/x86_emulator/simd-gf.c @@ -0,0 +1,80 @@ +#define UINT_SIZE 1 + +#include "simd.h" +ENTRY(gf_test); + +#if VEC_SIZE == 16 +# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v16qi ## s(a) +#elif VEC_SIZE == 32 +# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v32qi ## s(a) +#elif VEC_SIZE == 64 +# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v64qi ## s(a) +#endif + +#ifdef __AVX512BW__ +# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT)) +# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE) +# define mul(x, y) GF(mulb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0) +# define transform(m, dir, x, c) ({ \ + vec_t t_; \ + asm ( "vgf2p8affine" #dir "qb %[imm], %[matrix]%{1to%c[n]%}, %[src], %[dst]" \ + : [dst] "=v" (t_) \ + : [matrix] "m" (m), [src] "v" (x), [imm] "i" (c), [n] "i" (VEC_SIZE / 8) ); \ + t_; \ +}) +#else +# if defined(__AVX2__) +# define bcstq(x) ({ \ + vdi_t t_; \ + asm ( "vpbroadcastq %1, %0" : "=x" (t_) : "m" (x) ); \ + t_; \ +}) +# define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0) +# else +# define bcstq(x) ((vdi_t){x, x}) +# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff) +# endif +# define eq(x, y) to_bool((x) == (y)) +# define mul(x, y) GF(mulb, , (vqi_t)(x), (vqi_t)(y)) +# define transform(m, dir, x, c) ({ \ + vdi_t m_ = bcstq(m); \ + touch(m_); \ + ((vec_t)GF(affine ## dir ## qb, , (vqi_t)(x), (vqi_t)m_, c)); \ +}) +#endif + +const unsigned __attribute__((mode(DI))) ident = 0x0102040810204080ULL; + +int gf_test(void) +{ + unsigned int i; + vec_t src, one; + + for ( i = 0; i < ELEM_COUNT; ++i ) + { + src[i] = i; + one[i] = 1; + } + + /* Special case for first iteration. */ + one[0] = 0; + + do { + vec_t inv = transform(ident, inv, src, 0); + + touch(src); + touch(inv); + if ( !eq(mul(src, inv), one) ) return __LINE__; + + touch(src); + touch(inv); + if ( !eq(mul(inv, src), one) ) return __LINE__; + + one[0] = 1; + + src += ELEM_COUNT; + i += ELEM_COUNT; + } while ( i < 256 ); + + return 0; +} --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -11,12 +11,14 @@ asm ( ".pushsection .test, \"ax\", @prog #include "3dnow.h" #include "sse.h" #include "sse2.h" +#include "sse2-gf.h" #include "sse4.h" #include "avx.h" #include "fma4.h" #include "fma.h" #include "avx2.h" #include "avx2-sg.h" +#include "avx2-gf.h" #include "xop.h" #include "avx512f-opmask.h" #include "avx512dq-opmask.h" @@ -25,6 +27,7 @@ asm ( ".pushsection .test, \"ax\", @prog #include "avx512f-sg.h" #include "avx512vl-sg.h" #include "avx512bw.h" +#include "avx512bw-gf.h" #include "avx512dq.h" #include "avx512er.h" #include "avx512vbmi.h" @@ -138,6 +141,26 @@ static bool simd_check_avx512vbmi_vl(voi return cpu_has_avx512_vbmi && cpu_has_avx512vl; } +static bool simd_check_sse2_gf(void) +{ + return cpu_has_gfni && cpu_has_sse2; +} + +static bool simd_check_avx2_gf(void) +{ + return cpu_has_gfni && cpu_has_avx2; +} + +static bool simd_check_avx512bw_gf(void) +{ + return cpu_has_gfni && cpu_has_avx512bw; +} + +static bool simd_check_avx512bw_gf_vl(void) +{ + return cpu_has_gfni && cpu_has_avx512vl; +} + static void simd_set_regs(struct cpu_user_regs *regs) { if ( cpu_has_mmx ) @@ -395,6 +418,12 @@ static const struct { AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2), AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2), AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2), + SIMD(GFNI (legacy), sse2_gf, 16), + SIMD(GFNI (VEX/x16), avx2_gf, 16), + SIMD(GFNI (VEX/x32), avx2_gf, 32), + SIMD(GFNI (EVEX/x64), avx512bw_gf, 64), + AVX512VL(VL+GFNI (x16), avx512bw_gf, 16), + AVX512VL(VL+GFNI (x32), avx512bw_gf, 32), #undef AVX512VL_ #undef AVX512VL #undef SIMD_ --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma #define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6)) #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6)) #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6)) +#define cpu_has_gfni cp.feat.gfni #define cpu_has_vaes (cp.feat.vaes && xcr0_mask(6)) #define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6)) #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6)) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -540,6 +540,7 @@ static const struct ext0f38_table { [0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, + [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0xf0] = { .two_op = 1 }, @@ -619,6 +620,7 @@ static const struct ext0f3a_table { [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 }, [0xcc] = { .simd_size = simd_other }, + [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xf0] = {}, }; @@ -1922,6 +1924,7 @@ static bool vcpu_has( #define vcpu_has_avx512vl() vcpu_has( 7, EBX, 31, ctxt, ops) #define vcpu_has_avx512_vbmi() vcpu_has( 7, ECX, 1, ctxt, ops) #define vcpu_has_avx512_vbmi2() vcpu_has( 7, ECX, 6, ctxt, ops) +#define vcpu_has_gfni() vcpu_has( 7, ECX, 8, ctxt, ops) #define vcpu_has_vaes() vcpu_has( 7, ECX, 9, ctxt, ops) #define vcpu_has_vpclmulqdq() vcpu_has( 7, ECX, 10, ctxt, ops) #define vcpu_has_avx512_vnni() vcpu_has( 7, ECX, 11, ctxt, ops) @@ -9652,6 +9655,21 @@ x86_emulate( host_and_vcpu_must_have(avx512er); goto simd_zmm_scalar_sae; + case X86EMUL_OPC_66(0x0f38, 0xcf): /* gf2p8mulb xmm/m128,xmm */ + host_and_vcpu_must_have(gfni); + goto simd_0f38_common; + + case X86EMUL_OPC_VEX_66(0x0f38, 0xcf): /* vgf2p8mulb {x,y}mm/mem,{x,y}mm,{x,y}mm */ + host_and_vcpu_must_have(gfni); + generate_exception_if(vex.w, EXC_UD); + goto simd_0f_avx; + + case X86EMUL_OPC_EVEX_66(0x0f38, 0xcf): /* vgf2p8mulb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + host_and_vcpu_must_have(gfni); + generate_exception_if(evex.w || evex.brs, EXC_UD); + elem_bytes = 1; + goto avx512f_no_sae; + case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */ @@ -10395,6 +10413,24 @@ x86_emulate( op_bytes = 16; goto simd_0f3a_common; + case X86EMUL_OPC_66(0x0f3a, 0xce): /* gf2p8affineqb $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_66(0x0f3a, 0xcf): /* gf2p8affineinvqb $imm8,xmm/m128,xmm */ + host_and_vcpu_must_have(gfni); + goto simd_0f3a_common; + + case X86EMUL_OPC_VEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + host_and_vcpu_must_have(gfni); + generate_exception_if(!vex.w, EXC_UD); + goto simd_0f_imm8_avx; + + case X86EMUL_OPC_EVEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + host_and_vcpu_must_have(gfni); + generate_exception_if(!evex.w, EXC_UD); + fault_suppression = false; + goto avx512f_imm8_no_sae; + case X86EMUL_OPC_66(0x0f3a, 0xdf): /* aeskeygenassist $imm8,xmm/m128,xmm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */ host_and_vcpu_must_have(aesni); --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -112,6 +112,7 @@ /* CPUID level 0x00000007:0.ecx */ #define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI) #define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2) +#define cpu_has_gfni boot_cpu_has(X86_FEATURE_GFNI) #define cpu_has_vaes boot_cpu_has(X86_FEATURE_VAES) #define cpu_has_vpclmulqdq boot_cpu_has(X86_FEATURE_VPCLMULQDQ) #define cpu_has_avx512_vnni boot_cpu_has(X86_FEATURE_AVX512_VNNI) --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) / XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */ XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */ XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */ +XEN_CPUFEATURE(GFNI, 6*32+ 8) /*A Galois Field Instrs */ XEN_CPUFEATURE(VAES, 6*32+ 9) /*A Vector AES Instrs */ XEN_CPUFEATURE(VPCLMULQDQ, 6*32+10) /*A Vector Carry-less Multiplication Instrs */ XEN_CPUFEATURE(AVX512_VNNI, 6*32+11) /*A Vector Neural Network Instrs */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -197,7 +197,7 @@ def crunch_numbers(state): # %XMM support, without specific inter-dependencies. Additionally # AMD has a special mis-alignment sub-mode. SSE: [SSE2, SSE3, SSSE3, SSE4A, MISALIGNSSE, - AESNI, PCLMULQDQ, SHA], + AESNI, PCLMULQDQ, SHA, GFNI], # SSE2 was re-specified as core instructions for 64bit. SSE2: [LM],

[v8,46/50] x86emul: support GFNI insns

Commit Message

Comments

Patch