[v8,13/50] x86emul: basic AVX512BW testing

Message ID	5C8B81ED020000780021F170@prv1-mh.provo.novell.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xenproject.org> Message-Id: <5C8B81ED020000780021F170@prv1-mh.provo.novell.com> Date: Fri, 15 Mar 2019 04:43:57 -0600 From: "Jan Beulich" <JBeulich@suse.com> To: "xen-devel" <xen-devel@lists.xenproject.org> References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 13/50] x86emul: basic AVX512BW testing Precedence: list Cc: George Dunlap <George.Dunlap@eu.citrix.com>, Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>, Roger Pau Monne <roger.pau@citrix.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>
Series	x86emul: remaining AVX512 support \| expand [v8,00/50] x86emul: remaining AVX512 support [v8,01/50] x86emul: no need to set fault_suppression to false for VMOVNT* [v8,02/50] x86emul: support AVX512{F, BW, DQ} extract insns [v8,03/50] x86emul: support AVX512{F, BW, DQ} insert insns [v8,04/50] x86emul: basic AVX512F testing [v8,05/50] x86emul: support AVX512{F, BW, DQ} integer broadcast insns [v8,06/50] x86emul: basic AVX512VL testing [v8,07/50] x86emul: support AVX512{F, BW} zero- and sign-extending moves [v8,08/50] x86emul: support AVX512{F, BW} down conversion moves [v8,09/50] x86emul: support AVX512{F, BW} integer unpack insns [v8,10/50] x86emul: support AVX512{F, BW, _VBMI} full permute insns [v8,11/50] x86emul: support AVX512{F, BW} integer shuffle insns [v8,12/50] x86emul: support AVX512{BW, DQ} mask move insns [v8,13/50] x86emul: basic AVX512BW testing [v8,14/50] x86emul: basic AVX512DQ testing [v8,15/50] x86emul: support AVX512F move high/low insns [v8,16/50] x86emul: support AVX512F move duplicate insns [v8,17/50] x86emul: support AVX512{F, BW, _VBMI} permute insns [v8,18/50] x86emul: support AVX512BW pack insns [v8,19/50] x86emul: support AVX512F floating-point conversion insns [v8,20/50] x86emul: support AVX512F legacy-equivalent packed int/FP conversion insns [v8,21/50] x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns [v8,22/50] x86emul: support AVX512DQ packed quad-int/FP conversion insns [v8,23/50] x86emul: support AVX512{F, DQ} uint-to-FP conversion insns [v8,24/50] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns [v8,25/50] x86emul: support remaining AVX512F legacy-equivalent insns [v8,26/50] x86emul: support remaining AVX512BW legacy-equivalent insns [v8,27/50] x86emul: support AVX512{F, ER} reciprocal insns [v8,28/50] x86emul: support AVX512F floating point manipulation insns [v8,29/50] x86emul: support AVX512DQ floating point manipulation insns [v8,30/50] x86emul: support AVX512{F, _VBMI2} compress/expand insns [v8,31/50] x86emul: support remaining misc AVX512{F, BW} insns [v8,32/50] x86emul: support AVX512F gather insns [v8,33/50] x86emul: add high register S/G test cases [v8,34/50] x86emul: support AVX512F scatter insns [v8,35/50] x86emul: support AVX512PF insns [v8,36/50] x86emul: support AVX512CD insns [v8,37/50] x86emul: complete support of AVX512_VBMI insns [v8,38/50] x86emul: support of AVX512* population count insns [v8,39/50] x86emul: support of AVX512_IFMA insns [v8,40/50] x86emul: support remaining AVX512_VBMI2 insns [v8,41/50] x86emul: support AVX512_4FMAPS insns [v8,42/50] x86emul: support AVX512_4VNNIW insns [v8,43/50] x86emul: support AVX512_VNNI insns [v8,44/50] x86emul: support VPCLMULQDQ insns [v8,45/50] x86emul: support VAES insns [v8,46/50] x86emul: support GFNI insns [v8,47/50] x86emul: restore ordering within main switch statement [v8,48/50] x86emul: add an AES/VAES test case to the harness [v8,49/50] x86emul: add a SHA test case to the harness [v8,50/50] x86emul: add a PCLMUL/VPCLMUL test case to the harness

--- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86 CFLAGS += $(CFLAGS_xeninclude) -SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f +SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw FMA := fma4 fma SG := avx2-sg TESTCASES := blowfish $(SIMD) $(FMA) $(SG) @@ -66,6 +66,9 @@ xop-flts := $(avx-flts) avx512f-vecs := 64 16 32 avx512f-ints := 4 8 avx512f-flts := 4 8 +avx512bw-vecs := $(avx512f-vecs) +avx512bw-ints := 1 2 +avx512bw-flts := avx512f-opmask-vecs := 2 avx512dq-opmask-vecs := 1 --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -31,6 +31,10 @@ ENTRY(simd_test); # define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE) # elif FLOAT_SIZE == 8 # define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE) +# elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__) +# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE) +# elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__) +# define eq(x, y) (B(pcmpeqw, _mask, (vhi_t)(x), (vhi_t)(y), -1) == ALL_TRUE) # elif INT_SIZE == 4 || UINT_SIZE == 4 # define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE) # elif INT_SIZE == 8 || UINT_SIZE == 8 @@ -374,6 +378,87 @@ static inline bool _to_bool(byte_vec_t b # define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) # define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) # endif +#elif (INT_SIZE == 1 || UINT_SIZE == 1 || INT_SIZE == 2 || UINT_SIZE == 2) && \ + defined(__AVX512BW__) && (VEC_SIZE == 64 || defined(__AVX512VL__)) +# if INT_SIZE == 1 || UINT_SIZE == 1 +# define broadcast(x) ({ \ + vec_t t_; \ + asm ( "%{evex%} vpbroadcastb %1, %0" \ + : "=v" (t_) : "m" (*(char[1]){ x }) ); \ + t_; \ +}) +# define broadcast2(x) ({ \ + vec_t t_; \ + asm ( "vpbroadcastb %k1, %0" : "=v" (t_) : "r" (x) ); \ + t_; \ +}) +# if VEC_SIZE == 16 +# define interleave_hi(x, y) ((vec_t)B(punpckhbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) +# define interleave_lo(x, y) ((vec_t)B(punpcklbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) +# define swap(x) ((vec_t)B(pshufb, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0)) +# elif defined(__AVX512VBMI__) +# define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0)) +# define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0)) +# endif +# define mix(x, y) ((vec_t)B(movdquqi, _mask, (vqi_t)(x), (vqi_t)(y), \ + (0b0101010101010101010101010101010101010101010101010101010101010101LL & ALL_TRUE))) +# define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0)) +# define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0)) +# define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0)) +# elif INT_SIZE == 2 || UINT_SIZE == 2 +# define broadcast(x) ({ \ + vec_t t_; \ + asm ( "%{evex%} vpbroadcastw %1, %0" \ + : "=v" (t_) : "m" (*(short[1]){ x }) ); \ + t_; \ +}) +# define broadcast2(x) ({ \ + vec_t t_; \ + asm ( "vpbroadcastw %k1, %0" : "=v" (t_) : "r" (x) ); \ + t_; \ +}) +# if VEC_SIZE == 16 +# define interleave_hi(x, y) ((vec_t)B(punpckhwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0)) +# define interleave_lo(x, y) ((vec_t)B(punpcklwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0)) +# define swap(x) ((vec_t)B(pshufd, _mask, \ + (vsi_t)B(pshufhw, _mask, \ + B(pshuflw, _mask, (vhi_t)(x), 0b00011011, (vhi_t)undef(), ~0), \ + 0b00011011, (vhi_t)undef(), ~0), \ + 0b01001110, (vsi_t)undef(), ~0)) +# else +# define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0)) +# define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0)) +# endif +# define mix(x, y) ((vec_t)B(movdquhi, _mask, (vhi_t)(x), (vhi_t)(y), \ + (0b01010101010101010101010101010101 & ALL_TRUE))) +# define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0)) +# define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0)) +# endif +# if INT_SIZE == 1 +# define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) +# define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) +# define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0)) +# define widen2(x) ((vec_t)B(pmovsxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0)) +# define widen3(x) ((vec_t)B(pmovsxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0)) +# elif UINT_SIZE == 1 +# define max(x, y) ((vec_t)B(pmaxub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) +# define min(x, y) ((vec_t)B(pminub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) +# define widen1(x) ((vec_t)B(pmovzxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0)) +# define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0)) +# define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0)) +# elif INT_SIZE == 2 +# define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0) +# define min(x, y) B(pminsw, _mask, x, y, undef(), ~0) +# define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0) +# define widen1(x) ((vec_t)B(pmovsxwd, _mask, x, (vsi_t)undef(), ~0)) +# define widen2(x) ((vec_t)B(pmovsxwq, _mask, x, (vdi_t)undef(), ~0)) +# elif UINT_SIZE == 2 +# define max(x, y) ((vec_t)B(pmaxuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0)) +# define min(x, y) ((vec_t)B(pminuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0)) +# define mul_hi(x, y) ((vec_t)B(pmulhuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0)) +# define widen1(x) ((vec_t)B(pmovzxwd, _mask, (vhi_half_t)(x), (vsi_t)undef(), ~0)) +# define widen2(x) ((vec_t)B(pmovzxwq, _mask, (vhi_quarter_t)(x), (vdi_t)undef(), ~0)) +# endif #elif VEC_SIZE == 16 && defined(__SSE2__) # if INT_SIZE == 1 || UINT_SIZE == 1 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y))) @@ -565,7 +650,7 @@ static inline bool _to_bool(byte_vec_t b # endif # endif #endif -#if VEC_SIZE == 16 && defined(__SSSE3__) +#if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__) # if INT_SIZE == 1 # define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x))) # elif INT_SIZE == 2 @@ -789,6 +874,40 @@ static inline half_t low_half(vec_t x) } # endif +# if !defined(low_quarter) && defined(QUARTER_SIZE) +static inline quarter_t low_quarter(vec_t x) +{ +# if QUARTER_SIZE < VEC_SIZE + quarter_t y; + unsigned int i; + + for ( i = 0; i < ELEM_COUNT / 4; ++i ) + y[i] = x[i]; + + return y; +# else + return x; +# endif +} +# endif + +# if !defined(low_eighth) && defined(EIGHTH_SIZE) +static inline eighth_t low_eighth(vec_t x) +{ +# if EIGHTH_SIZE < VEC_SIZE + eighth_t y; + unsigned int i; + + for ( i = 0; i < ELEM_COUNT / 4; ++i ) + y[i] = x[i]; + + return y; +# else + return x; +# endif +} +# endif + #endif #if defined(__AVX512F__) && defined(FLOAT_SIZE) @@ -1117,7 +1236,7 @@ int simd_test(void) y = interleave_lo(alt < 0, alt < 0); y = interleave_lo(z, y); touch(x); - z = widen2(x); + z = widen2(low_quarter(x)); touch(x); if ( !eq(z, y) ) return __LINE__; @@ -1126,7 +1245,7 @@ int simd_test(void) y = interleave_lo(y, y); y = interleave_lo(z, y); touch(x); - z = widen3(x); + z = widen3(low_eighth(x)); touch(x); if ( !eq(z, y) ) return __LINE__; # endif @@ -1148,14 +1267,14 @@ int simd_test(void) # ifdef widen2 touch(src); - x = widen2(src); + x = widen2(low_quarter(src)); touch(src); if ( !eq(x, z) ) return __LINE__; # endif # ifdef widen3 touch(src); - x = widen3(src); + x = widen3(low_eighth(src)); touch(src); if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__; # endif @@ -1175,6 +1294,36 @@ int simd_test(void) if ( aux2[i] != src[i] ) return __LINE__; } +#endif + +#if defined(widen2) && defined(shrink2) + { + quarter_t aux1 = low_quarter(src), aux2; + + touch(aux1); + x = widen2(aux1); + touch(x); + aux2 = shrink2(x); + touch(aux2); + for ( i = 0; i < ELEM_COUNT / 4; ++i ) + if ( aux2[i] != src[i] ) + return __LINE__; + } +#endif + +#if defined(widen3) && defined(shrink3) + { + eighth_t aux1 = low_eighth(src), aux2; + + touch(aux1); + x = widen3(aux1); + touch(x); + aux2 = shrink3(x); + touch(aux2); + for ( i = 0; i < ELEM_COUNT / 8; ++i ) + if ( aux2[i] != src[i] ) + return __LINE__; + } #endif #ifdef dup_lo --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -95,6 +95,32 @@ typedef int __attribute__((vector_size(H typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t; # endif +# if ELEM_COUNT >= 4 +# if VEC_SIZE > 64 +# define QUARTER_SIZE (VEC_SIZE / 4) +# else +# define QUARTER_SIZE 16 +# endif +typedef typeof((vec_t){}[0]) __attribute__((vector_size(QUARTER_SIZE))) quarter_t; +typedef char __attribute__((vector_size(QUARTER_SIZE))) vqi_quarter_t; +typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t; +typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t; +typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t; +# endif + +# if ELEM_COUNT >= 8 +# if VEC_SIZE > 128 +# define EIGHTH_SIZE (VEC_SIZE / 8) +# else +# define EIGHTH_SIZE 16 +# endif +typedef typeof((vec_t){}[0]) __attribute__((vector_size(EIGHTH_SIZE))) eighth_t; +typedef char __attribute__((vector_size(EIGHTH_SIZE))) vqi_eighth_t; +typedef short __attribute__((vector_size(EIGHTH_SIZE))) vhi_eighth_t; +typedef int __attribute__((vector_size(EIGHTH_SIZE))) vsi_eighth_t; +typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t; +# endif + #endif #if VEC_SIZE == 16 @@ -182,6 +208,9 @@ OVR_SFP(broadcast); OVR_SFP(comi); OVR_FP(add); OVR_INT(add); +OVR_BW(adds); +OVR_BW(addus); +OVR_BW(avg); OVR_FP(div); OVR(extractps); OVR_FMA(fmadd, FP); @@ -214,6 +243,8 @@ OVR_INT(srl); OVR_DQ(srlv); OVR_FP(sub); OVR_INT(sub); +OVR_BW(subs); +OVR_BW(subus); OVR_SFP(ucomi); OVR_VFP(unpckh); OVR_VFP(unpckl); @@ -275,6 +306,31 @@ OVR(punpckldq); OVR(punpcklqdq); # endif +# ifdef __AVX512BW__ +OVR(pextrb); +OVR(pextrw); +OVR(pinsrb); +OVR(pinsrw); +# ifdef __AVX512VL__ +OVR(pmaddwd); +OVR(pmovsxbw); +OVR(pmovzxbw); +OVR(pmulhuw); +OVR(pmulhw); +OVR(pmullw); +OVR(psadbw); +OVR(pshufb); +OVR(pshufhw); +OVR(pshuflw); +OVR(pslldq); +OVR(psrldq); +OVR(punpckhbw); +OVR(punpckhwd); +OVR(punpcklbw); +OVR(punpcklwd); +# endif +# endif + # undef OVR_VFP # undef OVR_SFP # undef OVR_INT --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -22,6 +22,7 @@ asm ( ".pushsection .test, \"ax\", @prog #include "avx512dq-opmask.h" #include "avx512bw-opmask.h" #include "avx512f.h" +#include "avx512bw.h" #define verbose false /* Switch to true for far more logging. */ @@ -105,6 +106,11 @@ static bool simd_check_avx512bw(void) } #define simd_check_avx512bw_opmask simd_check_avx512bw +static bool simd_check_avx512bw_vl(void) +{ + return cpu_has_avx512bw && cpu_has_avx512vl; +} + static void simd_set_regs(struct cpu_user_regs *regs) { if ( cpu_has_mmx ) @@ -284,6 +290,18 @@ static const struct { AVX512VL(VL u64x2, avx512f, 16u8), AVX512VL(VL s64x4, avx512f, 32i8), AVX512VL(VL u64x4, avx512f, 32u8), + SIMD(AVX512BW s8x64, avx512bw, 64i1), + SIMD(AVX512BW u8x64, avx512bw, 64u1), + SIMD(AVX512BW s16x32, avx512bw, 64i2), + SIMD(AVX512BW u16x32, avx512bw, 64u2), + AVX512VL(BW+VL s8x16, avx512bw, 16i1), + AVX512VL(BW+VL u8x16, avx512bw, 16u1), + AVX512VL(BW+VL s8x32, avx512bw, 32i1), + AVX512VL(BW+VL u8x32, avx512bw, 32u1), + AVX512VL(BW+VL s16x8, avx512bw, 16i2), + AVX512VL(BW+VL u16x8, avx512bw, 16u2), + AVX512VL(BW+VL s16x16, avx512bw, 32i2), + AVX512VL(BW+VL u16x16, avx512bw, 32u2), #undef AVX512VL_ #undef AVX512VL #undef SIMD_

[v8,13/50] x86emul: basic AVX512BW testing

Commit Message

Comments

Patch